Update.
[glibc.git] / db2 / mp / mp_sync.c
blob33218eef1ae45277a61bcc30be3f6c95a16270f2
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 #include "config.h"
9 #ifndef lint
10 static const char sccsid[] = "@(#)mp_sync.c 10.25 (Sleepycat) 4/26/98";
11 #endif /* not lint */
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
16 #include <errno.h>
17 #include <stdlib.h>
18 #endif
20 #include "db_int.h"
21 #include "shqueue.h"
22 #include "db_shash.h"
23 #include "mp.h"
24 #include "common_ext.h"
26 static int __bhcmp __P((const void *, const void *));
27 static int __memp_fsync __P((DB_MPOOLFILE *));
30 * memp_sync --
31 * Mpool sync function.
33 int
34 memp_sync(dbmp, lsnp)
35 DB_MPOOL *dbmp;
36 DB_LSN *lsnp;
38 BH *bhp, **bharray;
39 DB_ENV *dbenv;
40 MPOOL *mp;
41 MPOOLFILE *mfp;
42 int ar_cnt, cnt, nalloc, next, ret, wrote;
44 dbenv = dbmp->dbenv;
46 if (dbenv->lg_info == NULL) {
47 __db_err(dbenv, "memp_sync: requires logging");
48 return (EINVAL);
52 * We try and write the buffers in page order so that the underlying
53 * filesystem doesn't have to seek and can write contiguous blocks,
54 * plus, we don't want to hold the region lock while we write the
55 * buffers. Get memory to hold the buffer pointers. Get a good-size
56 * block, too, because we realloc while holding the region lock if we
57 * run out.
59 if ((bharray =
60 (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL)
61 return (ENOMEM);
63 LOCKREGION(dbmp);
66 * If the application is asking about a previous call to memp_sync(),
67 * and we haven't found any buffers that the application holding the
68 * pin couldn't write, return yes or no based on the current count.
69 * Note, if the application is asking about a LSN *smaller* than one
70 * we've already handled or are currently handling, then we return a
71 * result based on the count for the larger LSN.
73 mp = dbmp->mp;
74 if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
75 if (mp->lsn_cnt == 0) {
76 *lsnp = mp->lsn;
77 ret = 0;
78 } else
79 ret = DB_INCOMPLETE;
80 goto done;
83 /* Else, it's a new checkpoint. */
84 F_CLR(mp, MP_LSN_RETRY);
87 * Save the LSN. We know that it's a new LSN or larger than the one
88 * for which we were already doing a checkpoint. (BTW, I don't expect
89 * to see multiple LSN's from the same or multiple processes, but You
90 * Just Never Know. Responding as if they all called with the largest
91 * of the LSNs specified makes everything work.)
93 * We don't currently use the LSN we save. We could potentially save
94 * the last-written LSN in each buffer header and use it to determine
95 * what buffers need to be written. The problem with this is that it's
96 * sizeof(LSN) more bytes of buffer header. We currently write all the
97 * dirty buffers instead.
99 * Walk the list of shared memory segments clearing the count of
100 * buffers waiting to be written.
102 mp->lsn = *lsnp;
103 mp->lsn_cnt = 0;
104 for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
105 mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
106 mfp->lsn_cnt = 0;
109 * Walk the list of buffers and mark all dirty buffers to be written
110 * and all pinned buffers to be potentially written (we can't know if
111 * we'll need to write them until the holding process returns them to
112 * the cache). We do this in one pass while holding the region locked
113 * so that processes can't make new buffers dirty, causing us to never
114 * finish. Since the application may have restarted the sync, clear
115 * any BH_WRITE flags that appear to be left over from previous calls.
117 * Keep a count of the total number of buffers we need to write in
118 * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
120 ar_cnt = 0;
121 for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
122 bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
123 if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
124 F_SET(bhp, BH_WRITE);
126 ++mp->lsn_cnt;
128 mfp = R_ADDR(dbmp, bhp->mf_offset);
129 ++mfp->lsn_cnt;
132 * If the buffer isn't in use, we should be able to
133 * write it immediately, so save a reference to it.
135 if (bhp->ref == 0) {
136 if (ar_cnt == nalloc) {
137 nalloc *= 2;
138 if ((bharray =
139 (BH **)__db_realloc(bharray,
140 nalloc * sizeof(BH *))) == NULL) {
141 ret = ENOMEM;
142 goto err;
145 bharray[ar_cnt++] = bhp;
147 } else
148 if (F_ISSET(bhp, BH_WRITE))
149 F_CLR(bhp, BH_WRITE);
151 /* If there no buffers we can write immediately, we're done. */
152 if (ar_cnt == 0) {
153 ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
154 goto done;
157 /* Lock down the buffers and their contents. */
158 for (cnt = 0; cnt < ar_cnt; ++cnt)
159 ++bharray[cnt]->ref;
161 UNLOCKREGION(dbmp);
163 /* Sort the buffers we're going to write. */
164 qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
166 LOCKREGION(dbmp);
168 /* Walk the array, writing buffers. */
169 for (next = 0; next < ar_cnt; ++next) {
171 * It's possible for a thread to have gotten the buffer since
172 * we listed it for writing. If the reference count is still
173 * 1, we're the only ones using the buffer, go ahead and write.
174 * If it's >1, then skip the buffer and assume that it will be
175 * written when it's returned to the cache.
177 if (bharray[next]->ref > 1) {
178 --bharray[next]->ref;
179 continue;
182 /* Write the buffer. */
183 mfp = R_ADDR(dbmp, bharray[next]->mf_offset);
184 ret = __memp_bhwrite(dbmp, mfp, bharray[next], NULL, &wrote);
186 /* Release the buffer. */
187 --bharray[next]->ref;
189 /* If there's an error, release the rest of the buffers. */
190 if (ret != 0 || !wrote) {
192 * Any process syncing the shared memory buffer pool
193 * had better be able to write to any underlying file.
194 * Be understanding, but firm, on this point.
196 if (ret == 0) {
197 __db_err(dbenv, "%s: unable to flush page: %lu",
198 __memp_fns(dbmp, mfp),
199 (u_long)bharray[next]->pgno);
200 ret = EPERM;
203 while (++next < ar_cnt)
204 --bharray[next]->ref;
205 goto err;
208 ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
210 done:
211 if (0) {
212 err: /*
213 * On error, clear:
214 * MPOOL->lsn_cnt (the total sync count)
215 * MPOOLFILE->lsn_cnt (the per-file sync count)
216 * BH_WRITE flag (the scheduled for writing flag)
218 mp->lsn_cnt = 0;
219 for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
220 mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
221 mfp->lsn_cnt = 0;
222 for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
223 bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
224 F_CLR(bhp, BH_WRITE);
226 UNLOCKREGION(dbmp);
227 __db_free(bharray);
228 return (ret);
232 * memp_fsync --
233 * Mpool file sync function.
236 memp_fsync(dbmfp)
237 DB_MPOOLFILE *dbmfp;
239 DB_MPOOL *dbmp;
240 int is_tmp;
242 dbmp = dbmfp->dbmp;
245 * If this handle doesn't have a file descriptor that's open for
246 * writing, or if the file is a temporary, there's no reason to
247 * proceed further.
249 if (F_ISSET(dbmfp, MP_READONLY))
250 return (0);
252 LOCKREGION(dbmp);
253 is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
254 UNLOCKREGION(dbmp);
255 if (is_tmp)
256 return (0);
258 return (__memp_fsync(dbmfp));
262 * __mp_xxx_fd --
263 * Return a file descriptor for DB 1.85 compatibility locking.
265 * PUBLIC: int __mp_xxx_fd __P((DB_MPOOLFILE *, int *));
268 __mp_xxx_fd(dbmfp, fdp)
269 DB_MPOOLFILE *dbmfp;
270 int *fdp;
272 int ret;
275 * This is a truly spectacular layering violation, intended ONLY to
276 * support compatibility for the DB 1.85 DB->fd call.
278 * Sync the database file to disk, creating the file as necessary.
280 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
281 * The MP_READONLY test isn't interesting because we will either
282 * already have a file descriptor (we opened the database file for
283 * reading) or we aren't readonly (we created the database which
284 * requires write privileges). The MP_TEMP test isn't interesting
285 * because we want to write to the backing file regardless so that
286 * we get a file descriptor to return.
288 ret = dbmfp->fd == -1 ? __memp_fsync(dbmfp) : 0;
290 return ((*fdp = dbmfp->fd) == -1 ? ENOENT : ret);
294 * __memp_fsync --
295 * Mpool file internal sync function.
297 static int
298 __memp_fsync(dbmfp)
299 DB_MPOOLFILE *dbmfp;
301 BH *bhp, **bharray;
302 DB_MPOOL *dbmp;
303 size_t mf_offset;
304 int ar_cnt, cnt, nalloc, next, pincnt, ret, wrote;
306 ret = 0;
307 dbmp = dbmfp->dbmp;
308 mf_offset = R_OFFSET(dbmp, dbmfp->mfp);
311 * We try and write the buffers in page order so that the underlying
312 * filesystem doesn't have to seek and can write contiguous blocks,
313 * plus, we don't want to hold the region lock while we write the
314 * buffers. Get memory to hold the buffer pointers. Get a good-size
315 * block, too, because we realloc while holding the region lock if we
316 * run out.
318 nalloc = 1024;
319 if ((bharray =
320 (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL)
321 return (ENOMEM);
323 LOCKREGION(dbmp);
326 * Walk the LRU list of buffer headers, and get a list of buffers to
327 * write for this MPOOLFILE.
329 ar_cnt = pincnt = 0;
330 for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
331 bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
332 if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset)
333 continue;
334 if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
335 ++pincnt;
336 continue;
339 if (ar_cnt == nalloc) {
340 nalloc *= 2;
341 if ((bharray = (BH **)__db_realloc(bharray,
342 nalloc * sizeof(BH *))) == NULL) {
343 ret = ENOMEM;
344 goto err;
348 bharray[ar_cnt++] = bhp;
351 /* Lock down the buffers and their contents. */
352 for (cnt = 0; cnt < ar_cnt; ++cnt)
353 ++bharray[cnt]->ref;
355 UNLOCKREGION(dbmp);
357 /* Sort the buffers we're going to write. */
358 qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
360 LOCKREGION(dbmp);
362 /* Walk the array, writing buffers. */
363 for (next = 0; next < ar_cnt; ++next) {
365 * It's possible for a thread to have gotten the buffer since
366 * we listed it for writing. If the reference count is still
367 * 1, we're the only ones using the buffer, go ahead and write.
368 * If it's >1, then skip the buffer and assume that it will be
369 * written when it's returned to the cache.
371 if (bharray[next]->ref > 1) {
372 ++pincnt;
374 --bharray[next]->ref;
375 continue;
378 /* Write the buffer. */
379 ret = __memp_pgwrite(dbmfp, bharray[next], NULL, &wrote);
381 /* Release the buffer. */
382 --bharray[next]->ref;
384 /* If there's an error, release the rest of the buffers. */
385 if (ret != 0) {
386 while (++next < ar_cnt)
387 --bharray[next]->ref;
388 goto err;
390 if (!wrote)
391 ++pincnt;
394 err: UNLOCKREGION(dbmp);
396 __db_free(bharray);
399 * Sync the underlying file as the last thing we do, so that the OS
400 * has maximal opportunity to flush buffers before we request it.
402 * XXX:
403 * Don't lock the region around the sync, fsync(2) has no atomicity
404 * issues.
406 if (ret == 0)
407 return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE);
408 return (ret);
412 * memp_trickle --
413 * Keep a specified percentage of the buffers clean.
416 memp_trickle(dbmp, pct, nwrotep)
417 DB_MPOOL *dbmp;
418 int pct, *nwrotep;
420 BH *bhp;
421 MPOOL *mp;
422 MPOOLFILE *mfp;
423 u_long total;
424 int ret, wrote;
426 mp = dbmp->mp;
427 if (nwrotep != NULL)
428 *nwrotep = 0;
430 if (pct < 1 || pct > 100)
431 return (EINVAL);
433 LOCKREGION(dbmp);
436 * If there are sufficient clean buffers, or no buffers or no dirty
437 * buffers, we're done.
439 * XXX
440 * Using st_page_clean and st_page_dirty is our only choice at the
441 * moment, but it's not as correct as we might like in the presence
442 * of pools with more than one buffer size, as a free 512-byte buffer
443 * isn't the same as a free 8K buffer.
445 loop: total = mp->stat.st_page_clean + mp->stat.st_page_dirty;
446 if (total == 0 || mp->stat.st_page_dirty == 0 ||
447 (mp->stat.st_page_clean * 100) / total >= (u_long)pct) {
448 UNLOCKREGION(dbmp);
449 return (0);
452 /* Loop until we write a buffer. */
453 for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
454 bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
455 if (bhp->ref != 0 ||
456 !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
457 continue;
459 mfp = R_ADDR(dbmp, bhp->mf_offset);
462 * We can't write to temporary files -- see the comment in
463 * mp_bh.c:__memp_bhwrite().
465 if (F_ISSET(mfp, MP_TEMP))
466 continue;
468 if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
469 goto err;
472 * Any process syncing the shared memory buffer pool had better
473 * be able to write to any underlying file. Be understanding,
474 * but firm, on this point.
476 if (!wrote) {
477 __db_err(dbmp->dbenv, "%s: unable to flush page: %lu",
478 __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
479 ret = EPERM;
480 goto err;
483 ++mp->stat.st_page_trickle;
484 if (nwrotep != NULL)
485 ++*nwrotep;
486 goto loop;
489 /* No more buffers to write. */
490 return (0);
492 err: UNLOCKREGION(dbmp);
493 return (ret);
496 static int
497 __bhcmp(p1, p2)
498 const void *p1, *p2;
500 BH *bhp1, *bhp2;
502 bhp1 = *(BH * const *)p1;
503 bhp2 = *(BH * const *)p2;
505 /* Sort by file (shared memory pool offset). */
506 if (bhp1->mf_offset < bhp2->mf_offset)
507 return (-1);
508 if (bhp1->mf_offset > bhp2->mf_offset)
509 return (1);
511 /* Sort by page in file. */
512 return (bhp1->pgno < bhp2->pgno ? -1 : 1);