2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
10 static const char sccsid
[] = "@(#)mp_sync.c 10.25 (Sleepycat) 4/26/98";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
24 #include "common_ext.h"
26 static int __bhcmp
__P((const void *, const void *));
27 static int __memp_fsync
__P((DB_MPOOLFILE
*));
31 * Mpool sync function.
42 int ar_cnt
, cnt
, nalloc
, next
, ret
, wrote
;
46 if (dbenv
->lg_info
== NULL
) {
47 __db_err(dbenv
, "memp_sync: requires logging");
52 * We try and write the buffers in page order so that the underlying
53 * filesystem doesn't have to seek and can write contiguous blocks,
54 * plus, we don't want to hold the region lock while we write the
55 * buffers. Get memory to hold the buffer pointers. Get a good-size
56 * block, too, because we realloc while holding the region lock if we
60 (BH
**)__db_malloc((nalloc
= 1024) * sizeof(BH
*))) == NULL
)
66 * If the application is asking about a previous call to memp_sync(),
67 * and we haven't found any buffers that the application holding the
68 * pin couldn't write, return yes or no based on the current count.
69 * Note, if the application is asking about a LSN *smaller* than one
70 * we've already handled or are currently handling, then we return a
71 * result based on the count for the larger LSN.
74 if (!F_ISSET(mp
, MP_LSN_RETRY
) && log_compare(lsnp
, &mp
->lsn
) <= 0) {
75 if (mp
->lsn_cnt
== 0) {
83 /* Else, it's a new checkpoint. */
84 F_CLR(mp
, MP_LSN_RETRY
);
87 * Save the LSN. We know that it's a new LSN or larger than the one
88 * for which we were already doing a checkpoint. (BTW, I don't expect
89 * to see multiple LSN's from the same or multiple processes, but You
90 * Just Never Know. Responding as if they all called with the largest
91 * of the LSNs specified makes everything work.)
93 * We don't currently use the LSN we save. We could potentially save
94 * the last-written LSN in each buffer header and use it to determine
95 * what buffers need to be written. The problem with this is that it's
96 * sizeof(LSN) more bytes of buffer header. We currently write all the
97 * dirty buffers instead.
99 * Walk the list of shared memory segments clearing the count of
100 * buffers waiting to be written.
104 for (mfp
= SH_TAILQ_FIRST(&dbmp
->mp
->mpfq
, __mpoolfile
);
105 mfp
!= NULL
; mfp
= SH_TAILQ_NEXT(mfp
, q
, __mpoolfile
))
109 * Walk the list of buffers and mark all dirty buffers to be written
110 * and all pinned buffers to be potentially written (we can't know if
111 * we'll need to write them until the holding process returns them to
112 * the cache). We do this in one pass while holding the region locked
113 * so that processes can't make new buffers dirty, causing us to never
114 * finish. Since the application may have restarted the sync, clear
115 * any BH_WRITE flags that appear to be left over from previous calls.
117 * Keep a count of the total number of buffers we need to write in
118 * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
121 for (bhp
= SH_TAILQ_FIRST(&mp
->bhq
, __bh
);
122 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, q
, __bh
))
123 if (F_ISSET(bhp
, BH_DIRTY
) || bhp
->ref
!= 0) {
124 F_SET(bhp
, BH_WRITE
);
128 mfp
= R_ADDR(dbmp
, bhp
->mf_offset
);
132 * If the buffer isn't in use, we should be able to
133 * write it immediately, so save a reference to it.
136 if (ar_cnt
== nalloc
) {
139 (BH
**)__db_realloc(bharray
,
140 nalloc
* sizeof(BH
*))) == NULL
) {
145 bharray
[ar_cnt
++] = bhp
;
148 if (F_ISSET(bhp
, BH_WRITE
))
149 F_CLR(bhp
, BH_WRITE
);
151 /* If there no buffers we can write immediately, we're done. */
153 ret
= mp
->lsn_cnt
? DB_INCOMPLETE
: 0;
157 /* Lock down the buffers and their contents. */
158 for (cnt
= 0; cnt
< ar_cnt
; ++cnt
)
163 /* Sort the buffers we're going to write. */
164 qsort(bharray
, ar_cnt
, sizeof(BH
*), __bhcmp
);
168 /* Walk the array, writing buffers. */
169 for (next
= 0; next
< ar_cnt
; ++next
) {
171 * It's possible for a thread to have gotten the buffer since
172 * we listed it for writing. If the reference count is still
173 * 1, we're the only ones using the buffer, go ahead and write.
174 * If it's >1, then skip the buffer and assume that it will be
175 * written when it's returned to the cache.
177 if (bharray
[next
]->ref
> 1) {
178 --bharray
[next
]->ref
;
182 /* Write the buffer. */
183 mfp
= R_ADDR(dbmp
, bharray
[next
]->mf_offset
);
184 ret
= __memp_bhwrite(dbmp
, mfp
, bharray
[next
], NULL
, &wrote
);
186 /* Release the buffer. */
187 --bharray
[next
]->ref
;
189 /* If there's an error, release the rest of the buffers. */
190 if (ret
!= 0 || !wrote
) {
192 * Any process syncing the shared memory buffer pool
193 * had better be able to write to any underlying file.
194 * Be understanding, but firm, on this point.
197 __db_err(dbenv
, "%s: unable to flush page: %lu",
198 __memp_fns(dbmp
, mfp
),
199 (u_long
)bharray
[next
]->pgno
);
203 while (++next
< ar_cnt
)
204 --bharray
[next
]->ref
;
208 ret
= mp
->lsn_cnt
? DB_INCOMPLETE
: 0;
214 * MPOOL->lsn_cnt (the total sync count)
215 * MPOOLFILE->lsn_cnt (the per-file sync count)
216 * BH_WRITE flag (the scheduled for writing flag)
219 for (mfp
= SH_TAILQ_FIRST(&dbmp
->mp
->mpfq
, __mpoolfile
);
220 mfp
!= NULL
; mfp
= SH_TAILQ_NEXT(mfp
, q
, __mpoolfile
))
222 for (bhp
= SH_TAILQ_FIRST(&mp
->bhq
, __bh
);
223 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, q
, __bh
))
224 F_CLR(bhp
, BH_WRITE
);
233 * Mpool file sync function.
245 * If this handle doesn't have a file descriptor that's open for
246 * writing, or if the file is a temporary, there's no reason to
249 if (F_ISSET(dbmfp
, MP_READONLY
))
253 is_tmp
= F_ISSET(dbmfp
->mfp
, MP_TEMP
);
258 return (__memp_fsync(dbmfp
));
263 * Return a file descriptor for DB 1.85 compatibility locking.
265 * PUBLIC: int __mp_xxx_fd __P((DB_MPOOLFILE *, int *));
268 __mp_xxx_fd(dbmfp
, fdp
)
275 * This is a truly spectacular layering violation, intended ONLY to
276 * support compatibility for the DB 1.85 DB->fd call.
278 * Sync the database file to disk, creating the file as necessary.
280 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
281 * The MP_READONLY test isn't interesting because we will either
282 * already have a file descriptor (we opened the database file for
283 * reading) or we aren't readonly (we created the database which
284 * requires write privileges). The MP_TEMP test isn't interesting
285 * because we want to write to the backing file regardless so that
286 * we get a file descriptor to return.
288 ret
= dbmfp
->fd
== -1 ? __memp_fsync(dbmfp
) : 0;
290 return ((*fdp
= dbmfp
->fd
) == -1 ? ENOENT
: ret
);
295 * Mpool file internal sync function.
304 int ar_cnt
, cnt
, nalloc
, next
, pincnt
, ret
, wrote
;
308 mf_offset
= R_OFFSET(dbmp
, dbmfp
->mfp
);
311 * We try and write the buffers in page order so that the underlying
312 * filesystem doesn't have to seek and can write contiguous blocks,
313 * plus, we don't want to hold the region lock while we write the
314 * buffers. Get memory to hold the buffer pointers. Get a good-size
315 * block, too, because we realloc while holding the region lock if we
320 (BH
**)__db_malloc((size_t)nalloc
* sizeof(BH
*))) == NULL
)
326 * Walk the LRU list of buffer headers, and get a list of buffers to
327 * write for this MPOOLFILE.
330 for (bhp
= SH_TAILQ_FIRST(&dbmp
->mp
->bhq
, __bh
);
331 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, q
, __bh
)) {
332 if (!F_ISSET(bhp
, BH_DIRTY
) || bhp
->mf_offset
!= mf_offset
)
334 if (bhp
->ref
!= 0 || F_ISSET(bhp
, BH_LOCKED
)) {
339 if (ar_cnt
== nalloc
) {
341 if ((bharray
= (BH
**)__db_realloc(bharray
,
342 nalloc
* sizeof(BH
*))) == NULL
) {
348 bharray
[ar_cnt
++] = bhp
;
351 /* Lock down the buffers and their contents. */
352 for (cnt
= 0; cnt
< ar_cnt
; ++cnt
)
357 /* Sort the buffers we're going to write. */
358 qsort(bharray
, ar_cnt
, sizeof(BH
*), __bhcmp
);
362 /* Walk the array, writing buffers. */
363 for (next
= 0; next
< ar_cnt
; ++next
) {
365 * It's possible for a thread to have gotten the buffer since
366 * we listed it for writing. If the reference count is still
367 * 1, we're the only ones using the buffer, go ahead and write.
368 * If it's >1, then skip the buffer and assume that it will be
369 * written when it's returned to the cache.
371 if (bharray
[next
]->ref
> 1) {
374 --bharray
[next
]->ref
;
378 /* Write the buffer. */
379 ret
= __memp_pgwrite(dbmfp
, bharray
[next
], NULL
, &wrote
);
381 /* Release the buffer. */
382 --bharray
[next
]->ref
;
384 /* If there's an error, release the rest of the buffers. */
386 while (++next
< ar_cnt
)
387 --bharray
[next
]->ref
;
394 err
: UNLOCKREGION(dbmp
);
399 * Sync the underlying file as the last thing we do, so that the OS
400 * has maximal opportunity to flush buffers before we request it.
403 * Don't lock the region around the sync, fsync(2) has no atomicity
407 return (pincnt
== 0 ? __db_fsync(dbmfp
->fd
) : DB_INCOMPLETE
);
413 * Keep a specified percentage of the buffers clean.
416 memp_trickle(dbmp
, pct
, nwrotep
)
430 if (pct
< 1 || pct
> 100)
436 * If there are sufficient clean buffers, or no buffers or no dirty
437 * buffers, we're done.
440 * Using st_page_clean and st_page_dirty is our only choice at the
441 * moment, but it's not as correct as we might like in the presence
442 * of pools with more than one buffer size, as a free 512-byte buffer
443 * isn't the same as a free 8K buffer.
445 loop
: total
= mp
->stat
.st_page_clean
+ mp
->stat
.st_page_dirty
;
446 if (total
== 0 || mp
->stat
.st_page_dirty
== 0 ||
447 (mp
->stat
.st_page_clean
* 100) / total
>= (u_long
)pct
) {
452 /* Loop until we write a buffer. */
453 for (bhp
= SH_TAILQ_FIRST(&mp
->bhq
, __bh
);
454 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, q
, __bh
)) {
456 !F_ISSET(bhp
, BH_DIRTY
) || F_ISSET(bhp
, BH_LOCKED
))
459 mfp
= R_ADDR(dbmp
, bhp
->mf_offset
);
462 * We can't write to temporary files -- see the comment in
463 * mp_bh.c:__memp_bhwrite().
465 if (F_ISSET(mfp
, MP_TEMP
))
468 if ((ret
= __memp_bhwrite(dbmp
, mfp
, bhp
, NULL
, &wrote
)) != 0)
472 * Any process syncing the shared memory buffer pool had better
473 * be able to write to any underlying file. Be understanding,
474 * but firm, on this point.
477 __db_err(dbmp
->dbenv
, "%s: unable to flush page: %lu",
478 __memp_fns(dbmp
, mfp
), (u_long
)bhp
->pgno
);
483 ++mp
->stat
.st_page_trickle
;
489 /* No more buffers to write. */
492 err
: UNLOCKREGION(dbmp
);
502 bhp1
= *(BH
* const *)p1
;
503 bhp2
= *(BH
* const *)p2
;
505 /* Sort by file (shared memory pool offset). */
506 if (bhp1
->mf_offset
< bhp2
->mf_offset
)
508 if (bhp1
->mf_offset
> bhp2
->mf_offset
)
511 /* Sort by page in file. */
512 return (bhp1
->pgno
< bhp2
->pgno
? -1 : 1);