Update.
[glibc.git] / db2 / mp / mp_bh.c
blobd89f9c2ded4f743e0fba40e45e747fe0cd5f26b8
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 #include "config.h"
9 #ifndef lint
10 static const char sccsid[] = "@(#)mp_bh.c 10.38 (Sleepycat) 5/20/98";
11 #endif /* not lint */
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
16 #include <errno.h>
17 #include <string.h>
18 #include <unistd.h>
19 #endif
21 #include "db_int.h"
22 #include "shqueue.h"
23 #include "db_shash.h"
24 #include "mp.h"
25 #include "common_ext.h"
27 static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
30 * __memp_bhwrite --
31 * Write the page associated with a given bucket header.
33 * PUBLIC: int __memp_bhwrite
34 * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
36 int
37 __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
38 DB_MPOOL *dbmp;
39 MPOOLFILE *mfp;
40 BH *bhp;
41 int *restartp, *wrotep;
43 DB_MPOOLFILE *dbmfp;
44 DB_MPREG *mpreg;
46 if (restartp != NULL)
47 *restartp = 0;
48 if (wrotep != NULL)
49 *wrotep = 0;
52 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
53 * the file. We also check that the descriptor is open for writing.
54 * If we find a descriptor on the file that's not open for writing, we
55 * try and upgrade it to make it writeable. If that fails, we're done.
57 LOCKHANDLE(dbmp, dbmp->mutexp);
58 for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
59 dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
60 if (dbmfp->mfp == mfp) {
61 if (F_ISSET(dbmfp, MP_READONLY) &&
62 __memp_upgrade(dbmp, dbmfp, mfp)) {
63 UNLOCKHANDLE(dbmp, dbmp->mutexp);
64 return (0);
66 break;
68 UNLOCKHANDLE(dbmp, dbmp->mutexp);
69 if (dbmfp != NULL)
70 goto found;
73 * It's not a page from a file we've opened. If the file requires
74 * input/output processing, see if this process has ever registered
75 * information as to how to write this type of file. If not, there's
76 * nothing we can do.
78 if (mfp->ftype != 0) {
79 LOCKHANDLE(dbmp, dbmp->mutexp);
80 for (mpreg = LIST_FIRST(&dbmp->dbregq);
81 mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
82 if (mpreg->ftype == mfp->ftype)
83 break;
84 UNLOCKHANDLE(dbmp, dbmp->mutexp);
85 if (mpreg == NULL)
86 return (0);
90 * Try and open the file, attaching to the underlying shared area.
92 * XXX
93 * Don't try to attach to temporary files. There are two problems in
94 * trying to do that. First, if we have different privileges than the
95 * process that "owns" the temporary file, we might create the backing
96 * disk file such that the owning process couldn't read/write its own
97 * buffers, e.g., memp_trickle() running as root creating a file owned
98 * as root, mode 600. Second, if the temporary file has already been
99 * created, we don't have any way of finding out what its real name is,
100 * and, even if we did, it was already unlinked (so that it won't be
101 * left if the process dies horribly). This decision causes a problem,
102 * however: if the temporary file consumes the entire buffer cache,
103 * and the owner doesn't flush the buffers to disk, we could end up
104 * with resource starvation, and the memp_trickle() thread couldn't do
105 * anything about it. That's a pretty unlikely scenario, though.
107 * XXX
108 * There's no negative cache, so we may repeatedly try and open files
109 * that we have previously tried (and failed) to open.
111 * Ignore any error, assume it's a permissions problem.
113 if (F_ISSET(mfp, MP_TEMP))
114 return (0);
116 if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp, mfp->path_off),
117 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
118 return (0);
120 found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep));
124 * __memp_pgread --
125 * Read a page from a file.
127 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
130 __memp_pgread(dbmfp, bhp, can_create)
131 DB_MPOOLFILE *dbmfp;
132 BH *bhp;
133 int can_create;
135 DB_MPOOL *dbmp;
136 MPOOLFILE *mfp;
137 size_t pagesize;
138 ssize_t nr;
139 int ret;
141 dbmp = dbmfp->dbmp;
142 mfp = dbmfp->mfp;
143 pagesize = mfp->stat.st_pagesize;
145 F_SET(bhp, BH_LOCKED | BH_TRASH);
146 LOCKBUFFER(dbmp, bhp);
147 UNLOCKREGION(dbmp);
150 * Temporary files may not yet have been created.
152 * Seek to the page location.
154 ret = 0;
155 LOCKHANDLE(dbmp, dbmfp->mutexp);
156 if (dbmfp->fd == -1 || (ret =
157 __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, 0, SEEK_SET)) != 0) {
158 if (!can_create) {
159 if (dbmfp->fd == -1)
160 ret = EINVAL;
161 UNLOCKHANDLE(dbmp, dbmfp->mutexp);
162 __db_err(dbmp->dbenv,
163 "%s: page %lu doesn't exist, create flag not set",
164 __memp_fn(dbmfp), (u_long)bhp->pgno);
165 goto err;
167 UNLOCKHANDLE(dbmp, dbmfp->mutexp);
169 /* Clear the created page. */
170 if (mfp->clear_len == 0)
171 memset(bhp->buf, 0, pagesize);
172 else {
173 memset(bhp->buf, 0, mfp->clear_len);
174 #ifdef DIAGNOSTIC
175 memset(bhp->buf + mfp->clear_len,
176 0xff, pagesize - mfp->clear_len);
177 #endif
180 goto pgin;
184 * Read the page; short reads are treated like creates, although
185 * any valid data is preserved.
187 ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr);
188 UNLOCKHANDLE(dbmp, dbmfp->mutexp);
189 if (ret != 0)
190 goto err;
192 if (nr == (ssize_t)pagesize)
193 can_create = 0;
194 else {
195 if (!can_create) {
196 ret = EINVAL;
197 goto err;
201 * If we didn't fail until we tried the read, don't clear the
202 * whole page, it wouldn't be insane for a filesystem to just
203 * always behave that way. Else, clear any uninitialized data.
205 if (nr == 0)
206 memset(bhp->buf, 0,
207 mfp->clear_len == 0 ? pagesize : mfp->clear_len);
208 else
209 memset(bhp->buf + nr, 0, pagesize - nr);
212 /* Call any pgin function. */
213 pgin: ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
215 /* Unlock the buffer and reacquire the region lock. */
216 err: UNLOCKBUFFER(dbmp, bhp);
217 LOCKREGION(dbmp);
220 * If no errors occurred, the data is now valid, clear the BH_TRASH
221 * flag; regardless, clear the lock bit and let other threads proceed.
223 F_CLR(bhp, BH_LOCKED);
224 if (ret == 0) {
225 F_CLR(bhp, BH_TRASH);
227 /* Update the statistics. */
228 if (can_create) {
229 ++dbmp->mp->stat.st_page_create;
230 ++mfp->stat.st_page_create;
231 } else {
232 ++dbmp->mp->stat.st_page_in;
233 ++mfp->stat.st_page_in;
237 return (ret);
241 * __memp_pgwrite --
242 * Write a page to a file.
244 * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));
247 __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
248 DB_MPOOLFILE *dbmfp;
249 BH *bhp;
250 int *restartp, *wrotep;
252 DB_ENV *dbenv;
253 DB_LOG *lg_info;
254 DB_LSN lsn;
255 DB_MPOOL *dbmp;
256 MPOOL *mp;
257 MPOOLFILE *mfp;
258 size_t pagesize;
259 ssize_t nw;
260 int callpgin, ret, syncfail;
261 const char *fail;
263 dbmp = dbmfp->dbmp;
264 dbenv = dbmp->dbenv;
265 mp = dbmp->mp;
266 mfp = dbmfp->mfp;
268 if (restartp != NULL)
269 *restartp = 0;
270 if (wrotep != NULL)
271 *wrotep = 0;
272 callpgin = 0;
273 pagesize = mfp->stat.st_pagesize;
276 * Check the dirty bit -- this buffer may have been written since we
277 * decided to write it.
279 if (!F_ISSET(bhp, BH_DIRTY)) {
280 if (wrotep != NULL)
281 *wrotep = 1;
282 return (0);
285 LOCKBUFFER(dbmp, bhp);
288 * If there were two writers, we may have just been waiting while the
289 * other writer completed I/O on this buffer. Check the dirty bit one
290 * more time.
292 if (!F_ISSET(bhp, BH_DIRTY)) {
293 UNLOCKBUFFER(dbmp, bhp);
295 if (wrotep != NULL)
296 *wrotep = 1;
297 return (0);
300 F_SET(bhp, BH_LOCKED);
301 UNLOCKREGION(dbmp);
303 if (restartp != NULL)
304 *restartp = 1;
306 /* Copy the LSN off the page if we're going to need it. */
307 lg_info = dbenv->lg_info;
308 if (lg_info != NULL || F_ISSET(bhp, BH_WRITE))
309 memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
311 /* Ensure the appropriate log records are on disk. */
312 if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0)
313 goto err;
316 * Call any pgout function. We set the callpgin flag so that we flag
317 * that the contents of the buffer will need to be passed through pgin
318 * before they are reused.
320 if (mfp->ftype == 0)
321 ret = 0;
322 else {
323 callpgin = 1;
324 if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
325 goto err;
328 /* Temporary files may not yet have been created. */
329 LOCKHANDLE(dbmp, dbmfp->mutexp);
330 if (dbmfp->fd == -1 &&
331 ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
332 DB_CREATE | DB_EXCL | DB_TEMPORARY, &dbmfp->fd, NULL)) != 0 ||
333 dbmfp->fd == -1)) {
334 UNLOCKHANDLE(dbmp, dbmfp->mutexp);
335 __db_err(dbenv, "unable to create temporary backing file");
336 goto err;
340 * Write the page out.
342 * XXX
343 * Shut the compiler up; it doesn't understand the correlation between
344 * the failing clauses to __db_lseek and __db_write and this ret != 0.
346 COMPQUIET(fail, NULL);
347 if ((ret =
348 __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, 0, SEEK_SET)) != 0)
349 fail = "seek";
350 else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0)
351 fail = "write";
352 UNLOCKHANDLE(dbmp, dbmfp->mutexp);
353 if (ret != 0)
354 goto syserr;
356 if (nw != (ssize_t)pagesize) {
357 ret = EIO;
358 fail = "write";
359 goto syserr;
362 if (wrotep != NULL)
363 *wrotep = 1;
365 /* Unlock the buffer and reacquire the region lock. */
366 UNLOCKBUFFER(dbmp, bhp);
367 LOCKREGION(dbmp);
370 * Clean up the flags based on a successful write.
372 * If we rewrote the page, it will need processing by the pgin
373 * routine before reuse.
375 if (callpgin)
376 F_SET(bhp, BH_CALLPGIN);
377 F_CLR(bhp, BH_DIRTY | BH_LOCKED);
380 * If we write a buffer for which a checkpoint is waiting, update
381 * the count of pending buffers (both in the mpool as a whole and
382 * for this file). If the count for this file goes to zero, flush
383 * the writes.
385 * XXX:
386 * Don't lock the region around the sync, fsync(2) has no atomicity
387 * issues.
389 * XXX:
390 * We ignore errors from the sync -- it makes no sense to return an
391 * error to the calling process, so set a flag causing the checkpoint
392 * to be retried later.
394 if (F_ISSET(bhp, BH_WRITE)) {
395 if (mfp->lsn_cnt == 1) {
396 UNLOCKREGION(dbmp);
397 syncfail = __db_fsync(dbmfp->fd) != 0;
398 LOCKREGION(dbmp);
399 if (syncfail)
400 F_SET(mp, MP_LSN_RETRY);
404 F_CLR(bhp, BH_WRITE);
407 * If the buffer just written has a larger LSN than the current
408 * max LSN written for this checkpoint, update the saved value.
410 if (log_compare(&lsn, &mp->lsn) > 0)
411 mp->lsn = lsn;
413 --mp->lsn_cnt;
414 --mfp->lsn_cnt;
417 /* Update the page clean/dirty statistics. */
418 ++mp->stat.st_page_clean;
419 --mp->stat.st_page_dirty;
421 /* Update I/O statistics. */
422 ++mp->stat.st_page_out;
423 ++mfp->stat.st_page_out;
425 return (0);
427 syserr: __db_err(dbenv, "%s: %s failed for page %lu",
428 __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
430 err: /* Unlock the buffer and reacquire the region lock. */
431 UNLOCKBUFFER(dbmp, bhp);
432 LOCKREGION(dbmp);
435 * Clean up the flags based on a failure.
437 * The page remains dirty but we remove our lock. If we rewrote the
438 * page, it will need processing by the pgin routine before reuse.
440 if (callpgin)
441 F_SET(bhp, BH_CALLPGIN);
442 F_CLR(bhp, BH_LOCKED);
444 return (ret);
448 * __memp_pg --
449 * Call the pgin/pgout routine.
451 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
454 __memp_pg(dbmfp, bhp, is_pgin)
455 DB_MPOOLFILE *dbmfp;
456 BH *bhp;
457 int is_pgin;
459 DBT dbt, *dbtp;
460 DB_MPOOL *dbmp;
461 DB_MPREG *mpreg;
462 MPOOLFILE *mfp;
463 int ftype, ret;
465 dbmp = dbmfp->dbmp;
466 mfp = dbmfp->mfp;
468 LOCKHANDLE(dbmp, dbmp->mutexp);
470 ftype = mfp->ftype;
471 for (mpreg = LIST_FIRST(&dbmp->dbregq);
472 mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
473 if (ftype != mpreg->ftype)
474 continue;
475 if (mfp->pgcookie_len == 0)
476 dbtp = NULL;
477 else {
478 dbt.size = mfp->pgcookie_len;
479 dbt.data = R_ADDR(dbmp, mfp->pgcookie_off);
480 dbtp = &dbt;
482 UNLOCKHANDLE(dbmp, dbmp->mutexp);
484 if (is_pgin) {
485 if (mpreg->pgin != NULL && (ret =
486 mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0)
487 goto err;
488 } else
489 if (mpreg->pgout != NULL && (ret =
490 mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0)
491 goto err;
492 break;
495 if (mpreg == NULL)
496 UNLOCKHANDLE(dbmp, dbmp->mutexp);
498 return (0);
500 err: UNLOCKHANDLE(dbmp, dbmp->mutexp);
501 __db_err(dbmp->dbenv, "%s: %s failed for page %lu",
502 __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
503 return (ret);
507 * __memp_bhfree --
508 * Free a bucket header and its referenced data.
510 * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));
512 void
513 __memp_bhfree(dbmp, mfp, bhp, free_mem)
514 DB_MPOOL *dbmp;
515 MPOOLFILE *mfp;
516 BH *bhp;
517 int free_mem;
519 size_t off;
521 /* Delete the buffer header from the hash bucket queue. */
522 off = BUCKET(dbmp->mp, R_OFFSET(dbmp, mfp), bhp->pgno);
523 SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, hq, __bh);
525 /* Delete the buffer header from the LRU queue. */
526 SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
529 * If we're not reusing it immediately, free the buffer header
530 * and data for real.
532 if (free_mem) {
533 __db_shalloc_free(dbmp->addr, bhp);
534 --dbmp->mp->stat.st_page_clean;
539 * __memp_upgrade --
540 * Upgrade a file descriptor from readonly to readwrite.
542 static int
543 __memp_upgrade(dbmp, dbmfp, mfp)
544 DB_MPOOL *dbmp;
545 DB_MPOOLFILE *dbmfp;
546 MPOOLFILE *mfp;
548 int fd, ret;
549 char *rpath;
552 * !!!
553 * We expect the handle to already be locked.
556 /* Check to see if we've already upgraded. */
557 if (F_ISSET(dbmfp, MP_UPGRADE))
558 return (0);
560 /* Check to see if we've already failed. */
561 if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
562 return (1);
565 * Calculate the real name for this file and try to open it read/write.
566 * We know we have a valid pathname for the file because it's the only
567 * way we could have gotten a file descriptor of any kind.
569 if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
570 NULL, R_ADDR(dbmp, mfp->path_off), 0, NULL, &rpath)) != 0)
571 return (ret);
572 if (__db_open(rpath, 0, 0, 0, &fd) != 0) {
573 F_SET(dbmfp, MP_UPGRADE_FAIL);
574 ret = 1;
575 } else {
576 /* Swap the descriptors and set the upgrade flag. */
577 (void)__db_close(dbmfp->fd);
578 dbmfp->fd = fd;
579 F_SET(dbmfp, MP_UPGRADE);
580 ret = 0;
582 FREES(rpath);
583 return (ret);