db2/mp/mp_sync.c

   1 /*-
   2  * See the file LICENSE for redistribution information.
   3  *
   4  * Copyright (c) 1996, 1997, 1998
   5  *      Sleepycat Software.  All rights reserved.
   6  */
   7 #include "config.h"
   8
   9 #ifndef lint
  10 static const char sccsid[] = "@(#)mp_sync.c     10.25 (Sleepycat) 4/26/98";
  11 #endif /* not lint */
  12
  13 #ifndef NO_SYSTEM_INCLUDES
  14 #include <sys/types.h>
  15
  16 #include <errno.h>
  17 #include <stdlib.h>
  18 #endif
  19
  20 #include "db_int.h"
  21 #include "shqueue.h"
  22 #include "db_shash.h"
  23 #include "mp.h"
  24 #include "common_ext.h"
  25
  26 static int __bhcmp __P((const void *, const void *));
  27 static int __memp_fsync __P((DB_MPOOLFILE *));
  28
  29 /*
  30  * memp_sync --
  31  *      Mpool sync function.
  32  */
  33 int
  34 memp_sync(dbmp, lsnp)
  35         DB_MPOOL *dbmp;
  36         DB_LSN *lsnp;
  37 {
  38         BH *bhp, **bharray;
  39         DB_ENV *dbenv;
  40         MPOOL *mp;
  41         MPOOLFILE *mfp;
  42         int ar_cnt, cnt, nalloc, next, ret, wrote;
  43
  44         dbenv = dbmp->dbenv;
  45
  46         if (dbenv->lg_info == NULL) {
  47                 __db_err(dbenv, "memp_sync: requires logging");
  48                 return (EINVAL);
  49         }
  50
  51         /*
  52          * We try and write the buffers in page order so that the underlying
  53          * filesystem doesn't have to seek and can write contiguous blocks,
  54          * plus, we don't want to hold the region lock while we write the
  55          * buffers.  Get memory to hold the buffer pointers.  Get a good-size
  56          * block, too, because we realloc while holding the region lock if we
  57          * run out.
  58          */
  59         if ((bharray =
  60             (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL)
  61                 return (ENOMEM);
  62
  63         LOCKREGION(dbmp);
  64
  65         /*
  66          * If the application is asking about a previous call to memp_sync(),
  67          * and we haven't found any buffers that the application holding the
  68          * pin couldn't write, return yes or no based on the current count.
  69          * Note, if the application is asking about a LSN *smaller* than one
  70          * we've already handled or are currently handling, then we return a
  71          * result based on the count for the larger LSN.
  72          */
  73         mp = dbmp->mp;
  74         if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
  75                 if (mp->lsn_cnt == 0) {
  76                         *lsnp = mp->lsn;
  77                         ret = 0;
  78                 } else
  79                         ret = DB_INCOMPLETE;
  80                 goto done;
  81         }
  82
  83         /* Else, it's a new checkpoint. */
  84         F_CLR(mp, MP_LSN_RETRY);
  85
  86         /*
  87          * Save the LSN.  We know that it's a new LSN or larger than the one
  88          * for which we were already doing a checkpoint.  (BTW, I don't expect
  89          * to see multiple LSN's from the same or multiple processes, but You
  90          * Just Never Know.  Responding as if they all called with the largest
  91          * of the LSNs specified makes everything work.)
  92          *
  93          * We don't currently use the LSN we save.  We could potentially save
  94          * the last-written LSN in each buffer header and use it to determine
  95          * what buffers need to be written.  The problem with this is that it's
  96          * sizeof(LSN) more bytes of buffer header.  We currently write all the
  97          * dirty buffers instead.
  98          *
  99          * Walk the list of shared memory segments clearing the count of
 100          * buffers waiting to be written.
 101          */
 102         mp->lsn = *lsnp;
 103         mp->lsn_cnt = 0;
 104         for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
 105             mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
 106                 mfp->lsn_cnt = 0;
 107
 108         /*
 109          * Walk the list of buffers and mark all dirty buffers to be written
 110          * and all pinned buffers to be potentially written (we can't know if
 111          * we'll need to write them until the holding process returns them to
 112          * the cache).  We do this in one pass while holding the region locked
 113          * so that processes can't make new buffers dirty, causing us to never
 114          * finish.  Since the application may have restarted the sync, clear
 115          * any BH_WRITE flags that appear to be left over from previous calls.
 116          *
 117          * Keep a count of the total number of buffers we need to write in
 118          * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
 119          */
 120         ar_cnt = 0;
 121         for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
 122             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
 123                 if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
 124                         F_SET(bhp, BH_WRITE);
 125
 126                         ++mp->lsn_cnt;
 127
 128                         mfp = R_ADDR(dbmp, bhp->mf_offset);
 129                         ++mfp->lsn_cnt;
 130
 131                         /*
 132                          * If the buffer isn't in use, we should be able to
 133                          * write it immediately, so save a reference to it.
 134                          */
 135                         if (bhp->ref == 0) {
 136                                 if (ar_cnt == nalloc) {
 137                                         nalloc *= 2;
 138                                         if ((bharray =
 139                                             (BH **)__db_realloc(bharray,
 140                                             nalloc * sizeof(BH *))) == NULL) {
 141                                                 ret = ENOMEM;
 142                                                 goto err;
 143                                         }
 144                                 }
 145                                 bharray[ar_cnt++] = bhp;
 146                         }
 147                 } else
 148                         if (F_ISSET(bhp, BH_WRITE))
 149                                 F_CLR(bhp, BH_WRITE);
 150
 151         /* If there no buffers we can write immediately, we're done. */
 152         if (ar_cnt == 0) {
 153                 ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
 154                 goto done;
 155         }
 156
 157         /* Lock down the buffers and their contents. */
 158         for (cnt = 0; cnt < ar_cnt; ++cnt)
 159                 ++bharray[cnt]->ref;
 160
 161         UNLOCKREGION(dbmp);
 162
 163         /* Sort the buffers we're going to write. */
 164         qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
 165
 166         LOCKREGION(dbmp);
 167
 168         /* Walk the array, writing buffers. */
 169         for (next = 0; next < ar_cnt; ++next) {
 170                 /*
 171                  * It's possible for a thread to have gotten the buffer since
 172                  * we listed it for writing.  If the reference count is still
 173                  * 1, we're the only ones using the buffer, go ahead and write.
 174                  * If it's >1, then skip the buffer and assume that it will be
 175                  * written when it's returned to the cache.
 176                  */
 177                 if (bharray[next]->ref > 1) {
 178                         --bharray[next]->ref;
 179                         continue;
 180                 }
 181
 182                 /* Write the buffer. */
 183                 mfp = R_ADDR(dbmp, bharray[next]->mf_offset);
 184                 ret = __memp_bhwrite(dbmp, mfp, bharray[next], NULL, &wrote);
 185
 186                 /* Release the buffer. */
 187                 --bharray[next]->ref;
 188
 189                 /* If there's an error, release the rest of the buffers. */
 190                 if (ret != 0 || !wrote) {
 191                         /*
 192                          * Any process syncing the shared memory buffer pool
 193                          * had better be able to write to any underlying file.
 194                          * Be understanding, but firm, on this point.
 195                          */
 196                         if (ret == 0) {
 197                                 __db_err(dbenv, "%s: unable to flush page: %lu",
 198                                     __memp_fns(dbmp, mfp),
 199                                     (u_long)bharray[next]->pgno);
 200                                 ret = EPERM;
 201                         }
 202
 203                         while (++next < ar_cnt)
 204                                 --bharray[next]->ref;
 205                         goto err;
 206                 }
 207         }
 208         ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
 209
 210 done:
 211         if (0) {
 212 err:            /*
 213                  * On error, clear:
 214                  *      MPOOL->lsn_cnt (the total sync count)
 215                  *      MPOOLFILE->lsn_cnt (the per-file sync count)
 216                  *      BH_WRITE flag (the scheduled for writing flag)
 217                  */
 218                 mp->lsn_cnt = 0;
 219                 for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
 220                     mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
 221                         mfp->lsn_cnt = 0;
 222                 for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
 223                     bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
 224                         F_CLR(bhp, BH_WRITE);
 225         }
 226         UNLOCKREGION(dbmp);
 227         __db_free(bharray);
 228         return (ret);
 229 }
 230
 231 /*
 232  * memp_fsync --
 233  *      Mpool file sync function.
 234  */
 235 int
 236 memp_fsync(dbmfp)
 237         DB_MPOOLFILE *dbmfp;
 238 {
 239         DB_MPOOL *dbmp;
 240         int is_tmp;
 241
 242         dbmp = dbmfp->dbmp;
 243
 244         /*
 245          * If this handle doesn't have a file descriptor that's open for
 246          * writing, or if the file is a temporary, there's no reason to
 247          * proceed further.
 248          */
 249         if (F_ISSET(dbmfp, MP_READONLY))
 250                 return (0);
 251
 252         LOCKREGION(dbmp);
 253         is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
 254         UNLOCKREGION(dbmp);
 255         if (is_tmp)
 256                 return (0);
 257
 258         return (__memp_fsync(dbmfp));
 259 }
 260
 261 /*
 262  * __mp_xxx_fd --
 263  *      Return a file descriptor for DB 1.85 compatibility locking.
 264  *
 265  * PUBLIC: int __mp_xxx_fd __P((DB_MPOOLFILE *, int *));
 266  */
 267 int
 268 __mp_xxx_fd(dbmfp, fdp)
 269         DB_MPOOLFILE *dbmfp;
 270         int *fdp;
 271 {
 272         int ret;
 273
 274         /*
 275          * This is a truly spectacular layering violation, intended ONLY to
 276          * support compatibility for the DB 1.85 DB->fd call.
 277          *
 278          * Sync the database file to disk, creating the file as necessary.
 279          *
 280          * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
 281          * The MP_READONLY test isn't interesting because we will either
 282          * already have a file descriptor (we opened the database file for
 283          * reading) or we aren't readonly (we created the database which
 284          * requires write privileges).  The MP_TEMP test isn't interesting
 285          * because we want to write to the backing file regardless so that
 286          * we get a file descriptor to return.
 287          */
 288         ret = dbmfp->fd == -1 ? __memp_fsync(dbmfp) : 0;
 289
 290         return ((*fdp = dbmfp->fd) == -1 ? ENOENT : ret);
 291 }
 292
 293 /*
 294  * __memp_fsync --
 295  *      Mpool file internal sync function.
 296  */
 297 static int
 298 __memp_fsync(dbmfp)
 299         DB_MPOOLFILE *dbmfp;
 300 {
 301         BH *bhp, **bharray;
 302         DB_MPOOL *dbmp;
 303         size_t mf_offset;
 304         int ar_cnt, cnt, nalloc, next, pincnt, ret, wrote;
 305
 306         ret = 0;
 307         dbmp = dbmfp->dbmp;
 308         mf_offset = R_OFFSET(dbmp, dbmfp->mfp);
 309
 310         /*
 311          * We try and write the buffers in page order so that the underlying
 312          * filesystem doesn't have to seek and can write contiguous blocks,
 313          * plus, we don't want to hold the region lock while we write the
 314          * buffers.  Get memory to hold the buffer pointers.  Get a good-size
 315          * block, too, because we realloc while holding the region lock if we
 316          * run out.
 317          */
 318         nalloc = 1024;
 319         if ((bharray =
 320             (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL)
 321                 return (ENOMEM);
 322
 323         LOCKREGION(dbmp);
 324
 325         /*
 326          * Walk the LRU list of buffer headers, and get a list of buffers to
 327          * write for this MPOOLFILE.
 328          */
 329         ar_cnt = pincnt = 0;
 330         for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
 331             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
 332                 if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset)
 333                         continue;
 334                 if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
 335                         ++pincnt;
 336                         continue;
 337                 }
 338
 339                 if (ar_cnt == nalloc) {
 340                         nalloc *= 2;
 341                         if ((bharray = (BH **)__db_realloc(bharray,
 342                             nalloc * sizeof(BH *))) == NULL) {
 343                                 ret = ENOMEM;
 344                                 goto err;
 345                         }
 346                 }
 347
 348                 bharray[ar_cnt++] = bhp;
 349         }
 350
 351         /* Lock down the buffers and their contents. */
 352         for (cnt = 0; cnt < ar_cnt; ++cnt)
 353                 ++bharray[cnt]->ref;
 354
 355         UNLOCKREGION(dbmp);
 356
 357         /* Sort the buffers we're going to write. */
 358         qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
 359
 360         LOCKREGION(dbmp);
 361
 362         /* Walk the array, writing buffers. */
 363         for (next = 0; next < ar_cnt; ++next) {
 364                 /*
 365                  * It's possible for a thread to have gotten the buffer since
 366                  * we listed it for writing.  If the reference count is still
 367                  * 1, we're the only ones using the buffer, go ahead and write.
 368                  * If it's >1, then skip the buffer and assume that it will be
 369                  * written when it's returned to the cache.
 370                  */
 371                 if (bharray[next]->ref > 1) {
 372                         ++pincnt;
 373
 374                         --bharray[next]->ref;
 375                         continue;
 376                 }
 377
 378                 /* Write the buffer. */
 379                 ret = __memp_pgwrite(dbmfp, bharray[next], NULL, &wrote);
 380
 381                 /* Release the buffer. */
 382                 --bharray[next]->ref;
 383
 384                 /* If there's an error, release the rest of the buffers. */
 385                 if (ret != 0) {
 386                         while (++next < ar_cnt)
 387                                 --bharray[next]->ref;
 388                         goto err;
 389                 }
 390                 if (!wrote)
 391                         ++pincnt;
 392         }
 393
 394 err:    UNLOCKREGION(dbmp);
 395
 396         __db_free(bharray);
 397
 398         /*
 399          * Sync the underlying file as the last thing we do, so that the OS
 400          * has maximal opportunity to flush buffers before we request it.
 401          *
 402          * XXX:
 403          * Don't lock the region around the sync, fsync(2) has no atomicity
 404          * issues.
 405          */
 406         if (ret == 0)
 407                 return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE);
 408         return (ret);
 409 }
 410
 411 /*
 412  * memp_trickle --
 413  *      Keep a specified percentage of the buffers clean.
 414  */
 415 int
 416 memp_trickle(dbmp, pct, nwrotep)
 417         DB_MPOOL *dbmp;
 418         int pct, *nwrotep;
 419 {
 420         BH *bhp;
 421         MPOOL *mp;
 422         MPOOLFILE *mfp;
 423         u_long total;
 424         int ret, wrote;
 425
 426         mp = dbmp->mp;
 427         if (nwrotep != NULL)
 428                 *nwrotep = 0;
 429
 430         if (pct < 1 || pct > 100)
 431                 return (EINVAL);
 432
 433         LOCKREGION(dbmp);
 434
 435         /*
 436          * If there are sufficient clean buffers, or no buffers or no dirty
 437          * buffers, we're done.
 438          *
 439          * XXX
 440          * Using st_page_clean and st_page_dirty is our only choice at the
 441          * moment, but it's not as correct as we might like in the presence
 442          * of pools with more than one buffer size, as a free 512-byte buffer
 443          * isn't the same as a free 8K buffer.
 444          */
 445 loop:   total = mp->stat.st_page_clean + mp->stat.st_page_dirty;
 446         if (total == 0 || mp->stat.st_page_dirty == 0 ||
 447             (mp->stat.st_page_clean * 100) / total >= (u_long)pct) {
 448                 UNLOCKREGION(dbmp);
 449                 return (0);
 450         }
 451
 452         /* Loop until we write a buffer. */
 453         for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
 454             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
 455                 if (bhp->ref != 0 ||
 456                     !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
 457                         continue;
 458
 459                 mfp = R_ADDR(dbmp, bhp->mf_offset);
 460
 461                 /*
 462                  * We can't write to temporary files -- see the comment in
 463                  * mp_bh.c:__memp_bhwrite().
 464                  */
 465                 if (F_ISSET(mfp, MP_TEMP))
 466                         continue;
 467
 468                 if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
 469                         goto err;
 470
 471                 /*
 472                  * Any process syncing the shared memory buffer pool had better
 473                  * be able to write to any underlying file.  Be understanding,
 474                  * but firm, on this point.
 475                  */
 476                 if (!wrote) {
 477                         __db_err(dbmp->dbenv, "%s: unable to flush page: %lu",
 478                             __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
 479                         ret = EPERM;
 480                         goto err;
 481                 }
 482
 483                 ++mp->stat.st_page_trickle;
 484                 if (nwrotep != NULL)
 485                         ++*nwrotep;
 486                 goto loop;
 487         }
 488
 489         /* No more buffers to write. */
 490         return (0);
 491
 492 err:    UNLOCKREGION(dbmp);
 493         return (ret);
 494 }
 495
 496 static int
 497 __bhcmp(p1, p2)
 498         const void *p1, *p2;
 499 {
 500         BH *bhp1, *bhp2;
 501
 502         bhp1 = *(BH * const *)p1;
 503         bhp2 = *(BH * const *)p2;
 504
 505         /* Sort by file (shared memory pool offset). */
 506         if (bhp1->mf_offset < bhp2->mf_offset)
 507                 return (-1);
 508         if (bhp1->mf_offset > bhp2->mf_offset)
 509                 return (1);
 510
 511         /* Sort by page in file. */
 512         return (bhp1->pgno < bhp2->pgno ? -1 : 1);
 513 }