db2/mp/mp_fget.c

   1 /*-
   2  * See the file LICENSE for redistribution information.
   3  *
   4  * Copyright (c) 1996, 1997, 1998
   5  *      Sleepycat Software.  All rights reserved.
   6  */
   7 #include "config.h"
   8
   9 #ifndef lint
  10 static const char sccsid[] = "@(#)mp_fget.c     10.48 (Sleepycat) 6/2/98";
  11 #endif /* not lint */
  12
  13 #ifndef NO_SYSTEM_INCLUDES
  14 #include <sys/types.h>
  15
  16 #include <errno.h>
  17 #include <string.h>
  18 #endif
  19
  20 #include "db_int.h"
  21 #include "shqueue.h"
  22 #include "db_shash.h"
  23 #include "mp.h"
  24 #include "common_ext.h"
  25
  26 /*
  27  * memp_fget --
  28  *      Get a page from the file.
  29  */
  30 int
  31 memp_fget(dbmfp, pgnoaddr, flags, addrp)
  32         DB_MPOOLFILE *dbmfp;
  33         db_pgno_t *pgnoaddr;
  34         u_int32_t flags;
  35         void *addrp;
  36 {
  37         BH *bhp;
  38         DB_MPOOL *dbmp;
  39         MPOOL *mp;
  40         MPOOLFILE *mfp;
  41         size_t bucket, mf_offset;
  42         u_int32_t st_hsearch;
  43         int b_incr, first, ret;
  44
  45         dbmp = dbmfp->dbmp;
  46         mp = dbmp->mp;
  47         mfp = dbmfp->mfp;
  48
  49         /*
  50          * Validate arguments.
  51          *
  52          * !!!
  53          * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
  54          * files here, and create non-existent pages in readonly files if the
  55          * flags are set, later.  The reason is that the hash access method
  56          * wants to get empty pages that don't really exist in readonly files.
  57          * The only alternative is for hash to write the last "bucket" all the
  58          * time, which we don't want to do because one of our big goals in life
  59          * is to keep database files small.  It's sleazy as hell, but we catch
  60          * any attempt to actually write the file in memp_fput().
  61          */
  62 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
  63         if (flags != 0) {
  64                 if ((ret =
  65                     __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0)
  66                         return (ret);
  67
  68                 switch (flags) {
  69                 case DB_MPOOL_CREATE:
  70                 case DB_MPOOL_LAST:
  71                 case DB_MPOOL_NEW:
  72                 case 0:
  73                         break;
  74                 default:
  75                         return (__db_ferr(dbmp->dbenv, "memp_fget", 1));
  76                 }
  77         }
  78
  79 #ifdef DIAGNOSTIC
  80         /*
  81          * XXX
  82          * We want to switch threads as often as possible.  Sleep every time
  83          * we get a new page to make it more likely.
  84          */
  85         if (DB_GLOBAL(db_pageyield) &&
  86             (__db_yield == NULL || __db_yield() != 0))
  87                 __db_sleep(0, 1);
  88 #endif
  89
  90         /* Initialize remaining local variables. */
  91         mf_offset = R_OFFSET(dbmp, mfp);
  92         bhp = NULL;
  93         st_hsearch = 0;
  94         b_incr = ret = 0;
  95
  96         /* Determine the hash bucket where this page will live. */
  97         bucket = BUCKET(mp, mf_offset, *pgnoaddr);
  98
  99         LOCKREGION(dbmp);
 100
 101         /*
 102          * Check for the last or last + 1 page requests.
 103          *
 104          * Examine and update the file's last_pgno value.  We don't care if
 105          * the last_pgno value immediately changes due to another thread --
 106          * at this instant in time, the value is correct.  We do increment the
 107          * current last_pgno value if the thread is asking for a new page,
 108          * however, to ensure that two threads creating pages don't get the
 109          * same one.
 110          */
 111         if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
 112                 if (LF_ISSET(DB_MPOOL_NEW))
 113                         ++mfp->last_pgno;
 114                 *pgnoaddr = mfp->last_pgno;
 115                 bucket = BUCKET(mp, mf_offset, mfp->last_pgno);
 116
 117                 if (LF_ISSET(DB_MPOOL_NEW))
 118                         goto alloc;
 119         }
 120
 121         /*
 122          * If mmap'ing the file and the page is not past the end of the file,
 123          * just return a pointer.
 124          *
 125          * The page may be past the end of the file, so check the page number
 126          * argument against the original length of the file.  If we previously
 127          * returned pages past the original end of the file, last_pgno will
 128          * have been updated to match the "new" end of the file, and checking
 129          * against it would return pointers past the end of the mmap'd region.
 130          *
 131          * If another process has opened the file for writing since we mmap'd
 132          * it, we will start playing the game by their rules, i.e. everything
 133          * goes through the cache.  All pages previously returned will be safe,
 134          * as long as the correct locking protocol was observed.
 135          *
 136          * XXX
 137          * We don't discard the map because we don't know when all of the
 138          * pages will have been discarded from the process' address space.
 139          * It would be possible to do so by reference counting the open
 140          * pages from the mmap, but it's unclear to me that it's worth it.
 141          */
 142         if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
 143                 if (*pgnoaddr > mfp->orig_last_pgno) {
 144                         /*
 145                          * !!!
 146                          * See the comment above about non-existent pages and
 147                          * the hash access method.
 148                          */
 149                         if (!LF_ISSET(DB_MPOOL_CREATE)) {
 150                                 __db_err(dbmp->dbenv,
 151                                     "%s: page %lu doesn't exist",
 152                                     __memp_fn(dbmfp), (u_long)*pgnoaddr);
 153                                 ret = EINVAL;
 154                                 goto err;
 155                         }
 156                 } else {
 157                         *(void **)addrp =
 158                             R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
 159                         ++mp->stat.st_map;
 160                         ++mfp->stat.st_map;
 161                         goto done;
 162                 }
 163         }
 164
 165         /* Search the hash chain for the page. */
 166         for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
 167             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
 168                 ++st_hsearch;
 169                 if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
 170                         continue;
 171
 172                 /* Increment the reference count. */
 173                 if (bhp->ref == UINT16_T_MAX) {
 174                         __db_err(dbmp->dbenv,
 175                             "%s: page %lu: reference count overflow",
 176                             __memp_fn(dbmfp), (u_long)bhp->pgno);
 177                         ret = EINVAL;
 178                         goto err;
 179                 }
 180
 181                 /*
 182                  * Increment the reference count.  We may discard the region
 183                  * lock as we evaluate and/or read the buffer, so we need to
 184                  * ensure that it doesn't move and that its contents remain
 185                  * unchanged.
 186                  */
 187                 ++bhp->ref;
 188                 b_incr = 1;
 189
 190                 /*
 191                  * Any buffer we find might be trouble.
 192                  *
 193                  * BH_LOCKED --
 194                  * I/O is in progress.  Because we've incremented the buffer
 195                  * reference count, we know the buffer can't move.  Unlock
 196                  * the region lock, wait for the I/O to complete, and reacquire
 197                  * the region.
 198                  */
 199                 for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
 200                         UNLOCKREGION(dbmp);
 201
 202                         /*
 203                          * Explicitly yield the processor if it's not the first
 204                          * pass through this loop -- if we don't, we might end
 205                          * up running to the end of our CPU quantum as we will
 206                          * simply be swapping between the two locks.
 207                          */
 208                         if (!first && (__db_yield == NULL || __db_yield() != 0))
 209                                 __db_sleep(0, 1);
 210
 211                         LOCKBUFFER(dbmp, bhp);
 212                         /* Wait for I/O to finish... */
 213                         UNLOCKBUFFER(dbmp, bhp);
 214                         LOCKREGION(dbmp);
 215                 }
 216
 217                 /*
 218                  * BH_TRASH --
 219                  * The contents of the buffer are garbage.  Shouldn't happen,
 220                  * and this read is likely to fail, but might as well try.
 221                  */
 222                 if (F_ISSET(bhp, BH_TRASH))
 223                         goto reread;
 224
 225                 /*
 226                  * BH_CALLPGIN --
 227                  * The buffer was converted so it could be written, and the
 228                  * contents need to be converted again.
 229                  */
 230                 if (F_ISSET(bhp, BH_CALLPGIN)) {
 231                         if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
 232                                 goto err;
 233                         F_CLR(bhp, BH_CALLPGIN);
 234                 }
 235
 236                 ++mp->stat.st_cache_hit;
 237                 ++mfp->stat.st_cache_hit;
 238                 *(void **)addrp = bhp->buf;
 239                 goto done;
 240         }
 241
 242 alloc:  /* Allocate new buffer header and data space. */
 243         if ((ret = __memp_ralloc(dbmp, sizeof(BH) -
 244             sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0)
 245                 goto err;
 246
 247 #ifdef DIAGNOSTIC
 248         if ((ALIGNTYPE)bhp->buf & (sizeof(size_t) - 1)) {
 249                 __db_err(dbmp->dbenv,
 250                     "Internal error: BH data NOT size_t aligned.");
 251                 ret = EINVAL;
 252                 goto err;
 253         }
 254 #endif
 255         /* Initialize the BH fields. */
 256         memset(bhp, 0, sizeof(BH));
 257         LOCKINIT(dbmp, &bhp->mutex);
 258         bhp->ref = 1;
 259         bhp->pgno = *pgnoaddr;
 260         bhp->mf_offset = mf_offset;
 261
 262         /*
 263          * Prepend the bucket header to the head of the appropriate MPOOL
 264          * bucket hash list.  Append the bucket header to the tail of the
 265          * MPOOL LRU chain.
 266          */
 267         SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, hq, __bh);
 268         SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
 269
 270         /*
 271          * If we created the page, zero it out and continue.
 272          *
 273          * !!!
 274          * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
 275          * If DB_MPOOL_CREATE is used, then the application's pgin function
 276          * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
 277          * it can detect all of its page creates, and not bother.
 278          *
 279          * Otherwise, read the page into memory, optionally creating it if
 280          * DB_MPOOL_CREATE is set.
 281          */
 282         if (LF_ISSET(DB_MPOOL_NEW)) {
 283                 if (mfp->clear_len == 0)
 284                         memset(bhp->buf, 0, mfp->stat.st_pagesize);
 285                 else {
 286                         memset(bhp->buf, 0, mfp->clear_len);
 287 #ifdef DIAGNOSTIC
 288                         memset(bhp->buf + mfp->clear_len, 0xff,
 289                             mfp->stat.st_pagesize - mfp->clear_len);
 290 #endif
 291                 }
 292
 293                 ++mp->stat.st_page_create;
 294                 ++mfp->stat.st_page_create;
 295         } else {
 296                 /*
 297                  * It's possible for the read function to fail, which means
 298                  * that we fail as well.  Note, the __memp_pgread() function
 299                  * discards the region lock, so the buffer must be pinned
 300                  * down so that it cannot move and its contents are unchanged.
 301                  */
 302 reread:         if ((ret = __memp_pgread(dbmfp,
 303                     bhp, LF_ISSET(DB_MPOOL_CREATE))) != 0) {
 304                         /*
 305                          * !!!
 306                          * Discard the buffer unless another thread is waiting
 307                          * on our I/O to complete.  Regardless, the header has
 308                          * the BH_TRASH flag set.
 309                          */
 310                         if (bhp->ref == 1)
 311                                 __memp_bhfree(dbmp, mfp, bhp, 1);
 312                         goto err;
 313                 }
 314
 315                 ++mp->stat.st_cache_miss;
 316                 ++mfp->stat.st_cache_miss;
 317         }
 318
 319         /*
 320          * If we're returning a page after our current notion of the last-page,
 321          * update our information.  Note, there's no way to un-instantiate this
 322          * page, it's going to exist whether it's returned to us dirty or not.
 323          */
 324         if (bhp->pgno > mfp->last_pgno)
 325                 mfp->last_pgno = bhp->pgno;
 326
 327         ++mp->stat.st_page_clean;
 328         *(void **)addrp = bhp->buf;
 329
 330 done:   /* Update the chain search statistics. */
 331         if (st_hsearch) {
 332                 ++mp->stat.st_hash_searches;
 333                 if (st_hsearch > mp->stat.st_hash_longest)
 334                         mp->stat.st_hash_longest = st_hsearch;
 335                 mp->stat.st_hash_examined += st_hsearch;
 336         }
 337
 338         UNLOCKREGION(dbmp);
 339
 340         LOCKHANDLE(dbmp, dbmfp->mutexp);
 341         ++dbmfp->pinref;
 342         UNLOCKHANDLE(dbmp, dbmfp->mutexp);
 343
 344         return (0);
 345
 346 err:    /* Discard our reference. */
 347         if (b_incr)
 348                 --bhp->ref;
 349         UNLOCKREGION(dbmp);
 350
 351         *(void **)addrp = NULL;
 352         return (ret);
 353 }