db2/mp/mp_fget.c

   1 /*-
   2  * See the file LICENSE for redistribution information.
   3  *
   4  * Copyright (c) 1996, 1997, 1998
   5  *      Sleepycat Software.  All rights reserved.
   6  */
   7 #include "config.h"
   8
   9 #ifndef lint
  10 static const char sccsid[] = "@(#)mp_fget.c     10.53 (Sleepycat) 11/16/98";
  11 #endif /* not lint */
  12
  13 #ifndef NO_SYSTEM_INCLUDES
  14 #include <sys/types.h>
  15
  16 #include <errno.h>
  17 #include <string.h>
  18 #endif
  19
  20 #include "db_int.h"
  21 #include "shqueue.h"
  22 #include "db_shash.h"
  23 #include "mp.h"
  24 #include "common_ext.h"
  25
  26 /*
  27  * memp_fget --
  28  *      Get a page from the file.
  29  */
  30 int
  31 memp_fget(dbmfp, pgnoaddr, flags, addrp)
  32         DB_MPOOLFILE *dbmfp;
  33         db_pgno_t *pgnoaddr;
  34         u_int32_t flags;
  35         void *addrp;
  36 {
  37         BH *bhp;
  38         DB_MPOOL *dbmp;
  39         MPOOL *mp;
  40         MPOOLFILE *mfp;
  41         size_t bucket, mf_offset;
  42         u_int32_t st_hsearch;
  43         int b_incr, first, ret;
  44
  45         dbmp = dbmfp->dbmp;
  46         mp = dbmp->mp;
  47         mfp = dbmfp->mfp;
  48
  49         MP_PANIC_CHECK(dbmp);
  50
  51         /*
  52          * Validate arguments.
  53          *
  54          * !!!
  55          * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
  56          * files here, and create non-existent pages in readonly files if the
  57          * flags are set, later.  The reason is that the hash access method
  58          * wants to get empty pages that don't really exist in readonly files.
  59          * The only alternative is for hash to write the last "bucket" all the
  60          * time, which we don't want to do because one of our big goals in life
  61          * is to keep database files small.  It's sleazy as hell, but we catch
  62          * any attempt to actually write the file in memp_fput().
  63          */
  64 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
  65         if (flags != 0) {
  66                 if ((ret =
  67                     __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0)
  68                         return (ret);
  69
  70                 switch (flags) {
  71                 case DB_MPOOL_CREATE:
  72                 case DB_MPOOL_LAST:
  73                 case DB_MPOOL_NEW:
  74                 case 0:
  75                         break;
  76                 default:
  77                         return (__db_ferr(dbmp->dbenv, "memp_fget", 1));
  78                 }
  79         }
  80
  81 #ifdef DIAGNOSTIC
  82         /*
  83          * XXX
  84          * We want to switch threads as often as possible.  Yield every time
  85          * we get a new page to ensure contention.
  86          */
  87         if (DB_GLOBAL(db_pageyield))
  88                 __os_yield(1);
  89 #endif
  90
  91         /* Initialize remaining local variables. */
  92         mf_offset = R_OFFSET(dbmp, mfp);
  93         bhp = NULL;
  94         st_hsearch = 0;
  95         b_incr = ret = 0;
  96
  97         /* Determine the hash bucket where this page will live. */
  98         bucket = BUCKET(mp, mf_offset, *pgnoaddr);
  99
 100         LOCKREGION(dbmp);
 101
 102         /*
 103          * Check for the last or last + 1 page requests.
 104          *
 105          * Examine and update the file's last_pgno value.  We don't care if
 106          * the last_pgno value immediately changes due to another thread --
 107          * at this instant in time, the value is correct.  We do increment the
 108          * current last_pgno value if the thread is asking for a new page,
 109          * however, to ensure that two threads creating pages don't get the
 110          * same one.
 111          */
 112         if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
 113                 if (LF_ISSET(DB_MPOOL_NEW))
 114                         ++mfp->last_pgno;
 115                 *pgnoaddr = mfp->last_pgno;
 116                 bucket = BUCKET(mp, mf_offset, mfp->last_pgno);
 117
 118                 if (LF_ISSET(DB_MPOOL_NEW))
 119                         goto alloc;
 120         }
 121
 122         /*
 123          * If mmap'ing the file and the page is not past the end of the file,
 124          * just return a pointer.
 125          *
 126          * The page may be past the end of the file, so check the page number
 127          * argument against the original length of the file.  If we previously
 128          * returned pages past the original end of the file, last_pgno will
 129          * have been updated to match the "new" end of the file, and checking
 130          * against it would return pointers past the end of the mmap'd region.
 131          *
 132          * If another process has opened the file for writing since we mmap'd
 133          * it, we will start playing the game by their rules, i.e. everything
 134          * goes through the cache.  All pages previously returned will be safe,
 135          * as long as the correct locking protocol was observed.
 136          *
 137          * XXX
 138          * We don't discard the map because we don't know when all of the
 139          * pages will have been discarded from the process' address space.
 140          * It would be possible to do so by reference counting the open
 141          * pages from the mmap, but it's unclear to me that it's worth it.
 142          */
 143         if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
 144                 if (*pgnoaddr > mfp->orig_last_pgno) {
 145                         /*
 146                          * !!!
 147                          * See the comment above about non-existent pages and
 148                          * the hash access method.
 149                          */
 150                         if (!LF_ISSET(DB_MPOOL_CREATE)) {
 151                                 __db_err(dbmp->dbenv,
 152                                     "%s: page %lu doesn't exist",
 153                                     __memp_fn(dbmfp), (u_long)*pgnoaddr);
 154                                 ret = EINVAL;
 155                                 goto err;
 156                         }
 157                 } else {
 158                         *(void **)addrp =
 159                             R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
 160                         ++mp->stat.st_map;
 161                         ++mfp->stat.st_map;
 162                         goto done;
 163                 }
 164         }
 165
 166         /* Search the hash chain for the page. */
 167         for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
 168             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
 169                 ++st_hsearch;
 170                 if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
 171                         continue;
 172
 173                 /* Increment the reference count. */
 174                 if (bhp->ref == UINT16_T_MAX) {
 175                         __db_err(dbmp->dbenv,
 176                             "%s: page %lu: reference count overflow",
 177                             __memp_fn(dbmfp), (u_long)bhp->pgno);
 178                         ret = EINVAL;
 179                         goto err;
 180                 }
 181
 182                 /*
 183                  * Increment the reference count.  We may discard the region
 184                  * lock as we evaluate and/or read the buffer, so we need to
 185                  * ensure that it doesn't move and that its contents remain
 186                  * unchanged.
 187                  */
 188                 ++bhp->ref;
 189                 b_incr = 1;
 190
 191                 /*
 192                  * Any buffer we find might be trouble.
 193                  *
 194                  * BH_LOCKED --
 195                  * I/O is in progress.  Because we've incremented the buffer
 196                  * reference count, we know the buffer can't move.  Unlock
 197                  * the region lock, wait for the I/O to complete, and reacquire
 198                  * the region.
 199                  */
 200                 for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
 201                         UNLOCKREGION(dbmp);
 202
 203                         /*
 204                          * Explicitly yield the processor if it's not the first
 205                          * pass through this loop -- if we don't, we might end
 206                          * up running to the end of our CPU quantum as we will
 207                          * simply be swapping between the two locks.
 208                          */
 209                         if (!first)
 210                                 __os_yield(1);
 211
 212                         LOCKBUFFER(dbmp, bhp);
 213                         /* Wait for I/O to finish... */
 214                         UNLOCKBUFFER(dbmp, bhp);
 215                         LOCKREGION(dbmp);
 216                 }
 217
 218                 /*
 219                  * BH_TRASH --
 220                  * The contents of the buffer are garbage.  Shouldn't happen,
 221                  * and this read is likely to fail, but might as well try.
 222                  */
 223                 if (F_ISSET(bhp, BH_TRASH))
 224                         goto reread;
 225
 226                 /*
 227                  * BH_CALLPGIN --
 228                  * The buffer was converted so it could be written, and the
 229                  * contents need to be converted again.
 230                  */
 231                 if (F_ISSET(bhp, BH_CALLPGIN)) {
 232                         if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
 233                                 goto err;
 234                         F_CLR(bhp, BH_CALLPGIN);
 235                 }
 236
 237                 ++mp->stat.st_cache_hit;
 238                 ++mfp->stat.st_cache_hit;
 239                 *(void **)addrp = bhp->buf;
 240                 goto done;
 241         }
 242
 243 alloc:  /* Allocate new buffer header and data space. */
 244         if ((ret = __memp_alloc(dbmp, sizeof(BH) -
 245             sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0)
 246                 goto err;
 247
 248 #ifdef DIAGNOSTIC
 249         if ((ALIGNTYPE)bhp->buf & (sizeof(size_t) - 1)) {
 250                 __db_err(dbmp->dbenv,
 251                     "Internal error: BH data NOT size_t aligned.");
 252                 ret = EINVAL;
 253                 goto err;
 254         }
 255 #endif
 256         /* Initialize the BH fields. */
 257         memset(bhp, 0, sizeof(BH));
 258         LOCKINIT(dbmp, &bhp->mutex);
 259         bhp->ref = 1;
 260         bhp->pgno = *pgnoaddr;
 261         bhp->mf_offset = mf_offset;
 262
 263         /*
 264          * Prepend the bucket header to the head of the appropriate MPOOL
 265          * bucket hash list.  Append the bucket header to the tail of the
 266          * MPOOL LRU chain.
 267          */
 268         SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, hq, __bh);
 269         SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
 270
 271         /*
 272          * If we created the page, zero it out and continue.
 273          *
 274          * !!!
 275          * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
 276          * If DB_MPOOL_CREATE is used, then the application's pgin function
 277          * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
 278          * it can detect all of its page creates, and not bother.
 279          *
 280          * Otherwise, read the page into memory, optionally creating it if
 281          * DB_MPOOL_CREATE is set.
 282          */
 283         if (LF_ISSET(DB_MPOOL_NEW)) {
 284                 if (mfp->clear_len == 0)
 285                         memset(bhp->buf, 0, mfp->stat.st_pagesize);
 286                 else {
 287                         memset(bhp->buf, 0, mfp->clear_len);
 288 #ifdef DIAGNOSTIC
 289                         memset(bhp->buf + mfp->clear_len, 0xdb,
 290                             mfp->stat.st_pagesize - mfp->clear_len);
 291 #endif
 292                 }
 293
 294                 ++mp->stat.st_page_create;
 295                 ++mfp->stat.st_page_create;
 296         } else {
 297                 /*
 298                  * It's possible for the read function to fail, which means
 299                  * that we fail as well.  Note, the __memp_pgread() function
 300                  * discards the region lock, so the buffer must be pinned
 301                  * down so that it cannot move and its contents are unchanged.
 302                  */
 303 reread:         if ((ret = __memp_pgread(dbmfp,
 304                     bhp, LF_ISSET(DB_MPOOL_CREATE))) != 0) {
 305                         /*
 306                          * !!!
 307                          * Discard the buffer unless another thread is waiting
 308                          * on our I/O to complete.  Regardless, the header has
 309                          * the BH_TRASH flag set.
 310                          */
 311                         if (bhp->ref == 1)
 312                                 __memp_bhfree(dbmp, mfp, bhp, 1);
 313                         goto err;
 314                 }
 315
 316                 ++mp->stat.st_cache_miss;
 317                 ++mfp->stat.st_cache_miss;
 318         }
 319
 320         /*
 321          * If we're returning a page after our current notion of the last-page,
 322          * update our information.  Note, there's no way to un-instantiate this
 323          * page, it's going to exist whether it's returned to us dirty or not.
 324          */
 325         if (bhp->pgno > mfp->last_pgno)
 326                 mfp->last_pgno = bhp->pgno;
 327
 328         ++mp->stat.st_page_clean;
 329         *(void **)addrp = bhp->buf;
 330
 331 done:   /* Update the chain search statistics. */
 332         if (st_hsearch) {
 333                 ++mp->stat.st_hash_searches;
 334                 if (st_hsearch > mp->stat.st_hash_longest)
 335                         mp->stat.st_hash_longest = st_hsearch;
 336                 mp->stat.st_hash_examined += st_hsearch;
 337         }
 338
 339         ++dbmfp->pinref;
 340
 341         UNLOCKREGION(dbmp);
 342
 343         return (0);
 344
 345 err:    /* Discard our reference. */
 346         if (b_incr)
 347                 --bhp->ref;
 348         UNLOCKREGION(dbmp);
 349
 350         *(void **)addrp = NULL;
 351         return (ret);
 352 }