ext/lsm1/lsm_file.c

   1 /*
   2 ** 2011-08-26
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 **
  13 ** NORMAL DATABASE FILE FORMAT
  14 **
  15 ** The following database file format concepts are used by the code in
  16 ** this file to read and write the database file.
  17 **
  18 ** Pages:
  19 **
  20 **   A database file is divided into pages. The first 8KB of the file consists
  21 **   of two 4KB meta-pages. The meta-page size is not configurable. The
  22 **   remainder of the file is made up of database pages. The default database
  23 **   page size is 4KB. Database pages are aligned to page-size boundaries,
  24 **   so if the database page size is larger than 8KB there is a gap between
  25 **   the end of the meta pages and the start of the database pages.
  26 **
  27 **   Database pages are numbered based on their position in the file. Page N
  28 **   begins at byte offset ((N-1)*pgsz). This means that page 1 does not
  29 **   exist - since it would always overlap with the meta pages. If the
  30 **   page-size is (say) 512 bytes, then the first usable page in the database
  31 **   is page 33.
  32 **
  33 **   It is assumed that the first two meta pages and the data that follows
  34 **   them are located on different disk sectors. So that if a power failure
  35 **   while writing to a meta page there is no risk of damage to the other
  36 **   meta page or any other part of the database file. TODO: This may need
  37 **   to be revisited.
  38 **
  39 ** Blocks:
  40 **
  41 **   The database file is also divided into blocks. The default block size is
  42 **   1MB. When writing to the database file, an attempt is made to write data
  43 **   in contiguous block-sized chunks.
  44 **
  45 **   The first and last page on each block are special in that they are 4
  46 **   bytes smaller than all other pages. This is because the last four bytes
  47 **   of space on the first and last pages of each block are reserved for
  48 **   pointers to other blocks (i.e. a 32-bit block number).
  49 **
  50 ** Runs:
  51 **
  52 **   A run is a sequence of pages that the upper layer uses to store a
  53 **   sorted array of database keys (and accompanying data - values, FC
  54 **   pointers and so on). Given a page within a run, it is possible to
  55 **   navigate to the next page in the run as follows:
  56 **
  57 **     a) if the current page is not the last in a block, the next page
  58 **        in the run is located immediately after the current page, OR
  59 **
  60 **     b) if the current page is the last page in a block, the next page
  61 **        in the run is the first page on the block identified by the
  62 **        block pointer stored in the last 4 bytes of the current block.
  63 **
  64 **   It is possible to navigate to the previous page in a similar fashion,
  65 **   using the block pointer embedded in the last 4 bytes of the first page
  66 **   of each block as required.
  67 **
  68 **   The upper layer is responsible for identifying by page number the
  69 **   first and last page of any run that it needs to navigate - there are
  70 **   no "end-of-run" markers stored or identified by this layer. This is
  71 **   necessary as clients reading different database snapshots may access
  72 **   different subsets of a run.
  73 **
  74 ** THE LOG FILE
  75 **
  76 ** This file opens and closes the log file. But it does not contain any
  77 ** logic related to the log file format. Instead, it exports the following
  78 ** functions that are used by the code in lsm_log.c to read and write the
  79 ** log file:
  80 **
  81 **     lsmFsOpenLog
  82 **     lsmFsWriteLog
  83 **     lsmFsSyncLog
  84 **     lsmFsReadLog
  85 **     lsmFsTruncateLog
  86 **     lsmFsCloseAndDeleteLog
  87 **
  88 ** COMPRESSED DATABASE FILE FORMAT
  89 **
  90 ** The compressed database file format is very similar to the normal format.
  91 ** The file still begins with two 4KB meta-pages (which are never compressed).
  92 ** It is still divided into blocks.
  93 **
  94 ** The first and last four bytes of each block are reserved for 32-bit
  95 ** pointer values. Similar to the way four bytes are carved from the end of
  96 ** the first and last page of each block in uncompressed databases. From
  97 ** the point of view of the upper layer, all pages are the same size - this
  98 ** is different from the uncompressed format where the first and last pages
  99 ** on each block are 4 bytes smaller than the others.
 100 **
 101 ** Pages are stored in variable length compressed form, as follows:
 102 **
 103 **     * 3-byte size field containing the size of the compressed page image
 104 **       in bytes. The most significant bit of each byte of the size field
 105 **       is always set. The remaining 7 bits are used to store a 21-bit
 106 **       integer value (in big-endian order - the first byte in the field
 107 **       contains the most significant 7 bits). Since the maximum allowed
 108 **       size of a compressed page image is (2^17 - 1) bytes, there are
 109 **       actually 4 unused bits in the size field.
 110 **
 111 **       In other words, if the size of the compressed page image is nSz,
 112 **       the header can be serialized as follows:
 113 **
 114 **         u8 aHdr[3]
 115 **         aHdr[0] = 0x80 | (u8)(nSz >> 14);
 116 **         aHdr[1] = 0x80 | (u8)(nSz >>  7);
 117 **         aHdr[2] = 0x80 | (u8)(nSz >>  0);
 118 **
 119 **     * Compressed page image.
 120 **
 121 **     * A second copy of the 3-byte record header.
 122 **
 123 ** A page number is a byte offset into the database file. So the smallest
 124 ** possible page number is 8192 (immediately after the two meta-pages).
 125 ** The first and root page of a segment are identified by a page number
 126 ** corresponding to the byte offset of the first byte in the corresponding
 127 ** page record. The last page of a segment is identified by the byte offset
 128 ** of the last byte in its record.
 129 **
 130 ** Unlike uncompressed pages, compressed page records may span blocks.
 131 **
 132 ** Sometimes, in order to avoid touching sectors that contain synced data
 133 ** when writing, it is necessary to insert unused space between compressed
 134 ** page records. This can be done as follows:
 135 **
 136 **     * For less than 6 bytes of empty space, the first and last byte
 137 **       of the free space contain the total number of free bytes. For
 138 **       example:
 139 **
 140 **         Block of 4 free bytes: 0x04 0x?? 0x?? 0x04
 141 **         Block of 2 free bytes: 0x02 0x02
 142 **         A single free byte:    0x01
 143 **
 144 **     * For 6 or more bytes of empty space, a record similar to a
 145 **       compressed page record is added to the segment. A padding record
 146 **       is distinguished from a compressed page record by the most
 147 **       significant bit of the second byte of the size field, which is
 148 **       cleared instead of set.
 149 */
 150 #include "lsmInt.h"
 151
 152 #include <sys/types.h>
 153 #include <sys/stat.h>
 154 #include <fcntl.h>
 155
 156 /*
 157 ** File-system object. Each database connection allocates a single instance
 158 ** of the following structure. It is used for all access to the database and
 159 ** log files.
 160 **
 161 ** The database file may be accessed via two methods - using mmap() or using
 162 ** read() and write() calls. In the general case both methods are used - a
 163 ** prefix of the file is mapped into memory and the remainder accessed using
 164 ** read() and write(). This is helpful when accessing very large files (or
 165 ** files that may grow very large during the lifetime of a database
 166 ** connection) on systems with 32-bit address spaces. However, it also requires
 167 ** that this object manage two distinct types of Page objects simultaneously -
 168 ** those that carry pointers to the mapped file and those that carry arrays
 169 ** populated by read() calls.
 170 **
 171 ** pFree:
 172 **   The head of a singly-linked list that containing currently unused Page
 173 **   structures suitable for use as mmap-page handles. Connected by the
 174 **   Page.pFreeNext pointers.
 175 **
 176 ** pMapped:
 177 **   The head of a singly-linked list that contains all pages that currently
 178 **   carry pointers to the mapped region. This is used if the region is
 179 **   every remapped - the pointers carried by existing pages can be adjusted
 180 **   to account for the remapping. Connected by the Page.pMappedNext pointers.
 181 **
 182 ** pWaiting:
 183 **   When the upper layer wishes to append a new b-tree page to a segment,
 184 **   it allocates a Page object that carries a malloc'd block of memory -
 185 **   regardless of the mmap-related configuration. The page is not assigned
 186 **   a page number at first. When the upper layer has finished constructing
 187 **   the page contents, it calls lsmFsPagePersist() to assign a page number
 188 **   to it. At this point it is likely that N pages have been written to the
 189 **   segment, the (N+1)th page is still outstanding and the b-tree page is
 190 **   assigned page number (N+2). To avoid writing page (N+2) before page
 191 **   (N+1), the recently completed b-tree page is held in the singly linked
 192 **   list headed by pWaiting until page (N+1) has been written.
 193 **
 194 **   Function lsmFsFlushWaiting() is responsible for eventually writing
 195 **   waiting pages to disk.
 196 **
 197 ** apHash/nHash:
 198 **   Hash table used to store all Page objects that carry malloc'd arrays,
 199 **   except those b-tree pages that have not yet been assigned page numbers.
 200 **   Once they have been assigned page numbers - they are added to this
 201 **   hash table.
 202 **
 203 **   Hash table overflow chains are connected using the Page.pHashNext
 204 **   pointers.
 205 **
 206 ** pLruFirst, pLruLast:
 207 **   The first and last entries in a doubly-linked list of pages. This
 208 **   list contains all pages with malloc'd data that are present in the
 209 **   hash table and have a ref-count of zero.
 210 */
 211 struct FileSystem {
 212   lsm_db *pDb;                    /* Database handle that owns this object */
 213   lsm_env *pEnv;                  /* Environment pointer */
 214   char *zDb;                      /* Database file name */
 215   char *zLog;                     /* Database file name */
 216   int nMetasize;                  /* Size of meta pages in bytes */
 217   int nMetaRwSize;                /* Read/written size of meta pages in bytes */
 218   int nPagesize;                  /* Database page-size in bytes */
 219   int nBlocksize;                 /* Database block-size in bytes */
 220
 221   /* r/w file descriptors for both files. */
 222   LsmFile *pLsmFile;              /* Used after lsm_close() to link into list */
 223   lsm_file *fdDb;                 /* Database file */
 224   lsm_file *fdLog;                /* Log file */
 225   int szSector;                   /* Database file sector size */
 226
 227   /* If this is a compressed database, a pointer to the compression methods.
 228   ** For an uncompressed database, a NULL pointer.  */
 229   lsm_compress *pCompress;
 230   u8 *aIBuffer;                   /* Buffer to compress to */
 231   u8 *aOBuffer;                   /* Buffer to uncompress from */
 232   int nBuffer;                    /* Allocated size of above buffers in bytes */
 233
 234   /* mmap() page related things */
 235   i64 nMapLimit;                  /* Maximum bytes of file to map */
 236   void *pMap;                     /* Current mapping of database file */
 237   i64 nMap;                       /* Bytes mapped at pMap */
 238   Page *pFree;                    /* Unused Page structures */
 239   Page *pMapped;                  /* List of Page structs that point to pMap */
 240
 241   /* Page cache parameters for non-mmap() pages */
 242   int nCacheMax;                  /* Configured cache size (in pages) */
 243   int nCacheAlloc;                /* Current cache size (in pages) */
 244   Page *pLruFirst;                /* Head of the LRU list */
 245   Page *pLruLast;                 /* Tail of the LRU list */
 246   int nHash;                      /* Number of hash slots in hash table */
 247   Page **apHash;                  /* nHash Hash slots */
 248   Page *pWaiting;                 /* b-tree pages waiting to be written */
 249
 250   /* Statistics */
 251   int nOut;                       /* Number of outstanding pages */
 252   int nWrite;                     /* Total number of pages written */
 253   int nRead;                      /* Total number of pages read */
 254 };
 255
 256 /*
 257 ** Database page handle.
 258 **
 259 ** pSeg:
 260 **   When lsmFsSortedAppend() is called on a compressed database, the new
 261 **   page is not assigned a page number or location in the database file
 262 **   immediately. Instead, these are assigned by the lsmFsPagePersist() call
 263 **   right before it writes the compressed page image to disk.
 264 **
 265 **   The lsmFsSortedAppend() function sets the pSeg pointer to point to the
 266 **   segment that the new page will be a part of. It is unset by
 267 **   lsmFsPagePersist() after the page is written to disk.
 268 */
 269 struct Page {
 270   u8 *aData;                      /* Buffer containing page data */
 271   int nData;                      /* Bytes of usable data at aData[] */
 272   Pgno iPg;                       /* Page number */
 273   int nRef;                       /* Number of outstanding references */
 274   int flags;                      /* Combination of PAGE_XXX flags */
 275   Page *pHashNext;                /* Next page in hash table slot */
 276   Page *pLruNext;                 /* Next page in LRU list */
 277   Page *pLruPrev;                 /* Previous page in LRU list */
 278   FileSystem *pFS;                /* File system that owns this page */
 279
 280   /* Only used in compressed database mode: */
 281   int nCompress;                  /* Compressed size (or 0 for uncomp. db) */
 282   int nCompressPrev;              /* Compressed size of prev page */
 283   Segment *pSeg;                  /* Segment this page will be written to */
 284
 285   /* Pointers for singly linked lists */
 286   Page *pWaitingNext;             /* Next page in FileSystem.pWaiting list */
 287   Page *pFreeNext;                /* Next page in FileSystem.pFree list */
 288   Page *pMappedNext;              /* Next page in FileSystem.pMapped list */
 289 };
 290
 291 /*
 292 ** Meta-data page handle. There are two meta-data pages at the start of
 293 ** the database file, each FileSystem.nMetasize bytes in size.
 294 */
 295 struct MetaPage {
 296   int iPg;                        /* Either 1 or 2 */
 297   int bWrite;                     /* Write back to db file on release */
 298   u8 *aData;                      /* Pointer to buffer */
 299   FileSystem *pFS;                /* FileSystem that owns this page */
 300 };
 301
 302 /*
 303 ** Values for LsmPage.flags
 304 */
 305 #define PAGE_DIRTY   0x00000001   /* Set if page is dirty */
 306 #define PAGE_FREE    0x00000002   /* Set if Page.aData requires lsmFree() */
 307 #define PAGE_HASPREV 0x00000004   /* Set if page is first on uncomp. block */
 308
 309 /*
 310 ** Number of pgsz byte pages omitted from the start of block 1. The start
 311 ** of block 1 contains two 4096 byte meta pages (8192 bytes in total).
 312 */
 313 #define BLOCK1_HDR_SIZE(pgsz)  LSM_MAX(1, 8192/(pgsz))
 314
 315 /*
 316 ** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt()
 317 ** to catch IO errors (any error returned by a VFS method).
 318 */
 319 #ifndef NDEBUG
 320 static void lsmIoerrBkpt(void){
 321   static int nErr = 0;
 322   nErr++;
 323 }
 324 static int IOERR_WRAPPER(int rc){
 325   if( rc!=LSM_OK ) lsmIoerrBkpt();
 326   return rc;
 327 }
 328 #else
 329 # define IOERR_WRAPPER(rc) (rc)
 330 #endif
 331
 332 #ifdef NDEBUG
 333 # define assert_lists_are_ok(x)
 334 #else
 335 static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash);
 336
 337 static void assert_lists_are_ok(FileSystem *pFS){
 338 #if 0
 339   Page *p;
 340
 341   assert( pFS->nMapLimit>=0 );
 342
 343   /* Check that all pages in the LRU list have nRef==0, pointers to buffers
 344   ** in heap memory, and corresponding entries in the hash table.  */
 345   for(p=pFS->pLruFirst; p; p=p->pLruNext){
 346     assert( p==pFS->pLruFirst || p->pLruPrev!=0 );
 347     assert( p==pFS->pLruLast || p->pLruNext!=0 );
 348     assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p );
 349     assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p );
 350     assert( p->nRef==0 );
 351     assert( p->flags & PAGE_FREE );
 352     assert( p==fsPageFindInHash(pFS, p->iPg, 0) );
 353   }
 354 #endif
 355 }
 356 #endif
 357
 358 /*
 359 ** Wrappers around the VFS methods of the lsm_env object:
 360 **
 361 **     lsmEnvOpen()
 362 **     lsmEnvRead()
 363 **     lsmEnvWrite()
 364 **     lsmEnvSync()
 365 **     lsmEnvSectorSize()
 366 **     lsmEnvClose()
 367 **     lsmEnvTruncate()
 368 **     lsmEnvUnlink()
 369 **     lsmEnvRemap()
 370 */
 371 int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){
 372   return pEnv->xOpen(pEnv, zFile, flags, ppNew);
 373 }
 374
 375 static int lsmEnvRead(
 376   lsm_env *pEnv,
 377   lsm_file *pFile,
 378   lsm_i64 iOff,
 379   void *pRead,
 380   int nRead
 381 ){
 382   return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) );
 383 }
 384
 385 static int lsmEnvWrite(
 386   lsm_env *pEnv,
 387   lsm_file *pFile,
 388   lsm_i64 iOff,
 389   const void *pWrite,
 390   int nWrite
 391 ){
 392   return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) );
 393 }
 394
 395 static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
 396   return IOERR_WRAPPER( pEnv->xSync(pFile) );
 397 }
 398
 399 static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
 400   return pEnv->xSectorSize(pFile);
 401 }
 402
 403 int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
 404   return IOERR_WRAPPER( pEnv->xClose(pFile) );
 405 }
 406
 407 static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
 408   return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) );
 409 }
 410
 411 static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
 412   return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) );
 413 }
 414
 415 static int lsmEnvRemap(
 416   lsm_env *pEnv,
 417   lsm_file *pFile,
 418   i64 szMin,
 419   void **ppMap,
 420   i64 *pszMap
 421 ){
 422   return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
 423 }
 424
 425 int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
 426   if( pFile==0 ) return LSM_OK;
 427   return pEnv->xLock(pFile, iLock, eLock);
 428 }
 429
 430 int lsmEnvTestLock(
 431   lsm_env *pEnv,
 432   lsm_file *pFile,
 433   int iLock,
 434   int nLock,
 435   int eLock
 436 ){
 437   return pEnv->xTestLock(pFile, iLock, nLock, eLock);
 438 }
 439
 440 int lsmEnvShmMap(
 441   lsm_env *pEnv,
 442   lsm_file *pFile,
 443   int iChunk,
 444   int sz,
 445   void **ppOut
 446 ){
 447   return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
 448 }
 449
 450 void lsmEnvShmBarrier(lsm_env *pEnv){
 451   pEnv->xShmBarrier();
 452 }
 453
 454 void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
 455   pEnv->xShmUnmap(pFile, bDel);
 456 }
 457
 458 void lsmEnvSleep(lsm_env *pEnv, int nUs){
 459   pEnv->xSleep(pEnv, nUs);
 460 }
 461
 462
 463 /*
 464 ** Write the contents of string buffer pStr into the log file, starting at
 465 ** offset iOff.
 466 */
 467 int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
 468   assert( pFS->fdLog );
 469   return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
 470 }
 471
 472 /*
 473 ** fsync() the log file.
 474 */
 475 int lsmFsSyncLog(FileSystem *pFS){
 476   assert( pFS->fdLog );
 477   return lsmEnvSync(pFS->pEnv, pFS->fdLog);
 478 }
 479
 480 /*
 481 ** Read nRead bytes of data starting at offset iOff of the log file. Append
 482 ** the results to string buffer pStr.
 483 */
 484 int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
 485   int rc;                         /* Return code */
 486   assert( pFS->fdLog );
 487   rc = lsmStringExtend(pStr, nRead);
 488   if( rc==LSM_OK ){
 489     rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
 490     pStr->n += nRead;
 491   }
 492   return rc;
 493 }
 494
 495 /*
 496 ** Truncate the log file to nByte bytes in size.
 497 */
 498 int lsmFsTruncateLog(FileSystem *pFS, i64 nByte){
 499   if( pFS->fdLog==0 ) return LSM_OK;
 500   return lsmEnvTruncate(pFS->pEnv, pFS->fdLog, nByte);
 501 }
 502
 503 /*
 504 ** Truncate the db file to nByte bytes in size.
 505 */
 506 int lsmFsTruncateDb(FileSystem *pFS, i64 nByte){
 507   if( pFS->fdDb==0 ) return LSM_OK;
 508   return lsmEnvTruncate(pFS->pEnv, pFS->fdDb, nByte);
 509 }
 510
 511 /*
 512 ** Close the log file. Then delete it from the file-system. This function
 513 ** is called during database shutdown only.
 514 */
 515 int lsmFsCloseAndDeleteLog(FileSystem *pFS){
 516   char *zDel;
 517
 518   if( pFS->fdLog ){
 519     lsmEnvClose(pFS->pEnv, pFS->fdLog );
 520     pFS->fdLog = 0;
 521   }
 522
 523   zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
 524   if( zDel ){
 525     lsmEnvUnlink(pFS->pEnv, zDel);
 526     lsmFree(pFS->pEnv, zDel);
 527   }
 528   return LSM_OK;
 529 }
 530
 531 /*
 532 ** Return true if page iReal of the database should be accessed using mmap.
 533 ** False otherwise.
 534 */
 535 static int fsMmapPage(FileSystem *pFS, Pgno iReal){
 536   return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit);
 537 }
 538
 539 /*
 540 ** Given that there are currently nHash slots in the hash table, return
 541 ** the hash key for file iFile, page iPg.
 542 */
 543 static int fsHashKey(int nHash, Pgno iPg){
 544   return (iPg % nHash);
 545 }
 546
 547 /*
 548 ** This is a helper function for lsmFsOpen(). It opens a single file on
 549 ** disk (either the database or log file).
 550 */
 551 static lsm_file *fsOpenFile(
 552   FileSystem *pFS,                /* File system object */
 553   int bReadonly,                  /* True to open this file read-only */
 554   int bLog,                       /* True for log, false for db */
 555   int *pRc                        /* IN/OUT: Error code */
 556 ){
 557   lsm_file *pFile = 0;
 558   if( *pRc==LSM_OK ){
 559     int flags = (bReadonly ? LSM_OPEN_READONLY : 0);
 560     const char *zPath = (bLog ? pFS->zLog : pFS->zDb);
 561
 562     *pRc = lsmEnvOpen(pFS->pEnv, zPath, flags, &pFile);
 563   }
 564   return pFile;
 565 }
 566
 567 /*
 568 ** If it is not already open, this function opens the log file. It returns
 569 ** LSM_OK if successful (or if the log file was already open) or an LSM
 570 ** error code otherwise.
 571 **
 572 ** The log file must be opened before any of the following may be called:
 573 **
 574 **     lsmFsWriteLog
 575 **     lsmFsSyncLog
 576 **     lsmFsReadLog
 577 */
 578 int lsmFsOpenLog(lsm_db *db, int *pbOpen){
 579   int rc = LSM_OK;
 580   FileSystem *pFS = db->pFS;
 581
 582   if( 0==pFS->fdLog ){
 583     pFS->fdLog = fsOpenFile(pFS, db->bReadonly, 1, &rc);
 584
 585     if( rc==LSM_IOERR_NOENT && db->bReadonly ){
 586       rc = LSM_OK;
 587     }
 588   }
 589
 590   if( pbOpen ) *pbOpen = (pFS->fdLog!=0);
 591   return rc;
 592 }
 593
 594 /*
 595 ** Close the log file, if it is open.
 596 */
 597 void lsmFsCloseLog(lsm_db *db){
 598   FileSystem *pFS = db->pFS;
 599   if( pFS->fdLog ){
 600     lsmEnvClose(pFS->pEnv, pFS->fdLog);
 601     pFS->fdLog = 0;
 602   }
 603 }
 604
 605 /*
 606 ** Open a connection to a database stored within the file-system.
 607 **
 608 ** If parameter bReadonly is true, then open a read-only file-descriptor
 609 ** on the database file. It is possible that bReadonly will be false even
 610 ** if the user requested that pDb be opened read-only. This is because the
 611 ** file-descriptor may later on be recycled by a read-write connection.
 612 ** If the db file can be opened for read-write access, it always is. Parameter
 613 ** bReadonly is only ever true if it has already been determined that the
 614 ** db can only be opened for read-only access.
 615 **
 616 ** Return LSM_OK if successful or an lsm error code otherwise.
 617 */
 618 int lsmFsOpen(
 619   lsm_db *pDb,                    /* Database connection to open fd for */
 620   const char *zDb,                /* Full path to database file */
 621   int bReadonly                   /* True to open db file read-only */
 622 ){
 623   FileSystem *pFS;
 624   int rc = LSM_OK;
 625   int nDb = strlen(zDb);
 626   int nByte;
 627
 628   assert( pDb->pFS==0 );
 629   assert( pDb->pWorker==0 && pDb->pClient==0 );
 630
 631   nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
 632   pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
 633   if( pFS ){
 634     LsmFile *pLsmFile;
 635     pFS->zDb = (char *)&pFS[1];
 636     pFS->zLog = &pFS->zDb[nDb+1];
 637     pFS->nPagesize = LSM_DFLT_PAGE_SIZE;
 638     pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE;
 639     pFS->nMetasize = LSM_META_PAGE_SIZE;
 640     pFS->nMetaRwSize = LSM_META_RW_PAGE_SIZE;
 641     pFS->pDb = pDb;
 642     pFS->pEnv = pDb->pEnv;
 643
 644     /* Make a copy of the database and log file names. */
 645     memcpy(pFS->zDb, zDb, nDb+1);
 646     memcpy(pFS->zLog, zDb, nDb);
 647     memcpy(&pFS->zLog[nDb], "-log", 5);
 648
 649     /* Allocate the hash-table here. At some point, it should be changed
 650     ** so that it can grow dynamicly. */
 651     pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
 652     pFS->nHash = 4096;
 653     pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
 654
 655     /* Open the database file */
 656     pLsmFile = lsmDbRecycleFd(pDb);
 657     if( pLsmFile ){
 658       pFS->pLsmFile = pLsmFile;
 659       pFS->fdDb = pLsmFile->pFile;
 660       memset(pLsmFile, 0, sizeof(LsmFile));
 661     }else{
 662       pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
 663       if( rc==LSM_OK ){
 664         pFS->fdDb = fsOpenFile(pFS, bReadonly, 0, &rc);
 665       }
 666     }
 667
 668     if( rc!=LSM_OK ){
 669       lsmFsClose(pFS);
 670       pFS = 0;
 671     }else{
 672       pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb);
 673     }
 674   }
 675
 676   pDb->pFS = pFS;
 677   return rc;
 678 }
 679
 680 /*
 681 ** Configure the file-system object according to the current values of
 682 ** the LSM_CONFIG_MMAP and LSM_CONFIG_SET_COMPRESSION options.
 683 */
 684 int lsmFsConfigure(lsm_db *db){
 685   FileSystem *pFS = db->pFS;
 686   if( pFS ){
 687     lsm_env *pEnv = pFS->pEnv;
 688     Page *pPg;
 689
 690     assert( pFS->nOut==0 );
 691     assert( pFS->pWaiting==0 );
 692     assert( pFS->pMapped==0 );
 693
 694     /* Reset any compression/decompression buffers already allocated */
 695     lsmFree(pEnv, pFS->aIBuffer);
 696     lsmFree(pEnv, pFS->aOBuffer);
 697     pFS->nBuffer = 0;
 698
 699     /* Unmap the file, if it is currently mapped */
 700     if( pFS->pMap ){
 701       lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
 702       pFS->nMapLimit = 0;
 703     }
 704
 705     /* Free all allocated page structures */
 706     pPg = pFS->pLruFirst;
 707     while( pPg ){
 708       Page *pNext = pPg->pLruNext;
 709       assert( pPg->flags & PAGE_FREE );
 710       lsmFree(pEnv, pPg->aData);
 711       lsmFree(pEnv, pPg);
 712       pPg = pNext;
 713     }
 714
 715     pPg = pFS->pFree;
 716     while( pPg ){
 717       Page *pNext = pPg->pFreeNext;
 718       lsmFree(pEnv, pPg);
 719       pPg = pNext;
 720     }
 721
 722     /* Zero pointers that point to deleted page objects */
 723     pFS->nCacheAlloc = 0;
 724     pFS->pLruFirst = 0;
 725     pFS->pLruLast = 0;
 726     pFS->pFree = 0;
 727     if( pFS->apHash ){
 728       memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0]));
 729     }
 730
 731     /* Configure the FileSystem object */
 732     if( db->compress.xCompress ){
 733       pFS->pCompress = &db->compress;
 734       pFS->nMapLimit = 0;
 735     }else{
 736       pFS->pCompress = 0;
 737       if( db->iMmap==1 ){
 738         /* Unlimited */
 739         pFS->nMapLimit = (i64)1 << 60;
 740       }else{
 741         /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */
 742         pFS->nMapLimit = (i64)db->iMmap * 1024;
 743       }
 744     }
 745   }
 746
 747   return LSM_OK;
 748 }
 749
 750 /*
 751 ** Close and destroy a FileSystem object.
 752 */
 753 void lsmFsClose(FileSystem *pFS){
 754   if( pFS ){
 755     Page *pPg;
 756     lsm_env *pEnv = pFS->pEnv;
 757
 758     assert( pFS->nOut==0 );
 759     pPg = pFS->pLruFirst;
 760     while( pPg ){
 761       Page *pNext = pPg->pLruNext;
 762       if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
 763       lsmFree(pEnv, pPg);
 764       pPg = pNext;
 765     }
 766
 767     pPg = pFS->pFree;
 768     while( pPg ){
 769       Page *pNext = pPg->pFreeNext;
 770       if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
 771       lsmFree(pEnv, pPg);
 772       pPg = pNext;
 773     }
 774
 775     if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
 776     if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
 777     lsmFree(pEnv, pFS->pLsmFile);
 778     lsmFree(pEnv, pFS->apHash);
 779     lsmFree(pEnv, pFS->aIBuffer);
 780     lsmFree(pEnv, pFS->aOBuffer);
 781     lsmFree(pEnv, pFS);
 782   }
 783 }
 784
 785 /*
 786 ** This function is called when closing a database handle (i.e. lsm_close())
 787 ** if there exist other connections to the same database within this process.
 788 ** In that case the file-descriptor open on the database file is not closed
 789 ** when the FileSystem object is destroyed, as this would cause any POSIX
 790 ** locks held by the other connections to be silently dropped (see "man close"
 791 ** for details). Instead, the file-descriptor is stored in a list by the
 792 ** lsm_shared.c module until it is either closed or reused.
 793 **
 794 ** This function returns a pointer to an object that can be linked into
 795 ** the list described above. The returned object now 'owns' the database
 796 ** file descriptr, so that when the FileSystem object is destroyed, it
 797 ** will not be closed.
 798 **
 799 ** This function may be called at most once in the life-time of a
 800 ** FileSystem object. The results of any operations involving the database
 801 ** file descriptor are undefined once this function has been called.
 802 **
 803 ** None of this is necessary on non-POSIX systems. But we do it anyway in
 804 ** the name of using as similar code as possible on all platforms.
 805 */
 806 LsmFile *lsmFsDeferClose(FileSystem *pFS){
 807   LsmFile *p = pFS->pLsmFile;
 808   assert( p->pNext==0 );
 809   p->pFile = pFS->fdDb;
 810   pFS->fdDb = 0;
 811   pFS->pLsmFile = 0;
 812   return p;
 813 }
 814
 815 /*
 816 ** Allocate a buffer and populate it with the output of the xFileid()
 817 ** method of the database file handle. If successful, set *ppId to point
 818 ** to the buffer and *pnId to the number of bytes in the buffer and return
 819 ** LSM_OK. Otherwise, set *ppId and *pnId to zero and return an LSM
 820 ** error code.
 821 */
 822 int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId){
 823   lsm_env *pEnv = pDb->pEnv;
 824   FileSystem *pFS = pDb->pFS;
 825   int rc;
 826   int nId = 0;
 827   void *pId;
 828
 829   rc = pEnv->xFileid(pFS->fdDb, 0, &nId);
 830   pId = lsmMallocZeroRc(pEnv, nId, &rc);
 831   if( rc==LSM_OK ) rc = pEnv->xFileid(pFS->fdDb, pId, &nId);
 832
 833   if( rc!=LSM_OK ){
 834     lsmFree(pEnv, pId);
 835     pId = 0;
 836     nId = 0;
 837   }
 838
 839   *ppId = pId;
 840   *pnId = nId;
 841   return rc;
 842 }
 843
 844 /*
 845 ** Return the nominal page-size used by this file-system. Actual pages
 846 ** may be smaller or larger than this value.
 847 */
 848 int lsmFsPageSize(FileSystem *pFS){
 849   return pFS->nPagesize;
 850 }
 851
 852 /*
 853 ** Return the block-size used by this file-system.
 854 */
 855 int lsmFsBlockSize(FileSystem *pFS){
 856   return pFS->nBlocksize;
 857 }
 858
 859 /*
 860 ** Configure the nominal page-size used by this file-system. Actual
 861 ** pages may be smaller or larger than this value.
 862 */
 863 void lsmFsSetPageSize(FileSystem *pFS, int nPgsz){
 864   pFS->nPagesize = nPgsz;
 865   pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
 866 }
 867
 868 /*
 869 ** Configure the block-size used by this file-system.
 870 */
 871 void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){
 872   pFS->nBlocksize = nBlocksize;
 873 }
 874
 875 /*
 876 ** Return the page number of the first page on block iBlock. Blocks are
 877 ** numbered starting from 1.
 878 **
 879 ** For a compressed database, page numbers are byte offsets. The first
 880 ** page on each block is the byte offset immediately following the 4-byte
 881 ** "previous block" pointer at the start of each block.
 882 */
 883 static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){
 884   Pgno iPg;
 885   if( pFS->pCompress ){
 886     if( iBlock==1 ){
 887       iPg = pFS->nMetasize * 2 + 4;
 888     }else{
 889       iPg = pFS->nBlocksize * (Pgno)(iBlock-1) + 4;
 890     }
 891   }else{
 892     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 893     if( iBlock==1 ){
 894       iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize);
 895     }else{
 896       iPg = 1 + (iBlock-1) * nPagePerBlock;
 897     }
 898   }
 899   return iPg;
 900 }
 901
 902 /*
 903 ** Return the page number of the last page on block iBlock. Blocks are
 904 ** numbered starting from 1.
 905 **
 906 ** For a compressed database, page numbers are byte offsets. The first
 907 ** page on each block is the byte offset of the byte immediately before
 908 ** the 4-byte "next block" pointer at the end of each block.
 909 */
 910 static Pgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){
 911   if( pFS->pCompress ){
 912     return pFS->nBlocksize * (Pgno)iBlock - 1 - 4;
 913   }else{
 914     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 915     return iBlock * nPagePerBlock;
 916   }
 917 }
 918
 919 /*
 920 ** Return the block number of the block that page iPg is located on.
 921 ** Blocks are numbered starting from 1.
 922 */
 923 static int fsPageToBlock(FileSystem *pFS, Pgno iPg){
 924   if( pFS->pCompress ){
 925     return (int)((iPg / pFS->nBlocksize) + 1);
 926   }else{
 927     return (int)(1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)));
 928   }
 929 }
 930
 931 /*
 932 ** Return true if page iPg is the last page on its block.
 933 **
 934 ** This function is only called in non-compressed database mode.
 935 */
 936 static int fsIsLast(FileSystem *pFS, Pgno iPg){
 937   const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 938   assert( !pFS->pCompress );
 939   return ( iPg && (iPg % nPagePerBlock)==0 );
 940 }
 941
 942 /*
 943 ** Return true if page iPg is the first page on its block.
 944 **
 945 ** This function is only called in non-compressed database mode.
 946 */
 947 static int fsIsFirst(FileSystem *pFS, Pgno iPg){
 948   const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 949   assert( !pFS->pCompress );
 950   return ( (iPg % nPagePerBlock)==1
 951         || (iPg<nPagePerBlock && iPg==fsFirstPageOnBlock(pFS, 1))
 952   );
 953 }
 954
 955 /*
 956 ** Given a page reference, return a pointer to the buffer containing the
 957 ** pages contents. If parameter pnData is not NULL, set *pnData to the size
 958 ** of the buffer in bytes before returning.
 959 */
 960 u8 *lsmFsPageData(Page *pPage, int *pnData){
 961   if( pnData ){
 962     *pnData = pPage->nData;
 963   }
 964   return pPage->aData;
 965 }
 966
 967 /*
 968 ** Return the page number of a page.
 969 */
 970 Pgno lsmFsPageNumber(Page *pPage){
 971   /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */
 972   return pPage ? pPage->iPg : 0;
 973 }
 974
 975 /*
 976 ** Page pPg is currently part of the LRU list belonging to pFS. Remove
 977 ** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
 978 ** operation.
 979 */
 980 static void fsPageRemoveFromLru(FileSystem *pFS, Page *pPg){
 981   assert( pPg->pLruNext || pPg==pFS->pLruLast );
 982   assert( pPg->pLruPrev || pPg==pFS->pLruFirst );
 983   if( pPg->pLruNext ){
 984     pPg->pLruNext->pLruPrev = pPg->pLruPrev;
 985   }else{
 986     pFS->pLruLast = pPg->pLruPrev;
 987   }
 988   if( pPg->pLruPrev ){
 989     pPg->pLruPrev->pLruNext = pPg->pLruNext;
 990   }else{
 991     pFS->pLruFirst = pPg->pLruNext;
 992   }
 993   pPg->pLruPrev = 0;
 994   pPg->pLruNext = 0;
 995 }
 996
 997 /*
 998 ** Page pPg is not currently part of the LRU list belonging to pFS. Add it.
 999 */
1000 static void fsPageAddToLru(FileSystem *pFS, Page *pPg){
1001   assert( pPg->pLruNext==0 && pPg->pLruPrev==0 );
1002   pPg->pLruPrev = pFS->pLruLast;
1003   if( pPg->pLruPrev ){
1004     pPg->pLruPrev->pLruNext = pPg;
1005   }else{
1006     pFS->pLruFirst = pPg;
1007   }
1008   pFS->pLruLast = pPg;
1009 }
1010
1011 /*
1012 ** Page pPg is currently stored in the apHash/nHash hash table. Remove it.
1013 */
1014 static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
1015   int iHash;
1016   Page **pp;
1017
1018   iHash = fsHashKey(pFS->nHash, pPg->iPg);
1019   for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
1020   *pp = pPg->pHashNext;
1021   pPg->pHashNext = 0;
1022 }
1023
1024 /*
1025 ** Free a Page object allocated by fsPageBuffer().
1026 */
1027 static void fsPageBufferFree(Page *pPg){
1028   pPg->pFS->nCacheAlloc--;
1029   lsmFree(pPg->pFS->pEnv, pPg->aData);
1030   lsmFree(pPg->pFS->pEnv, pPg);
1031 }
1032
1033
1034 /*
1035 ** Purge the cache of all non-mmap pages with nRef==0.
1036 */
1037 void lsmFsPurgeCache(FileSystem *pFS){
1038   Page *pPg;
1039
1040   pPg = pFS->pLruFirst;
1041   while( pPg ){
1042     Page *pNext = pPg->pLruNext;
1043     assert( pPg->flags & PAGE_FREE );
1044     fsPageRemoveFromHash(pFS, pPg);
1045     fsPageBufferFree(pPg);
1046     pPg = pNext;
1047   }
1048   pFS->pLruFirst = 0;
1049   pFS->pLruLast = 0;
1050
1051   assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );
1052 }
1053
1054 /*
1055 ** Search the hash-table for page iPg. If an entry is round, return a pointer
1056 ** to it. Otherwise, return NULL.
1057 **
1058 ** Either way, if argument piHash is not NULL set *piHash to the hash slot
1059 ** number that page iPg would be stored in before returning.
1060 */
1061 static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash){
1062   Page *p;                        /* Return value */
1063   int iHash = fsHashKey(pFS->nHash, iPg);
1064
1065   if( piHash ) *piHash = iHash;
1066   for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
1067     if( p->iPg==iPg) break;
1068   }
1069   return p;
1070 }
1071
1072 /*
1073 ** Allocate and return a non-mmap Page object. If there are already
1074 ** nCacheMax such Page objects outstanding, try to recycle an existing
1075 ** Page instead.
1076 */
1077 static int fsPageBuffer(
1078   FileSystem *pFS,
1079   Page **ppOut
1080 ){
1081   int rc = LSM_OK;
1082   Page *pPage = 0;
1083   if( pFS->pLruFirst==0 || pFS->nCacheAlloc<pFS->nCacheMax ){
1084     /* Allocate a new Page object */
1085     pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
1086     if( !pPage ){
1087       rc = LSM_NOMEM_BKPT;
1088     }else{
1089       pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);
1090       if( !pPage->aData ){
1091         lsmFree(pFS->pEnv, pPage);
1092         rc = LSM_NOMEM_BKPT;
1093         pPage = 0;
1094       }else{
1095         pFS->nCacheAlloc++;
1096       }
1097     }
1098   }else{
1099     /* Reuse an existing Page object */
1100     u8 *aData;
1101     pPage = pFS->pLruFirst;
1102     aData = pPage->aData;
1103     fsPageRemoveFromLru(pFS, pPage);
1104     fsPageRemoveFromHash(pFS, pPage);
1105
1106     memset(pPage, 0, sizeof(Page));
1107     pPage->aData = aData;
1108   }
1109
1110   if( pPage ){
1111     pPage->flags = PAGE_FREE;
1112   }
1113   *ppOut = pPage;
1114   return rc;
1115 }
1116
1117 /*
1118 ** Assuming *pRc is initially LSM_OK, attempt to ensure that the
1119 ** memory-mapped region is at least iSz bytes in size. If it is not already,
1120 ** iSz bytes in size, extend it and update the pointers associated with any
1121 ** outstanding Page objects.
1122 **
1123 ** If *pRc is not LSM_OK when this function is called, it is a no-op.
1124 ** Otherwise, *pRc is set to an lsm error code if an error occurs, or
1125 ** left unmodified otherwise.
1126 **
1127 ** This function is never called in compressed database mode.
1128 */
1129 static void fsGrowMapping(
1130   FileSystem *pFS,                /* File system object */
1131   i64 iSz,                        /* Minimum size to extend mapping to */
1132   int *pRc                        /* IN/OUT: Error code */
1133 ){
1134   assert( pFS->pCompress==0 );
1135   assert( PAGE_HASPREV==4 );
1136
1137   if( *pRc==LSM_OK && iSz>pFS->nMap ){
1138     int rc;
1139     u8 *aOld = pFS->pMap;
1140     rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
1141     if( rc==LSM_OK && pFS->pMap!=aOld ){
1142       Page *pFix;
1143       i64 iOff = (u8 *)pFS->pMap - aOld;
1144       for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){
1145         pFix->aData += iOff;
1146       }
1147       lsmSortedRemap(pFS->pDb);
1148     }
1149     *pRc = rc;
1150   }
1151 }
1152
1153 /*
1154 ** fsync() the database file.
1155 */
1156 int lsmFsSyncDb(FileSystem *pFS, int nBlock){
1157   return lsmEnvSync(pFS->pEnv, pFS->fdDb);
1158 }
1159
1160 /*
1161 ** If block iBlk has been redirected according to the redirections in the
1162 ** object passed as the first argument, return the destination block to
1163 ** which it is redirected. Otherwise, return a copy of iBlk.
1164 */
1165 static int fsRedirectBlock(Redirect *p, int iBlk){
1166   if( p ){
1167     int i;
1168     for(i=0; i<p->n; i++){
1169       if( iBlk==p->a[i].iFrom ) return p->a[i].iTo;
1170     }
1171   }
1172   assert( iBlk!=0 );
1173   return iBlk;
1174 }
1175
1176 /*
1177 ** If page iPg has been redirected according to the redirections in the
1178 ** object passed as the second argument, return the destination page to
1179 ** which it is redirected. Otherwise, return a copy of iPg.
1180 */
1181 Pgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){
1182   Pgno iReal = iPg;
1183
1184   if( pRedir ){
1185     const int nPagePerBlock = (
1186         pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
1187     );
1188     int iBlk = fsPageToBlock(pFS, iPg);
1189     int i;
1190     for(i=0; i<pRedir->n; i++){
1191       int iFrom = pRedir->a[i].iFrom;
1192       if( iFrom>iBlk ) break;
1193       if( iFrom==iBlk ){
1194         int iTo = pRedir->a[i].iTo;
1195         iReal = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
1196         if( iTo==1 ){
1197           iReal += (fsFirstPageOnBlock(pFS, 1)-1);
1198         }
1199         break;
1200       }
1201     }
1202   }
1203
1204   assert( iReal!=0 );
1205   return iReal;
1206 }
1207
1208 /* Required by the circular fsBlockNext<->fsPageGet dependency. */
1209 static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *);
1210
1211 /*
1212 ** Parameter iBlock is a database file block. This function reads the value
1213 ** stored in the blocks "next block" pointer and stores it in *piNext.
1214 ** LSM_OK is returned if everything is successful, or an LSM error code
1215 ** otherwise.
1216 */
1217 static int fsBlockNext(
1218   FileSystem *pFS,                /* File-system object handle */
1219   Segment *pSeg,                  /* Use this segment for block redirects */
1220   int iBlock,                     /* Read field from this block */
1221   int *piNext                     /* OUT: Next block in linked list */
1222 ){
1223   int rc;
1224   int iRead;                      /* Read block from here */
1225
1226   if( pSeg ){
1227     iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
1228   }else{
1229     iRead = iBlock;
1230   }
1231
1232   assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
1233   if( pFS->pCompress ){
1234     i64 iOff;                     /* File offset to read data from */
1235     u8 aNext[4];                  /* 4-byte pointer read from db file */
1236
1237     iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
1238     rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
1239     if( rc==LSM_OK ){
1240       *piNext = (int)lsmGetU32(aNext);
1241     }
1242   }else{
1243     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
1244     Page *pLast;
1245     rc = fsPageGet(pFS, 0, iRead*nPagePerBlock, 0, &pLast, 0);
1246     if( rc==LSM_OK ){
1247       *piNext = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
1248       lsmFsPageRelease(pLast);
1249     }
1250   }
1251
1252   if( pSeg ){
1253     *piNext = fsRedirectBlock(pSeg->pRedirect, *piNext);
1254   }
1255   return rc;
1256 }
1257
1258 /*
1259 ** Return the page number of the last page on the same block as page iPg.
1260 */
1261 Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){
1262   return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg));
1263 }
1264
1265 /*
1266 ** Read nData bytes of data from offset iOff of the database file into
1267 ** buffer aData. If this means reading past the end of a block, follow
1268 ** the block pointer to the next block and continue reading.
1269 **
1270 ** Offset iOff is an absolute offset - not subject to any block redirection.
1271 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1272 **
1273 ** This function is only called in compressed database mode.
1274 */
1275 static int fsReadData(
1276   FileSystem *pFS,                /* File-system handle */
1277   Segment *pSeg,                  /* Block redirection */
1278   i64 iOff,                       /* Read data from this offset */
1279   u8 *aData,                      /* Buffer to read data into */
1280   int nData                       /* Number of bytes to read */
1281 ){
1282   i64 iEob;                       /* End of block */
1283   int nRead;
1284   int rc;
1285
1286   assert( pFS->pCompress );
1287
1288   iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1;
1289   nRead = (int)LSM_MIN(iEob - iOff, nData);
1290
1291   rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead);
1292   if( rc==LSM_OK && nRead!=nData ){
1293     int iBlk;
1294
1295     rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1296     if( rc==LSM_OK ){
1297       i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk);
1298       rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead);
1299     }
1300   }
1301
1302   return rc;
1303 }
1304
1305 /*
1306 ** Parameter iBlock is a database file block. This function reads the value
1307 ** stored in the blocks "previous block" pointer and stores it in *piPrev.
1308 ** LSM_OK is returned if everything is successful, or an LSM error code
1309 ** otherwise.
1310 */
1311 static int fsBlockPrev(
1312   FileSystem *pFS,                /* File-system object handle */
1313   Segment *pSeg,                  /* Use this segment for block redirects */
1314   int iBlock,                     /* Read field from this block */
1315   int *piPrev                     /* OUT: Previous block in linked list */
1316 ){
1317   int rc = LSM_OK;                /* Return code */
1318
1319   assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
1320   assert( iBlock>0 );
1321
1322   if( pFS->pCompress ){
1323     i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
1324     u8 aPrev[4];                  /* 4-byte pointer read from db file */
1325     rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
1326     if( rc==LSM_OK ){
1327       Redirect *pRedir = (pSeg ? pSeg->pRedirect : 0);
1328       *piPrev = fsRedirectBlock(pRedir, (int)lsmGetU32(aPrev));
1329     }
1330   }else{
1331     assert( 0 );
1332   }
1333   return rc;
1334 }
1335
1336 /*
1337 ** Encode and decode routines for record size fields.
1338 */
1339 static void putRecordSize(u8 *aBuf, int nByte, int bFree){
1340   aBuf[0] = (u8)(nByte >> 14) | 0x80;
1341   aBuf[1] = ((u8)(nByte >>  7) & 0x7F) | (bFree ? 0x00 : 0x80);
1342   aBuf[2] = (u8)nByte | 0x80;
1343 }
1344 static int getRecordSize(u8 *aBuf, int *pbFree){
1345   int nByte;
1346   nByte  = (aBuf[0] & 0x7F) << 14;
1347   nByte += (aBuf[1] & 0x7F) << 7;
1348   nByte += (aBuf[2] & 0x7F);
1349   *pbFree = !(aBuf[1] & 0x80);
1350   return nByte;
1351 }
1352
1353 /*
1354 ** Subtract iSub from database file offset iOff and set *piRes to the
1355 ** result. If doing so means passing the start of a block, follow the
1356 ** block pointer stored in the first 4 bytes of the block.
1357 **
1358 ** Offset iOff is an absolute offset - not subject to any block redirection.
1359 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1360 **
1361 ** Return LSM_OK if successful or an lsm error code if an error occurs.
1362 */
1363 static int fsSubtractOffset(
1364   FileSystem *pFS,
1365   Segment *pSeg,
1366   i64 iOff,
1367   int iSub,
1368   i64 *piRes
1369 ){
1370   i64 iStart;
1371   int iBlk = 0;
1372   int rc;
1373
1374   assert( pFS->pCompress );
1375
1376   iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff));
1377   if( (iOff-iSub)>=iStart ){
1378     *piRes = (iOff-iSub);
1379     return LSM_OK;
1380   }
1381
1382   rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1383   *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1);
1384   return rc;
1385 }
1386
1387 /*
1388 ** Add iAdd to database file offset iOff and set *piRes to the
1389 ** result. If doing so means passing the end of a block, follow the
1390 ** block pointer stored in the last 4 bytes of the block.
1391 **
1392 ** Offset iOff is an absolute offset - not subject to any block redirection.
1393 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1394 **
1395 ** Return LSM_OK if successful or an lsm error code if an error occurs.
1396 */
1397 static int fsAddOffset(
1398   FileSystem *pFS,
1399   Segment *pSeg,
1400   i64 iOff,
1401   int iAdd,
1402   i64 *piRes
1403 ){
1404   i64 iEob;
1405   int iBlk;
1406   int rc;
1407
1408   assert( pFS->pCompress );
1409
1410   iEob = fsLastPageOnPagesBlock(pFS, iOff);
1411   if( (iOff+iAdd)<=iEob ){
1412     *piRes = (iOff+iAdd);
1413     return LSM_OK;
1414   }
1415
1416   rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1417   *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1);
1418   return rc;
1419 }
1420
1421 /*
1422 ** If it is not already allocated, allocate either the FileSystem.aOBuffer (if
1423 ** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return
1424 ** LSM_OK if successful if the attempt to allocate memory fails.
1425 */
1426 static int fsAllocateBuffer(FileSystem *pFS, int bWrite){
1427   u8 **pp;                        /* Pointer to either aIBuffer or aOBuffer */
1428
1429   assert( pFS->pCompress );
1430
1431   /* If neither buffer has been allocated, figure out how large they
1432   ** should be. Store this value in FileSystem.nBuffer.  */
1433   if( pFS->nBuffer==0 ){
1434     assert( pFS->aIBuffer==0 && pFS->aOBuffer==0 );
1435     pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize);
1436     if( pFS->nBuffer<(pFS->szSector+6) ){
1437       pFS->nBuffer = pFS->szSector+6;
1438     }
1439   }
1440
1441   pp = (bWrite ? &pFS->aOBuffer : &pFS->aIBuffer);
1442   if( *pp==0 ){
1443     *pp = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize));
1444     if( *pp==0 ) return LSM_NOMEM_BKPT;
1445   }
1446
1447   return LSM_OK;
1448 }
1449
1450 /*
1451 ** This function is only called in compressed database mode. It reads and
1452 ** uncompresses the compressed data for page pPg from the database and
1453 ** populates the pPg->aData[] buffer and pPg->nCompress field.
1454 **
1455 ** It is possible that instead of a page record, there is free space
1456 ** at offset pPg->iPgno. In this case no data is read from the file, but
1457 ** output variable *pnSpace is set to the total number of free bytes.
1458 **
1459 ** LSM_OK is returned if successful, or an LSM error code otherwise.
1460 */
1461 static int fsReadPagedata(
1462   FileSystem *pFS,                /* File-system handle */
1463   Segment *pSeg,                  /* pPg is part of this segment */
1464   Page *pPg,                      /* Page to read and uncompress data for */
1465   int *pnSpace                    /* OUT: Total bytes of free space */
1466 ){
1467   lsm_compress *p = pFS->pCompress;
1468   i64 iOff = pPg->iPg;
1469   u8 aSz[3];
1470   int rc;
1471
1472   assert( p && pPg->nCompress==0 );
1473
1474   if( fsAllocateBuffer(pFS, 0) ) return LSM_NOMEM;
1475
1476   rc = fsReadData(pFS, pSeg, iOff, aSz, sizeof(aSz));
1477
1478   if( rc==LSM_OK ){
1479     int bFree;
1480     if( aSz[0] & 0x80 ){
1481       pPg->nCompress = (int)getRecordSize(aSz, &bFree);
1482     }else{
1483       pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2;
1484       bFree = 1;
1485     }
1486     if( bFree ){
1487       if( pnSpace ){
1488         *pnSpace = pPg->nCompress + sizeof(aSz)*2;
1489       }else{
1490         rc = LSM_CORRUPT_BKPT;
1491       }
1492     }else{
1493       rc = fsAddOffset(pFS, pSeg, iOff, 3, &iOff);
1494       if( rc==LSM_OK ){
1495         if( pPg->nCompress>pFS->nBuffer ){
1496           rc = LSM_CORRUPT_BKPT;
1497         }else{
1498           rc = fsReadData(pFS, pSeg, iOff, pFS->aIBuffer, pPg->nCompress);
1499         }
1500         if( rc==LSM_OK ){
1501           int n = pFS->nPagesize;
1502           rc = p->xUncompress(p->pCtx,
1503               (char *)pPg->aData, &n,
1504               (const char *)pFS->aIBuffer, pPg->nCompress
1505           );
1506           if( rc==LSM_OK && n!=pPg->pFS->nPagesize ){
1507             rc = LSM_CORRUPT_BKPT;
1508           }
1509         }
1510       }
1511     }
1512   }
1513   return rc;
1514 }
1515
1516 /*
1517 ** Return a handle for a database page.
1518 **
1519 ** If this file-system object is accessing a compressed database it may be
1520 ** that there is no page record at database file offset iPg. Instead, there
1521 ** may be a free space record. In this case, set *ppPg to NULL and *pnSpace
1522 ** to the total number of free bytes before returning.
1523 **
1524 ** If no error occurs, LSM_OK is returned. Otherwise, an lsm error code.
1525 */
1526 static int fsPageGet(
1527   FileSystem *pFS,                /* File-system handle */
1528   Segment *pSeg,                  /* Block redirection to use (or NULL) */
1529   Pgno iPg,                       /* Page id */
1530   int noContent,                  /* True to not load content from disk */
1531   Page **ppPg,                    /* OUT: New page handle */
1532   int *pnSpace                    /* OUT: Bytes of free space */
1533 ){
1534   Page *p;
1535   int iHash;
1536   int rc = LSM_OK;
1537
1538   /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is
1539   ** not NULL, and the block containing iPg has been redirected, then iReal
1540   ** is the page number after redirection.  */
1541   Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);
1542
1543   assert_lists_are_ok(pFS);
1544   assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
1545   assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
1546   *ppPg = 0;
1547
1548   /* Search the hash-table for the page */
1549   p = fsPageFindInHash(pFS, iReal, &iHash);
1550
1551   if( p ){
1552     assert( p->flags & PAGE_FREE );
1553     if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p);
1554   }else{
1555
1556     if( fsMmapPage(pFS, iReal) ){
1557       i64 iEnd = (i64)iReal * pFS->nPagesize;
1558       fsGrowMapping(pFS, iEnd, &rc);
1559       if( rc!=LSM_OK ) return rc;
1560
1561       if( pFS->pFree ){
1562         p = pFS->pFree;
1563         pFS->pFree = p->pFreeNext;
1564         assert( p->nRef==0 );
1565       }else{
1566         p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
1567         if( rc ) return rc;
1568         p->pFS = pFS;
1569       }
1570       p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
1571       p->iPg = iReal;
1572
1573       /* This page now carries a pointer to the mapping. Link it in to
1574       ** the FileSystem.pMapped list.  */
1575       assert( p->pMappedNext==0 );
1576       p->pMappedNext = pFS->pMapped;
1577       pFS->pMapped = p;
1578
1579       assert( pFS->pCompress==0 );
1580       assert( (p->flags & PAGE_FREE)==0 );
1581     }else{
1582       rc = fsPageBuffer(pFS, &p);
1583       if( rc==LSM_OK ){
1584         int nSpace = 0;
1585         p->iPg = iReal;
1586         p->nRef = 0;
1587         p->pFS = pFS;
1588         assert( p->flags==0 || p->flags==PAGE_FREE );
1589
1590 #ifdef LSM_DEBUG
1591         memset(p->aData, 0x56, pFS->nPagesize);
1592 #endif
1593         assert( p->pLruNext==0 && p->pLruPrev==0 );
1594         if( noContent==0 ){
1595           if( pFS->pCompress ){
1596             rc = fsReadPagedata(pFS, pSeg, p, &nSpace);
1597           }else{
1598             int nByte = pFS->nPagesize;
1599             i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
1600             rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
1601           }
1602           pFS->nRead++;
1603         }
1604
1605         /* If the xRead() call was successful (or not attempted), link the
1606         ** page into the page-cache hash-table. Otherwise, if it failed,
1607         ** free the buffer. */
1608         if( rc==LSM_OK && nSpace==0 ){
1609           p->pHashNext = pFS->apHash[iHash];
1610           pFS->apHash[iHash] = p;
1611         }else{
1612           fsPageBufferFree(p);
1613           p = 0;
1614           if( pnSpace ) *pnSpace = nSpace;
1615         }
1616       }
1617     }
1618
1619     assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
1620          || (rc!=LSM_OK && p==0)
1621     );
1622   }
1623
1624   if( rc==LSM_OK && p ){
1625     if( pFS->pCompress==0 && (fsIsLast(pFS, iReal) || fsIsFirst(pFS, iReal)) ){
1626       p->nData = pFS->nPagesize - 4;
1627       if( fsIsFirst(pFS, iReal) && p->nRef==0 ){
1628         p->aData += 4;
1629         p->flags |= PAGE_HASPREV;
1630       }
1631     }else{
1632       p->nData = pFS->nPagesize;
1633     }
1634     pFS->nOut += (p->nRef==0);
1635     p->nRef++;
1636   }
1637   *ppPg = p;
1638   return rc;
1639 }
1640
1641 /*
1642 ** Read the 64-bit checkpoint id of the checkpoint currently stored on meta
1643 ** page iMeta of the database file. If no error occurs, store the id value
1644 ** in *piVal and return LSM_OK. Otherwise, return an LSM error code and leave
1645 ** *piVal unmodified.
1646 **
1647 ** If a checkpointer connection is currently updating meta-page iMeta, or an
1648 ** earlier checkpointer crashed while doing so, the value read into *piVal
1649 ** may be garbage. It is the callers responsibility to deal with this.
1650 */
1651 int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
1652   FileSystem *pFS = db->pFS;
1653   int rc = LSM_OK;
1654
1655   assert( iMeta==1 || iMeta==2 );
1656   if( pFS->nMapLimit>0 ){
1657     fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
1658     if( rc==LSM_OK ){
1659       *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
1660     }
1661   }else{
1662     MetaPage *pMeta = 0;
1663     rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
1664     if( rc==LSM_OK ){
1665       *piVal = (i64)lsmGetU64(pMeta->aData);
1666       lsmFsMetaPageRelease(pMeta);
1667     }
1668   }
1669
1670   return rc;
1671 }
1672
1673
1674 /*
1675 ** Return true if the first or last page of segment pRun falls between iFirst
1676 ** and iLast, inclusive, and pRun is not equal to pIgnore.
1677 */
1678 static int fsRunEndsBetween(
1679   Segment *pRun,
1680   Segment *pIgnore,
1681   Pgno iFirst,
1682   Pgno iLast
1683 ){
1684   return (pRun!=pIgnore && (
1685         (pRun->iFirst>=iFirst && pRun->iFirst<=iLast)
1686      || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast)
1687   ));
1688 }
1689
1690 /*
1691 ** Return true if level pLevel contains a segment other than pIgnore for
1692 ** which the first or last page is between iFirst and iLast, inclusive.
1693 */
1694 static int fsLevelEndsBetween(
1695   Level *pLevel,
1696   Segment *pIgnore,
1697   Pgno iFirst,
1698   Pgno iLast
1699 ){
1700   int i;
1701
1702   if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){
1703     return 1;
1704   }
1705   for(i=0; i<pLevel->nRight; i++){
1706     if( fsRunEndsBetween(&pLevel->aRhs[i], pIgnore, iFirst, iLast) ){
1707       return 1;
1708     }
1709   }
1710
1711   return 0;
1712 }
1713
1714 /*
1715 ** Block iBlk is no longer in use by segment pIgnore. If it is not in use
1716 ** by any other segment, move it to the free block list.
1717 */
1718 static int fsFreeBlock(
1719   FileSystem *pFS,                /* File system object */
1720   Snapshot *pSnapshot,            /* Worker snapshot */
1721   Segment *pIgnore,               /* Ignore this run when searching */
1722   int iBlk                        /* Block number of block to free */
1723 ){
1724   int rc = LSM_OK;                /* Return code */
1725   Pgno iFirst;                    /* First page on block iBlk */
1726   Pgno iLast;                     /* Last page on block iBlk */
1727   Level *pLevel;                  /* Used to iterate through levels */
1728
1729   int iIn;                        /* Used to iterate through append points */
1730   int iOut = 0;                   /* Used to output append points */
1731   Pgno *aApp = pSnapshot->aiAppend;
1732
1733   iFirst = fsFirstPageOnBlock(pFS, iBlk);
1734   iLast = fsLastPageOnBlock(pFS, iBlk);
1735
1736   /* Check if any other run in the snapshot has a start or end page
1737   ** within this block. If there is such a run, return early. */
1738   for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
1739     if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
1740       return LSM_OK;
1741     }
1742   }
1743
1744   /* Remove any entries that lie on this block from the append-list. */
1745   for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
1746     if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
1747       aApp[iOut++] = aApp[iIn];
1748     }
1749   }
1750   while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;
1751
1752   if( rc==LSM_OK ){
1753     rc = lsmBlockFree(pFS->pDb, iBlk);
1754   }
1755   return rc;
1756 }
1757
1758 /*
1759 ** Delete or otherwise recycle the blocks currently occupied by run pDel.
1760 */
1761 int lsmFsSortedDelete(
1762   FileSystem *pFS,
1763   Snapshot *pSnapshot,
1764   int bZero,                      /* True to zero the Segment structure */
1765   Segment *pDel
1766 ){
1767   if( pDel->iFirst ){
1768     int rc = LSM_OK;
1769
1770     int iBlk;
1771     int iLastBlk;
1772
1773     iBlk = fsPageToBlock(pFS, pDel->iFirst);
1774     iLastBlk = fsPageToBlock(pFS, pDel->iLastPg);
1775
1776     /* Mark all blocks currently used by this sorted run as free */
1777     while( iBlk && rc==LSM_OK ){
1778       int iNext = 0;
1779       if( iBlk!=iLastBlk ){
1780         rc = fsBlockNext(pFS, pDel, iBlk, &iNext);
1781       }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){
1782         break;
1783       }
1784       rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk);
1785       iBlk = iNext;
1786     }
1787
1788     if( pDel->pRedirect ){
1789       assert( pDel->pRedirect==&pSnapshot->redirect );
1790       pSnapshot->redirect.n = 0;
1791     }
1792
1793     if( bZero ) memset(pDel, 0, sizeof(Segment));
1794   }
1795   return LSM_OK;
1796 }
1797
1798 /*
1799 ** aPgno is an array containing nPgno page numbers. Return the smallest page
1800 ** number from the array that falls on block iBlk. Or, if none of the pages
1801 ** in aPgno[] fall on block iBlk, return 0.
1802 */
1803 static Pgno firstOnBlock(FileSystem *pFS, int iBlk, Pgno *aPgno, int nPgno){
1804   Pgno iRet = 0;
1805   int i;
1806   for(i=0; i<nPgno; i++){
1807     Pgno iPg = aPgno[i];
1808     if( fsPageToBlock(pFS, iPg)==iBlk && (iRet==0 || iPg<iRet) ){
1809       iRet = iPg;
1810     }
1811   }
1812   return iRet;
1813 }
1814
1815 #ifndef NDEBUG
1816 /*
1817 ** Return true if page iPg, which is a part of segment p, lies on
1818 ** a redirected block.
1819 */
1820 static int fsPageRedirects(FileSystem *pFS, Segment *p, Pgno iPg){
1821   return (iPg!=0 && iPg!=lsmFsRedirectPage(pFS, p->pRedirect, iPg));
1822 }
1823
1824 /*
1825 ** Return true if the second argument is not NULL and any of the first
1826 ** last or root pages lie on a redirected block.
1827 */
1828 static int fsSegmentRedirects(FileSystem *pFS, Segment *p){
1829   return (p && (
1830       fsPageRedirects(pFS, p, p->iFirst)
1831    || fsPageRedirects(pFS, p, p->iRoot)
1832    || fsPageRedirects(pFS, p, p->iLastPg)
1833   ));
1834 }
1835 #endif
1836
1837 /*
1838 ** Argument aPgno is an array of nPgno page numbers. All pages belong to
1839 ** the segment pRun. This function gobbles from the start of the run to the
1840 ** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is
1841 ** the new first page of the run).
1842 */
1843 void lsmFsGobble(
1844   lsm_db *pDb,
1845   Segment *pRun,
1846   Pgno *aPgno,
1847   int nPgno
1848 ){
1849   int rc = LSM_OK;
1850   FileSystem *pFS = pDb->pFS;
1851   Snapshot *pSnapshot = pDb->pWorker;
1852   int iBlk;
1853
1854   assert( pRun->nSize>0 );
1855   assert( 0==fsSegmentRedirects(pFS, pRun) );
1856   assert( nPgno>0 && 0==fsPageRedirects(pFS, pRun, aPgno[0]) );
1857
1858   iBlk = fsPageToBlock(pFS, pRun->iFirst);
1859   pRun->nSize += (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
1860
1861   while( rc==LSM_OK ){
1862     int iNext = 0;
1863     Pgno iFirst = firstOnBlock(pFS, iBlk, aPgno, nPgno);
1864     if( iFirst ){
1865       pRun->iFirst = iFirst;
1866       break;
1867     }
1868     rc = fsBlockNext(pFS, pRun, iBlk, &iNext);
1869     if( rc==LSM_OK ) rc = fsFreeBlock(pFS, pSnapshot, pRun, iBlk);
1870     pRun->nSize -= (int)(
1871         1 + fsLastPageOnBlock(pFS, iBlk) - fsFirstPageOnBlock(pFS, iBlk)
1872     );
1873     iBlk = iNext;
1874   }
1875
1876   pRun->nSize -= (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
1877   assert( pRun->nSize>0 );
1878 }
1879
1880 /*
1881 ** This function is only used in compressed database mode.
1882 **
1883 ** Argument iPg is the page number (byte offset) of a page within segment
1884 ** pSeg. The page record, including all headers, is nByte bytes in size.
1885 ** Before returning, set *piNext to the page number of the next page in
1886 ** the segment, or to zero if iPg is the last.
1887 **
1888 ** In other words, do:
1889 **
1890 **   *piNext = iPg + nByte;
1891 **
1892 ** But take block overflow and redirection into account.
1893 */
1894 static int fsNextPageOffset(
1895   FileSystem *pFS,                /* File system object */
1896   Segment *pSeg,                  /* Segment to move within */
1897   Pgno iPg,                       /* Offset of current page */
1898   int nByte,                      /* Size of current page including headers */
1899   Pgno *piNext                    /* OUT: Offset of next page. Or zero (EOF) */
1900 ){
1901   Pgno iNext;
1902   int rc;
1903
1904   assert( pFS->pCompress );
1905
1906   rc = fsAddOffset(pFS, pSeg, iPg, nByte-1, &iNext);
1907   if( pSeg && iNext==pSeg->iLastPg ){
1908     iNext = 0;
1909   }else if( rc==LSM_OK ){
1910     rc = fsAddOffset(pFS, pSeg, iNext, 1, &iNext);
1911   }
1912
1913   *piNext = iNext;
1914   return rc;
1915 }
1916
1917 /*
1918 ** This function is only used in compressed database mode.
1919 **
1920 ** Argument iPg is the page number of a pagethat appears in segment pSeg.
1921 ** This function determines the page number of the previous page in the
1922 ** same run. *piPrev is set to the previous page number before returning.
1923 **
1924 ** LSM_OK is returned if no error occurs. Otherwise, an lsm error code.
1925 ** If any value other than LSM_OK is returned, then the final value of
1926 ** *piPrev is undefined.
1927 */
1928 static int fsGetPageBefore(
1929   FileSystem *pFS,
1930   Segment *pSeg,
1931   Pgno iPg,
1932   Pgno *piPrev
1933 ){
1934   u8 aSz[3];
1935   int rc;
1936   i64 iRead;
1937
1938   assert( pFS->pCompress );
1939
1940   rc = fsSubtractOffset(pFS, pSeg, iPg, sizeof(aSz), &iRead);
1941   if( rc==LSM_OK ) rc = fsReadData(pFS, pSeg, iRead, aSz, sizeof(aSz));
1942
1943   if( rc==LSM_OK ){
1944     int bFree;
1945     int nSz;
1946     if( aSz[2] & 0x80 ){
1947       nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2;
1948     }else{
1949       nSz = (int)(aSz[2] & 0x7F);
1950       bFree = 1;
1951     }
1952     rc = fsSubtractOffset(pFS, pSeg, iPg, nSz, piPrev);
1953   }
1954
1955   return rc;
1956 }
1957
1958 /*
1959 ** The first argument to this function is a valid reference to a database
1960 ** file page that is part of a sorted run. If parameter eDir is -1, this
1961 ** function attempts to locate and load the previous page in the same run.
1962 ** Or, if eDir is +1, it attempts to find the next page in the same run.
1963 ** The results of passing an eDir value other than positive or negative one
1964 ** are undefined.
1965 **
1966 ** If parameter pRun is not NULL then it must point to the run that page
1967 ** pPg belongs to. In this case, if pPg is the first or last page of the
1968 ** run, and the request is for the previous or next page, respectively,
1969 ** *ppNext is set to NULL before returning LSM_OK. If pRun is NULL, then it
1970 ** is assumed that the next or previous page, as requested, exists.
1971 **
1972 ** If the previous/next page does exist and is successfully loaded, *ppNext
1973 ** is set to point to it and LSM_OK is returned. Otherwise, if an error
1974 ** occurs, *ppNext is set to NULL and and lsm error code returned.
1975 **
1976 ** Page references returned by this function should be released by the
1977 ** caller using lsmFsPageRelease().
1978 */
1979 int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){
1980   int rc = LSM_OK;
1981   FileSystem *pFS = pPg->pFS;
1982   Pgno iPg = pPg->iPg;
1983
1984   assert( 0==fsSegmentRedirects(pFS, pRun) );
1985   if( pFS->pCompress ){
1986     int nSpace = pPg->nCompress + 2*3;
1987
1988     do {
1989       if( eDir>0 ){
1990         rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg);
1991       }else{
1992         if( iPg==pRun->iFirst ){
1993           iPg = 0;
1994         }else{
1995           rc = fsGetPageBefore(pFS, pRun, iPg, &iPg);
1996         }
1997       }
1998
1999       nSpace = 0;
2000       if( iPg!=0 ){
2001         rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, &nSpace);
2002         assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) );
2003       }else{
2004         *ppNext = 0;
2005       }
2006     }while( nSpace>0 && rc==LSM_OK );
2007
2008   }else{
2009     Redirect *pRedir = pRun ? pRun->pRedirect : 0;
2010     assert( eDir==1 || eDir==-1 );
2011     if( eDir<0 ){
2012       if( pRun && iPg==pRun->iFirst ){
2013         *ppNext = 0;
2014         return LSM_OK;
2015       }else if( fsIsFirst(pFS, iPg) ){
2016         assert( pPg->flags & PAGE_HASPREV );
2017         iPg = fsLastPageOnBlock(pFS, lsmGetU32(&pPg->aData[-4]));
2018       }else{
2019         iPg--;
2020       }
2021     }else{
2022       if( pRun ){
2023         if( iPg==pRun->iLastPg ){
2024           *ppNext = 0;
2025           return LSM_OK;
2026         }
2027       }
2028
2029       if( fsIsLast(pFS, iPg) ){
2030         int iBlk = fsRedirectBlock(
2031             pRedir, lsmGetU32(&pPg->aData[pFS->nPagesize-4])
2032         );
2033         iPg = fsFirstPageOnBlock(pFS, iBlk);
2034       }else{
2035         iPg++;
2036       }
2037     }
2038     rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, 0);
2039   }
2040
2041   return rc;
2042 }
2043
2044 /*
2045 ** This function is called when creating a new segment to determine if the
2046 ** first part of it can be written following an existing segment on an
2047 ** already allocated block. If it is possible, the page number of the first
2048 ** page to use for the new segment is returned. Otherwise zero.
2049 **
2050 ** If argument pLvl is not NULL, then this function will not attempt to
2051 ** start the new segment immediately following any segment that is part
2052 ** of the right-hand-side of pLvl.
2053 */
2054 static Pgno findAppendPoint(FileSystem *pFS, Level *pLvl){
2055   int i;
2056   Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
2057   Pgno iRet = 0;
2058
2059   for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
2060     if( (iRet = aiAppend[i]) ){
2061       if( pLvl ){
2062         int iBlk = fsPageToBlock(pFS, iRet);
2063         int j;
2064         for(j=0; iRet && j<pLvl->nRight; j++){
2065           if( fsPageToBlock(pFS, pLvl->aRhs[j].iLastPg)==iBlk ){
2066             iRet = 0;
2067           }
2068         }
2069       }
2070       if( iRet ) aiAppend[i] = 0;
2071     }
2072   }
2073   return iRet;
2074 }
2075
2076 /*
2077 ** Append a page to the left-hand-side of pLvl. Set the ref-count to 1 and
2078 ** return a pointer to it. The page is writable until either
2079 ** lsmFsPagePersist() is called on it or the ref-count drops to zero.
2080 */
2081 int lsmFsSortedAppend(
2082   FileSystem *pFS,
2083   Snapshot *pSnapshot,
2084   Level *pLvl,
2085   int bDefer,
2086   Page **ppOut
2087 ){
2088   int rc = LSM_OK;
2089   Page *pPg = 0;
2090   Pgno iApp = 0;
2091   Pgno iNext = 0;
2092   Segment *p = &pLvl->lhs;
2093   Pgno iPrev = p->iLastPg;
2094
2095   *ppOut = 0;
2096   assert( p->pRedirect==0 );
2097
2098   if( pFS->pCompress || bDefer ){
2099     /* In compressed database mode the page is not assigned a page number
2100     ** or location in the database file at this point. This will be done
2101     ** by the lsmFsPagePersist() call.  */
2102     rc = fsPageBuffer(pFS, &pPg);
2103     if( rc==LSM_OK ){
2104       pPg->pFS = pFS;
2105       pPg->pSeg = p;
2106       pPg->iPg = 0;
2107       pPg->flags |= PAGE_DIRTY;
2108       pPg->nData = pFS->nPagesize;
2109       assert( pPg->aData );
2110       if( pFS->pCompress==0 ) pPg->nData -= 4;
2111
2112       pPg->nRef = 1;
2113       pFS->nOut++;
2114     }
2115   }else{
2116     if( iPrev==0 ){
2117       iApp = findAppendPoint(pFS, pLvl);
2118     }else if( fsIsLast(pFS, iPrev) ){
2119       int iNext;
2120       rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iPrev), &iNext);
2121       if( rc!=LSM_OK ) return rc;
2122       iApp = fsFirstPageOnBlock(pFS, iNext);
2123     }else{
2124       iApp = iPrev + 1;
2125     }
2126
2127     /* If this is the first page allocated, or if the page allocated is the
2128     ** last in the block, also allocate the next block here.  */
2129     if( iApp==0 || fsIsLast(pFS, iApp) ){
2130       int iNew;                     /* New block number */
2131
2132       rc = lsmBlockAllocate(pFS->pDb, 0, &iNew);
2133       if( rc!=LSM_OK ) return rc;
2134       if( iApp==0 ){
2135         iApp = fsFirstPageOnBlock(pFS, iNew);
2136       }else{
2137         iNext = fsFirstPageOnBlock(pFS, iNew);
2138       }
2139     }
2140
2141     /* Grab the new page. */
2142     pPg = 0;
2143     rc = fsPageGet(pFS, 0, iApp, 1, &pPg, 0);
2144     assert( rc==LSM_OK || pPg==0 );
2145
2146     /* If this is the first or last page of a block, fill in the pointer
2147      ** value at the end of the new page. */
2148     if( rc==LSM_OK ){
2149       p->nSize++;
2150       p->iLastPg = iApp;
2151       if( p->iFirst==0 ) p->iFirst = iApp;
2152       pPg->flags |= PAGE_DIRTY;
2153
2154       if( fsIsLast(pFS, iApp) ){
2155         lsmPutU32(&pPg->aData[pFS->nPagesize-4], fsPageToBlock(pFS, iNext));
2156       }else if( fsIsFirst(pFS, iApp) ){
2157         lsmPutU32(&pPg->aData[-4], fsPageToBlock(pFS, iPrev));
2158       }
2159     }
2160   }
2161
2162   *ppOut = pPg;
2163   return rc;
2164 }
2165
2166 /*
2167 ** Mark the segment passed as the second argument as finished. Once a segment
2168 ** is marked as finished it is not possible to append any further pages to
2169 ** it.
2170 **
2171 ** Return LSM_OK if successful or an lsm error code if an error occurs.
2172 */
2173 int lsmFsSortedFinish(FileSystem *pFS, Segment *p){
2174   int rc = LSM_OK;
2175   if( p && p->iLastPg ){
2176     assert( p->pRedirect==0 );
2177
2178     /* Check if the last page of this run happens to be the last of a block.
2179     ** If it is, then an extra block has already been allocated for this run.
2180     ** Shift this extra block back to the free-block list.
2181     **
2182     ** Otherwise, add the first free page in the last block used by the run
2183     ** to the lAppend list.
2184     */
2185     if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){
2186       int i;
2187       Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
2188       for(i=0; i<LSM_APPLIST_SZ; i++){
2189         if( aiAppend[i]==0 ){
2190           aiAppend[i] = p->iLastPg+1;
2191           break;
2192         }
2193       }
2194     }else if( pFS->pCompress==0 ){
2195       Page *pLast;
2196       rc = fsPageGet(pFS, 0, p->iLastPg, 0, &pLast, 0);
2197       if( rc==LSM_OK ){
2198         int iBlk = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
2199         lsmBlockRefree(pFS->pDb, iBlk);
2200         lsmFsPageRelease(pLast);
2201       }
2202     }else{
2203       int iBlk = 0;
2204       rc = fsBlockNext(pFS, p, fsPageToBlock(pFS, p->iLastPg), &iBlk);
2205       if( rc==LSM_OK ){
2206         lsmBlockRefree(pFS->pDb, iBlk);
2207       }
2208     }
2209   }
2210   return rc;
2211 }
2212
2213 /*
2214 ** Obtain a reference to page number iPg.
2215 **
2216 ** Return LSM_OK if successful, or an lsm error code if an error occurs.
2217 */
2218 int lsmFsDbPageGet(FileSystem *pFS, Segment *pSeg, Pgno iPg, Page **ppPg){
2219   return fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
2220 }
2221
2222 /*
2223 ** Obtain a reference to the last page in the segment passed as the
2224 ** second argument.
2225 **
2226 ** Return LSM_OK if successful, or an lsm error code if an error occurs.
2227 */
2228 int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){
2229   int rc;
2230   Pgno iPg = pSeg->iLastPg;
2231   if( pFS->pCompress ){
2232     int nSpace;
2233     iPg++;
2234     do {
2235       nSpace = 0;
2236       rc = fsGetPageBefore(pFS, pSeg, iPg, &iPg);
2237       if( rc==LSM_OK ){
2238         rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, &nSpace);
2239       }
2240     }while( rc==LSM_OK && nSpace>0 );
2241
2242   }else{
2243     rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
2244   }
2245   return rc;
2246 }
2247
2248 /*
2249 ** Return a reference to meta-page iPg. If successful, LSM_OK is returned
2250 ** and *ppPg populated with the new page reference. The reference should
2251 ** be released by the caller using lsmFsPageRelease().
2252 **
2253 ** Otherwise, if an error occurs, *ppPg is set to NULL and an LSM error
2254 ** code is returned.
2255 */
2256 int lsmFsMetaPageGet(
2257   FileSystem *pFS,                /* File-system connection */
2258   int bWrite,                     /* True for write access, false for read */
2259   int iPg,                        /* Either 1 or 2 */
2260   MetaPage **ppPg                 /* OUT: Pointer to MetaPage object */
2261 ){
2262   int rc = LSM_OK;
2263   MetaPage *pPg;
2264   assert( iPg==1 || iPg==2 );
2265
2266   pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
2267
2268   if( pPg ){
2269     i64 iOff = (iPg-1) * pFS->nMetasize;
2270     if( pFS->nMapLimit>0 ){
2271       fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
2272       pPg->aData = (u8 *)(pFS->pMap) + iOff;
2273     }else{
2274       pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
2275       if( rc==LSM_OK && bWrite==0 ){
2276         rc = lsmEnvRead(
2277             pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetaRwSize
2278         );
2279       }
2280 #ifndef NDEBUG
2281       /* pPg->aData causes an uninitialized access via a downstreadm write().
2282          After discussion on this list, this memory should not, for performance
2283          reasons, be memset. However, tracking down "real" misuse is more
2284          difficult with this "false" positive, so it is set when NDEBUG.
2285       */
2286       else if( rc==LSM_OK ){
2287         memset( pPg->aData, 0x77, pFS->nMetasize );
2288       }
2289 #endif
2290     }
2291
2292     if( rc!=LSM_OK ){
2293       if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData);
2294       lsmFree(pFS->pEnv, pPg);
2295       pPg = 0;
2296     }else{
2297       pPg->iPg = iPg;
2298       pPg->bWrite = bWrite;
2299       pPg->pFS = pFS;
2300     }
2301   }
2302
2303   *ppPg = pPg;
2304   return rc;
2305 }
2306
2307 /*
2308 ** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
2309 */
2310 int lsmFsMetaPageRelease(MetaPage *pPg){
2311   int rc = LSM_OK;
2312   if( pPg ){
2313     FileSystem *pFS = pPg->pFS;
2314
2315     if( pFS->nMapLimit==0 ){
2316       if( pPg->bWrite ){
2317         i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
2318         int nWrite = pFS->nMetaRwSize;
2319         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
2320       }
2321       lsmFree(pFS->pEnv, pPg->aData);
2322     }
2323
2324     lsmFree(pFS->pEnv, pPg);
2325   }
2326   return rc;
2327 }
2328
2329 /*
2330 ** Return a pointer to a buffer containing the data associated with the
2331 ** meta-page passed as the first argument. If parameter pnData is not NULL,
2332 ** set *pnData to the size of the meta-page in bytes before returning.
2333 */
2334 u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){
2335   if( pnData ) *pnData = pPg->pFS->nMetaRwSize;
2336   return pPg->aData;
2337 }
2338
2339 /*
2340 ** Return true if page is currently writable. This is used in assert()
2341 ** statements only.
2342 */
2343 #ifndef NDEBUG
2344 int lsmFsPageWritable(Page *pPg){
2345   return (pPg->flags & PAGE_DIRTY) ? 1 : 0;
2346 }
2347 #endif
2348
2349 /*
2350 ** This is called when block iFrom is being redirected to iTo. If page
2351 ** number (*piPg) lies on block iFrom, then calculate the equivalent
2352 ** page on block iTo and set *piPg to this value before returning.
2353 */
2354 static void fsMovePage(
2355   FileSystem *pFS,                /* File system object */
2356   int iTo,                        /* Destination block */
2357   int iFrom,                      /* Source block */
2358   Pgno *piPg                      /* IN/OUT: Page number */
2359 ){
2360   Pgno iPg = *piPg;
2361   if( iFrom==fsPageToBlock(pFS, iPg) ){
2362     const int nPagePerBlock = (
2363         pFS->pCompress ? pFS ->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
2364     );
2365     *piPg = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
2366   }
2367 }
2368
2369 /*
2370 ** Copy the contents of block iFrom to block iTo.
2371 **
2372 ** It is safe to assume that there are no outstanding references to pages
2373 ** on block iTo. And that block iFrom is not currently being written. In
2374 ** other words, the data can be read and written directly.
2375 */
2376 int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
2377   Snapshot *p = pFS->pDb->pWorker;
2378   int rc = LSM_OK;
2379   int i;
2380   i64 nMap;
2381
2382   i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
2383   i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
2384
2385   assert( iTo!=1 );
2386   assert( iFrom>iTo );
2387
2388   /* Grow the mapping as required. */
2389   nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize);
2390   fsGrowMapping(pFS, nMap, &rc);
2391
2392   if( rc==LSM_OK ){
2393     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
2394     int nSz = pFS->nPagesize;
2395     u8 *aBuf = 0;
2396     u8 *aData = 0;
2397
2398     for(i=0; rc==LSM_OK && i<nPagePerBlock; i++){
2399       i64 iOff = iFromOff + i*nSz;
2400
2401       /* Set aData to point to a buffer containing the from page */
2402       if( (iOff+nSz)<=pFS->nMapLimit ){
2403         u8 *aMap = (u8 *)(pFS->pMap);
2404         aData = &aMap[iOff];
2405       }else{
2406         if( aBuf==0 ){
2407           aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
2408           if( aBuf==0 ) break;
2409         }
2410         aData = aBuf;
2411         rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
2412       }
2413
2414       /* Copy aData to the to page */
2415       if( rc==LSM_OK ){
2416         iOff = iToOff + i*nSz;
2417         if( (iOff+nSz)<=pFS->nMapLimit ){
2418           u8 *aMap = (u8 *)(pFS->pMap);
2419           memcpy(&aMap[iOff], aData, nSz);
2420         }else{
2421           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
2422         }
2423       }
2424     }
2425     lsmFree(pFS->pEnv, aBuf);
2426     lsmFsPurgeCache(pFS);
2427   }
2428
2429   /* Update append-point list if necessary */
2430   for(i=0; i<LSM_APPLIST_SZ; i++){
2431     fsMovePage(pFS, iTo, iFrom, &p->aiAppend[i]);
2432   }
2433
2434   /* Update the Segment structure itself */
2435   fsMovePage(pFS, iTo, iFrom, &pSeg->iFirst);
2436   fsMovePage(pFS, iTo, iFrom, &pSeg->iLastPg);
2437   fsMovePage(pFS, iTo, iFrom, &pSeg->iRoot);
2438
2439   return rc;
2440 }
2441
2442 /*
2443 ** Append raw data to a segment. Return the database file offset that the
2444 ** data is written to (this may be used as the page number if the data
2445 ** being appended is a new page record).
2446 **
2447 ** This function is only used in compressed database mode.
2448 */
2449 static Pgno fsAppendData(
2450   FileSystem *pFS,                /* File-system handle */
2451   Segment *pSeg,                  /* Segment to append to */
2452   const u8 *aData,                /* Buffer containing data to write */
2453   int nData,                      /* Size of buffer aData[] in bytes */
2454   int *pRc                        /* IN/OUT: Error code */
2455 ){
2456   Pgno iRet = 0;
2457   int rc = *pRc;
2458   assert( pFS->pCompress );
2459   if( rc==LSM_OK ){
2460     int nRem = 0;
2461     int nWrite = 0;
2462     Pgno iLastOnBlock;
2463     Pgno iApp = pSeg->iLastPg+1;
2464
2465     /* If this is the first data written into the segment, find an append-point
2466     ** or allocate a new block.  */
2467     if( iApp==1 ){
2468       pSeg->iFirst = iApp = findAppendPoint(pFS, 0);
2469       if( iApp==0 ){
2470         int iBlk;
2471         rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2472         pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk);
2473       }
2474     }
2475     iRet = iApp;
2476
2477     /* Write as much data as is possible at iApp (usually all of it). */
2478     iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp);
2479     if( rc==LSM_OK ){
2480       int nSpace = (int)(iLastOnBlock - iApp + 1);
2481       nWrite = LSM_MIN(nData, nSpace);
2482       nRem = nData - nWrite;
2483       assert( nWrite>=0 );
2484       if( nWrite!=0 ){
2485         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite);
2486       }
2487       iApp += nWrite;
2488     }
2489
2490     /* If required, allocate a new block and write the rest of the data
2491     ** into it. Set the next and previous block pointers to link the new
2492     ** block to the old.  */
2493     assert( nRem<=0 || (iApp-1)==iLastOnBlock );
2494     if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){
2495       u8 aPtr[4];                 /* Space to serialize a u32 */
2496       int iBlk;                   /* New block number */
2497
2498       if( nWrite>0 ){
2499         /* Allocate a new block. */
2500         rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2501
2502         /* Set the "next" pointer on the old block */
2503         if( rc==LSM_OK ){
2504           assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 );
2505           lsmPutU32(aPtr, iBlk);
2506           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr));
2507         }
2508
2509         /* Set the "prev" pointer on the new block */
2510         if( rc==LSM_OK ){
2511           Pgno iWrite;
2512           lsmPutU32(aPtr, fsPageToBlock(pFS, iApp));
2513           iWrite = fsFirstPageOnBlock(pFS, iBlk);
2514           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr));
2515           if( nRem>0 ) iApp = iWrite;
2516         }
2517       }else{
2518         /* The next block is already allocated. */
2519         assert( nRem>0 );
2520         assert( pSeg->pRedirect==0 );
2521         rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iApp), &iBlk);
2522         iRet = iApp = fsFirstPageOnBlock(pFS, iBlk);
2523       }
2524
2525       /* Write the remaining data into the new block */
2526       if( rc==LSM_OK && nRem>0 ){
2527         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem);
2528         iApp += nRem;
2529       }
2530     }
2531
2532     pSeg->iLastPg = iApp-1;
2533     *pRc = rc;
2534   }
2535
2536   return iRet;
2537 }
2538
2539 /*
2540 ** This function is only called in compressed database mode. It
2541 ** compresses the contents of page pPg and writes the result to the
2542 ** buffer at pFS->aOBuffer. The size of the compressed data is stored in
2543 ** pPg->nCompress.
2544 **
2545 ** If buffer pFS->aOBuffer[] has not been allocated then this function
2546 ** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK.
2547 */
2548 static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){
2549   lsm_compress *p = pFS->pCompress;
2550
2551   if( fsAllocateBuffer(pFS, 1) ) return LSM_NOMEM;
2552   assert( pPg->nData==pFS->nPagesize );
2553
2554   pPg->nCompress = pFS->nBuffer;
2555   return p->xCompress(p->pCtx,
2556       (char *)pFS->aOBuffer, &pPg->nCompress,
2557       (const char *)pPg->aData, pPg->nData
2558   );
2559 }
2560
2561 /*
2562 ** Append a new page to segment pSeg. Set output variable *piNew to the
2563 ** page number of the new page before returning.
2564 **
2565 ** If the new page is the last on its block, then the 'next' block that
2566 ** will be used by the segment is allocated here too. In this case output
2567 ** variable *piNext is set to the block number of the next block.
2568 **
2569 ** If the new page is the first on its block but not the first in the
2570 ** entire segment, set output variable *piPrev to the block number of
2571 ** the previous block in the segment.
2572 **
2573 ** LSM_OK is returned if successful, or an lsm error code otherwise. If
2574 ** any value other than LSM_OK is returned, then the final value of all
2575 ** output variables is undefined.
2576 */
2577 static int fsAppendPage(
2578   FileSystem *pFS,
2579   Segment *pSeg,
2580   Pgno *piNew,
2581   int *piPrev,
2582   int *piNext
2583 ){
2584   Pgno iPrev = pSeg->iLastPg;
2585   int rc;
2586   assert( iPrev!=0 );
2587
2588   *piPrev = 0;
2589   *piNext = 0;
2590
2591   if( fsIsLast(pFS, iPrev) ){
2592     /* Grab the first page on the next block (which has already be
2593     ** allocated). In this case set *piPrev to tell the caller to set
2594     ** the "previous block" pointer in the first 4 bytes of the page.
2595     */
2596     int iNext;
2597     int iBlk = fsPageToBlock(pFS, iPrev);
2598     assert( pSeg->pRedirect==0 );
2599     rc = fsBlockNext(pFS, 0, iBlk, &iNext);
2600     if( rc!=LSM_OK ) return rc;
2601     *piNew = fsFirstPageOnBlock(pFS, iNext);
2602     *piPrev = iBlk;
2603   }else{
2604     *piNew = iPrev+1;
2605     if( fsIsLast(pFS, *piNew) ){
2606       /* Allocate the next block here. */
2607       int iBlk;
2608       rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2609       if( rc!=LSM_OK ) return rc;
2610       *piNext = iBlk;
2611     }
2612   }
2613
2614   pSeg->nSize++;
2615   pSeg->iLastPg = *piNew;
2616   return LSM_OK;
2617 }
2618
2619 /*
2620 ** Flush all pages in the FileSystem.pWaiting list to disk.
2621 */
2622 void lsmFsFlushWaiting(FileSystem *pFS, int *pRc){
2623   int rc = *pRc;
2624   Page *pPg;
2625
2626   pPg = pFS->pWaiting;
2627   pFS->pWaiting = 0;
2628
2629   while( pPg ){
2630     Page *pNext = pPg->pWaitingNext;
2631     if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
2632     assert( pPg->nRef==1 );
2633     lsmFsPageRelease(pPg);
2634     pPg = pNext;
2635   }
2636   *pRc = rc;
2637 }
2638
2639 /*
2640 ** If there exists a hash-table entry associated with page iPg, remove it.
2641 */
2642 static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){
2643   Page *p;
2644   int iHash = fsHashKey(pFS->nHash, iPg);
2645
2646   for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);
2647
2648   if( p ){
2649     assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 );
2650     fsPageRemoveFromHash(pFS, p);
2651     p->iPg = 0;
2652     iHash = fsHashKey(pFS->nHash, 0);
2653     p->pHashNext = pFS->apHash[iHash];
2654     pFS->apHash[iHash] = p;
2655   }
2656 }
2657
2658 /*
2659 ** If the page passed as an argument is dirty, update the database file
2660 ** (or mapping of the database file) with its current contents and mark
2661 ** the page as clean.
2662 **
2663 ** Return LSM_OK if the operation is a success, or an LSM error code
2664 ** otherwise.
2665 */
2666 int lsmFsPagePersist(Page *pPg){
2667   int rc = LSM_OK;
2668   if( pPg && (pPg->flags & PAGE_DIRTY) ){
2669     FileSystem *pFS = pPg->pFS;
2670
2671     if( pFS->pCompress ){
2672       int iHash;                  /* Hash key of assigned page number */
2673       u8 aSz[3];                  /* pPg->nCompress as a 24-bit big-endian */
2674       assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 );
2675
2676       /* Compress the page image. */
2677       rc = fsCompressIntoBuffer(pFS, pPg);
2678
2679       /* Serialize the compressed size into buffer aSz[] */
2680       putRecordSize(aSz, pPg->nCompress, 0);
2681
2682       /* Write the serialized page record into the database file. */
2683       pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
2684       fsAppendData(pFS, pPg->pSeg, pFS->aOBuffer, pPg->nCompress, &rc);
2685       fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
2686
2687       /* Now that it has a page number, insert the page into the hash table */
2688       iHash = fsHashKey(pFS->nHash, pPg->iPg);
2689       pPg->pHashNext = pFS->apHash[iHash];
2690       pFS->apHash[iHash] = pPg;
2691
2692       pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;
2693
2694       pPg->flags &= ~PAGE_DIRTY;
2695       pFS->nWrite++;
2696     }else{
2697
2698       if( pPg->iPg==0 ){
2699         /* No page number has been assigned yet. This occurs with pages used
2700         ** in the b-tree hierarchy. They were not assigned page numbers when
2701         ** they were created as doing so would cause this call to
2702         ** lsmFsPagePersist() to write an out-of-order page. Instead a page
2703         ** number is assigned here so that the page data will be appended
2704         ** to the current segment.
2705         */
2706         Page **pp;
2707         int iPrev = 0;
2708         int iNext = 0;
2709         int iHash;
2710
2711         assert( pPg->pSeg->iFirst );
2712         assert( pPg->flags & PAGE_FREE );
2713         assert( (pPg->flags & PAGE_HASPREV)==0 );
2714         assert( pPg->nData==pFS->nPagesize-4 );
2715
2716         rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
2717         if( rc!=LSM_OK ) return rc;
2718
2719         assert( pPg->flags & PAGE_FREE );
2720         iHash = fsHashKey(pFS->nHash, pPg->iPg);
2721         fsRemoveHashEntry(pFS, pPg->iPg);
2722         pPg->pHashNext = pFS->apHash[iHash];
2723         pFS->apHash[iHash] = pPg;
2724         assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );
2725
2726         if( iPrev ){
2727           assert( iNext==0 );
2728           memmove(&pPg->aData[4], pPg->aData, pPg->nData);
2729           lsmPutU32(pPg->aData, iPrev);
2730           pPg->flags |= PAGE_HASPREV;
2731           pPg->aData += 4;
2732         }else if( iNext ){
2733           assert( iPrev==0 );
2734           lsmPutU32(&pPg->aData[pPg->nData], iNext);
2735         }else{
2736           int nData = pPg->nData;
2737           pPg->nData += 4;
2738           lsmSortedExpandBtreePage(pPg, nData);
2739         }
2740
2741         pPg->nRef++;
2742         for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext);
2743         *pp = pPg;
2744         assert( pPg->pWaitingNext==0 );
2745
2746       }else{
2747         i64 iOff;                   /* Offset to write within database file */
2748
2749         iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
2750         if( fsMmapPage(pFS, pPg->iPg)==0 ){
2751           u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
2752           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
2753         }else if( pPg->flags & PAGE_FREE ){
2754           fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
2755           if( rc==LSM_OK ){
2756             u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
2757             u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
2758             memcpy(aTo, aFrom, pFS->nPagesize);
2759             lsmFree(pFS->pEnv, aFrom);
2760             pFS->nCacheAlloc--;
2761             pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
2762             pPg->flags &= ~PAGE_FREE;
2763             fsPageRemoveFromHash(pFS, pPg);
2764             pPg->pMappedNext = pFS->pMapped;
2765             pFS->pMapped = pPg;
2766           }
2767         }
2768
2769         lsmFsFlushWaiting(pFS, &rc);
2770         pPg->flags &= ~PAGE_DIRTY;
2771         pFS->nWrite++;
2772       }
2773     }
2774   }
2775
2776   return rc;
2777 }
2778
2779 /*
2780 ** For non-compressed databases, this function is a no-op. For compressed
2781 ** databases, it adds a padding record to the segment passed as the third
2782 ** argument.
2783 **
2784 ** The size of the padding records is selected so that the last byte
2785 ** written is the last byte of a disk sector. This means that if a
2786 ** snapshot is taken and checkpointed, subsequent worker processes will
2787 ** not write to any sector that contains checkpointed data.
2788 */
2789 int lsmFsSortedPadding(
2790   FileSystem *pFS,
2791   Snapshot *pSnapshot,
2792   Segment *pSeg
2793 ){
2794   int rc = LSM_OK;
2795   if( pFS->pCompress ){
2796     Pgno iLast2;
2797     Pgno iLast = pSeg->iLastPg;     /* Current last page of segment */
2798     int nPad;                       /* Bytes of padding required */
2799     u8 aSz[3];
2800
2801     iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1;
2802     assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) );
2803     nPad = (int)(iLast2 - iLast);
2804
2805     if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){
2806       nPad -= 4;
2807     }
2808     assert( nPad>=0 );
2809
2810     if( nPad>=6 ){
2811       pSeg->nSize += nPad;
2812       nPad -= 6;
2813       putRecordSize(aSz, nPad, 1);
2814       fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
2815       memset(pFS->aOBuffer, 0, nPad);
2816       fsAppendData(pFS, pSeg, pFS->aOBuffer, nPad, &rc);
2817       fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
2818     }else if( nPad>0 ){
2819       u8 aBuf[5] = {0,0,0,0,0};
2820       aBuf[0] = (u8)nPad;
2821       aBuf[nPad-1] = (u8)nPad;
2822       fsAppendData(pFS, pSeg, aBuf, nPad, &rc);
2823     }
2824
2825     assert( rc!=LSM_OK
2826         || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg)
2827         || ((pSeg->iLastPg + 1) % pFS->szSector)==0
2828     );
2829   }
2830
2831   return rc;
2832 }
2833
2834
2835 /*
2836 ** Increment the reference count on the page object passed as the first
2837 ** argument.
2838 */
2839 void lsmFsPageRef(Page *pPg){
2840   if( pPg ){
2841     pPg->nRef++;
2842   }
2843 }
2844
2845 /*
2846 ** Release a page-reference obtained using fsPageGet().
2847 */
2848 int lsmFsPageRelease(Page *pPg){
2849   int rc = LSM_OK;
2850   if( pPg ){
2851     assert( pPg->nRef>0 );
2852     pPg->nRef--;
2853     if( pPg->nRef==0 ){
2854       FileSystem *pFS = pPg->pFS;
2855       rc = lsmFsPagePersist(pPg);
2856       pFS->nOut--;
2857
2858       assert( pPg->pFS->pCompress
2859            || fsIsFirst(pPg->pFS, pPg->iPg)==0
2860            || (pPg->flags & PAGE_HASPREV)
2861       );
2862       pPg->aData -= (pPg->flags & PAGE_HASPREV);
2863       pPg->flags &= ~PAGE_HASPREV;
2864
2865       if( (pPg->flags & PAGE_FREE)==0 ){
2866         /* Removed from mapped list */
2867         Page **pp;
2868         for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext);
2869         *pp = pPg->pMappedNext;
2870         pPg->pMappedNext = 0;
2871
2872         /* Add to free list */
2873         pPg->pFreeNext = pFS->pFree;
2874         pFS->pFree = pPg;
2875       }else{
2876         fsPageAddToLru(pFS, pPg);
2877       }
2878     }
2879   }
2880
2881   return rc;
2882 }
2883
2884 /*
2885 ** Return the total number of pages read from the database file.
2886 */
2887 int lsmFsNRead(FileSystem *pFS){ return pFS->nRead; }
2888
2889 /*
2890 ** Return the total number of pages written to the database file.
2891 */
2892 int lsmFsNWrite(FileSystem *pFS){ return pFS->nWrite; }
2893
2894 /*
2895 ** Return a copy of the environment pointer used by the file-system object.
2896 */
2897 lsm_env *lsmFsEnv(FileSystem *pFS){
2898   return pFS->pEnv;
2899 }
2900
2901 /*
2902 ** Return a copy of the environment pointer used by the file-system object
2903 ** to which this page belongs.
2904 */
2905 lsm_env *lsmPageEnv(Page *pPg) {
2906   return pPg->pFS->pEnv;
2907 }
2908
2909 /*
2910 ** Return a pointer to the file-system object associated with the Page
2911 ** passed as the only argument.
2912 */
2913 FileSystem *lsmPageFS(Page *pPg){
2914   return pPg->pFS;
2915 }
2916
2917 /*
2918 ** Return the sector-size as reported by the log file handle.
2919 */
2920 int lsmFsSectorSize(FileSystem *pFS){
2921   return pFS->szSector;
2922 }
2923
2924 /*
2925 ** Helper function for lsmInfoArrayStructure().
2926 */
2927 static Segment *startsWith(Segment *pRun, Pgno iFirst){
2928   return (iFirst==pRun->iFirst) ? pRun : 0;
2929 }
2930
2931 /*
2932 ** Return the segment that starts with page iFirst, if any. If no such segment
2933 ** can be found, return NULL.
2934 */
2935 static Segment *findSegment(Snapshot *pWorker, Pgno iFirst){
2936   Level *pLvl;                    /* Used to iterate through db levels */
2937   Segment *pSeg = 0;              /* Pointer to segment to return */
2938
2939   for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pSeg==0; pLvl=pLvl->pNext){
2940     if( 0==(pSeg = startsWith(&pLvl->lhs, iFirst)) ){
2941       int i;
2942       for(i=0; i<pLvl->nRight; i++){
2943         if( (pSeg = startsWith(&pLvl->aRhs[i], iFirst)) ) break;
2944       }
2945     }
2946   }
2947
2948   return pSeg;
2949 }
2950
2951 /*
2952 ** This function implements the lsm_info(LSM_INFO_ARRAY_STRUCTURE) request.
2953 ** If successful, *pzOut is set to point to a nul-terminated string
2954 ** containing the array structure and LSM_OK is returned. The caller should
2955 ** eventually free the string using lsmFree().
2956 **
2957 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
2958 */
2959 int lsmInfoArrayStructure(
2960   lsm_db *pDb,
2961   int bBlock,                     /* True for block numbers only */
2962   Pgno iFirst,
2963   char **pzOut
2964 ){
2965   int rc = LSM_OK;
2966   Snapshot *pWorker;              /* Worker snapshot */
2967   Segment *pArray = 0;            /* Array to report on */
2968   int bUnlock = 0;
2969
2970   *pzOut = 0;
2971   if( iFirst==0 ) return LSM_ERROR;
2972
2973   /* Obtain the worker snapshot */
2974   pWorker = pDb->pWorker;
2975   if( !pWorker ){
2976     rc = lsmBeginWork(pDb);
2977     if( rc!=LSM_OK ) return rc;
2978     pWorker = pDb->pWorker;
2979     bUnlock = 1;
2980   }
2981
2982   /* Search for the array that starts on page iFirst */
2983   pArray = findSegment(pWorker, iFirst);
2984
2985   if( pArray==0 ){
2986     /* Could not find the requested array. This is an error. */
2987     rc = LSM_ERROR;
2988   }else{
2989     FileSystem *pFS = pDb->pFS;
2990     LsmString str;
2991     int iBlk;
2992     int iLastBlk;
2993
2994     iBlk = fsPageToBlock(pFS, pArray->iFirst);
2995     iLastBlk = fsPageToBlock(pFS, pArray->iLastPg);
2996
2997     lsmStringInit(&str, pDb->pEnv);
2998     if( bBlock ){
2999       lsmStringAppendf(&str, "%d", iBlk);
3000       while( iBlk!=iLastBlk ){
3001         fsBlockNext(pFS, pArray, iBlk, &iBlk);
3002         lsmStringAppendf(&str, " %d", iBlk);
3003       }
3004     }else{
3005       lsmStringAppendf(&str, "%d", pArray->iFirst);
3006       while( iBlk!=iLastBlk ){
3007         lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk));
3008         fsBlockNext(pFS, pArray, iBlk, &iBlk);
3009         lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
3010       }
3011       lsmStringAppendf(&str, " %d", pArray->iLastPg);
3012     }
3013
3014     *pzOut = str.z;
3015   }
3016
3017   if( bUnlock ){
3018     int rcwork = LSM_BUSY;
3019     lsmFinishWork(pDb, 0, &rcwork);
3020   }
3021   return rc;
3022 }
3023
3024 int lsmFsSegmentContainsPg(
3025   FileSystem *pFS,
3026   Segment *pSeg,
3027   Pgno iPg,
3028   int *pbRes
3029 ){
3030   Redirect *pRedir = pSeg->pRedirect;
3031   int rc = LSM_OK;
3032   int iBlk;
3033   int iLastBlk;
3034   int iPgBlock;                   /* Block containing page iPg */
3035
3036   iPgBlock = fsPageToBlock(pFS, pSeg->iFirst);
3037   iBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iFirst));
3038   iLastBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iLastPg));
3039
3040   while( iBlk!=iLastBlk && iBlk!=iPgBlock && rc==LSM_OK ){
3041     rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
3042   }
3043
3044   *pbRes = (iBlk==iPgBlock);
3045   return rc;
3046 }
3047
3048 /*
3049 ** This function implements the lsm_info(LSM_INFO_ARRAY_PAGES) request.
3050 ** If successful, *pzOut is set to point to a nul-terminated string
3051 ** containing the array structure and LSM_OK is returned. The caller should
3052 ** eventually free the string using lsmFree().
3053 **
3054 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
3055 */
3056 int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut){
3057   int rc = LSM_OK;
3058   Snapshot *pWorker;              /* Worker snapshot */
3059   Segment *pSeg = 0;              /* Array to report on */
3060   int bUnlock = 0;
3061
3062   *pzOut = 0;
3063   if( iFirst==0 ) return LSM_ERROR;
3064
3065   /* Obtain the worker snapshot */
3066   pWorker = pDb->pWorker;
3067   if( !pWorker ){
3068     rc = lsmBeginWork(pDb);
3069     if( rc!=LSM_OK ) return rc;
3070     pWorker = pDb->pWorker;
3071     bUnlock = 1;
3072   }
3073
3074   /* Search for the array that starts on page iFirst */
3075   pSeg = findSegment(pWorker, iFirst);
3076
3077   if( pSeg==0 ){
3078     /* Could not find the requested array. This is an error. */
3079     rc = LSM_ERROR;
3080   }else{
3081     Page *pPg = 0;
3082     FileSystem *pFS = pDb->pFS;
3083     LsmString str;
3084
3085     lsmStringInit(&str, pDb->pEnv);
3086     rc = lsmFsDbPageGet(pFS, pSeg, iFirst, &pPg);
3087     while( rc==LSM_OK && pPg ){
3088       Page *pNext = 0;
3089       lsmStringAppendf(&str, " %lld", lsmFsPageNumber(pPg));
3090       rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
3091       lsmFsPageRelease(pPg);
3092       pPg = pNext;
3093     }
3094
3095     if( rc!=LSM_OK ){
3096       lsmFree(pDb->pEnv, str.z);
3097     }else{
3098       *pzOut = str.z;
3099     }
3100   }
3101
3102   if( bUnlock ){
3103     int rcwork = LSM_BUSY;
3104     lsmFinishWork(pDb, 0, &rcwork);
3105   }
3106   return rc;
3107 }
3108
3109 /*
3110 ** The following macros are used by the integrity-check code. Associated with
3111 ** each block in the database is an 8-bit bit mask (the entry in the aUsed[]
3112 ** array). As the integrity-check meanders through the database, it sets the
3113 ** following bits to indicate how each block is used.
3114 **
3115 ** INTEGRITY_CHECK_FIRST_PG:
3116 **   First page of block is in use by sorted run.
3117 **
3118 ** INTEGRITY_CHECK_LAST_PG:
3119 **   Last page of block is in use by sorted run.
3120 **
3121 ** INTEGRITY_CHECK_USED:
3122 **   At least one page of the block is in use by a sorted run.
3123 **
3124 ** INTEGRITY_CHECK_FREE:
3125 **   The free block list contains an entry corresponding to this block.
3126 */
3127 #define INTEGRITY_CHECK_FIRST_PG 0x01
3128 #define INTEGRITY_CHECK_LAST_PG  0x02
3129 #define INTEGRITY_CHECK_USED     0x04
3130 #define INTEGRITY_CHECK_FREE     0x08
3131
3132 /*
3133 ** Helper function for lsmFsIntegrityCheck()
3134 */
3135 static void checkBlocks(
3136   FileSystem *pFS,
3137   Segment *pSeg,
3138   int bExtra,                     /* If true, count the "next" block if any */
3139   int nUsed,
3140   u8 *aUsed
3141 ){
3142   if( pSeg ){
3143     if( pSeg && pSeg->nSize>0 ){
3144       int rc;
3145       int iBlk;                   /* Current block (during iteration) */
3146       int iLastBlk;               /* Last block of segment */
3147       int iFirstBlk;              /* First block of segment */
3148       int bLastIsLastOnBlock;     /* True iLast is the last on its block */
3149
3150       assert( 0==fsSegmentRedirects(pFS, pSeg) );
3151       iBlk = iFirstBlk = fsPageToBlock(pFS, pSeg->iFirst);
3152       iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg);
3153
3154       bLastIsLastOnBlock = (fsLastPageOnBlock(pFS, iLastBlk)==pSeg->iLastPg);
3155       assert( iBlk>0 );
3156
3157       do {
3158         /* iBlk is a part of this sorted run. */
3159         aUsed[iBlk-1] |= INTEGRITY_CHECK_USED;
3160
3161         /* If the first page of this block is also part of the segment,
3162         ** set the flag to indicate that the first page of iBlk is in use.
3163         */
3164         if( fsFirstPageOnBlock(pFS, iBlk)==pSeg->iFirst || iBlk!=iFirstBlk ){
3165           assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_FIRST_PG)==0 );
3166           aUsed[iBlk-1] |= INTEGRITY_CHECK_FIRST_PG;
3167         }
3168
3169         /* Unless the sorted run finishes before the last page on this block,
3170         ** the last page of this block is also in use.  */
3171         if( iBlk!=iLastBlk || bLastIsLastOnBlock ){
3172           assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_LAST_PG)==0 );
3173           aUsed[iBlk-1] |= INTEGRITY_CHECK_LAST_PG;
3174         }
3175
3176         /* Special case. The sorted run being scanned is the output run of
3177         ** a level currently undergoing an incremental merge. The sorted
3178         ** run ends on the last page of iBlk, but the next block has already
3179         ** been allocated. So mark it as in use as well.  */
3180         if( iBlk==iLastBlk && bLastIsLastOnBlock && bExtra ){
3181           int iExtra = 0;
3182           rc = fsBlockNext(pFS, pSeg, iBlk, &iExtra);
3183           assert( rc==LSM_OK );
3184
3185           assert( aUsed[iExtra-1]==0 );
3186           aUsed[iExtra-1] |= INTEGRITY_CHECK_USED;
3187           aUsed[iExtra-1] |= INTEGRITY_CHECK_FIRST_PG;
3188           aUsed[iExtra-1] |= INTEGRITY_CHECK_LAST_PG;
3189         }
3190
3191         /* Move on to the next block in the sorted run. Or set iBlk to zero
3192         ** in order to break out of the loop if this was the last block in
3193         ** the run.  */
3194         if( iBlk==iLastBlk ){
3195           iBlk = 0;
3196         }else{
3197           rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
3198           assert( rc==LSM_OK );
3199         }
3200       }while( iBlk );
3201     }
3202   }
3203 }
3204
3205 typedef struct CheckFreelistCtx CheckFreelistCtx;
3206 struct CheckFreelistCtx {
3207   u8 *aUsed;
3208   int nBlock;
3209 };
3210 static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
3211   CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx;
3212
3213   assert( iBlk>=1 );
3214   assert( iBlk<=p->nBlock );
3215   assert( p->aUsed[iBlk-1]==0 );
3216   p->aUsed[iBlk-1] = INTEGRITY_CHECK_FREE;
3217   return 0;
3218 }
3219
3220 /*
3221 ** This function checks that all blocks in the database file are accounted
3222 ** for. For each block, exactly one of the following must be true:
3223 **
3224 **   + the block is part of a sorted run, or
3225 **   + the block is on the free-block list
3226 **
3227 ** This function also checks that there are no references to blocks with
3228 ** out-of-range block numbers.
3229 **
3230 ** If no errors are found, non-zero is returned. If an error is found, an
3231 ** assert() fails.
3232 */
3233 int lsmFsIntegrityCheck(lsm_db *pDb){
3234   CheckFreelistCtx ctx;
3235   FileSystem *pFS = pDb->pFS;
3236   int i;
3237   int rc;
3238   Freelist freelist = {0, 0, 0};
3239   u8 *aUsed;
3240   Level *pLevel;
3241   Snapshot *pWorker = pDb->pWorker;
3242   int nBlock = pWorker->nBlock;
3243
3244 #if 0
3245   static int nCall = 0;
3246   nCall++;
3247   printf("%d calls\n", nCall);
3248 #endif
3249
3250   aUsed = lsmMallocZero(pDb->pEnv, nBlock);
3251   if( aUsed==0 ){
3252     /* Malloc has failed. Since this function is only called within debug
3253     ** builds, this probably means the user is running an OOM injection test.
3254     ** Regardless, it will not be possible to run the integrity-check at this
3255     ** time, so assume the database is Ok and return non-zero. */
3256     return 1;
3257   }
3258
3259   for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
3260     int i;
3261     checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
3262     for(i=0; i<pLevel->nRight; i++){
3263       checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed);
3264     }
3265   }
3266
3267   /* Mark all blocks in the free-list as used */
3268   ctx.aUsed = aUsed;
3269   ctx.nBlock = nBlock;
3270   rc = lsmWalkFreelist(pDb, 0, checkFreelistCb, (void *)&ctx);
3271
3272   if( rc==LSM_OK ){
3273     for(i=0; i<nBlock; i++) assert( aUsed[i]!=0 );
3274   }
3275
3276   lsmFree(pDb->pEnv, aUsed);
3277   lsmFree(pDb->pEnv, freelist.aEntry);
3278
3279   return 1;
3280 }
3281
3282 #ifndef NDEBUG
3283 /*
3284 ** Return true if pPg happens to be the last page in segment pSeg. Or false
3285 ** otherwise. This function is only invoked as part of assert() conditions.
3286 */
3287 int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){
3288   if( pPg->pFS->pCompress ){
3289     Pgno iNext = 0;
3290     int rc;
3291     rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext);
3292     return (rc!=LSM_OK || iNext==0);
3293   }
3294   return (pPg->iPg==pSeg->iLastPg);
3295 }
3296 #endif