ext/lsm1/lsm_file.c

   1 /*
   2 ** 2011-08-26
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 **
  13 ** NORMAL DATABASE FILE FORMAT
  14 **
  15 ** The following database file format concepts are used by the code in
  16 ** this file to read and write the database file.
  17 **
  18 ** Pages:
  19 **
  20 **   A database file is divided into pages. The first 8KB of the file consists
  21 **   of two 4KB meta-pages. The meta-page size is not configurable. The
  22 **   remainder of the file is made up of database pages. The default database
  23 **   page size is 4KB. Database pages are aligned to page-size boundaries,
  24 **   so if the database page size is larger than 8KB there is a gap between
  25 **   the end of the meta pages and the start of the database pages.
  26 **
  27 **   Database pages are numbered based on their position in the file. Page N
  28 **   begins at byte offset ((N-1)*pgsz). This means that page 1 does not
  29 **   exist - since it would always overlap with the meta pages. If the
  30 **   page-size is (say) 512 bytes, then the first usable page in the database
  31 **   is page 33.
  32 **
  33 **   It is assumed that the first two meta pages and the data that follows
  34 **   them are located on different disk sectors. So that if a power failure
  35 **   while writing to a meta page there is no risk of damage to the other
  36 **   meta page or any other part of the database file. TODO: This may need
  37 **   to be revisited.
  38 **
  39 ** Blocks:
  40 **
  41 **   The database file is also divided into blocks. The default block size is
  42 **   1MB. When writing to the database file, an attempt is made to write data
  43 **   in contiguous block-sized chunks.
  44 **
  45 **   The first and last page on each block are special in that they are 4
  46 **   bytes smaller than all other pages. This is because the last four bytes
  47 **   of space on the first and last pages of each block are reserved for
  48 **   pointers to other blocks (i.e. a 32-bit block number).
  49 **
  50 ** Runs:
  51 **
  52 **   A run is a sequence of pages that the upper layer uses to store a
  53 **   sorted array of database keys (and accompanying data - values, FC
  54 **   pointers and so on). Given a page within a run, it is possible to
  55 **   navigate to the next page in the run as follows:
  56 **
  57 **     a) if the current page is not the last in a block, the next page
  58 **        in the run is located immediately after the current page, OR
  59 **
  60 **     b) if the current page is the last page in a block, the next page
  61 **        in the run is the first page on the block identified by the
  62 **        block pointer stored in the last 4 bytes of the current block.
  63 **
  64 **   It is possible to navigate to the previous page in a similar fashion,
  65 **   using the block pointer embedded in the last 4 bytes of the first page
  66 **   of each block as required.
  67 **
  68 **   The upper layer is responsible for identifying by page number the
  69 **   first and last page of any run that it needs to navigate - there are
  70 **   no "end-of-run" markers stored or identified by this layer. This is
  71 **   necessary as clients reading different database snapshots may access
  72 **   different subsets of a run.
  73 **
  74 ** THE LOG FILE
  75 **
  76 ** This file opens and closes the log file. But it does not contain any
  77 ** logic related to the log file format. Instead, it exports the following
  78 ** functions that are used by the code in lsm_log.c to read and write the
  79 ** log file:
  80 **
  81 **     lsmFsOpenLog
  82 **     lsmFsWriteLog
  83 **     lsmFsSyncLog
  84 **     lsmFsReadLog
  85 **     lsmFsTruncateLog
  86 **     lsmFsCloseAndDeleteLog
  87 **
  88 ** COMPRESSED DATABASE FILE FORMAT
  89 **
  90 ** The compressed database file format is very similar to the normal format.
  91 ** The file still begins with two 4KB meta-pages (which are never compressed).
  92 ** It is still divided into blocks.
  93 **
  94 ** The first and last four bytes of each block are reserved for 32-bit
  95 ** pointer values. Similar to the way four bytes are carved from the end of
  96 ** the first and last page of each block in uncompressed databases. From
  97 ** the point of view of the upper layer, all pages are the same size - this
  98 ** is different from the uncompressed format where the first and last pages
  99 ** on each block are 4 bytes smaller than the others.
 100 **
 101 ** Pages are stored in variable length compressed form, as follows:
 102 **
 103 **     * 3-byte size field containing the size of the compressed page image
 104 **       in bytes. The most significant bit of each byte of the size field
 105 **       is always set. The remaining 7 bits are used to store a 21-bit
 106 **       integer value (in big-endian order - the first byte in the field
 107 **       contains the most significant 7 bits). Since the maximum allowed
 108 **       size of a compressed page image is (2^17 - 1) bytes, there are
 109 **       actually 4 unused bits in the size field.
 110 **
 111 **       In other words, if the size of the compressed page image is nSz,
 112 **       the header can be serialized as follows:
 113 **
 114 **         u8 aHdr[3]
 115 **         aHdr[0] = 0x80 | (u8)(nSz >> 14);
 116 **         aHdr[1] = 0x80 | (u8)(nSz >>  7);
 117 **         aHdr[2] = 0x80 | (u8)(nSz >>  0);
 118 **
 119 **     * Compressed page image.
 120 **
 121 **     * A second copy of the 3-byte record header.
 122 **
 123 ** A page number is a byte offset into the database file. So the smallest
 124 ** possible page number is 8192 (immediately after the two meta-pages).
 125 ** The first and root page of a segment are identified by a page number
 126 ** corresponding to the byte offset of the first byte in the corresponding
 127 ** page record. The last page of a segment is identified by the byte offset
 128 ** of the last byte in its record.
 129 **
 130 ** Unlike uncompressed pages, compressed page records may span blocks.
 131 **
 132 ** Sometimes, in order to avoid touching sectors that contain synced data
 133 ** when writing, it is necessary to insert unused space between compressed
 134 ** page records. This can be done as follows:
 135 **
 136 **     * For less than 6 bytes of empty space, the first and last byte
 137 **       of the free space contain the total number of free bytes. For
 138 **       example:
 139 **
 140 **         Block of 4 free bytes: 0x04 0x?? 0x?? 0x04
 141 **         Block of 2 free bytes: 0x02 0x02
 142 **         A single free byte:    0x01
 143 **
 144 **     * For 6 or more bytes of empty space, a record similar to a
 145 **       compressed page record is added to the segment. A padding record
 146 **       is distinguished from a compressed page record by the most
 147 **       significant bit of the second byte of the size field, which is
 148 **       cleared instead of set.
 149 */
 150 #include "lsmInt.h"
 151
 152 #include <sys/types.h>
 153 #include <sys/stat.h>
 154 #include <fcntl.h>
 155
 156 /*
 157 ** File-system object. Each database connection allocates a single instance
 158 ** of the following structure. It is used for all access to the database and
 159 ** log files.
 160 **
 161 ** The database file may be accessed via two methods - using mmap() or using
 162 ** read() and write() calls. In the general case both methods are used - a
 163 ** prefix of the file is mapped into memory and the remainder accessed using
 164 ** read() and write(). This is helpful when accessing very large files (or
 165 ** files that may grow very large during the lifetime of a database
 166 ** connection) on systems with 32-bit address spaces. However, it also requires
 167 ** that this object manage two distinct types of Page objects simultaneously -
 168 ** those that carry pointers to the mapped file and those that carry arrays
 169 ** populated by read() calls.
 170 **
 171 ** pFree:
 172 **   The head of a singly-linked list that containing currently unused Page
 173 **   structures suitable for use as mmap-page handles. Connected by the
 174 **   Page.pFreeNext pointers.
 175 **
 176 ** pMapped:
 177 **   The head of a singly-linked list that contains all pages that currently
 178 **   carry pointers to the mapped region. This is used if the region is
 179 **   every remapped - the pointers carried by existing pages can be adjusted
 180 **   to account for the remapping. Connected by the Page.pMappedNext pointers.
 181 **
 182 ** pWaiting:
 183 **   When the upper layer wishes to append a new b-tree page to a segment,
 184 **   it allocates a Page object that carries a malloc'd block of memory -
 185 **   regardless of the mmap-related configuration. The page is not assigned
 186 **   a page number at first. When the upper layer has finished constructing
 187 **   the page contents, it calls lsmFsPagePersist() to assign a page number
 188 **   to it. At this point it is likely that N pages have been written to the
 189 **   segment, the (N+1)th page is still outstanding and the b-tree page is
 190 **   assigned page number (N+2). To avoid writing page (N+2) before page
 191 **   (N+1), the recently completed b-tree page is held in the singly linked
 192 **   list headed by pWaiting until page (N+1) has been written.
 193 **
 194 **   Function lsmFsFlushWaiting() is responsible for eventually writing
 195 **   waiting pages to disk.
 196 **
 197 ** apHash/nHash:
 198 **   Hash table used to store all Page objects that carry malloc'd arrays,
 199 **   except those b-tree pages that have not yet been assigned page numbers.
 200 **   Once they have been assigned page numbers - they are added to this
 201 **   hash table.
 202 **
 203 **   Hash table overflow chains are connected using the Page.pHashNext
 204 **   pointers.
 205 **
 206 ** pLruFirst, pLruLast:
 207 **   The first and last entries in a doubly-linked list of pages. This
 208 **   list contains all pages with malloc'd data that are present in the
 209 **   hash table and have a ref-count of zero.
 210 */
 211 struct FileSystem {
 212   lsm_db *pDb;                    /* Database handle that owns this object */
 213   lsm_env *pEnv;                  /* Environment pointer */
 214   char *zDb;                      /* Database file name */
 215   char *zLog;                     /* Database file name */
 216   int nMetasize;                  /* Size of meta pages in bytes */
 217   int nMetaRwSize;                /* Read/written size of meta pages in bytes */
 218   int nPagesize;                  /* Database page-size in bytes */
 219   int nBlocksize;                 /* Database block-size in bytes */
 220
 221   /* r/w file descriptors for both files. */
 222   LsmFile *pLsmFile;              /* Used after lsm_close() to link into list */
 223   lsm_file *fdDb;                 /* Database file */
 224   lsm_file *fdLog;                /* Log file */
 225   int szSector;                   /* Database file sector size */
 226
 227   /* If this is a compressed database, a pointer to the compression methods.
 228   ** For an uncompressed database, a NULL pointer.  */
 229   lsm_compress *pCompress;
 230   u8 *aIBuffer;                   /* Buffer to compress to */
 231   u8 *aOBuffer;                   /* Buffer to uncompress from */
 232   int nBuffer;                    /* Allocated size of above buffers in bytes */
 233
 234   /* mmap() page related things */
 235   i64 nMapLimit;                  /* Maximum bytes of file to map */
 236   void *pMap;                     /* Current mapping of database file */
 237   i64 nMap;                       /* Bytes mapped at pMap */
 238   Page *pFree;                    /* Unused Page structures */
 239   Page *pMapped;                  /* List of Page structs that point to pMap */
 240
 241   /* Page cache parameters for non-mmap() pages */
 242   int nCacheMax;                  /* Configured cache size (in pages) */
 243   int nCacheAlloc;                /* Current cache size (in pages) */
 244   Page *pLruFirst;                /* Head of the LRU list */
 245   Page *pLruLast;                 /* Tail of the LRU list */
 246   int nHash;                      /* Number of hash slots in hash table */
 247   Page **apHash;                  /* nHash Hash slots */
 248   Page *pWaiting;                 /* b-tree pages waiting to be written */
 249
 250   /* Statistics */
 251   int nOut;                       /* Number of outstanding pages */
 252   int nWrite;                     /* Total number of pages written */
 253   int nRead;                      /* Total number of pages read */
 254 };
 255
 256 /*
 257 ** Database page handle.
 258 **
 259 ** pSeg:
 260 **   When lsmFsSortedAppend() is called on a compressed database, the new
 261 **   page is not assigned a page number or location in the database file
 262 **   immediately. Instead, these are assigned by the lsmFsPagePersist() call
 263 **   right before it writes the compressed page image to disk.
 264 **
 265 **   The lsmFsSortedAppend() function sets the pSeg pointer to point to the
 266 **   segment that the new page will be a part of. It is unset by
 267 **   lsmFsPagePersist() after the page is written to disk.
 268 */
 269 struct Page {
 270   u8 *aData;                      /* Buffer containing page data */
 271   int nData;                      /* Bytes of usable data at aData[] */
 272   Pgno iPg;                       /* Page number */
 273   int nRef;                       /* Number of outstanding references */
 274   int flags;                      /* Combination of PAGE_XXX flags */
 275   Page *pHashNext;                /* Next page in hash table slot */
 276   Page *pLruNext;                 /* Next page in LRU list */
 277   Page *pLruPrev;                 /* Previous page in LRU list */
 278   FileSystem *pFS;                /* File system that owns this page */
 279
 280   /* Only used in compressed database mode: */
 281   int nCompress;                  /* Compressed size (or 0 for uncomp. db) */
 282   int nCompressPrev;              /* Compressed size of prev page */
 283   Segment *pSeg;                  /* Segment this page will be written to */
 284
 285   /* Pointers for singly linked lists */
 286   Page *pWaitingNext;             /* Next page in FileSystem.pWaiting list */
 287   Page *pFreeNext;                /* Next page in FileSystem.pFree list */
 288   Page *pMappedNext;              /* Next page in FileSystem.pMapped list */
 289 };
 290
 291 /*
 292 ** Meta-data page handle. There are two meta-data pages at the start of
 293 ** the database file, each FileSystem.nMetasize bytes in size.
 294 */
 295 struct MetaPage {
 296   int iPg;                        /* Either 1 or 2 */
 297   int bWrite;                     /* Write back to db file on release */
 298   u8 *aData;                      /* Pointer to buffer */
 299   FileSystem *pFS;                /* FileSystem that owns this page */
 300 };
 301
 302 /*
 303 ** Values for LsmPage.flags
 304 */
 305 #define PAGE_DIRTY   0x00000001   /* Set if page is dirty */
 306 #define PAGE_FREE    0x00000002   /* Set if Page.aData requires lsmFree() */
 307 #define PAGE_HASPREV 0x00000004   /* Set if page is first on uncomp. block */
 308
 309 /*
 310 ** Number of pgsz byte pages omitted from the start of block 1. The start
 311 ** of block 1 contains two 4096 byte meta pages (8192 bytes in total).
 312 */
 313 #define BLOCK1_HDR_SIZE(pgsz)  LSM_MAX(1, 8192/(pgsz))
 314
 315 /*
 316 ** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt()
 317 ** to catch IO errors (any error returned by a VFS method).
 318 */
 319 #ifndef NDEBUG
 320 static void lsmIoerrBkpt(void){
 321   static int nErr = 0;
 322   nErr++;
 323 }
 324 static int IOERR_WRAPPER(int rc){
 325   if( rc!=LSM_OK ) lsmIoerrBkpt();
 326   return rc;
 327 }
 328 #else
 329 # define IOERR_WRAPPER(rc) (rc)
 330 #endif
 331
 332 #ifdef NDEBUG
 333 # define assert_lists_are_ok(x)
 334 #else
 335 static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash);
 336
 337 static void assert_lists_are_ok(FileSystem *pFS){
 338 #if 0
 339   Page *p;
 340
 341   assert( pFS->nMapLimit>=0 );
 342
 343   /* Check that all pages in the LRU list have nRef==0, pointers to buffers
 344   ** in heap memory, and corresponding entries in the hash table.  */
 345   for(p=pFS->pLruFirst; p; p=p->pLruNext){
 346     assert( p==pFS->pLruFirst || p->pLruPrev!=0 );
 347     assert( p==pFS->pLruLast || p->pLruNext!=0 );
 348     assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p );
 349     assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p );
 350     assert( p->nRef==0 );
 351     assert( p->flags & PAGE_FREE );
 352     assert( p==fsPageFindInHash(pFS, p->iPg, 0) );
 353   }
 354 #endif
 355 }
 356 #endif
 357
 358 /*
 359 ** Wrappers around the VFS methods of the lsm_env object:
 360 **
 361 **     lsmEnvOpen()
 362 **     lsmEnvRead()
 363 **     lsmEnvWrite()
 364 **     lsmEnvSync()
 365 **     lsmEnvSectorSize()
 366 **     lsmEnvClose()
 367 **     lsmEnvTruncate()
 368 **     lsmEnvUnlink()
 369 **     lsmEnvRemap()
 370 */
 371 int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){
 372   return pEnv->xOpen(pEnv, zFile, flags, ppNew);
 373 }
 374
 375 static int lsmEnvRead(
 376   lsm_env *pEnv,
 377   lsm_file *pFile,
 378   lsm_i64 iOff,
 379   void *pRead,
 380   int nRead
 381 ){
 382   return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) );
 383 }
 384
 385 static int lsmEnvWrite(
 386   lsm_env *pEnv,
 387   lsm_file *pFile,
 388   lsm_i64 iOff,
 389   const void *pWrite,
 390   int nWrite
 391 ){
 392   return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) );
 393 }
 394
 395 static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
 396   return IOERR_WRAPPER( pEnv->xSync(pFile) );
 397 }
 398
 399 static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
 400   return pEnv->xSectorSize(pFile);
 401 }
 402
 403 int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
 404   return IOERR_WRAPPER( pEnv->xClose(pFile) );
 405 }
 406
 407 static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
 408   return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) );
 409 }
 410
 411 static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
 412   return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) );
 413 }
 414
 415 static int lsmEnvRemap(
 416   lsm_env *pEnv,
 417   lsm_file *pFile,
 418   i64 szMin,
 419   void **ppMap,
 420   i64 *pszMap
 421 ){
 422   return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
 423 }
 424
 425 int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
 426   if( pFile==0 ) return LSM_OK;
 427   return pEnv->xLock(pFile, iLock, eLock);
 428 }
 429
 430 int lsmEnvTestLock(
 431   lsm_env *pEnv,
 432   lsm_file *pFile,
 433   int iLock,
 434   int nLock,
 435   int eLock
 436 ){
 437   return pEnv->xTestLock(pFile, iLock, nLock, eLock);
 438 }
 439
 440 int lsmEnvShmMap(
 441   lsm_env *pEnv,
 442   lsm_file *pFile,
 443   int iChunk,
 444   int sz,
 445   void **ppOut
 446 ){
 447   return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
 448 }
 449
 450 void lsmEnvShmBarrier(lsm_env *pEnv){
 451   pEnv->xShmBarrier();
 452 }
 453
 454 void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
 455   pEnv->xShmUnmap(pFile, bDel);
 456 }
 457
 458 void lsmEnvSleep(lsm_env *pEnv, int nUs){
 459   pEnv->xSleep(pEnv, nUs);
 460 }
 461
 462
 463 /*
 464 ** Write the contents of string buffer pStr into the log file, starting at
 465 ** offset iOff.
 466 */
 467 int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
 468   assert( pFS->fdLog );
 469   return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
 470 }
 471
 472 /*
 473 ** fsync() the log file.
 474 */
 475 int lsmFsSyncLog(FileSystem *pFS){
 476   assert( pFS->fdLog );
 477   return lsmEnvSync(pFS->pEnv, pFS->fdLog);
 478 }
 479
 480 /*
 481 ** Read nRead bytes of data starting at offset iOff of the log file. Append
 482 ** the results to string buffer pStr.
 483 */
 484 int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
 485   int rc;                         /* Return code */
 486   assert( pFS->fdLog );
 487   rc = lsmStringExtend(pStr, nRead);
 488   if( rc==LSM_OK ){
 489     rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
 490     pStr->n += nRead;
 491   }
 492   return rc;
 493 }
 494
 495 /*
 496 ** Truncate the log file to nByte bytes in size.
 497 */
 498 int lsmFsTruncateLog(FileSystem *pFS, i64 nByte){
 499   if( pFS->fdLog==0 ) return LSM_OK;
 500   return lsmEnvTruncate(pFS->pEnv, pFS->fdLog, nByte);
 501 }
 502
 503 /*
 504 ** Truncate the db file to nByte bytes in size.
 505 */
 506 int lsmFsTruncateDb(FileSystem *pFS, i64 nByte){
 507   if( pFS->fdDb==0 ) return LSM_OK;
 508   return lsmEnvTruncate(pFS->pEnv, pFS->fdDb, nByte);
 509 }
 510
 511 /*
 512 ** Close the log file. Then delete it from the file-system. This function
 513 ** is called during database shutdown only.
 514 */
 515 int lsmFsCloseAndDeleteLog(FileSystem *pFS){
 516   char *zDel;
 517
 518   if( pFS->fdLog ){
 519     lsmEnvClose(pFS->pEnv, pFS->fdLog );
 520     pFS->fdLog = 0;
 521   }
 522
 523   zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
 524   if( zDel ){
 525     lsmEnvUnlink(pFS->pEnv, zDel);
 526     lsmFree(pFS->pEnv, zDel);
 527   }
 528   return LSM_OK;
 529 }
 530
 531 /*
 532 ** Return true if page iReal of the database should be accessed using mmap.
 533 ** False otherwise.
 534 */
 535 static int fsMmapPage(FileSystem *pFS, Pgno iReal){
 536   return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit);
 537 }
 538
 539 /*
 540 ** Given that there are currently nHash slots in the hash table, return
 541 ** the hash key for file iFile, page iPg.
 542 */
 543 static int fsHashKey(int nHash, Pgno iPg){
 544   return (iPg % nHash);
 545 }
 546
 547 /*
 548 ** This is a helper function for lsmFsOpen(). It opens a single file on
 549 ** disk (either the database or log file).
 550 */
 551 static lsm_file *fsOpenFile(
 552   FileSystem *pFS,                /* File system object */
 553   int bReadonly,                  /* True to open this file read-only */
 554   int bLog,                       /* True for log, false for db */
 555   int *pRc                        /* IN/OUT: Error code */
 556 ){
 557   lsm_file *pFile = 0;
 558   if( *pRc==LSM_OK ){
 559     int flags = (bReadonly ? LSM_OPEN_READONLY : 0);
 560     const char *zPath = (bLog ? pFS->zLog : pFS->zDb);
 561
 562     *pRc = lsmEnvOpen(pFS->pEnv, zPath, flags, &pFile);
 563   }
 564   return pFile;
 565 }
 566
 567 /*
 568 ** If it is not already open, this function opens the log file. It returns
 569 ** LSM_OK if successful (or if the log file was already open) or an LSM
 570 ** error code otherwise.
 571 **
 572 ** The log file must be opened before any of the following may be called:
 573 **
 574 **     lsmFsWriteLog
 575 **     lsmFsSyncLog
 576 **     lsmFsReadLog
 577 */
 578 int lsmFsOpenLog(lsm_db *db, int *pbOpen){
 579   int rc = LSM_OK;
 580   FileSystem *pFS = db->pFS;
 581
 582   if( 0==pFS->fdLog ){
 583     pFS->fdLog = fsOpenFile(pFS, db->bReadonly, 1, &rc);
 584
 585     if( rc==LSM_IOERR_NOENT && db->bReadonly ){
 586       rc = LSM_OK;
 587     }
 588   }
 589
 590   if( pbOpen ) *pbOpen = (pFS->fdLog!=0);
 591   return rc;
 592 }
 593
 594 /*
 595 ** Close the log file, if it is open.
 596 */
 597 void lsmFsCloseLog(lsm_db *db){
 598   FileSystem *pFS = db->pFS;
 599   if( pFS->fdLog ){
 600     lsmEnvClose(pFS->pEnv, pFS->fdLog);
 601     pFS->fdLog = 0;
 602   }
 603 }
 604
 605 /*
 606 ** Open a connection to a database stored within the file-system.
 607 **
 608 ** If parameter bReadonly is true, then open a read-only file-descriptor
 609 ** on the database file. It is possible that bReadonly will be false even
 610 ** if the user requested that pDb be opened read-only. This is because the
 611 ** file-descriptor may later on be recycled by a read-write connection.
 612 ** If the db file can be opened for read-write access, it always is. Parameter
 613 ** bReadonly is only ever true if it has already been determined that the
 614 ** db can only be opened for read-only access.
 615 **
 616 ** Return LSM_OK if successful or an lsm error code otherwise.
 617 */
 618 int lsmFsOpen(
 619   lsm_db *pDb,                    /* Database connection to open fd for */
 620   const char *zDb,                /* Full path to database file */
 621   int bReadonly                   /* True to open db file read-only */
 622 ){
 623   FileSystem *pFS;
 624   int rc = LSM_OK;
 625   int nDb = strlen(zDb);
 626   int nByte;
 627
 628   assert( pDb->pFS==0 );
 629   assert( pDb->pWorker==0 && pDb->pClient==0 );
 630
 631   nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
 632   pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
 633   if( pFS ){
 634     LsmFile *pLsmFile;
 635     pFS->zDb = (char *)&pFS[1];
 636     pFS->zLog = &pFS->zDb[nDb+1];
 637     pFS->nPagesize = LSM_DFLT_PAGE_SIZE;
 638     pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE;
 639     pFS->nMetasize = LSM_META_PAGE_SIZE;
 640     pFS->nMetaRwSize = LSM_META_RW_PAGE_SIZE;
 641     pFS->pDb = pDb;
 642     pFS->pEnv = pDb->pEnv;
 643
 644     /* Make a copy of the database and log file names. */
 645     memcpy(pFS->zDb, zDb, nDb+1);
 646     memcpy(pFS->zLog, zDb, nDb);
 647     memcpy(&pFS->zLog[nDb], "-log", 5);
 648
 649     /* Allocate the hash-table here. At some point, it should be changed
 650     ** so that it can grow dynamicly. */
 651     pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
 652     pFS->nHash = 4096;
 653     pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
 654
 655     /* Open the database file */
 656     pLsmFile = lsmDbRecycleFd(pDb);
 657     if( pLsmFile ){
 658       pFS->pLsmFile = pLsmFile;
 659       pFS->fdDb = pLsmFile->pFile;
 660       memset(pLsmFile, 0, sizeof(LsmFile));
 661     }else{
 662       pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
 663       if( rc==LSM_OK ){
 664         pFS->fdDb = fsOpenFile(pFS, bReadonly, 0, &rc);
 665       }
 666     }
 667
 668     if( rc!=LSM_OK ){
 669       lsmFsClose(pFS);
 670       pFS = 0;
 671     }else{
 672       pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb);
 673     }
 674   }
 675
 676   pDb->pFS = pFS;
 677   return rc;
 678 }
 679
 680 /*
 681 ** Configure the file-system object according to the current values of
 682 ** the LSM_CONFIG_MMAP and LSM_CONFIG_SET_COMPRESSION options.
 683 */
 684 int lsmFsConfigure(lsm_db *db){
 685   FileSystem *pFS = db->pFS;
 686   if( pFS ){
 687     lsm_env *pEnv = pFS->pEnv;
 688     Page *pPg;
 689
 690     assert( pFS->nOut==0 );
 691     assert( pFS->pWaiting==0 );
 692     assert( pFS->pMapped==0 );
 693
 694     /* Reset any compression/decompression buffers already allocated */
 695     lsmFree(pEnv, pFS->aIBuffer);
 696     lsmFree(pEnv, pFS->aOBuffer);
 697     pFS->nBuffer = 0;
 698
 699     /* Unmap the file, if it is currently mapped */
 700     if( pFS->pMap ){
 701       lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
 702       pFS->nMapLimit = 0;
 703     }
 704
 705     /* Free all allocated page structures */
 706     pPg = pFS->pLruFirst;
 707     while( pPg ){
 708       Page *pNext = pPg->pLruNext;
 709       assert( pPg->flags & PAGE_FREE );
 710       lsmFree(pEnv, pPg->aData);
 711       lsmFree(pEnv, pPg);
 712       pPg = pNext;
 713     }
 714
 715     pPg = pFS->pFree;
 716     while( pPg ){
 717       Page *pNext = pPg->pFreeNext;
 718       lsmFree(pEnv, pPg);
 719       pPg = pNext;
 720     }
 721
 722     /* Zero pointers that point to deleted page objects */
 723     pFS->nCacheAlloc = 0;
 724     pFS->pLruFirst = 0;
 725     pFS->pLruLast = 0;
 726     pFS->pFree = 0;
 727     if( pFS->apHash ){
 728       memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0]));
 729     }
 730
 731     /* Configure the FileSystem object */
 732     if( db->compress.xCompress ){
 733       pFS->pCompress = &db->compress;
 734       pFS->nMapLimit = 0;
 735     }else{
 736       pFS->pCompress = 0;
 737       if( db->iMmap==1 ){
 738         /* Unlimited */
 739         pFS->nMapLimit = (i64)1 << 60;
 740       }else{
 741         /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */
 742         pFS->nMapLimit = (i64)db->iMmap * 1024;
 743       }
 744     }
 745   }
 746
 747   return LSM_OK;
 748 }
 749
 750 /*
 751 ** Close and destroy a FileSystem object.
 752 */
 753 void lsmFsClose(FileSystem *pFS){
 754   if( pFS ){
 755     Page *pPg;
 756     lsm_env *pEnv = pFS->pEnv;
 757
 758     assert( pFS->nOut==0 );
 759     pPg = pFS->pLruFirst;
 760     while( pPg ){
 761       Page *pNext = pPg->pLruNext;
 762       if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
 763       lsmFree(pEnv, pPg);
 764       pPg = pNext;
 765     }
 766
 767     pPg = pFS->pFree;
 768     while( pPg ){
 769       Page *pNext = pPg->pFreeNext;
 770       if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
 771       lsmFree(pEnv, pPg);
 772       pPg = pNext;
 773     }
 774
 775     if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
 776     if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
 777     lsmFree(pEnv, pFS->pLsmFile);
 778     lsmFree(pEnv, pFS->apHash);
 779     lsmFree(pEnv, pFS->aIBuffer);
 780     lsmFree(pEnv, pFS->aOBuffer);
 781     lsmFree(pEnv, pFS);
 782   }
 783 }
 784
 785 /*
 786 ** This function is called when closing a database handle (i.e. lsm_close())
 787 ** if there exist other connections to the same database within this process.
 788 ** In that case the file-descriptor open on the database file is not closed
 789 ** when the FileSystem object is destroyed, as this would cause any POSIX
 790 ** locks held by the other connections to be silently dropped (see "man close"
 791 ** for details). Instead, the file-descriptor is stored in a list by the
 792 ** lsm_shared.c module until it is either closed or reused.
 793 **
 794 ** This function returns a pointer to an object that can be linked into
 795 ** the list described above. The returned object now 'owns' the database
 796 ** file descriptr, so that when the FileSystem object is destroyed, it
 797 ** will not be closed.
 798 **
 799 ** This function may be called at most once in the life-time of a
 800 ** FileSystem object. The results of any operations involving the database
 801 ** file descriptor are undefined once this function has been called.
 802 **
 803 ** None of this is necessary on non-POSIX systems. But we do it anyway in
 804 ** the name of using as similar code as possible on all platforms.
 805 */
 806 LsmFile *lsmFsDeferClose(FileSystem *pFS){
 807   LsmFile *p = pFS->pLsmFile;
 808   assert( p->pNext==0 );
 809   p->pFile = pFS->fdDb;
 810   pFS->fdDb = 0;
 811   pFS->pLsmFile = 0;
 812   return p;
 813 }
 814
 815 /*
 816 ** Allocate a buffer and populate it with the output of the xFileid()
 817 ** method of the database file handle. If successful, set *ppId to point
 818 ** to the buffer and *pnId to the number of bytes in the buffer and return
 819 ** LSM_OK. Otherwise, set *ppId and *pnId to zero and return an LSM
 820 ** error code.
 821 */
 822 int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId){
 823   lsm_env *pEnv = pDb->pEnv;
 824   FileSystem *pFS = pDb->pFS;
 825   int rc;
 826   int nId = 0;
 827   void *pId;
 828
 829   rc = pEnv->xFileid(pFS->fdDb, 0, &nId);
 830   pId = lsmMallocZeroRc(pEnv, nId, &rc);
 831   if( rc==LSM_OK ) rc = pEnv->xFileid(pFS->fdDb, pId, &nId);
 832
 833   if( rc!=LSM_OK ){
 834     lsmFree(pEnv, pId);
 835     pId = 0;
 836     nId = 0;
 837   }
 838
 839   *ppId = pId;
 840   *pnId = nId;
 841   return rc;
 842 }
 843
 844 /*
 845 ** Return the nominal page-size used by this file-system. Actual pages
 846 ** may be smaller or larger than this value.
 847 */
 848 int lsmFsPageSize(FileSystem *pFS){
 849   return pFS->nPagesize;
 850 }
 851
 852 /*
 853 ** Return the block-size used by this file-system.
 854 */
 855 int lsmFsBlockSize(FileSystem *pFS){
 856   return pFS->nBlocksize;
 857 }
 858
 859 /*
 860 ** Configure the nominal page-size used by this file-system. Actual
 861 ** pages may be smaller or larger than this value.
 862 */
 863 void lsmFsSetPageSize(FileSystem *pFS, int nPgsz){
 864   pFS->nPagesize = nPgsz;
 865   pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
 866 }
 867
 868 /*
 869 ** Configure the block-size used by this file-system.
 870 */
 871 void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){
 872   pFS->nBlocksize = nBlocksize;
 873 }
 874
 875 /*
 876 ** Return the page number of the first page on block iBlock. Blocks are
 877 ** numbered starting from 1.
 878 **
 879 ** For a compressed database, page numbers are byte offsets. The first
 880 ** page on each block is the byte offset immediately following the 4-byte
 881 ** "previous block" pointer at the start of each block.
 882 */
 883 static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){
 884   Pgno iPg;
 885   if( pFS->pCompress ){
 886     if( iBlock==1 ){
 887       iPg = pFS->nMetasize * 2 + 4;
 888     }else{
 889       iPg = pFS->nBlocksize * (Pgno)(iBlock-1) + 4;
 890     }
 891   }else{
 892     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 893     if( iBlock==1 ){
 894       iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize);
 895     }else{
 896       iPg = 1 + (iBlock-1) * nPagePerBlock;
 897     }
 898   }
 899   return iPg;
 900 }
 901
 902 /*
 903 ** Return the page number of the last page on block iBlock. Blocks are
 904 ** numbered starting from 1.
 905 **
 906 ** For a compressed database, page numbers are byte offsets. The first
 907 ** page on each block is the byte offset of the byte immediately before
 908 ** the 4-byte "next block" pointer at the end of each block.
 909 */
 910 static Pgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){
 911   if( pFS->pCompress ){
 912     return pFS->nBlocksize * (Pgno)iBlock - 1 - 4;
 913   }else{
 914     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 915     return iBlock * nPagePerBlock;
 916   }
 917 }
 918
 919 /*
 920 ** Return the block number of the block that page iPg is located on.
 921 ** Blocks are numbered starting from 1.
 922 */
 923 static int fsPageToBlock(FileSystem *pFS, Pgno iPg){
 924   if( pFS->pCompress ){
 925     return (int)((iPg / pFS->nBlocksize) + 1);
 926   }else{
 927     return (int)(1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)));
 928   }
 929 }
 930
 931 /*
 932 ** Return true if page iPg is the last page on its block.
 933 **
 934 ** This function is only called in non-compressed database mode.
 935 */
 936 static int fsIsLast(FileSystem *pFS, Pgno iPg){
 937   const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 938   assert( !pFS->pCompress );
 939   return ( iPg && (iPg % nPagePerBlock)==0 );
 940 }
 941
 942 /*
 943 ** Return true if page iPg is the first page on its block.
 944 **
 945 ** This function is only called in non-compressed database mode.
 946 */
 947 static int fsIsFirst(FileSystem *pFS, Pgno iPg){
 948   const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 949   assert( !pFS->pCompress );
 950   return ( (iPg % nPagePerBlock)==1
 951         || (iPg<nPagePerBlock && iPg==fsFirstPageOnBlock(pFS, 1))
 952   );
 953 }
 954
 955 /*
 956 ** Given a page reference, return a pointer to the buffer containing the
 957 ** pages contents. If parameter pnData is not NULL, set *pnData to the size
 958 ** of the buffer in bytes before returning.
 959 */
 960 u8 *lsmFsPageData(Page *pPage, int *pnData){
 961   if( pnData ){
 962     *pnData = pPage->nData;
 963   }
 964   return pPage->aData;
 965 }
 966
 967 /*
 968 ** Return the page number of a page.
 969 */
 970 Pgno lsmFsPageNumber(Page *pPage){
 971   /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */
 972   return pPage ? pPage->iPg : 0;
 973 }
 974
 975 /*
 976 ** Page pPg is currently part of the LRU list belonging to pFS. Remove
 977 ** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
 978 ** operation.
 979 */
 980 static void fsPageRemoveFromLru(FileSystem *pFS, Page *pPg){
 981   assert( pPg->pLruNext || pPg==pFS->pLruLast );
 982   assert( pPg->pLruPrev || pPg==pFS->pLruFirst );
 983   if( pPg->pLruNext ){
 984     pPg->pLruNext->pLruPrev = pPg->pLruPrev;
 985   }else{
 986     pFS->pLruLast = pPg->pLruPrev;
 987   }
 988   if( pPg->pLruPrev ){
 989     pPg->pLruPrev->pLruNext = pPg->pLruNext;
 990   }else{
 991     pFS->pLruFirst = pPg->pLruNext;
 992   }
 993   pPg->pLruPrev = 0;
 994   pPg->pLruNext = 0;
 995 }
 996
 997 /*
 998 ** Page pPg is not currently part of the LRU list belonging to pFS. Add it.
 999 */
1000 static void fsPageAddToLru(FileSystem *pFS, Page *pPg){
1001   assert( pPg->pLruNext==0 && pPg->pLruPrev==0 );
1002   pPg->pLruPrev = pFS->pLruLast;
1003   if( pPg->pLruPrev ){
1004     pPg->pLruPrev->pLruNext = pPg;
1005   }else{
1006     pFS->pLruFirst = pPg;
1007   }
1008   pFS->pLruLast = pPg;
1009 }
1010
1011 /*
1012 ** Page pPg is currently stored in the apHash/nHash hash table. Remove it.
1013 */
1014 static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
1015   int iHash;
1016   Page **pp;
1017
1018   iHash = fsHashKey(pFS->nHash, pPg->iPg);
1019   for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
1020   *pp = pPg->pHashNext;
1021   pPg->pHashNext = 0;
1022 }
1023
1024 /*
1025 ** Free a Page object allocated by fsPageBuffer().
1026 */
1027 static void fsPageBufferFree(Page *pPg){
1028   pPg->pFS->nCacheAlloc--;
1029   lsmFree(pPg->pFS->pEnv, pPg->aData);
1030   lsmFree(pPg->pFS->pEnv, pPg);
1031 }
1032
1033
1034 /*
1035 ** Purge the cache of all non-mmap pages with nRef==0.
1036 */
1037 void lsmFsPurgeCache(FileSystem *pFS){
1038   Page *pPg;
1039
1040   pPg = pFS->pLruFirst;
1041   while( pPg ){
1042     Page *pNext = pPg->pLruNext;
1043     assert( pPg->flags & PAGE_FREE );
1044     fsPageRemoveFromHash(pFS, pPg);
1045     fsPageBufferFree(pPg);
1046     pPg = pNext;
1047   }
1048   pFS->pLruFirst = 0;
1049   pFS->pLruLast = 0;
1050
1051   assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );
1052 }
1053
1054 /*
1055 ** Search the hash-table for page iPg. If an entry is round, return a pointer
1056 ** to it. Otherwise, return NULL.
1057 **
1058 ** Either way, if argument piHash is not NULL set *piHash to the hash slot
1059 ** number that page iPg would be stored in before returning.
1060 */
1061 static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash){
1062   Page *p;                        /* Return value */
1063   int iHash = fsHashKey(pFS->nHash, iPg);
1064
1065   if( piHash ) *piHash = iHash;
1066   for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
1067     if( p->iPg==iPg) break;
1068   }
1069   return p;
1070 }
1071
1072 /*
1073 ** Allocate and return a non-mmap Page object. If there are already
1074 ** nCacheMax such Page objects outstanding, try to recycle an existing
1075 ** Page instead.
1076 */
1077 static int fsPageBuffer(
1078   FileSystem *pFS,
1079   Page **ppOut
1080 ){
1081   int rc = LSM_OK;
1082   Page *pPage = 0;
1083   if( pFS->pLruFirst==0 || pFS->nCacheAlloc<pFS->nCacheMax ){
1084     /* Allocate a new Page object */
1085     pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
1086     if( !pPage ){
1087       rc = LSM_NOMEM_BKPT;
1088     }else{
1089       pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);
1090       if( !pPage->aData ){
1091         lsmFree(pFS->pEnv, pPage);
1092         rc = LSM_NOMEM_BKPT;
1093         pPage = 0;
1094       }else{
1095         pFS->nCacheAlloc++;
1096       }
1097     }
1098   }else{
1099     /* Reuse an existing Page object */
1100     u8 *aData;
1101     pPage = pFS->pLruFirst;
1102     aData = pPage->aData;
1103     fsPageRemoveFromLru(pFS, pPage);
1104     fsPageRemoveFromHash(pFS, pPage);
1105
1106     memset(pPage, 0, sizeof(Page));
1107     pPage->aData = aData;
1108   }
1109
1110   if( pPage ){
1111     pPage->flags = PAGE_FREE;
1112   }
1113   *ppOut = pPage;
1114   return rc;
1115 }
1116
1117 /*
1118 ** Assuming *pRc is initially LSM_OK, attempt to ensure that the
1119 ** memory-mapped region is at least iSz bytes in size. If it is not already,
1120 ** iSz bytes in size, extend it and update the pointers associated with any
1121 ** outstanding Page objects.
1122 **
1123 ** If *pRc is not LSM_OK when this function is called, it is a no-op.
1124 ** Otherwise, *pRc is set to an lsm error code if an error occurs, or
1125 ** left unmodified otherwise.
1126 **
1127 ** This function is never called in compressed database mode.
1128 */
1129 static void fsGrowMapping(
1130   FileSystem *pFS,                /* File system object */
1131   i64 iSz,                        /* Minimum size to extend mapping to */
1132   int *pRc                        /* IN/OUT: Error code */
1133 ){
1134   assert( pFS->pCompress==0 );
1135   assert( PAGE_HASPREV==4 );
1136
1137   if( *pRc==LSM_OK && iSz>pFS->nMap ){
1138     int rc;
1139     u8 *aOld = pFS->pMap;
1140     rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
1141     if( rc==LSM_OK && pFS->pMap!=aOld ){
1142       Page *pFix;
1143       i64 iOff = (u8 *)pFS->pMap - aOld;
1144       for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){
1145         pFix->aData += iOff;
1146       }
1147       lsmSortedRemap(pFS->pDb);
1148     }
1149     *pRc = rc;
1150   }
1151 }
1152
1153 /*
1154 ** If it is mapped, unmap the database file.
1155 */
1156 int lsmFsUnmap(FileSystem *pFS){
1157   int rc = LSM_OK;
1158   if( pFS ){
1159     rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
1160   }
1161   return rc;
1162 }
1163
1164 /*
1165 ** fsync() the database file.
1166 */
1167 int lsmFsSyncDb(FileSystem *pFS, int nBlock){
1168   return lsmEnvSync(pFS->pEnv, pFS->fdDb);
1169 }
1170
1171 /*
1172 ** If block iBlk has been redirected according to the redirections in the
1173 ** object passed as the first argument, return the destination block to
1174 ** which it is redirected. Otherwise, return a copy of iBlk.
1175 */
1176 static int fsRedirectBlock(Redirect *p, int iBlk){
1177   if( p ){
1178     int i;
1179     for(i=0; i<p->n; i++){
1180       if( iBlk==p->a[i].iFrom ) return p->a[i].iTo;
1181     }
1182   }
1183   assert( iBlk!=0 );
1184   return iBlk;
1185 }
1186
1187 /*
1188 ** If page iPg has been redirected according to the redirections in the
1189 ** object passed as the second argument, return the destination page to
1190 ** which it is redirected. Otherwise, return a copy of iPg.
1191 */
1192 Pgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){
1193   Pgno iReal = iPg;
1194
1195   if( pRedir ){
1196     const int nPagePerBlock = (
1197         pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
1198     );
1199     int iBlk = fsPageToBlock(pFS, iPg);
1200     int i;
1201     for(i=0; i<pRedir->n; i++){
1202       int iFrom = pRedir->a[i].iFrom;
1203       if( iFrom>iBlk ) break;
1204       if( iFrom==iBlk ){
1205         int iTo = pRedir->a[i].iTo;
1206         iReal = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
1207         if( iTo==1 ){
1208           iReal += (fsFirstPageOnBlock(pFS, 1)-1);
1209         }
1210         break;
1211       }
1212     }
1213   }
1214
1215   assert( iReal!=0 );
1216   return iReal;
1217 }
1218
1219 /* Required by the circular fsBlockNext<->fsPageGet dependency. */
1220 static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *);
1221
1222 /*
1223 ** Parameter iBlock is a database file block. This function reads the value
1224 ** stored in the blocks "next block" pointer and stores it in *piNext.
1225 ** LSM_OK is returned if everything is successful, or an LSM error code
1226 ** otherwise.
1227 */
1228 static int fsBlockNext(
1229   FileSystem *pFS,                /* File-system object handle */
1230   Segment *pSeg,                  /* Use this segment for block redirects */
1231   int iBlock,                     /* Read field from this block */
1232   int *piNext                     /* OUT: Next block in linked list */
1233 ){
1234   int rc;
1235   int iRead;                      /* Read block from here */
1236
1237   if( pSeg ){
1238     iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
1239   }else{
1240     iRead = iBlock;
1241   }
1242
1243   assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
1244   if( pFS->pCompress ){
1245     i64 iOff;                     /* File offset to read data from */
1246     u8 aNext[4];                  /* 4-byte pointer read from db file */
1247
1248     iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
1249     rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
1250     if( rc==LSM_OK ){
1251       *piNext = (int)lsmGetU32(aNext);
1252     }
1253   }else{
1254     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
1255     Page *pLast;
1256     rc = fsPageGet(pFS, 0, iRead*nPagePerBlock, 0, &pLast, 0);
1257     if( rc==LSM_OK ){
1258       *piNext = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
1259       lsmFsPageRelease(pLast);
1260     }
1261   }
1262
1263   if( pSeg ){
1264     *piNext = fsRedirectBlock(pSeg->pRedirect, *piNext);
1265   }
1266   return rc;
1267 }
1268
1269 /*
1270 ** Return the page number of the last page on the same block as page iPg.
1271 */
1272 Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){
1273   return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg));
1274 }
1275
1276 /*
1277 ** Read nData bytes of data from offset iOff of the database file into
1278 ** buffer aData. If this means reading past the end of a block, follow
1279 ** the block pointer to the next block and continue reading.
1280 **
1281 ** Offset iOff is an absolute offset - not subject to any block redirection.
1282 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1283 **
1284 ** This function is only called in compressed database mode.
1285 */
1286 static int fsReadData(
1287   FileSystem *pFS,                /* File-system handle */
1288   Segment *pSeg,                  /* Block redirection */
1289   i64 iOff,                       /* Read data from this offset */
1290   u8 *aData,                      /* Buffer to read data into */
1291   int nData                       /* Number of bytes to read */
1292 ){
1293   i64 iEob;                       /* End of block */
1294   int nRead;
1295   int rc;
1296
1297   assert( pFS->pCompress );
1298
1299   iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1;
1300   nRead = (int)LSM_MIN(iEob - iOff, nData);
1301
1302   rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead);
1303   if( rc==LSM_OK && nRead!=nData ){
1304     int iBlk;
1305
1306     rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1307     if( rc==LSM_OK ){
1308       i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk);
1309       rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead);
1310     }
1311   }
1312
1313   return rc;
1314 }
1315
1316 /*
1317 ** Parameter iBlock is a database file block. This function reads the value
1318 ** stored in the blocks "previous block" pointer and stores it in *piPrev.
1319 ** LSM_OK is returned if everything is successful, or an LSM error code
1320 ** otherwise.
1321 */
1322 static int fsBlockPrev(
1323   FileSystem *pFS,                /* File-system object handle */
1324   Segment *pSeg,                  /* Use this segment for block redirects */
1325   int iBlock,                     /* Read field from this block */
1326   int *piPrev                     /* OUT: Previous block in linked list */
1327 ){
1328   int rc = LSM_OK;                /* Return code */
1329
1330   assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
1331   assert( iBlock>0 );
1332
1333   if( pFS->pCompress ){
1334     i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
1335     u8 aPrev[4];                  /* 4-byte pointer read from db file */
1336     rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
1337     if( rc==LSM_OK ){
1338       Redirect *pRedir = (pSeg ? pSeg->pRedirect : 0);
1339       *piPrev = fsRedirectBlock(pRedir, (int)lsmGetU32(aPrev));
1340     }
1341   }else{
1342     assert( 0 );
1343   }
1344   return rc;
1345 }
1346
1347 /*
1348 ** Encode and decode routines for record size fields.
1349 */
1350 static void putRecordSize(u8 *aBuf, int nByte, int bFree){
1351   aBuf[0] = (u8)(nByte >> 14) | 0x80;
1352   aBuf[1] = ((u8)(nByte >>  7) & 0x7F) | (bFree ? 0x00 : 0x80);
1353   aBuf[2] = (u8)nByte | 0x80;
1354 }
1355 static int getRecordSize(u8 *aBuf, int *pbFree){
1356   int nByte;
1357   nByte  = (aBuf[0] & 0x7F) << 14;
1358   nByte += (aBuf[1] & 0x7F) << 7;
1359   nByte += (aBuf[2] & 0x7F);
1360   *pbFree = !(aBuf[1] & 0x80);
1361   return nByte;
1362 }
1363
1364 /*
1365 ** Subtract iSub from database file offset iOff and set *piRes to the
1366 ** result. If doing so means passing the start of a block, follow the
1367 ** block pointer stored in the first 4 bytes of the block.
1368 **
1369 ** Offset iOff is an absolute offset - not subject to any block redirection.
1370 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1371 **
1372 ** Return LSM_OK if successful or an lsm error code if an error occurs.
1373 */
1374 static int fsSubtractOffset(
1375   FileSystem *pFS,
1376   Segment *pSeg,
1377   i64 iOff,
1378   int iSub,
1379   i64 *piRes
1380 ){
1381   i64 iStart;
1382   int iBlk = 0;
1383   int rc;
1384
1385   assert( pFS->pCompress );
1386
1387   iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff));
1388   if( (iOff-iSub)>=iStart ){
1389     *piRes = (iOff-iSub);
1390     return LSM_OK;
1391   }
1392
1393   rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1394   *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1);
1395   return rc;
1396 }
1397
1398 /*
1399 ** Add iAdd to database file offset iOff and set *piRes to the
1400 ** result. If doing so means passing the end of a block, follow the
1401 ** block pointer stored in the last 4 bytes of the block.
1402 **
1403 ** Offset iOff is an absolute offset - not subject to any block redirection.
1404 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1405 **
1406 ** Return LSM_OK if successful or an lsm error code if an error occurs.
1407 */
1408 static int fsAddOffset(
1409   FileSystem *pFS,
1410   Segment *pSeg,
1411   i64 iOff,
1412   int iAdd,
1413   i64 *piRes
1414 ){
1415   i64 iEob;
1416   int iBlk;
1417   int rc;
1418
1419   assert( pFS->pCompress );
1420
1421   iEob = fsLastPageOnPagesBlock(pFS, iOff);
1422   if( (iOff+iAdd)<=iEob ){
1423     *piRes = (iOff+iAdd);
1424     return LSM_OK;
1425   }
1426
1427   rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1428   *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1);
1429   return rc;
1430 }
1431
1432 /*
1433 ** If it is not already allocated, allocate either the FileSystem.aOBuffer (if
1434 ** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return
1435 ** LSM_OK if successful if the attempt to allocate memory fails.
1436 */
1437 static int fsAllocateBuffer(FileSystem *pFS, int bWrite){
1438   u8 **pp;                        /* Pointer to either aIBuffer or aOBuffer */
1439
1440   assert( pFS->pCompress );
1441
1442   /* If neither buffer has been allocated, figure out how large they
1443   ** should be. Store this value in FileSystem.nBuffer.  */
1444   if( pFS->nBuffer==0 ){
1445     assert( pFS->aIBuffer==0 && pFS->aOBuffer==0 );
1446     pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize);
1447     if( pFS->nBuffer<(pFS->szSector+6) ){
1448       pFS->nBuffer = pFS->szSector+6;
1449     }
1450   }
1451
1452   pp = (bWrite ? &pFS->aOBuffer : &pFS->aIBuffer);
1453   if( *pp==0 ){
1454     *pp = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize));
1455     if( *pp==0 ) return LSM_NOMEM_BKPT;
1456   }
1457
1458   return LSM_OK;
1459 }
1460
1461 /*
1462 ** This function is only called in compressed database mode. It reads and
1463 ** uncompresses the compressed data for page pPg from the database and
1464 ** populates the pPg->aData[] buffer and pPg->nCompress field.
1465 **
1466 ** It is possible that instead of a page record, there is free space
1467 ** at offset pPg->iPgno. In this case no data is read from the file, but
1468 ** output variable *pnSpace is set to the total number of free bytes.
1469 **
1470 ** LSM_OK is returned if successful, or an LSM error code otherwise.
1471 */
1472 static int fsReadPagedata(
1473   FileSystem *pFS,                /* File-system handle */
1474   Segment *pSeg,                  /* pPg is part of this segment */
1475   Page *pPg,                      /* Page to read and uncompress data for */
1476   int *pnSpace                    /* OUT: Total bytes of free space */
1477 ){
1478   lsm_compress *p = pFS->pCompress;
1479   i64 iOff = pPg->iPg;
1480   u8 aSz[3];
1481   int rc;
1482
1483   assert( p && pPg->nCompress==0 );
1484
1485   if( fsAllocateBuffer(pFS, 0) ) return LSM_NOMEM;
1486
1487   rc = fsReadData(pFS, pSeg, iOff, aSz, sizeof(aSz));
1488
1489   if( rc==LSM_OK ){
1490     int bFree;
1491     if( aSz[0] & 0x80 ){
1492       pPg->nCompress = (int)getRecordSize(aSz, &bFree);
1493     }else{
1494       pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2;
1495       bFree = 1;
1496     }
1497     if( bFree ){
1498       if( pnSpace ){
1499         *pnSpace = pPg->nCompress + sizeof(aSz)*2;
1500       }else{
1501         rc = LSM_CORRUPT_BKPT;
1502       }
1503     }else{
1504       rc = fsAddOffset(pFS, pSeg, iOff, 3, &iOff);
1505       if( rc==LSM_OK ){
1506         if( pPg->nCompress>pFS->nBuffer ){
1507           rc = LSM_CORRUPT_BKPT;
1508         }else{
1509           rc = fsReadData(pFS, pSeg, iOff, pFS->aIBuffer, pPg->nCompress);
1510         }
1511         if( rc==LSM_OK ){
1512           int n = pFS->nPagesize;
1513           rc = p->xUncompress(p->pCtx,
1514               (char *)pPg->aData, &n,
1515               (const char *)pFS->aIBuffer, pPg->nCompress
1516           );
1517           if( rc==LSM_OK && n!=pPg->pFS->nPagesize ){
1518             rc = LSM_CORRUPT_BKPT;
1519           }
1520         }
1521       }
1522     }
1523   }
1524   return rc;
1525 }
1526
1527 /*
1528 ** Return a handle for a database page.
1529 **
1530 ** If this file-system object is accessing a compressed database it may be
1531 ** that there is no page record at database file offset iPg. Instead, there
1532 ** may be a free space record. In this case, set *ppPg to NULL and *pnSpace
1533 ** to the total number of free bytes before returning.
1534 **
1535 ** If no error occurs, LSM_OK is returned. Otherwise, an lsm error code.
1536 */
1537 static int fsPageGet(
1538   FileSystem *pFS,                /* File-system handle */
1539   Segment *pSeg,                  /* Block redirection to use (or NULL) */
1540   Pgno iPg,                       /* Page id */
1541   int noContent,                  /* True to not load content from disk */
1542   Page **ppPg,                    /* OUT: New page handle */
1543   int *pnSpace                    /* OUT: Bytes of free space */
1544 ){
1545   Page *p;
1546   int iHash;
1547   int rc = LSM_OK;
1548
1549   /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is
1550   ** not NULL, and the block containing iPg has been redirected, then iReal
1551   ** is the page number after redirection.  */
1552   Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);
1553
1554   assert_lists_are_ok(pFS);
1555   assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
1556   assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
1557   *ppPg = 0;
1558
1559   /* Search the hash-table for the page */
1560   p = fsPageFindInHash(pFS, iReal, &iHash);
1561
1562   if( p ){
1563     assert( p->flags & PAGE_FREE );
1564     if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p);
1565   }else{
1566
1567     if( fsMmapPage(pFS, iReal) ){
1568       i64 iEnd = (i64)iReal * pFS->nPagesize;
1569       fsGrowMapping(pFS, iEnd, &rc);
1570       if( rc!=LSM_OK ) return rc;
1571
1572       if( pFS->pFree ){
1573         p = pFS->pFree;
1574         pFS->pFree = p->pFreeNext;
1575         assert( p->nRef==0 );
1576       }else{
1577         p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
1578         if( rc ) return rc;
1579         p->pFS = pFS;
1580       }
1581       p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
1582       p->iPg = iReal;
1583
1584       /* This page now carries a pointer to the mapping. Link it in to
1585       ** the FileSystem.pMapped list.  */
1586       assert( p->pMappedNext==0 );
1587       p->pMappedNext = pFS->pMapped;
1588       pFS->pMapped = p;
1589
1590       assert( pFS->pCompress==0 );
1591       assert( (p->flags & PAGE_FREE)==0 );
1592     }else{
1593       rc = fsPageBuffer(pFS, &p);
1594       if( rc==LSM_OK ){
1595         int nSpace = 0;
1596         p->iPg = iReal;
1597         p->nRef = 0;
1598         p->pFS = pFS;
1599         assert( p->flags==0 || p->flags==PAGE_FREE );
1600
1601 #ifdef LSM_DEBUG
1602         memset(p->aData, 0x56, pFS->nPagesize);
1603 #endif
1604         assert( p->pLruNext==0 && p->pLruPrev==0 );
1605         if( noContent==0 ){
1606           if( pFS->pCompress ){
1607             rc = fsReadPagedata(pFS, pSeg, p, &nSpace);
1608           }else{
1609             int nByte = pFS->nPagesize;
1610             i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
1611             rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
1612           }
1613           pFS->nRead++;
1614         }
1615
1616         /* If the xRead() call was successful (or not attempted), link the
1617         ** page into the page-cache hash-table. Otherwise, if it failed,
1618         ** free the buffer. */
1619         if( rc==LSM_OK && nSpace==0 ){
1620           p->pHashNext = pFS->apHash[iHash];
1621           pFS->apHash[iHash] = p;
1622         }else{
1623           fsPageBufferFree(p);
1624           p = 0;
1625           if( pnSpace ) *pnSpace = nSpace;
1626         }
1627       }
1628     }
1629
1630     assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
1631          || (rc!=LSM_OK && p==0)
1632     );
1633   }
1634
1635   if( rc==LSM_OK && p ){
1636     if( pFS->pCompress==0 && (fsIsLast(pFS, iReal) || fsIsFirst(pFS, iReal)) ){
1637       p->nData = pFS->nPagesize - 4;
1638       if( fsIsFirst(pFS, iReal) && p->nRef==0 ){
1639         p->aData += 4;
1640         p->flags |= PAGE_HASPREV;
1641       }
1642     }else{
1643       p->nData = pFS->nPagesize;
1644     }
1645     pFS->nOut += (p->nRef==0);
1646     p->nRef++;
1647   }
1648   *ppPg = p;
1649   return rc;
1650 }
1651
1652 /*
1653 ** Read the 64-bit checkpoint id of the checkpoint currently stored on meta
1654 ** page iMeta of the database file. If no error occurs, store the id value
1655 ** in *piVal and return LSM_OK. Otherwise, return an LSM error code and leave
1656 ** *piVal unmodified.
1657 **
1658 ** If a checkpointer connection is currently updating meta-page iMeta, or an
1659 ** earlier checkpointer crashed while doing so, the value read into *piVal
1660 ** may be garbage. It is the callers responsibility to deal with this.
1661 */
1662 int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
1663   FileSystem *pFS = db->pFS;
1664   int rc = LSM_OK;
1665
1666   assert( iMeta==1 || iMeta==2 );
1667   if( pFS->nMapLimit>0 ){
1668     fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
1669     if( rc==LSM_OK ){
1670       *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
1671     }
1672   }else{
1673     MetaPage *pMeta = 0;
1674     rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
1675     if( rc==LSM_OK ){
1676       *piVal = (i64)lsmGetU64(pMeta->aData);
1677       lsmFsMetaPageRelease(pMeta);
1678     }
1679   }
1680
1681   return rc;
1682 }
1683
1684
1685 /*
1686 ** Return true if the first or last page of segment pRun falls between iFirst
1687 ** and iLast, inclusive, and pRun is not equal to pIgnore.
1688 */
1689 static int fsRunEndsBetween(
1690   Segment *pRun,
1691   Segment *pIgnore,
1692   Pgno iFirst,
1693   Pgno iLast
1694 ){
1695   return (pRun!=pIgnore && (
1696         (pRun->iFirst>=iFirst && pRun->iFirst<=iLast)
1697      || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast)
1698   ));
1699 }
1700
1701 /*
1702 ** Return true if level pLevel contains a segment other than pIgnore for
1703 ** which the first or last page is between iFirst and iLast, inclusive.
1704 */
1705 static int fsLevelEndsBetween(
1706   Level *pLevel,
1707   Segment *pIgnore,
1708   Pgno iFirst,
1709   Pgno iLast
1710 ){
1711   int i;
1712
1713   if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){
1714     return 1;
1715   }
1716   for(i=0; i<pLevel->nRight; i++){
1717     if( fsRunEndsBetween(&pLevel->aRhs[i], pIgnore, iFirst, iLast) ){
1718       return 1;
1719     }
1720   }
1721
1722   return 0;
1723 }
1724
1725 /*
1726 ** Block iBlk is no longer in use by segment pIgnore. If it is not in use
1727 ** by any other segment, move it to the free block list.
1728 */
1729 static int fsFreeBlock(
1730   FileSystem *pFS,                /* File system object */
1731   Snapshot *pSnapshot,            /* Worker snapshot */
1732   Segment *pIgnore,               /* Ignore this run when searching */
1733   int iBlk                        /* Block number of block to free */
1734 ){
1735   int rc = LSM_OK;                /* Return code */
1736   Pgno iFirst;                    /* First page on block iBlk */
1737   Pgno iLast;                     /* Last page on block iBlk */
1738   Level *pLevel;                  /* Used to iterate through levels */
1739
1740   int iIn;                        /* Used to iterate through append points */
1741   int iOut = 0;                   /* Used to output append points */
1742   Pgno *aApp = pSnapshot->aiAppend;
1743
1744   iFirst = fsFirstPageOnBlock(pFS, iBlk);
1745   iLast = fsLastPageOnBlock(pFS, iBlk);
1746
1747   /* Check if any other run in the snapshot has a start or end page
1748   ** within this block. If there is such a run, return early. */
1749   for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
1750     if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
1751       return LSM_OK;
1752     }
1753   }
1754
1755   /* Remove any entries that lie on this block from the append-list. */
1756   for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
1757     if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
1758       aApp[iOut++] = aApp[iIn];
1759     }
1760   }
1761   while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;
1762
1763   if( rc==LSM_OK ){
1764     rc = lsmBlockFree(pFS->pDb, iBlk);
1765   }
1766   return rc;
1767 }
1768
1769 /*
1770 ** Delete or otherwise recycle the blocks currently occupied by run pDel.
1771 */
1772 int lsmFsSortedDelete(
1773   FileSystem *pFS,
1774   Snapshot *pSnapshot,
1775   int bZero,                      /* True to zero the Segment structure */
1776   Segment *pDel
1777 ){
1778   if( pDel->iFirst ){
1779     int rc = LSM_OK;
1780
1781     int iBlk;
1782     int iLastBlk;
1783
1784     iBlk = fsPageToBlock(pFS, pDel->iFirst);
1785     iLastBlk = fsPageToBlock(pFS, pDel->iLastPg);
1786
1787     /* Mark all blocks currently used by this sorted run as free */
1788     while( iBlk && rc==LSM_OK ){
1789       int iNext = 0;
1790       if( iBlk!=iLastBlk ){
1791         rc = fsBlockNext(pFS, pDel, iBlk, &iNext);
1792       }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){
1793         break;
1794       }
1795       rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk);
1796       iBlk = iNext;
1797     }
1798
1799     if( pDel->pRedirect ){
1800       assert( pDel->pRedirect==&pSnapshot->redirect );
1801       pSnapshot->redirect.n = 0;
1802     }
1803
1804     if( bZero ) memset(pDel, 0, sizeof(Segment));
1805   }
1806   return LSM_OK;
1807 }
1808
1809 /*
1810 ** aPgno is an array containing nPgno page numbers. Return the smallest page
1811 ** number from the array that falls on block iBlk. Or, if none of the pages
1812 ** in aPgno[] fall on block iBlk, return 0.
1813 */
1814 static Pgno firstOnBlock(FileSystem *pFS, int iBlk, Pgno *aPgno, int nPgno){
1815   Pgno iRet = 0;
1816   int i;
1817   for(i=0; i<nPgno; i++){
1818     Pgno iPg = aPgno[i];
1819     if( fsPageToBlock(pFS, iPg)==iBlk && (iRet==0 || iPg<iRet) ){
1820       iRet = iPg;
1821     }
1822   }
1823   return iRet;
1824 }
1825
1826 #ifndef NDEBUG
1827 /*
1828 ** Return true if page iPg, which is a part of segment p, lies on
1829 ** a redirected block.
1830 */
1831 static int fsPageRedirects(FileSystem *pFS, Segment *p, Pgno iPg){
1832   return (iPg!=0 && iPg!=lsmFsRedirectPage(pFS, p->pRedirect, iPg));
1833 }
1834
1835 /*
1836 ** Return true if the second argument is not NULL and any of the first
1837 ** last or root pages lie on a redirected block.
1838 */
1839 static int fsSegmentRedirects(FileSystem *pFS, Segment *p){
1840   return (p && (
1841       fsPageRedirects(pFS, p, p->iFirst)
1842    || fsPageRedirects(pFS, p, p->iRoot)
1843    || fsPageRedirects(pFS, p, p->iLastPg)
1844   ));
1845 }
1846 #endif
1847
1848 /*
1849 ** Argument aPgno is an array of nPgno page numbers. All pages belong to
1850 ** the segment pRun. This function gobbles from the start of the run to the
1851 ** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is
1852 ** the new first page of the run).
1853 */
1854 void lsmFsGobble(
1855   lsm_db *pDb,
1856   Segment *pRun,
1857   Pgno *aPgno,
1858   int nPgno
1859 ){
1860   int rc = LSM_OK;
1861   FileSystem *pFS = pDb->pFS;
1862   Snapshot *pSnapshot = pDb->pWorker;
1863   int iBlk;
1864
1865   assert( pRun->nSize>0 );
1866   assert( 0==fsSegmentRedirects(pFS, pRun) );
1867   assert( nPgno>0 && 0==fsPageRedirects(pFS, pRun, aPgno[0]) );
1868
1869   iBlk = fsPageToBlock(pFS, pRun->iFirst);
1870   pRun->nSize += (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
1871
1872   while( rc==LSM_OK ){
1873     int iNext = 0;
1874     Pgno iFirst = firstOnBlock(pFS, iBlk, aPgno, nPgno);
1875     if( iFirst ){
1876       pRun->iFirst = iFirst;
1877       break;
1878     }
1879     rc = fsBlockNext(pFS, pRun, iBlk, &iNext);
1880     if( rc==LSM_OK ) rc = fsFreeBlock(pFS, pSnapshot, pRun, iBlk);
1881     pRun->nSize -= (int)(
1882         1 + fsLastPageOnBlock(pFS, iBlk) - fsFirstPageOnBlock(pFS, iBlk)
1883     );
1884     iBlk = iNext;
1885   }
1886
1887   pRun->nSize -= (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
1888   assert( pRun->nSize>0 );
1889 }
1890
1891 /*
1892 ** This function is only used in compressed database mode.
1893 **
1894 ** Argument iPg is the page number (byte offset) of a page within segment
1895 ** pSeg. The page record, including all headers, is nByte bytes in size.
1896 ** Before returning, set *piNext to the page number of the next page in
1897 ** the segment, or to zero if iPg is the last.
1898 **
1899 ** In other words, do:
1900 **
1901 **   *piNext = iPg + nByte;
1902 **
1903 ** But take block overflow and redirection into account.
1904 */
1905 static int fsNextPageOffset(
1906   FileSystem *pFS,                /* File system object */
1907   Segment *pSeg,                  /* Segment to move within */
1908   Pgno iPg,                       /* Offset of current page */
1909   int nByte,                      /* Size of current page including headers */
1910   Pgno *piNext                    /* OUT: Offset of next page. Or zero (EOF) */
1911 ){
1912   Pgno iNext;
1913   int rc;
1914
1915   assert( pFS->pCompress );
1916
1917   rc = fsAddOffset(pFS, pSeg, iPg, nByte-1, &iNext);
1918   if( pSeg && iNext==pSeg->iLastPg ){
1919     iNext = 0;
1920   }else if( rc==LSM_OK ){
1921     rc = fsAddOffset(pFS, pSeg, iNext, 1, &iNext);
1922   }
1923
1924   *piNext = iNext;
1925   return rc;
1926 }
1927
1928 /*
1929 ** This function is only used in compressed database mode.
1930 **
1931 ** Argument iPg is the page number of a pagethat appears in segment pSeg.
1932 ** This function determines the page number of the previous page in the
1933 ** same run. *piPrev is set to the previous page number before returning.
1934 **
1935 ** LSM_OK is returned if no error occurs. Otherwise, an lsm error code.
1936 ** If any value other than LSM_OK is returned, then the final value of
1937 ** *piPrev is undefined.
1938 */
1939 static int fsGetPageBefore(
1940   FileSystem *pFS,
1941   Segment *pSeg,
1942   Pgno iPg,
1943   Pgno *piPrev
1944 ){
1945   u8 aSz[3];
1946   int rc;
1947   i64 iRead;
1948
1949   assert( pFS->pCompress );
1950
1951   rc = fsSubtractOffset(pFS, pSeg, iPg, sizeof(aSz), &iRead);
1952   if( rc==LSM_OK ) rc = fsReadData(pFS, pSeg, iRead, aSz, sizeof(aSz));
1953
1954   if( rc==LSM_OK ){
1955     int bFree;
1956     int nSz;
1957     if( aSz[2] & 0x80 ){
1958       nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2;
1959     }else{
1960       nSz = (int)(aSz[2] & 0x7F);
1961       bFree = 1;
1962     }
1963     rc = fsSubtractOffset(pFS, pSeg, iPg, nSz, piPrev);
1964   }
1965
1966   return rc;
1967 }
1968
1969 /*
1970 ** The first argument to this function is a valid reference to a database
1971 ** file page that is part of a sorted run. If parameter eDir is -1, this
1972 ** function attempts to locate and load the previous page in the same run.
1973 ** Or, if eDir is +1, it attempts to find the next page in the same run.
1974 ** The results of passing an eDir value other than positive or negative one
1975 ** are undefined.
1976 **
1977 ** If parameter pRun is not NULL then it must point to the run that page
1978 ** pPg belongs to. In this case, if pPg is the first or last page of the
1979 ** run, and the request is for the previous or next page, respectively,
1980 ** *ppNext is set to NULL before returning LSM_OK. If pRun is NULL, then it
1981 ** is assumed that the next or previous page, as requested, exists.
1982 **
1983 ** If the previous/next page does exist and is successfully loaded, *ppNext
1984 ** is set to point to it and LSM_OK is returned. Otherwise, if an error
1985 ** occurs, *ppNext is set to NULL and and lsm error code returned.
1986 **
1987 ** Page references returned by this function should be released by the
1988 ** caller using lsmFsPageRelease().
1989 */
1990 int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){
1991   int rc = LSM_OK;
1992   FileSystem *pFS = pPg->pFS;
1993   Pgno iPg = pPg->iPg;
1994
1995   assert( 0==fsSegmentRedirects(pFS, pRun) );
1996   if( pFS->pCompress ){
1997     int nSpace = pPg->nCompress + 2*3;
1998
1999     do {
2000       if( eDir>0 ){
2001         rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg);
2002       }else{
2003         if( iPg==pRun->iFirst ){
2004           iPg = 0;
2005         }else{
2006           rc = fsGetPageBefore(pFS, pRun, iPg, &iPg);
2007         }
2008       }
2009
2010       nSpace = 0;
2011       if( iPg!=0 ){
2012         rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, &nSpace);
2013         assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) );
2014       }else{
2015         *ppNext = 0;
2016       }
2017     }while( nSpace>0 && rc==LSM_OK );
2018
2019   }else{
2020     Redirect *pRedir = pRun ? pRun->pRedirect : 0;
2021     assert( eDir==1 || eDir==-1 );
2022     if( eDir<0 ){
2023       if( pRun && iPg==pRun->iFirst ){
2024         *ppNext = 0;
2025         return LSM_OK;
2026       }else if( fsIsFirst(pFS, iPg) ){
2027         assert( pPg->flags & PAGE_HASPREV );
2028         iPg = fsLastPageOnBlock(pFS, lsmGetU32(&pPg->aData[-4]));
2029       }else{
2030         iPg--;
2031       }
2032     }else{
2033       if( pRun ){
2034         if( iPg==pRun->iLastPg ){
2035           *ppNext = 0;
2036           return LSM_OK;
2037         }
2038       }
2039
2040       if( fsIsLast(pFS, iPg) ){
2041         int iBlk = fsRedirectBlock(
2042             pRedir, lsmGetU32(&pPg->aData[pFS->nPagesize-4])
2043         );
2044         iPg = fsFirstPageOnBlock(pFS, iBlk);
2045       }else{
2046         iPg++;
2047       }
2048     }
2049     rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, 0);
2050   }
2051
2052   return rc;
2053 }
2054
2055 /*
2056 ** This function is called when creating a new segment to determine if the
2057 ** first part of it can be written following an existing segment on an
2058 ** already allocated block. If it is possible, the page number of the first
2059 ** page to use for the new segment is returned. Otherwise zero.
2060 **
2061 ** If argument pLvl is not NULL, then this function will not attempt to
2062 ** start the new segment immediately following any segment that is part
2063 ** of the right-hand-side of pLvl.
2064 */
2065 static Pgno findAppendPoint(FileSystem *pFS, Level *pLvl){
2066   int i;
2067   Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
2068   Pgno iRet = 0;
2069
2070   for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
2071     if( (iRet = aiAppend[i]) ){
2072       if( pLvl ){
2073         int iBlk = fsPageToBlock(pFS, iRet);
2074         int j;
2075         for(j=0; iRet && j<pLvl->nRight; j++){
2076           if( fsPageToBlock(pFS, pLvl->aRhs[j].iLastPg)==iBlk ){
2077             iRet = 0;
2078           }
2079         }
2080       }
2081       if( iRet ) aiAppend[i] = 0;
2082     }
2083   }
2084   return iRet;
2085 }
2086
2087 /*
2088 ** Append a page to the left-hand-side of pLvl. Set the ref-count to 1 and
2089 ** return a pointer to it. The page is writable until either
2090 ** lsmFsPagePersist() is called on it or the ref-count drops to zero.
2091 */
2092 int lsmFsSortedAppend(
2093   FileSystem *pFS,
2094   Snapshot *pSnapshot,
2095   Level *pLvl,
2096   int bDefer,
2097   Page **ppOut
2098 ){
2099   int rc = LSM_OK;
2100   Page *pPg = 0;
2101   Pgno iApp = 0;
2102   Pgno iNext = 0;
2103   Segment *p = &pLvl->lhs;
2104   Pgno iPrev = p->iLastPg;
2105
2106   *ppOut = 0;
2107   assert( p->pRedirect==0 );
2108
2109   if( pFS->pCompress || bDefer ){
2110     /* In compressed database mode the page is not assigned a page number
2111     ** or location in the database file at this point. This will be done
2112     ** by the lsmFsPagePersist() call.  */
2113     rc = fsPageBuffer(pFS, &pPg);
2114     if( rc==LSM_OK ){
2115       pPg->pFS = pFS;
2116       pPg->pSeg = p;
2117       pPg->iPg = 0;
2118       pPg->flags |= PAGE_DIRTY;
2119       pPg->nData = pFS->nPagesize;
2120       assert( pPg->aData );
2121       if( pFS->pCompress==0 ) pPg->nData -= 4;
2122
2123       pPg->nRef = 1;
2124       pFS->nOut++;
2125     }
2126   }else{
2127     if( iPrev==0 ){
2128       iApp = findAppendPoint(pFS, pLvl);
2129     }else if( fsIsLast(pFS, iPrev) ){
2130       int iNext2;
2131       rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iPrev), &iNext2);
2132       if( rc!=LSM_OK ) return rc;
2133       iApp = fsFirstPageOnBlock(pFS, iNext2);
2134     }else{
2135       iApp = iPrev + 1;
2136     }
2137
2138     /* If this is the first page allocated, or if the page allocated is the
2139     ** last in the block, also allocate the next block here.  */
2140     if( iApp==0 || fsIsLast(pFS, iApp) ){
2141       int iNew;                     /* New block number */
2142
2143       rc = lsmBlockAllocate(pFS->pDb, 0, &iNew);
2144       if( rc!=LSM_OK ) return rc;
2145       if( iApp==0 ){
2146         iApp = fsFirstPageOnBlock(pFS, iNew);
2147       }else{
2148         iNext = fsFirstPageOnBlock(pFS, iNew);
2149       }
2150     }
2151
2152     /* Grab the new page. */
2153     pPg = 0;
2154     rc = fsPageGet(pFS, 0, iApp, 1, &pPg, 0);
2155     assert( rc==LSM_OK || pPg==0 );
2156
2157     /* If this is the first or last page of a block, fill in the pointer
2158      ** value at the end of the new page. */
2159     if( rc==LSM_OK ){
2160       p->nSize++;
2161       p->iLastPg = iApp;
2162       if( p->iFirst==0 ) p->iFirst = iApp;
2163       pPg->flags |= PAGE_DIRTY;
2164
2165       if( fsIsLast(pFS, iApp) ){
2166         lsmPutU32(&pPg->aData[pFS->nPagesize-4], fsPageToBlock(pFS, iNext));
2167       }else if( fsIsFirst(pFS, iApp) ){
2168         lsmPutU32(&pPg->aData[-4], fsPageToBlock(pFS, iPrev));
2169       }
2170     }
2171   }
2172
2173   *ppOut = pPg;
2174   return rc;
2175 }
2176
2177 /*
2178 ** Mark the segment passed as the second argument as finished. Once a segment
2179 ** is marked as finished it is not possible to append any further pages to
2180 ** it.
2181 **
2182 ** Return LSM_OK if successful or an lsm error code if an error occurs.
2183 */
2184 int lsmFsSortedFinish(FileSystem *pFS, Segment *p){
2185   int rc = LSM_OK;
2186   if( p && p->iLastPg ){
2187     assert( p->pRedirect==0 );
2188
2189     /* Check if the last page of this run happens to be the last of a block.
2190     ** If it is, then an extra block has already been allocated for this run.
2191     ** Shift this extra block back to the free-block list.
2192     **
2193     ** Otherwise, add the first free page in the last block used by the run
2194     ** to the lAppend list.
2195     */
2196     if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){
2197       int i;
2198       Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
2199       for(i=0; i<LSM_APPLIST_SZ; i++){
2200         if( aiAppend[i]==0 ){
2201           aiAppend[i] = p->iLastPg+1;
2202           break;
2203         }
2204       }
2205     }else if( pFS->pCompress==0 ){
2206       Page *pLast;
2207       rc = fsPageGet(pFS, 0, p->iLastPg, 0, &pLast, 0);
2208       if( rc==LSM_OK ){
2209         int iBlk = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
2210         lsmBlockRefree(pFS->pDb, iBlk);
2211         lsmFsPageRelease(pLast);
2212       }
2213     }else{
2214       int iBlk = 0;
2215       rc = fsBlockNext(pFS, p, fsPageToBlock(pFS, p->iLastPg), &iBlk);
2216       if( rc==LSM_OK ){
2217         lsmBlockRefree(pFS->pDb, iBlk);
2218       }
2219     }
2220   }
2221   return rc;
2222 }
2223
2224 /*
2225 ** Obtain a reference to page number iPg.
2226 **
2227 ** Return LSM_OK if successful, or an lsm error code if an error occurs.
2228 */
2229 int lsmFsDbPageGet(FileSystem *pFS, Segment *pSeg, Pgno iPg, Page **ppPg){
2230   return fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
2231 }
2232
2233 /*
2234 ** Obtain a reference to the last page in the segment passed as the
2235 ** second argument.
2236 **
2237 ** Return LSM_OK if successful, or an lsm error code if an error occurs.
2238 */
2239 int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){
2240   int rc;
2241   Pgno iPg = pSeg->iLastPg;
2242   if( pFS->pCompress ){
2243     int nSpace;
2244     iPg++;
2245     do {
2246       nSpace = 0;
2247       rc = fsGetPageBefore(pFS, pSeg, iPg, &iPg);
2248       if( rc==LSM_OK ){
2249         rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, &nSpace);
2250       }
2251     }while( rc==LSM_OK && nSpace>0 );
2252
2253   }else{
2254     rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
2255   }
2256   return rc;
2257 }
2258
2259 /*
2260 ** Return a reference to meta-page iPg. If successful, LSM_OK is returned
2261 ** and *ppPg populated with the new page reference. The reference should
2262 ** be released by the caller using lsmFsPageRelease().
2263 **
2264 ** Otherwise, if an error occurs, *ppPg is set to NULL and an LSM error
2265 ** code is returned.
2266 */
2267 int lsmFsMetaPageGet(
2268   FileSystem *pFS,                /* File-system connection */
2269   int bWrite,                     /* True for write access, false for read */
2270   int iPg,                        /* Either 1 or 2 */
2271   MetaPage **ppPg                 /* OUT: Pointer to MetaPage object */
2272 ){
2273   int rc = LSM_OK;
2274   MetaPage *pPg;
2275   assert( iPg==1 || iPg==2 );
2276
2277   pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
2278
2279   if( pPg ){
2280     i64 iOff = (iPg-1) * pFS->nMetasize;
2281     if( pFS->nMapLimit>0 ){
2282       fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
2283       pPg->aData = (u8 *)(pFS->pMap) + iOff;
2284     }else{
2285       pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
2286       if( rc==LSM_OK && bWrite==0 ){
2287         rc = lsmEnvRead(
2288             pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetaRwSize
2289         );
2290       }
2291 #ifndef NDEBUG
2292       /* pPg->aData causes an uninitialized access via a downstreadm write().
2293          After discussion on this list, this memory should not, for performance
2294          reasons, be memset. However, tracking down "real" misuse is more
2295          difficult with this "false" positive, so it is set when NDEBUG.
2296       */
2297       else if( rc==LSM_OK ){
2298         memset( pPg->aData, 0x77, pFS->nMetasize );
2299       }
2300 #endif
2301     }
2302
2303     if( rc!=LSM_OK ){
2304       if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData);
2305       lsmFree(pFS->pEnv, pPg);
2306       pPg = 0;
2307     }else{
2308       pPg->iPg = iPg;
2309       pPg->bWrite = bWrite;
2310       pPg->pFS = pFS;
2311     }
2312   }
2313
2314   *ppPg = pPg;
2315   return rc;
2316 }
2317
2318 /*
2319 ** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
2320 */
2321 int lsmFsMetaPageRelease(MetaPage *pPg){
2322   int rc = LSM_OK;
2323   if( pPg ){
2324     FileSystem *pFS = pPg->pFS;
2325
2326     if( pFS->nMapLimit==0 ){
2327       if( pPg->bWrite ){
2328         i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
2329         int nWrite = pFS->nMetaRwSize;
2330         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
2331       }
2332       lsmFree(pFS->pEnv, pPg->aData);
2333     }
2334
2335     lsmFree(pFS->pEnv, pPg);
2336   }
2337   return rc;
2338 }
2339
2340 /*
2341 ** Return a pointer to a buffer containing the data associated with the
2342 ** meta-page passed as the first argument. If parameter pnData is not NULL,
2343 ** set *pnData to the size of the meta-page in bytes before returning.
2344 */
2345 u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){
2346   if( pnData ) *pnData = pPg->pFS->nMetaRwSize;
2347   return pPg->aData;
2348 }
2349
2350 /*
2351 ** Return true if page is currently writable. This is used in assert()
2352 ** statements only.
2353 */
2354 #ifndef NDEBUG
2355 int lsmFsPageWritable(Page *pPg){
2356   return (pPg->flags & PAGE_DIRTY) ? 1 : 0;
2357 }
2358 #endif
2359
2360 /*
2361 ** This is called when block iFrom is being redirected to iTo. If page
2362 ** number (*piPg) lies on block iFrom, then calculate the equivalent
2363 ** page on block iTo and set *piPg to this value before returning.
2364 */
2365 static void fsMovePage(
2366   FileSystem *pFS,                /* File system object */
2367   int iTo,                        /* Destination block */
2368   int iFrom,                      /* Source block */
2369   Pgno *piPg                      /* IN/OUT: Page number */
2370 ){
2371   Pgno iPg = *piPg;
2372   if( iFrom==fsPageToBlock(pFS, iPg) ){
2373     const int nPagePerBlock = (
2374         pFS->pCompress ? pFS ->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
2375     );
2376     *piPg = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
2377   }
2378 }
2379
2380 /*
2381 ** Copy the contents of block iFrom to block iTo.
2382 **
2383 ** It is safe to assume that there are no outstanding references to pages
2384 ** on block iTo. And that block iFrom is not currently being written. In
2385 ** other words, the data can be read and written directly.
2386 */
2387 int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
2388   Snapshot *p = pFS->pDb->pWorker;
2389   int rc = LSM_OK;
2390   int i;
2391   i64 nMap;
2392
2393   i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
2394   i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
2395
2396   assert( iTo!=1 );
2397   assert( iFrom>iTo );
2398
2399   /* Grow the mapping as required. */
2400   nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize);
2401   fsGrowMapping(pFS, nMap, &rc);
2402
2403   if( rc==LSM_OK ){
2404     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
2405     int nSz = pFS->nPagesize;
2406     u8 *aBuf = 0;
2407     u8 *aData = 0;
2408
2409     for(i=0; rc==LSM_OK && i<nPagePerBlock; i++){
2410       i64 iOff = iFromOff + i*nSz;
2411
2412       /* Set aData to point to a buffer containing the from page */
2413       if( (iOff+nSz)<=pFS->nMapLimit ){
2414         u8 *aMap = (u8 *)(pFS->pMap);
2415         aData = &aMap[iOff];
2416       }else{
2417         if( aBuf==0 ){
2418           aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
2419           if( aBuf==0 ) break;
2420         }
2421         aData = aBuf;
2422         rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
2423       }
2424
2425       /* Copy aData to the to page */
2426       if( rc==LSM_OK ){
2427         iOff = iToOff + i*nSz;
2428         if( (iOff+nSz)<=pFS->nMapLimit ){
2429           u8 *aMap = (u8 *)(pFS->pMap);
2430           memcpy(&aMap[iOff], aData, nSz);
2431         }else{
2432           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
2433         }
2434       }
2435     }
2436     lsmFree(pFS->pEnv, aBuf);
2437     lsmFsPurgeCache(pFS);
2438   }
2439
2440   /* Update append-point list if necessary */
2441   for(i=0; i<LSM_APPLIST_SZ; i++){
2442     fsMovePage(pFS, iTo, iFrom, &p->aiAppend[i]);
2443   }
2444
2445   /* Update the Segment structure itself */
2446   fsMovePage(pFS, iTo, iFrom, &pSeg->iFirst);
2447   fsMovePage(pFS, iTo, iFrom, &pSeg->iLastPg);
2448   fsMovePage(pFS, iTo, iFrom, &pSeg->iRoot);
2449
2450   return rc;
2451 }
2452
2453 /*
2454 ** Append raw data to a segment. Return the database file offset that the
2455 ** data is written to (this may be used as the page number if the data
2456 ** being appended is a new page record).
2457 **
2458 ** This function is only used in compressed database mode.
2459 */
2460 static Pgno fsAppendData(
2461   FileSystem *pFS,                /* File-system handle */
2462   Segment *pSeg,                  /* Segment to append to */
2463   const u8 *aData,                /* Buffer containing data to write */
2464   int nData,                      /* Size of buffer aData[] in bytes */
2465   int *pRc                        /* IN/OUT: Error code */
2466 ){
2467   Pgno iRet = 0;
2468   int rc = *pRc;
2469   assert( pFS->pCompress );
2470   if( rc==LSM_OK ){
2471     int nRem = 0;
2472     int nWrite = 0;
2473     Pgno iLastOnBlock;
2474     Pgno iApp = pSeg->iLastPg+1;
2475
2476     /* If this is the first data written into the segment, find an append-point
2477     ** or allocate a new block.  */
2478     if( iApp==1 ){
2479       pSeg->iFirst = iApp = findAppendPoint(pFS, 0);
2480       if( iApp==0 ){
2481         int iBlk;
2482         rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2483         pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk);
2484       }
2485     }
2486     iRet = iApp;
2487
2488     /* Write as much data as is possible at iApp (usually all of it). */
2489     iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp);
2490     if( rc==LSM_OK ){
2491       int nSpace = (int)(iLastOnBlock - iApp + 1);
2492       nWrite = LSM_MIN(nData, nSpace);
2493       nRem = nData - nWrite;
2494       assert( nWrite>=0 );
2495       if( nWrite!=0 ){
2496         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite);
2497       }
2498       iApp += nWrite;
2499     }
2500
2501     /* If required, allocate a new block and write the rest of the data
2502     ** into it. Set the next and previous block pointers to link the new
2503     ** block to the old.  */
2504     assert( nRem<=0 || (iApp-1)==iLastOnBlock );
2505     if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){
2506       u8 aPtr[4];                 /* Space to serialize a u32 */
2507       int iBlk;                   /* New block number */
2508
2509       if( nWrite>0 ){
2510         /* Allocate a new block. */
2511         rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2512
2513         /* Set the "next" pointer on the old block */
2514         if( rc==LSM_OK ){
2515           assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 );
2516           lsmPutU32(aPtr, iBlk);
2517           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr));
2518         }
2519
2520         /* Set the "prev" pointer on the new block */
2521         if( rc==LSM_OK ){
2522           Pgno iWrite;
2523           lsmPutU32(aPtr, fsPageToBlock(pFS, iApp));
2524           iWrite = fsFirstPageOnBlock(pFS, iBlk);
2525           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr));
2526           if( nRem>0 ) iApp = iWrite;
2527         }
2528       }else{
2529         /* The next block is already allocated. */
2530         assert( nRem>0 );
2531         assert( pSeg->pRedirect==0 );
2532         rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iApp), &iBlk);
2533         iRet = iApp = fsFirstPageOnBlock(pFS, iBlk);
2534       }
2535
2536       /* Write the remaining data into the new block */
2537       if( rc==LSM_OK && nRem>0 ){
2538         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem);
2539         iApp += nRem;
2540       }
2541     }
2542
2543     pSeg->iLastPg = iApp-1;
2544     *pRc = rc;
2545   }
2546
2547   return iRet;
2548 }
2549
2550 /*
2551 ** This function is only called in compressed database mode. It
2552 ** compresses the contents of page pPg and writes the result to the
2553 ** buffer at pFS->aOBuffer. The size of the compressed data is stored in
2554 ** pPg->nCompress.
2555 **
2556 ** If buffer pFS->aOBuffer[] has not been allocated then this function
2557 ** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK.
2558 */
2559 static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){
2560   lsm_compress *p = pFS->pCompress;
2561
2562   if( fsAllocateBuffer(pFS, 1) ) return LSM_NOMEM;
2563   assert( pPg->nData==pFS->nPagesize );
2564
2565   pPg->nCompress = pFS->nBuffer;
2566   return p->xCompress(p->pCtx,
2567       (char *)pFS->aOBuffer, &pPg->nCompress,
2568       (const char *)pPg->aData, pPg->nData
2569   );
2570 }
2571
2572 /*
2573 ** Append a new page to segment pSeg. Set output variable *piNew to the
2574 ** page number of the new page before returning.
2575 **
2576 ** If the new page is the last on its block, then the 'next' block that
2577 ** will be used by the segment is allocated here too. In this case output
2578 ** variable *piNext is set to the block number of the next block.
2579 **
2580 ** If the new page is the first on its block but not the first in the
2581 ** entire segment, set output variable *piPrev to the block number of
2582 ** the previous block in the segment.
2583 **
2584 ** LSM_OK is returned if successful, or an lsm error code otherwise. If
2585 ** any value other than LSM_OK is returned, then the final value of all
2586 ** output variables is undefined.
2587 */
2588 static int fsAppendPage(
2589   FileSystem *pFS,
2590   Segment *pSeg,
2591   Pgno *piNew,
2592   int *piPrev,
2593   int *piNext
2594 ){
2595   Pgno iPrev = pSeg->iLastPg;
2596   int rc;
2597   assert( iPrev!=0 );
2598
2599   *piPrev = 0;
2600   *piNext = 0;
2601
2602   if( fsIsLast(pFS, iPrev) ){
2603     /* Grab the first page on the next block (which has already be
2604     ** allocated). In this case set *piPrev to tell the caller to set
2605     ** the "previous block" pointer in the first 4 bytes of the page.
2606     */
2607     int iNext;
2608     int iBlk = fsPageToBlock(pFS, iPrev);
2609     assert( pSeg->pRedirect==0 );
2610     rc = fsBlockNext(pFS, 0, iBlk, &iNext);
2611     if( rc!=LSM_OK ) return rc;
2612     *piNew = fsFirstPageOnBlock(pFS, iNext);
2613     *piPrev = iBlk;
2614   }else{
2615     *piNew = iPrev+1;
2616     if( fsIsLast(pFS, *piNew) ){
2617       /* Allocate the next block here. */
2618       int iBlk;
2619       rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2620       if( rc!=LSM_OK ) return rc;
2621       *piNext = iBlk;
2622     }
2623   }
2624
2625   pSeg->nSize++;
2626   pSeg->iLastPg = *piNew;
2627   return LSM_OK;
2628 }
2629
2630 /*
2631 ** Flush all pages in the FileSystem.pWaiting list to disk.
2632 */
2633 void lsmFsFlushWaiting(FileSystem *pFS, int *pRc){
2634   int rc = *pRc;
2635   Page *pPg;
2636
2637   pPg = pFS->pWaiting;
2638   pFS->pWaiting = 0;
2639
2640   while( pPg ){
2641     Page *pNext = pPg->pWaitingNext;
2642     if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
2643     assert( pPg->nRef==1 );
2644     lsmFsPageRelease(pPg);
2645     pPg = pNext;
2646   }
2647   *pRc = rc;
2648 }
2649
2650 /*
2651 ** If there exists a hash-table entry associated with page iPg, remove it.
2652 */
2653 static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){
2654   Page *p;
2655   int iHash = fsHashKey(pFS->nHash, iPg);
2656
2657   for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);
2658
2659   if( p ){
2660     assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 );
2661     fsPageRemoveFromHash(pFS, p);
2662     p->iPg = 0;
2663     iHash = fsHashKey(pFS->nHash, 0);
2664     p->pHashNext = pFS->apHash[iHash];
2665     pFS->apHash[iHash] = p;
2666   }
2667 }
2668
2669 /*
2670 ** If the page passed as an argument is dirty, update the database file
2671 ** (or mapping of the database file) with its current contents and mark
2672 ** the page as clean.
2673 **
2674 ** Return LSM_OK if the operation is a success, or an LSM error code
2675 ** otherwise.
2676 */
2677 int lsmFsPagePersist(Page *pPg){
2678   int rc = LSM_OK;
2679   if( pPg && (pPg->flags & PAGE_DIRTY) ){
2680     FileSystem *pFS = pPg->pFS;
2681
2682     if( pFS->pCompress ){
2683       int iHash;                  /* Hash key of assigned page number */
2684       u8 aSz[3];                  /* pPg->nCompress as a 24-bit big-endian */
2685       assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 );
2686
2687       /* Compress the page image. */
2688       rc = fsCompressIntoBuffer(pFS, pPg);
2689
2690       /* Serialize the compressed size into buffer aSz[] */
2691       putRecordSize(aSz, pPg->nCompress, 0);
2692
2693       /* Write the serialized page record into the database file. */
2694       pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
2695       fsAppendData(pFS, pPg->pSeg, pFS->aOBuffer, pPg->nCompress, &rc);
2696       fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
2697
2698       /* Now that it has a page number, insert the page into the hash table */
2699       iHash = fsHashKey(pFS->nHash, pPg->iPg);
2700       pPg->pHashNext = pFS->apHash[iHash];
2701       pFS->apHash[iHash] = pPg;
2702
2703       pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;
2704
2705       pPg->flags &= ~PAGE_DIRTY;
2706       pFS->nWrite++;
2707     }else{
2708
2709       if( pPg->iPg==0 ){
2710         /* No page number has been assigned yet. This occurs with pages used
2711         ** in the b-tree hierarchy. They were not assigned page numbers when
2712         ** they were created as doing so would cause this call to
2713         ** lsmFsPagePersist() to write an out-of-order page. Instead a page
2714         ** number is assigned here so that the page data will be appended
2715         ** to the current segment.
2716         */
2717         Page **pp;
2718         int iPrev = 0;
2719         int iNext = 0;
2720         int iHash;
2721
2722         assert( pPg->pSeg->iFirst );
2723         assert( pPg->flags & PAGE_FREE );
2724         assert( (pPg->flags & PAGE_HASPREV)==0 );
2725         assert( pPg->nData==pFS->nPagesize-4 );
2726
2727         rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
2728         if( rc!=LSM_OK ) return rc;
2729
2730         assert( pPg->flags & PAGE_FREE );
2731         iHash = fsHashKey(pFS->nHash, pPg->iPg);
2732         fsRemoveHashEntry(pFS, pPg->iPg);
2733         pPg->pHashNext = pFS->apHash[iHash];
2734         pFS->apHash[iHash] = pPg;
2735         assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );
2736
2737         if( iPrev ){
2738           assert( iNext==0 );
2739           memmove(&pPg->aData[4], pPg->aData, pPg->nData);
2740           lsmPutU32(pPg->aData, iPrev);
2741           pPg->flags |= PAGE_HASPREV;
2742           pPg->aData += 4;
2743         }else if( iNext ){
2744           assert( iPrev==0 );
2745           lsmPutU32(&pPg->aData[pPg->nData], iNext);
2746         }else{
2747           int nData = pPg->nData;
2748           pPg->nData += 4;
2749           lsmSortedExpandBtreePage(pPg, nData);
2750         }
2751
2752         pPg->nRef++;
2753         for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext);
2754         *pp = pPg;
2755         assert( pPg->pWaitingNext==0 );
2756
2757       }else{
2758         i64 iOff;                   /* Offset to write within database file */
2759
2760         iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
2761         if( fsMmapPage(pFS, pPg->iPg)==0 ){
2762           u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
2763           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
2764         }else if( pPg->flags & PAGE_FREE ){
2765           fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
2766           if( rc==LSM_OK ){
2767             u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
2768             u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
2769             memcpy(aTo, aFrom, pFS->nPagesize);
2770             lsmFree(pFS->pEnv, aFrom);
2771             pFS->nCacheAlloc--;
2772             pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
2773             pPg->flags &= ~PAGE_FREE;
2774             fsPageRemoveFromHash(pFS, pPg);
2775             pPg->pMappedNext = pFS->pMapped;
2776             pFS->pMapped = pPg;
2777           }
2778         }
2779
2780         lsmFsFlushWaiting(pFS, &rc);
2781         pPg->flags &= ~PAGE_DIRTY;
2782         pFS->nWrite++;
2783       }
2784     }
2785   }
2786
2787   return rc;
2788 }
2789
2790 /*
2791 ** For non-compressed databases, this function is a no-op. For compressed
2792 ** databases, it adds a padding record to the segment passed as the third
2793 ** argument.
2794 **
2795 ** The size of the padding records is selected so that the last byte
2796 ** written is the last byte of a disk sector. This means that if a
2797 ** snapshot is taken and checkpointed, subsequent worker processes will
2798 ** not write to any sector that contains checkpointed data.
2799 */
2800 int lsmFsSortedPadding(
2801   FileSystem *pFS,
2802   Snapshot *pSnapshot,
2803   Segment *pSeg
2804 ){
2805   int rc = LSM_OK;
2806   if( pFS->pCompress && pSeg->iFirst ){
2807     Pgno iLast2;
2808     Pgno iLast = pSeg->iLastPg;     /* Current last page of segment */
2809     int nPad;                       /* Bytes of padding required */
2810     u8 aSz[3];
2811
2812     iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1;
2813     assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) );
2814     nPad = (int)(iLast2 - iLast);
2815
2816     if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){
2817       nPad -= 4;
2818     }
2819     assert( nPad>=0 );
2820
2821     if( nPad>=6 ){
2822       pSeg->nSize += nPad;
2823       nPad -= 6;
2824       putRecordSize(aSz, nPad, 1);
2825       fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
2826       memset(pFS->aOBuffer, 0, nPad);
2827       fsAppendData(pFS, pSeg, pFS->aOBuffer, nPad, &rc);
2828       fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
2829     }else if( nPad>0 ){
2830       u8 aBuf[5] = {0,0,0,0,0};
2831       aBuf[0] = (u8)nPad;
2832       aBuf[nPad-1] = (u8)nPad;
2833       fsAppendData(pFS, pSeg, aBuf, nPad, &rc);
2834     }
2835
2836     assert( rc!=LSM_OK
2837         || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg)
2838         || ((pSeg->iLastPg + 1) % pFS->szSector)==0
2839     );
2840   }
2841
2842   return rc;
2843 }
2844
2845
2846 /*
2847 ** Increment the reference count on the page object passed as the first
2848 ** argument.
2849 */
2850 void lsmFsPageRef(Page *pPg){
2851   if( pPg ){
2852     pPg->nRef++;
2853   }
2854 }
2855
2856 /*
2857 ** Release a page-reference obtained using fsPageGet().
2858 */
2859 int lsmFsPageRelease(Page *pPg){
2860   int rc = LSM_OK;
2861   if( pPg ){
2862     assert( pPg->nRef>0 );
2863     pPg->nRef--;
2864     if( pPg->nRef==0 ){
2865       FileSystem *pFS = pPg->pFS;
2866       rc = lsmFsPagePersist(pPg);
2867       pFS->nOut--;
2868
2869       assert( pPg->pFS->pCompress
2870            || fsIsFirst(pPg->pFS, pPg->iPg)==0
2871            || (pPg->flags & PAGE_HASPREV)
2872       );
2873       pPg->aData -= (pPg->flags & PAGE_HASPREV);
2874       pPg->flags &= ~PAGE_HASPREV;
2875
2876       if( (pPg->flags & PAGE_FREE)==0 ){
2877         /* Removed from mapped list */
2878         Page **pp;
2879         for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext);
2880         *pp = pPg->pMappedNext;
2881         pPg->pMappedNext = 0;
2882
2883         /* Add to free list */
2884         pPg->pFreeNext = pFS->pFree;
2885         pFS->pFree = pPg;
2886       }else{
2887         fsPageAddToLru(pFS, pPg);
2888       }
2889     }
2890   }
2891
2892   return rc;
2893 }
2894
2895 /*
2896 ** Return the total number of pages read from the database file.
2897 */
2898 int lsmFsNRead(FileSystem *pFS){ return pFS->nRead; }
2899
2900 /*
2901 ** Return the total number of pages written to the database file.
2902 */
2903 int lsmFsNWrite(FileSystem *pFS){ return pFS->nWrite; }
2904
2905 /*
2906 ** Return a copy of the environment pointer used by the file-system object.
2907 */
2908 lsm_env *lsmFsEnv(FileSystem *pFS){
2909   return pFS->pEnv;
2910 }
2911
2912 /*
2913 ** Return a copy of the environment pointer used by the file-system object
2914 ** to which this page belongs.
2915 */
2916 lsm_env *lsmPageEnv(Page *pPg) {
2917   return pPg->pFS->pEnv;
2918 }
2919
2920 /*
2921 ** Return a pointer to the file-system object associated with the Page
2922 ** passed as the only argument.
2923 */
2924 FileSystem *lsmPageFS(Page *pPg){
2925   return pPg->pFS;
2926 }
2927
2928 /*
2929 ** Return the sector-size as reported by the log file handle.
2930 */
2931 int lsmFsSectorSize(FileSystem *pFS){
2932   return pFS->szSector;
2933 }
2934
2935 /*
2936 ** Helper function for lsmInfoArrayStructure().
2937 */
2938 static Segment *startsWith(Segment *pRun, Pgno iFirst){
2939   return (iFirst==pRun->iFirst) ? pRun : 0;
2940 }
2941
2942 /*
2943 ** Return the segment that starts with page iFirst, if any. If no such segment
2944 ** can be found, return NULL.
2945 */
2946 static Segment *findSegment(Snapshot *pWorker, Pgno iFirst){
2947   Level *pLvl;                    /* Used to iterate through db levels */
2948   Segment *pSeg = 0;              /* Pointer to segment to return */
2949
2950   for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pSeg==0; pLvl=pLvl->pNext){
2951     if( 0==(pSeg = startsWith(&pLvl->lhs, iFirst)) ){
2952       int i;
2953       for(i=0; i<pLvl->nRight; i++){
2954         if( (pSeg = startsWith(&pLvl->aRhs[i], iFirst)) ) break;
2955       }
2956     }
2957   }
2958
2959   return pSeg;
2960 }
2961
2962 /*
2963 ** This function implements the lsm_info(LSM_INFO_ARRAY_STRUCTURE) request.
2964 ** If successful, *pzOut is set to point to a nul-terminated string
2965 ** containing the array structure and LSM_OK is returned. The caller should
2966 ** eventually free the string using lsmFree().
2967 **
2968 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
2969 */
2970 int lsmInfoArrayStructure(
2971   lsm_db *pDb,
2972   int bBlock,                     /* True for block numbers only */
2973   Pgno iFirst,
2974   char **pzOut
2975 ){
2976   int rc = LSM_OK;
2977   Snapshot *pWorker;              /* Worker snapshot */
2978   Segment *pArray = 0;            /* Array to report on */
2979   int bUnlock = 0;
2980
2981   *pzOut = 0;
2982   if( iFirst==0 ) return LSM_ERROR;
2983
2984   /* Obtain the worker snapshot */
2985   pWorker = pDb->pWorker;
2986   if( !pWorker ){
2987     rc = lsmBeginWork(pDb);
2988     if( rc!=LSM_OK ) return rc;
2989     pWorker = pDb->pWorker;
2990     bUnlock = 1;
2991   }
2992
2993   /* Search for the array that starts on page iFirst */
2994   pArray = findSegment(pWorker, iFirst);
2995
2996   if( pArray==0 ){
2997     /* Could not find the requested array. This is an error. */
2998     rc = LSM_ERROR;
2999   }else{
3000     FileSystem *pFS = pDb->pFS;
3001     LsmString str;
3002     int iBlk;
3003     int iLastBlk;
3004
3005     iBlk = fsPageToBlock(pFS, pArray->iFirst);
3006     iLastBlk = fsPageToBlock(pFS, pArray->iLastPg);
3007
3008     lsmStringInit(&str, pDb->pEnv);
3009     if( bBlock ){
3010       lsmStringAppendf(&str, "%d", iBlk);
3011       while( iBlk!=iLastBlk ){
3012         fsBlockNext(pFS, pArray, iBlk, &iBlk);
3013         lsmStringAppendf(&str, " %d", iBlk);
3014       }
3015     }else{
3016       lsmStringAppendf(&str, "%d", pArray->iFirst);
3017       while( iBlk!=iLastBlk ){
3018         lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk));
3019         fsBlockNext(pFS, pArray, iBlk, &iBlk);
3020         lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
3021       }
3022       lsmStringAppendf(&str, " %d", pArray->iLastPg);
3023     }
3024
3025     *pzOut = str.z;
3026   }
3027
3028   if( bUnlock ){
3029     int rcwork = LSM_BUSY;
3030     lsmFinishWork(pDb, 0, &rcwork);
3031   }
3032   return rc;
3033 }
3034
3035 int lsmFsSegmentContainsPg(
3036   FileSystem *pFS,
3037   Segment *pSeg,
3038   Pgno iPg,
3039   int *pbRes
3040 ){
3041   Redirect *pRedir = pSeg->pRedirect;
3042   int rc = LSM_OK;
3043   int iBlk;
3044   int iLastBlk;
3045   int iPgBlock;                   /* Block containing page iPg */
3046
3047   iPgBlock = fsPageToBlock(pFS, pSeg->iFirst);
3048   iBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iFirst));
3049   iLastBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iLastPg));
3050
3051   while( iBlk!=iLastBlk && iBlk!=iPgBlock && rc==LSM_OK ){
3052     rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
3053   }
3054
3055   *pbRes = (iBlk==iPgBlock);
3056   return rc;
3057 }
3058
3059 /*
3060 ** This function implements the lsm_info(LSM_INFO_ARRAY_PAGES) request.
3061 ** If successful, *pzOut is set to point to a nul-terminated string
3062 ** containing the array structure and LSM_OK is returned. The caller should
3063 ** eventually free the string using lsmFree().
3064 **
3065 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
3066 */
3067 int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut){
3068   int rc = LSM_OK;
3069   Snapshot *pWorker;              /* Worker snapshot */
3070   Segment *pSeg = 0;              /* Array to report on */
3071   int bUnlock = 0;
3072
3073   *pzOut = 0;
3074   if( iFirst==0 ) return LSM_ERROR;
3075
3076   /* Obtain the worker snapshot */
3077   pWorker = pDb->pWorker;
3078   if( !pWorker ){
3079     rc = lsmBeginWork(pDb);
3080     if( rc!=LSM_OK ) return rc;
3081     pWorker = pDb->pWorker;
3082     bUnlock = 1;
3083   }
3084
3085   /* Search for the array that starts on page iFirst */
3086   pSeg = findSegment(pWorker, iFirst);
3087
3088   if( pSeg==0 ){
3089     /* Could not find the requested array. This is an error. */
3090     rc = LSM_ERROR;
3091   }else{
3092     Page *pPg = 0;
3093     FileSystem *pFS = pDb->pFS;
3094     LsmString str;
3095
3096     lsmStringInit(&str, pDb->pEnv);
3097     rc = lsmFsDbPageGet(pFS, pSeg, iFirst, &pPg);
3098     while( rc==LSM_OK && pPg ){
3099       Page *pNext = 0;
3100       lsmStringAppendf(&str, " %lld", lsmFsPageNumber(pPg));
3101       rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
3102       lsmFsPageRelease(pPg);
3103       pPg = pNext;
3104     }
3105
3106     if( rc!=LSM_OK ){
3107       lsmFree(pDb->pEnv, str.z);
3108     }else{
3109       *pzOut = str.z;
3110     }
3111   }
3112
3113   if( bUnlock ){
3114     int rcwork = LSM_BUSY;
3115     lsmFinishWork(pDb, 0, &rcwork);
3116   }
3117   return rc;
3118 }
3119
3120 /*
3121 ** The following macros are used by the integrity-check code. Associated with
3122 ** each block in the database is an 8-bit bit mask (the entry in the aUsed[]
3123 ** array). As the integrity-check meanders through the database, it sets the
3124 ** following bits to indicate how each block is used.
3125 **
3126 ** INTEGRITY_CHECK_FIRST_PG:
3127 **   First page of block is in use by sorted run.
3128 **
3129 ** INTEGRITY_CHECK_LAST_PG:
3130 **   Last page of block is in use by sorted run.
3131 **
3132 ** INTEGRITY_CHECK_USED:
3133 **   At least one page of the block is in use by a sorted run.
3134 **
3135 ** INTEGRITY_CHECK_FREE:
3136 **   The free block list contains an entry corresponding to this block.
3137 */
3138 #define INTEGRITY_CHECK_FIRST_PG 0x01
3139 #define INTEGRITY_CHECK_LAST_PG  0x02
3140 #define INTEGRITY_CHECK_USED     0x04
3141 #define INTEGRITY_CHECK_FREE     0x08
3142
3143 /*
3144 ** Helper function for lsmFsIntegrityCheck()
3145 */
3146 static void checkBlocks(
3147   FileSystem *pFS,
3148   Segment *pSeg,
3149   int bExtra,                     /* If true, count the "next" block if any */
3150   int nUsed,
3151   u8 *aUsed
3152 ){
3153   if( pSeg ){
3154     if( pSeg && pSeg->nSize>0 ){
3155       int rc;
3156       int iBlk;                   /* Current block (during iteration) */
3157       int iLastBlk;               /* Last block of segment */
3158       int iFirstBlk;              /* First block of segment */
3159       int bLastIsLastOnBlock;     /* True iLast is the last on its block */
3160
3161       assert( 0==fsSegmentRedirects(pFS, pSeg) );
3162       iBlk = iFirstBlk = fsPageToBlock(pFS, pSeg->iFirst);
3163       iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg);
3164
3165       bLastIsLastOnBlock = (fsLastPageOnBlock(pFS, iLastBlk)==pSeg->iLastPg);
3166       assert( iBlk>0 );
3167
3168       do {
3169         /* iBlk is a part of this sorted run. */
3170         aUsed[iBlk-1] |= INTEGRITY_CHECK_USED;
3171
3172         /* If the first page of this block is also part of the segment,
3173         ** set the flag to indicate that the first page of iBlk is in use.
3174         */
3175         if( fsFirstPageOnBlock(pFS, iBlk)==pSeg->iFirst || iBlk!=iFirstBlk ){
3176           assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_FIRST_PG)==0 );
3177           aUsed[iBlk-1] |= INTEGRITY_CHECK_FIRST_PG;
3178         }
3179
3180         /* Unless the sorted run finishes before the last page on this block,
3181         ** the last page of this block is also in use.  */
3182         if( iBlk!=iLastBlk || bLastIsLastOnBlock ){
3183           assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_LAST_PG)==0 );
3184           aUsed[iBlk-1] |= INTEGRITY_CHECK_LAST_PG;
3185         }
3186
3187         /* Special case. The sorted run being scanned is the output run of
3188         ** a level currently undergoing an incremental merge. The sorted
3189         ** run ends on the last page of iBlk, but the next block has already
3190         ** been allocated. So mark it as in use as well.  */
3191         if( iBlk==iLastBlk && bLastIsLastOnBlock && bExtra ){
3192           int iExtra = 0;
3193           rc = fsBlockNext(pFS, pSeg, iBlk, &iExtra);
3194           assert( rc==LSM_OK );
3195
3196           assert( aUsed[iExtra-1]==0 );
3197           aUsed[iExtra-1] |= INTEGRITY_CHECK_USED;
3198           aUsed[iExtra-1] |= INTEGRITY_CHECK_FIRST_PG;
3199           aUsed[iExtra-1] |= INTEGRITY_CHECK_LAST_PG;
3200         }
3201
3202         /* Move on to the next block in the sorted run. Or set iBlk to zero
3203         ** in order to break out of the loop if this was the last block in
3204         ** the run.  */
3205         if( iBlk==iLastBlk ){
3206           iBlk = 0;
3207         }else{
3208           rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
3209           assert( rc==LSM_OK );
3210         }
3211       }while( iBlk );
3212     }
3213   }
3214 }
3215
3216 typedef struct CheckFreelistCtx CheckFreelistCtx;
3217 struct CheckFreelistCtx {
3218   u8 *aUsed;
3219   int nBlock;
3220 };
3221 static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
3222   CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx;
3223
3224   assert( iBlk>=1 );
3225   assert( iBlk<=p->nBlock );
3226   assert( p->aUsed[iBlk-1]==0 );
3227   p->aUsed[iBlk-1] = INTEGRITY_CHECK_FREE;
3228   return 0;
3229 }
3230
3231 /*
3232 ** This function checks that all blocks in the database file are accounted
3233 ** for. For each block, exactly one of the following must be true:
3234 **
3235 **   + the block is part of a sorted run, or
3236 **   + the block is on the free-block list
3237 **
3238 ** This function also checks that there are no references to blocks with
3239 ** out-of-range block numbers.
3240 **
3241 ** If no errors are found, non-zero is returned. If an error is found, an
3242 ** assert() fails.
3243 */
3244 int lsmFsIntegrityCheck(lsm_db *pDb){
3245   CheckFreelistCtx ctx;
3246   FileSystem *pFS = pDb->pFS;
3247   int i;
3248   int rc;
3249   Freelist freelist = {0, 0, 0};
3250   u8 *aUsed;
3251   Level *pLevel;
3252   Snapshot *pWorker = pDb->pWorker;
3253   int nBlock = pWorker->nBlock;
3254
3255 #if 0
3256   static int nCall = 0;
3257   nCall++;
3258   printf("%d calls\n", nCall);
3259 #endif
3260
3261   aUsed = lsmMallocZero(pDb->pEnv, nBlock);
3262   if( aUsed==0 ){
3263     /* Malloc has failed. Since this function is only called within debug
3264     ** builds, this probably means the user is running an OOM injection test.
3265     ** Regardless, it will not be possible to run the integrity-check at this
3266     ** time, so assume the database is Ok and return non-zero. */
3267     return 1;
3268   }
3269
3270   for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
3271     int j;
3272     checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
3273     for(j=0; j<pLevel->nRight; j++){
3274       checkBlocks(pFS, &pLevel->aRhs[j], 0, nBlock, aUsed);
3275     }
3276   }
3277
3278   /* Mark all blocks in the free-list as used */
3279   ctx.aUsed = aUsed;
3280   ctx.nBlock = nBlock;
3281   rc = lsmWalkFreelist(pDb, 0, checkFreelistCb, (void *)&ctx);
3282
3283   if( rc==LSM_OK ){
3284     for(i=0; i<nBlock; i++) assert( aUsed[i]!=0 );
3285   }
3286
3287   lsmFree(pDb->pEnv, aUsed);
3288   lsmFree(pDb->pEnv, freelist.aEntry);
3289
3290   return 1;
3291 }
3292
3293 #ifndef NDEBUG
3294 /*
3295 ** Return true if pPg happens to be the last page in segment pSeg. Or false
3296 ** otherwise. This function is only invoked as part of assert() conditions.
3297 */
3298 int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){
3299   if( pPg->pFS->pCompress ){
3300     Pgno iNext = 0;
3301     int rc;
3302     rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext);
3303     return (rc!=LSM_OK || iNext==0);
3304   }
3305   return (pPg->iPg==pSeg->iLastPg);
3306 }
3307 #endif