ext/lsm1/lsm_file.c

   1 /*
   2 ** 2011-08-26
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 **
  13 ** NORMAL DATABASE FILE FORMAT
  14 **
  15 ** The following database file format concepts are used by the code in
  16 ** this file to read and write the database file.
  17 **
  18 ** Pages:
  19 **
  20 **   A database file is divided into pages. The first 8KB of the file consists
  21 **   of two 4KB meta-pages. The meta-page size is not configurable. The
  22 **   remainder of the file is made up of database pages. The default database
  23 **   page size is 4KB. Database pages are aligned to page-size boundaries,
  24 **   so if the database page size is larger than 8KB there is a gap between
  25 **   the end of the meta pages and the start of the database pages.
  26 **
  27 **   Database pages are numbered based on their position in the file. Page N
  28 **   begins at byte offset ((N-1)*pgsz). This means that page 1 does not
  29 **   exist - since it would always overlap with the meta pages. If the
  30 **   page-size is (say) 512 bytes, then the first usable page in the database
  31 **   is page 33.
  32 **
  33 **   It is assumed that the first two meta pages and the data that follows
  34 **   them are located on different disk sectors. So that if a power failure
  35 **   while writing to a meta page there is no risk of damage to the other
  36 **   meta page or any other part of the database file. TODO: This may need
  37 **   to be revisited.
  38 **
  39 ** Blocks:
  40 **
  41 **   The database file is also divided into blocks. The default block size is
  42 **   1MB. When writing to the database file, an attempt is made to write data
  43 **   in contiguous block-sized chunks.
  44 **
  45 **   The first and last page on each block are special in that they are 4
  46 **   bytes smaller than all other pages. This is because the last four bytes
  47 **   of space on the first and last pages of each block are reserved for
  48 **   pointers to other blocks (i.e. a 32-bit block number).
  49 **
  50 ** Runs:
  51 **
  52 **   A run is a sequence of pages that the upper layer uses to store a
  53 **   sorted array of database keys (and accompanying data - values, FC
  54 **   pointers and so on). Given a page within a run, it is possible to
  55 **   navigate to the next page in the run as follows:
  56 **
  57 **     a) if the current page is not the last in a block, the next page
  58 **        in the run is located immediately after the current page, OR
  59 **
  60 **     b) if the current page is the last page in a block, the next page
  61 **        in the run is the first page on the block identified by the
  62 **        block pointer stored in the last 4 bytes of the current block.
  63 **
  64 **   It is possible to navigate to the previous page in a similar fashion,
  65 **   using the block pointer embedded in the last 4 bytes of the first page
  66 **   of each block as required.
  67 **
  68 **   The upper layer is responsible for identifying by page number the
  69 **   first and last page of any run that it needs to navigate - there are
  70 **   no "end-of-run" markers stored or identified by this layer. This is
  71 **   necessary as clients reading different database snapshots may access
  72 **   different subsets of a run.
  73 **
  74 ** THE LOG FILE
  75 **
  76 ** This file opens and closes the log file. But it does not contain any
  77 ** logic related to the log file format. Instead, it exports the following
  78 ** functions that are used by the code in lsm_log.c to read and write the
  79 ** log file:
  80 **
  81 **     lsmFsOpenLog
  82 **     lsmFsWriteLog
  83 **     lsmFsSyncLog
  84 **     lsmFsReadLog
  85 **     lsmFsTruncateLog
  86 **     lsmFsCloseAndDeleteLog
  87 **
  88 ** COMPRESSED DATABASE FILE FORMAT
  89 **
  90 ** The compressed database file format is very similar to the normal format.
  91 ** The file still begins with two 4KB meta-pages (which are never compressed).
  92 ** It is still divided into blocks.
  93 **
  94 ** The first and last four bytes of each block are reserved for 32-bit
  95 ** pointer values. Similar to the way four bytes are carved from the end of
  96 ** the first and last page of each block in uncompressed databases. From
  97 ** the point of view of the upper layer, all pages are the same size - this
  98 ** is different from the uncompressed format where the first and last pages
  99 ** on each block are 4 bytes smaller than the others.
 100 **
 101 ** Pages are stored in variable length compressed form, as follows:
 102 **
 103 **     * 3-byte size field containing the size of the compressed page image
 104 **       in bytes. The most significant bit of each byte of the size field
 105 **       is always set. The remaining 7 bits are used to store a 21-bit
 106 **       integer value (in big-endian order - the first byte in the field
 107 **       contains the most significant 7 bits). Since the maximum allowed
 108 **       size of a compressed page image is (2^17 - 1) bytes, there are
 109 **       actually 4 unused bits in the size field.
 110 **
 111 **       In other words, if the size of the compressed page image is nSz,
 112 **       the header can be serialized as follows:
 113 **
 114 **         u8 aHdr[3]
 115 **         aHdr[0] = 0x80 | (u8)(nSz >> 14);
 116 **         aHdr[1] = 0x80 | (u8)(nSz >>  7);
 117 **         aHdr[2] = 0x80 | (u8)(nSz >>  0);
 118 **
 119 **     * Compressed page image.
 120 **
 121 **     * A second copy of the 3-byte record header.
 122 **
 123 ** A page number is a byte offset into the database file. So the smallest
 124 ** possible page number is 8192 (immediately after the two meta-pages).
 125 ** The first and root page of a segment are identified by a page number
 126 ** corresponding to the byte offset of the first byte in the corresponding
 127 ** page record. The last page of a segment is identified by the byte offset
 128 ** of the last byte in its record.
 129 **
 130 ** Unlike uncompressed pages, compressed page records may span blocks.
 131 **
 132 ** Sometimes, in order to avoid touching sectors that contain synced data
 133 ** when writing, it is necessary to insert unused space between compressed
 134 ** page records. This can be done as follows:
 135 **
 136 **     * For less than 6 bytes of empty space, the first and last byte
 137 **       of the free space contain the total number of free bytes. For
 138 **       example:
 139 **
 140 **         Block of 4 free bytes: 0x04 0x?? 0x?? 0x04
 141 **         Block of 2 free bytes: 0x02 0x02
 142 **         A single free byte:    0x01
 143 **
 144 **     * For 6 or more bytes of empty space, a record similar to a
 145 **       compressed page record is added to the segment. A padding record
 146 **       is distinguished from a compressed page record by the most
 147 **       significant bit of the second byte of the size field, which is
 148 **       cleared instead of set.
 149 */
 150 #include "lsmInt.h"
 151
 152 #include <sys/types.h>
 153 #include <sys/stat.h>
 154 #include <fcntl.h>
 155
 156 /*
 157 ** File-system object. Each database connection allocates a single instance
 158 ** of the following structure. It is used for all access to the database and
 159 ** log files.
 160 **
 161 ** The database file may be accessed via two methods - using mmap() or using
 162 ** read() and write() calls. In the general case both methods are used - a
 163 ** prefix of the file is mapped into memory and the remainder accessed using
 164 ** read() and write(). This is helpful when accessing very large files (or
 165 ** files that may grow very large during the lifetime of a database
 166 ** connection) on systems with 32-bit address spaces. However, it also requires
 167 ** that this object manage two distinct types of Page objects simultaneously -
 168 ** those that carry pointers to the mapped file and those that carry arrays
 169 ** populated by read() calls.
 170 **
 171 ** pFree:
 172 **   The head of a singly-linked list that containing currently unused Page
 173 **   structures suitable for use as mmap-page handles. Connected by the
 174 **   Page.pFreeNext pointers.
 175 **
 176 ** pMapped:
 177 **   The head of a singly-linked list that contains all pages that currently
 178 **   carry pointers to the mapped region. This is used if the region is
 179 **   every remapped - the pointers carried by existing pages can be adjusted
 180 **   to account for the remapping. Connected by the Page.pMappedNext pointers.
 181 **
 182 ** pWaiting:
 183 **   When the upper layer wishes to append a new b-tree page to a segment,
 184 **   it allocates a Page object that carries a malloc'd block of memory -
 185 **   regardless of the mmap-related configuration. The page is not assigned
 186 **   a page number at first. When the upper layer has finished constructing
 187 **   the page contents, it calls lsmFsPagePersist() to assign a page number
 188 **   to it. At this point it is likely that N pages have been written to the
 189 **   segment, the (N+1)th page is still outstanding and the b-tree page is
 190 **   assigned page number (N+2). To avoid writing page (N+2) before page
 191 **   (N+1), the recently completed b-tree page is held in the singly linked
 192 **   list headed by pWaiting until page (N+1) has been written.
 193 **
 194 **   Function lsmFsFlushWaiting() is responsible for eventually writing
 195 **   waiting pages to disk.
 196 **
 197 ** apHash/nHash:
 198 **   Hash table used to store all Page objects that carry malloc'd arrays,
 199 **   except those b-tree pages that have not yet been assigned page numbers.
 200 **   Once they have been assigned page numbers - they are added to this
 201 **   hash table.
 202 **
 203 **   Hash table overflow chains are connected using the Page.pHashNext
 204 **   pointers.
 205 **
 206 ** pLruFirst, pLruLast:
 207 **   The first and last entries in a doubly-linked list of pages. This
 208 **   list contains all pages with malloc'd data that are present in the
 209 **   hash table and have a ref-count of zero.
 210 */
 211 struct FileSystem {
 212   lsm_db *pDb;                    /* Database handle that owns this object */
 213   lsm_env *pEnv;                  /* Environment pointer */
 214   char *zDb;                      /* Database file name */
 215   char *zLog;                     /* Database file name */
 216   int nMetasize;                  /* Size of meta pages in bytes */
 217   int nMetaRwSize;                /* Read/written size of meta pages in bytes */
 218   i64 nPagesize;                  /* Database page-size in bytes */
 219   i64 nBlocksize;                 /* Database block-size in bytes */
 220
 221   /* r/w file descriptors for both files. */
 222   LsmFile *pLsmFile;              /* Used after lsm_close() to link into list */
 223   lsm_file *fdDb;                 /* Database file */
 224   lsm_file *fdLog;                /* Log file */
 225   int szSector;                   /* Database file sector size */
 226
 227   /* If this is a compressed database, a pointer to the compression methods.
 228   ** For an uncompressed database, a NULL pointer.  */
 229   lsm_compress *pCompress;
 230   u8 *aIBuffer;                   /* Buffer to compress to */
 231   u8 *aOBuffer;                   /* Buffer to uncompress from */
 232   int nBuffer;                    /* Allocated size of above buffers in bytes */
 233
 234   /* mmap() page related things */
 235   i64 nMapLimit;                  /* Maximum bytes of file to map */
 236   void *pMap;                     /* Current mapping of database file */
 237   i64 nMap;                       /* Bytes mapped at pMap */
 238   Page *pFree;                    /* Unused Page structures */
 239   Page *pMapped;                  /* List of Page structs that point to pMap */
 240
 241   /* Page cache parameters for non-mmap() pages */
 242   int nCacheMax;                  /* Configured cache size (in pages) */
 243   int nCacheAlloc;                /* Current cache size (in pages) */
 244   Page *pLruFirst;                /* Head of the LRU list */
 245   Page *pLruLast;                 /* Tail of the LRU list */
 246   int nHash;                      /* Number of hash slots in hash table */
 247   Page **apHash;                  /* nHash Hash slots */
 248   Page *pWaiting;                 /* b-tree pages waiting to be written */
 249
 250   /* Statistics */
 251   int nOut;                       /* Number of outstanding pages */
 252   int nWrite;                     /* Total number of pages written */
 253   int nRead;                      /* Total number of pages read */
 254 };
 255
 256 /*
 257 ** Database page handle.
 258 **
 259 ** pSeg:
 260 **   When lsmFsSortedAppend() is called on a compressed database, the new
 261 **   page is not assigned a page number or location in the database file
 262 **   immediately. Instead, these are assigned by the lsmFsPagePersist() call
 263 **   right before it writes the compressed page image to disk.
 264 **
 265 **   The lsmFsSortedAppend() function sets the pSeg pointer to point to the
 266 **   segment that the new page will be a part of. It is unset by
 267 **   lsmFsPagePersist() after the page is written to disk.
 268 */
 269 struct Page {
 270   u8 *aData;                      /* Buffer containing page data */
 271   int nData;                      /* Bytes of usable data at aData[] */
 272   LsmPgno iPg;                    /* Page number */
 273   int nRef;                       /* Number of outstanding references */
 274   int flags;                      /* Combination of PAGE_XXX flags */
 275   Page *pHashNext;                /* Next page in hash table slot */
 276   Page *pLruNext;                 /* Next page in LRU list */
 277   Page *pLruPrev;                 /* Previous page in LRU list */
 278   FileSystem *pFS;                /* File system that owns this page */
 279
 280   /* Only used in compressed database mode: */
 281   int nCompress;                  /* Compressed size (or 0 for uncomp. db) */
 282   int nCompressPrev;              /* Compressed size of prev page */
 283   Segment *pSeg;                  /* Segment this page will be written to */
 284
 285   /* Pointers for singly linked lists */
 286   Page *pWaitingNext;             /* Next page in FileSystem.pWaiting list */
 287   Page *pFreeNext;                /* Next page in FileSystem.pFree list */
 288   Page *pMappedNext;              /* Next page in FileSystem.pMapped list */
 289 };
 290
 291 /*
 292 ** Meta-data page handle. There are two meta-data pages at the start of
 293 ** the database file, each FileSystem.nMetasize bytes in size.
 294 */
 295 struct MetaPage {
 296   int iPg;                        /* Either 1 or 2 */
 297   int bWrite;                     /* Write back to db file on release */
 298   u8 *aData;                      /* Pointer to buffer */
 299   FileSystem *pFS;                /* FileSystem that owns this page */
 300 };
 301
 302 /*
 303 ** Values for LsmPage.flags
 304 */
 305 #define PAGE_DIRTY   0x00000001   /* Set if page is dirty */
 306 #define PAGE_FREE    0x00000002   /* Set if Page.aData requires lsmFree() */
 307 #define PAGE_HASPREV 0x00000004   /* Set if page is first on uncomp. block */
 308
 309 /*
 310 ** Number of pgsz byte pages omitted from the start of block 1. The start
 311 ** of block 1 contains two 4096 byte meta pages (8192 bytes in total).
 312 */
 313 #define BLOCK1_HDR_SIZE(pgsz)  LSM_MAX(1, 8192/(pgsz))
 314
 315 /*
 316 ** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt()
 317 ** to catch IO errors (any error returned by a VFS method).
 318 */
 319 #ifndef NDEBUG
 320 static void lsmIoerrBkpt(void){
 321   static int nErr = 0;
 322   nErr++;
 323 }
 324 static int IOERR_WRAPPER(int rc){
 325   if( rc!=LSM_OK ) lsmIoerrBkpt();
 326   return rc;
 327 }
 328 #else
 329 # define IOERR_WRAPPER(rc) (rc)
 330 #endif
 331
 332 #ifdef NDEBUG
 333 # define assert_lists_are_ok(x)
 334 #else
 335 static Page *fsPageFindInHash(FileSystem *pFS, LsmPgno iPg, int *piHash);
 336
 337 static void assert_lists_are_ok(FileSystem *pFS){
 338 #if 0
 339   Page *p;
 340
 341   assert( pFS->nMapLimit>=0 );
 342
 343   /* Check that all pages in the LRU list have nRef==0, pointers to buffers
 344   ** in heap memory, and corresponding entries in the hash table.  */
 345   for(p=pFS->pLruFirst; p; p=p->pLruNext){
 346     assert( p==pFS->pLruFirst || p->pLruPrev!=0 );
 347     assert( p==pFS->pLruLast || p->pLruNext!=0 );
 348     assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p );
 349     assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p );
 350     assert( p->nRef==0 );
 351     assert( p->flags & PAGE_FREE );
 352     assert( p==fsPageFindInHash(pFS, p->iPg, 0) );
 353   }
 354 #endif
 355 }
 356 #endif
 357
 358 /*
 359 ** Wrappers around the VFS methods of the lsm_env object:
 360 **
 361 **     lsmEnvOpen()
 362 **     lsmEnvRead()
 363 **     lsmEnvWrite()
 364 **     lsmEnvSync()
 365 **     lsmEnvSectorSize()
 366 **     lsmEnvClose()
 367 **     lsmEnvTruncate()
 368 **     lsmEnvUnlink()
 369 **     lsmEnvRemap()
 370 */
 371 int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){
 372   return pEnv->xOpen(pEnv, zFile, flags, ppNew);
 373 }
 374
 375 static int lsmEnvRead(
 376   lsm_env *pEnv,
 377   lsm_file *pFile,
 378   lsm_i64 iOff,
 379   void *pRead,
 380   int nRead
 381 ){
 382   return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) );
 383 }
 384
 385 static int lsmEnvWrite(
 386   lsm_env *pEnv,
 387   lsm_file *pFile,
 388   lsm_i64 iOff,
 389   const void *pWrite,
 390   int nWrite
 391 ){
 392   return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) );
 393 }
 394
 395 static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
 396   return IOERR_WRAPPER( pEnv->xSync(pFile) );
 397 }
 398
 399 static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
 400   return pEnv->xSectorSize(pFile);
 401 }
 402
 403 int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
 404   return IOERR_WRAPPER( pEnv->xClose(pFile) );
 405 }
 406
 407 static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
 408   return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) );
 409 }
 410
 411 static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
 412   return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) );
 413 }
 414
 415 static int lsmEnvRemap(
 416   lsm_env *pEnv,
 417   lsm_file *pFile,
 418   i64 szMin,
 419   void **ppMap,
 420   i64 *pszMap
 421 ){
 422   return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
 423 }
 424
 425 int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
 426   if( pFile==0 ) return LSM_OK;
 427   return pEnv->xLock(pFile, iLock, eLock);
 428 }
 429
 430 int lsmEnvTestLock(
 431   lsm_env *pEnv,
 432   lsm_file *pFile,
 433   int iLock,
 434   int nLock,
 435   int eLock
 436 ){
 437   return pEnv->xTestLock(pFile, iLock, nLock, eLock);
 438 }
 439
 440 int lsmEnvShmMap(
 441   lsm_env *pEnv,
 442   lsm_file *pFile,
 443   int iChunk,
 444   int sz,
 445   void **ppOut
 446 ){
 447   return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
 448 }
 449
 450 void lsmEnvShmBarrier(lsm_env *pEnv){
 451   pEnv->xShmBarrier();
 452 }
 453
 454 void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
 455   pEnv->xShmUnmap(pFile, bDel);
 456 }
 457
 458 void lsmEnvSleep(lsm_env *pEnv, int nUs){
 459   pEnv->xSleep(pEnv, nUs);
 460 }
 461
 462
 463 /*
 464 ** Write the contents of string buffer pStr into the log file, starting at
 465 ** offset iOff.
 466 */
 467 int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
 468   assert( pFS->fdLog );
 469   return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
 470 }
 471
 472 /*
 473 ** fsync() the log file.
 474 */
 475 int lsmFsSyncLog(FileSystem *pFS){
 476   assert( pFS->fdLog );
 477   return lsmEnvSync(pFS->pEnv, pFS->fdLog);
 478 }
 479
 480 /*
 481 ** Read nRead bytes of data starting at offset iOff of the log file. Append
 482 ** the results to string buffer pStr.
 483 */
 484 int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
 485   int rc;                         /* Return code */
 486   assert( pFS->fdLog );
 487   rc = lsmStringExtend(pStr, nRead);
 488   if( rc==LSM_OK ){
 489     rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
 490     pStr->n += nRead;
 491   }
 492   return rc;
 493 }
 494
 495 /*
 496 ** Truncate the log file to nByte bytes in size.
 497 */
 498 int lsmFsTruncateLog(FileSystem *pFS, i64 nByte){
 499   if( pFS->fdLog==0 ) return LSM_OK;
 500   return lsmEnvTruncate(pFS->pEnv, pFS->fdLog, nByte);
 501 }
 502
 503 /*
 504 ** Truncate the db file to nByte bytes in size.
 505 */
 506 int lsmFsTruncateDb(FileSystem *pFS, i64 nByte){
 507   if( pFS->fdDb==0 ) return LSM_OK;
 508   return lsmEnvTruncate(pFS->pEnv, pFS->fdDb, nByte);
 509 }
 510
 511 /*
 512 ** Close the log file. Then delete it from the file-system. This function
 513 ** is called during database shutdown only.
 514 */
 515 int lsmFsCloseAndDeleteLog(FileSystem *pFS){
 516   char *zDel;
 517
 518   if( pFS->fdLog ){
 519     lsmEnvClose(pFS->pEnv, pFS->fdLog );
 520     pFS->fdLog = 0;
 521   }
 522
 523   zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
 524   if( zDel ){
 525     lsmEnvUnlink(pFS->pEnv, zDel);
 526     lsmFree(pFS->pEnv, zDel);
 527   }
 528   return LSM_OK;
 529 }
 530
 531 /*
 532 ** Return true if page iReal of the database should be accessed using mmap.
 533 ** False otherwise.
 534 */
 535 static int fsMmapPage(FileSystem *pFS, LsmPgno iReal){
 536   return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit);
 537 }
 538
 539 /*
 540 ** Given that there are currently nHash slots in the hash table, return
 541 ** the hash key for file iFile, page iPg.
 542 */
 543 static int fsHashKey(int nHash, LsmPgno iPg){
 544   return (iPg % nHash);
 545 }
 546
 547 /*
 548 ** This is a helper function for lsmFsOpen(). It opens a single file on
 549 ** disk (either the database or log file).
 550 */
 551 static lsm_file *fsOpenFile(
 552   FileSystem *pFS,                /* File system object */
 553   int bReadonly,                  /* True to open this file read-only */
 554   int bLog,                       /* True for log, false for db */
 555   int *pRc                        /* IN/OUT: Error code */
 556 ){
 557   lsm_file *pFile = 0;
 558   if( *pRc==LSM_OK ){
 559     int flags = (bReadonly ? LSM_OPEN_READONLY : 0);
 560     const char *zPath = (bLog ? pFS->zLog : pFS->zDb);
 561
 562     *pRc = lsmEnvOpen(pFS->pEnv, zPath, flags, &pFile);
 563   }
 564   return pFile;
 565 }
 566
 567 /*
 568 ** If it is not already open, this function opens the log file. It returns
 569 ** LSM_OK if successful (or if the log file was already open) or an LSM
 570 ** error code otherwise.
 571 **
 572 ** The log file must be opened before any of the following may be called:
 573 **
 574 **     lsmFsWriteLog
 575 **     lsmFsSyncLog
 576 **     lsmFsReadLog
 577 */
 578 int lsmFsOpenLog(lsm_db *db, int *pbOpen){
 579   int rc = LSM_OK;
 580   FileSystem *pFS = db->pFS;
 581
 582   if( 0==pFS->fdLog ){
 583     pFS->fdLog = fsOpenFile(pFS, db->bReadonly, 1, &rc);
 584
 585     if( rc==LSM_IOERR_NOENT && db->bReadonly ){
 586       rc = LSM_OK;
 587     }
 588   }
 589
 590   if( pbOpen ) *pbOpen = (pFS->fdLog!=0);
 591   return rc;
 592 }
 593
 594 /*
 595 ** Close the log file, if it is open.
 596 */
 597 void lsmFsCloseLog(lsm_db *db){
 598   FileSystem *pFS = db->pFS;
 599   if( pFS->fdLog ){
 600     lsmEnvClose(pFS->pEnv, pFS->fdLog);
 601     pFS->fdLog = 0;
 602   }
 603 }
 604
 605 /*
 606 ** Open a connection to a database stored within the file-system.
 607 **
 608 ** If parameter bReadonly is true, then open a read-only file-descriptor
 609 ** on the database file. It is possible that bReadonly will be false even
 610 ** if the user requested that pDb be opened read-only. This is because the
 611 ** file-descriptor may later on be recycled by a read-write connection.
 612 ** If the db file can be opened for read-write access, it always is. Parameter
 613 ** bReadonly is only ever true if it has already been determined that the
 614 ** db can only be opened for read-only access.
 615 **
 616 ** Return LSM_OK if successful or an lsm error code otherwise.
 617 */
 618 int lsmFsOpen(
 619   lsm_db *pDb,                    /* Database connection to open fd for */
 620   const char *zDb,                /* Full path to database file */
 621   int bReadonly                   /* True to open db file read-only */
 622 ){
 623   FileSystem *pFS;
 624   int rc = LSM_OK;
 625   int nDb = strlen(zDb);
 626   int nByte;
 627
 628   assert( pDb->pFS==0 );
 629   assert( pDb->pWorker==0 && pDb->pClient==0 );
 630
 631   nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
 632   pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
 633   if( pFS ){
 634     LsmFile *pLsmFile;
 635     pFS->zDb = (char *)&pFS[1];
 636     pFS->zLog = &pFS->zDb[nDb+1];
 637     pFS->nPagesize = LSM_DFLT_PAGE_SIZE;
 638     pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE;
 639     pFS->nMetasize = LSM_META_PAGE_SIZE;
 640     pFS->nMetaRwSize = LSM_META_RW_PAGE_SIZE;
 641     pFS->pDb = pDb;
 642     pFS->pEnv = pDb->pEnv;
 643
 644     /* Make a copy of the database and log file names. */
 645     memcpy(pFS->zDb, zDb, nDb+1);
 646     memcpy(pFS->zLog, zDb, nDb);
 647     memcpy(&pFS->zLog[nDb], "-log", 5);
 648
 649     /* Allocate the hash-table here. At some point, it should be changed
 650     ** so that it can grow dynamicly. */
 651     pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
 652     pFS->nHash = 4096;
 653     pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
 654
 655     /* Open the database file */
 656     pLsmFile = lsmDbRecycleFd(pDb);
 657     if( pLsmFile ){
 658       pFS->pLsmFile = pLsmFile;
 659       pFS->fdDb = pLsmFile->pFile;
 660       memset(pLsmFile, 0, sizeof(LsmFile));
 661     }else{
 662       pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
 663       if( rc==LSM_OK ){
 664         pFS->fdDb = fsOpenFile(pFS, bReadonly, 0, &rc);
 665       }
 666     }
 667
 668     if( rc!=LSM_OK ){
 669       lsmFsClose(pFS);
 670       pFS = 0;
 671     }else{
 672       pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb);
 673     }
 674   }
 675
 676   pDb->pFS = pFS;
 677   return rc;
 678 }
 679
 680 /*
 681 ** Configure the file-system object according to the current values of
 682 ** the LSM_CONFIG_MMAP and LSM_CONFIG_SET_COMPRESSION options.
 683 */
 684 int lsmFsConfigure(lsm_db *db){
 685   FileSystem *pFS = db->pFS;
 686   if( pFS ){
 687     lsm_env *pEnv = pFS->pEnv;
 688     Page *pPg;
 689
 690     assert( pFS->nOut==0 );
 691     assert( pFS->pWaiting==0 );
 692     assert( pFS->pMapped==0 );
 693
 694     /* Reset any compression/decompression buffers already allocated */
 695     lsmFree(pEnv, pFS->aIBuffer);
 696     lsmFree(pEnv, pFS->aOBuffer);
 697     pFS->nBuffer = 0;
 698
 699     /* Unmap the file, if it is currently mapped */
 700     if( pFS->pMap ){
 701       lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
 702       pFS->nMapLimit = 0;
 703     }
 704
 705     /* Free all allocated page structures */
 706     pPg = pFS->pLruFirst;
 707     while( pPg ){
 708       Page *pNext = pPg->pLruNext;
 709       assert( pPg->flags & PAGE_FREE );
 710       lsmFree(pEnv, pPg->aData);
 711       lsmFree(pEnv, pPg);
 712       pPg = pNext;
 713     }
 714
 715     pPg = pFS->pFree;
 716     while( pPg ){
 717       Page *pNext = pPg->pFreeNext;
 718       lsmFree(pEnv, pPg);
 719       pPg = pNext;
 720     }
 721
 722     /* Zero pointers that point to deleted page objects */
 723     pFS->nCacheAlloc = 0;
 724     pFS->pLruFirst = 0;
 725     pFS->pLruLast = 0;
 726     pFS->pFree = 0;
 727     if( pFS->apHash ){
 728       memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0]));
 729     }
 730
 731     /* Configure the FileSystem object */
 732     if( db->compress.xCompress ){
 733       pFS->pCompress = &db->compress;
 734       pFS->nMapLimit = 0;
 735     }else{
 736       pFS->pCompress = 0;
 737       if( db->iMmap==1 ){
 738         /* Unlimited */
 739         pFS->nMapLimit = (i64)1 << 60;
 740       }else{
 741         /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */
 742         pFS->nMapLimit = (i64)db->iMmap * 1024;
 743       }
 744     }
 745   }
 746
 747   return LSM_OK;
 748 }
 749
 750 /*
 751 ** Close and destroy a FileSystem object.
 752 */
 753 void lsmFsClose(FileSystem *pFS){
 754   if( pFS ){
 755     Page *pPg;
 756     lsm_env *pEnv = pFS->pEnv;
 757
 758     assert( pFS->nOut==0 );
 759     pPg = pFS->pLruFirst;
 760     while( pPg ){
 761       Page *pNext = pPg->pLruNext;
 762       if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
 763       lsmFree(pEnv, pPg);
 764       pPg = pNext;
 765     }
 766
 767     pPg = pFS->pFree;
 768     while( pPg ){
 769       Page *pNext = pPg->pFreeNext;
 770       if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
 771       lsmFree(pEnv, pPg);
 772       pPg = pNext;
 773     }
 774
 775     if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
 776     if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
 777     lsmFree(pEnv, pFS->pLsmFile);
 778     lsmFree(pEnv, pFS->apHash);
 779     lsmFree(pEnv, pFS->aIBuffer);
 780     lsmFree(pEnv, pFS->aOBuffer);
 781     lsmFree(pEnv, pFS);
 782   }
 783 }
 784
 785 /*
 786 ** This function is called when closing a database handle (i.e. lsm_close())
 787 ** if there exist other connections to the same database within this process.
 788 ** In that case the file-descriptor open on the database file is not closed
 789 ** when the FileSystem object is destroyed, as this would cause any POSIX
 790 ** locks held by the other connections to be silently dropped (see "man close"
 791 ** for details). Instead, the file-descriptor is stored in a list by the
 792 ** lsm_shared.c module until it is either closed or reused.
 793 **
 794 ** This function returns a pointer to an object that can be linked into
 795 ** the list described above. The returned object now 'owns' the database
 796 ** file descriptr, so that when the FileSystem object is destroyed, it
 797 ** will not be closed.
 798 **
 799 ** This function may be called at most once in the life-time of a
 800 ** FileSystem object. The results of any operations involving the database
 801 ** file descriptor are undefined once this function has been called.
 802 **
 803 ** None of this is necessary on non-POSIX systems. But we do it anyway in
 804 ** the name of using as similar code as possible on all platforms.
 805 */
 806 LsmFile *lsmFsDeferClose(FileSystem *pFS){
 807   LsmFile *p = pFS->pLsmFile;
 808   assert( p->pNext==0 );
 809   p->pFile = pFS->fdDb;
 810   pFS->fdDb = 0;
 811   pFS->pLsmFile = 0;
 812   return p;
 813 }
 814
 815 /*
 816 ** Allocate a buffer and populate it with the output of the xFileid()
 817 ** method of the database file handle. If successful, set *ppId to point
 818 ** to the buffer and *pnId to the number of bytes in the buffer and return
 819 ** LSM_OK. Otherwise, set *ppId and *pnId to zero and return an LSM
 820 ** error code.
 821 */
 822 int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId){
 823   lsm_env *pEnv = pDb->pEnv;
 824   FileSystem *pFS = pDb->pFS;
 825   int rc;
 826   int nId = 0;
 827   void *pId;
 828
 829   rc = pEnv->xFileid(pFS->fdDb, 0, &nId);
 830   pId = lsmMallocZeroRc(pEnv, nId, &rc);
 831   if( rc==LSM_OK ) rc = pEnv->xFileid(pFS->fdDb, pId, &nId);
 832
 833   if( rc!=LSM_OK ){
 834     lsmFree(pEnv, pId);
 835     pId = 0;
 836     nId = 0;
 837   }
 838
 839   *ppId = pId;
 840   *pnId = nId;
 841   return rc;
 842 }
 843
 844 /*
 845 ** Return the nominal page-size used by this file-system. Actual pages
 846 ** may be smaller or larger than this value.
 847 */
 848 int lsmFsPageSize(FileSystem *pFS){
 849   return pFS->nPagesize;
 850 }
 851
 852 /*
 853 ** Return the block-size used by this file-system.
 854 */
 855 int lsmFsBlockSize(FileSystem *pFS){
 856   return pFS->nBlocksize;
 857 }
 858
 859 /*
 860 ** Configure the nominal page-size used by this file-system. Actual
 861 ** pages may be smaller or larger than this value.
 862 */
 863 void lsmFsSetPageSize(FileSystem *pFS, int nPgsz){
 864   pFS->nPagesize = nPgsz;
 865   pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
 866 }
 867
 868 /*
 869 ** Configure the block-size used by this file-system.
 870 */
 871 void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){
 872   pFS->nBlocksize = nBlocksize;
 873 }
 874
 875 /*
 876 ** Return the page number of the first page on block iBlock. Blocks are
 877 ** numbered starting from 1.
 878 **
 879 ** For a compressed database, page numbers are byte offsets. The first
 880 ** page on each block is the byte offset immediately following the 4-byte
 881 ** "previous block" pointer at the start of each block.
 882 */
 883 static LsmPgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){
 884   LsmPgno iPg;
 885   if( pFS->pCompress ){
 886     if( iBlock==1 ){
 887       iPg = pFS->nMetasize * 2 + 4;
 888     }else{
 889       iPg = pFS->nBlocksize * (LsmPgno)(iBlock-1) + 4;
 890     }
 891   }else{
 892     const i64 nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 893     if( iBlock==1 ){
 894       iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize);
 895     }else{
 896       iPg = 1 + (iBlock-1) * nPagePerBlock;
 897     }
 898   }
 899   return iPg;
 900 }
 901
 902 /*
 903 ** Return the page number of the last page on block iBlock. Blocks are
 904 ** numbered starting from 1.
 905 **
 906 ** For a compressed database, page numbers are byte offsets. The first
 907 ** page on each block is the byte offset of the byte immediately before
 908 ** the 4-byte "next block" pointer at the end of each block.
 909 */
 910 static LsmPgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){
 911   if( pFS->pCompress ){
 912     return pFS->nBlocksize * (LsmPgno)iBlock - 1 - 4;
 913   }else{
 914     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 915     return iBlock * nPagePerBlock;
 916   }
 917 }
 918
 919 /*
 920 ** Return the block number of the block that page iPg is located on.
 921 ** Blocks are numbered starting from 1.
 922 */
 923 static int fsPageToBlock(FileSystem *pFS, LsmPgno iPg){
 924   if( pFS->pCompress ){
 925     return (int)((iPg / pFS->nBlocksize) + 1);
 926   }else{
 927     return (int)(1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)));
 928   }
 929 }
 930
 931 /*
 932 ** Return true if page iPg is the last page on its block.
 933 **
 934 ** This function is only called in non-compressed database mode.
 935 */
 936 static int fsIsLast(FileSystem *pFS, LsmPgno iPg){
 937   const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 938   assert( !pFS->pCompress );
 939   return ( iPg && (iPg % nPagePerBlock)==0 );
 940 }
 941
 942 /*
 943 ** Return true if page iPg is the first page on its block.
 944 **
 945 ** This function is only called in non-compressed database mode.
 946 */
 947 static int fsIsFirst(FileSystem *pFS, LsmPgno iPg){
 948   const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
 949   assert( !pFS->pCompress );
 950   return ( (iPg % nPagePerBlock)==1
 951         || (iPg<nPagePerBlock && iPg==fsFirstPageOnBlock(pFS, 1))
 952   );
 953 }
 954
 955 /*
 956 ** Given a page reference, return a pointer to the buffer containing the
 957 ** pages contents. If parameter pnData is not NULL, set *pnData to the size
 958 ** of the buffer in bytes before returning.
 959 */
 960 u8 *lsmFsPageData(Page *pPage, int *pnData){
 961   if( pnData ){
 962     *pnData = pPage->nData;
 963   }
 964   return pPage->aData;
 965 }
 966
 967 /*
 968 ** Return the page number of a page.
 969 */
 970 LsmPgno lsmFsPageNumber(Page *pPage){
 971   /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */
 972   return pPage ? pPage->iPg : 0;
 973 }
 974
 975 /*
 976 ** Page pPg is currently part of the LRU list belonging to pFS. Remove
 977 ** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
 978 ** operation.
 979 */
 980 static void fsPageRemoveFromLru(FileSystem *pFS, Page *pPg){
 981   assert( pPg->pLruNext || pPg==pFS->pLruLast );
 982   assert( pPg->pLruPrev || pPg==pFS->pLruFirst );
 983   if( pPg->pLruNext ){
 984     pPg->pLruNext->pLruPrev = pPg->pLruPrev;
 985   }else{
 986     pFS->pLruLast = pPg->pLruPrev;
 987   }
 988   if( pPg->pLruPrev ){
 989     pPg->pLruPrev->pLruNext = pPg->pLruNext;
 990   }else{
 991     pFS->pLruFirst = pPg->pLruNext;
 992   }
 993   pPg->pLruPrev = 0;
 994   pPg->pLruNext = 0;
 995 }
 996
 997 /*
 998 ** Page pPg is not currently part of the LRU list belonging to pFS. Add it.
 999 */
1000 static void fsPageAddToLru(FileSystem *pFS, Page *pPg){
1001   assert( pPg->pLruNext==0 && pPg->pLruPrev==0 );
1002   pPg->pLruPrev = pFS->pLruLast;
1003   if( pPg->pLruPrev ){
1004     pPg->pLruPrev->pLruNext = pPg;
1005   }else{
1006     pFS->pLruFirst = pPg;
1007   }
1008   pFS->pLruLast = pPg;
1009 }
1010
1011 /*
1012 ** Page pPg is currently stored in the apHash/nHash hash table. Remove it.
1013 */
1014 static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
1015   int iHash;
1016   Page **pp;
1017
1018   iHash = fsHashKey(pFS->nHash, pPg->iPg);
1019   for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
1020   *pp = pPg->pHashNext;
1021   pPg->pHashNext = 0;
1022 }
1023
1024 /*
1025 ** Free a Page object allocated by fsPageBuffer().
1026 */
1027 static void fsPageBufferFree(Page *pPg){
1028   pPg->pFS->nCacheAlloc--;
1029   lsmFree(pPg->pFS->pEnv, pPg->aData);
1030   lsmFree(pPg->pFS->pEnv, pPg);
1031 }
1032
1033
1034 /*
1035 ** Purge the cache of all non-mmap pages with nRef==0.
1036 */
1037 void lsmFsPurgeCache(FileSystem *pFS){
1038   Page *pPg;
1039
1040   pPg = pFS->pLruFirst;
1041   while( pPg ){
1042     Page *pNext = pPg->pLruNext;
1043     assert( pPg->flags & PAGE_FREE );
1044     fsPageRemoveFromHash(pFS, pPg);
1045     fsPageBufferFree(pPg);
1046     pPg = pNext;
1047   }
1048   pFS->pLruFirst = 0;
1049   pFS->pLruLast = 0;
1050
1051   assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );
1052 }
1053
1054 /*
1055 ** Search the hash-table for page iPg. If an entry is round, return a pointer
1056 ** to it. Otherwise, return NULL.
1057 **
1058 ** Either way, if argument piHash is not NULL set *piHash to the hash slot
1059 ** number that page iPg would be stored in before returning.
1060 */
1061 static Page *fsPageFindInHash(FileSystem *pFS, LsmPgno iPg, int *piHash){
1062   Page *p;                        /* Return value */
1063   int iHash = fsHashKey(pFS->nHash, iPg);
1064
1065   if( piHash ) *piHash = iHash;
1066   for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
1067     if( p->iPg==iPg) break;
1068   }
1069   return p;
1070 }
1071
1072 /*
1073 ** Allocate and return a non-mmap Page object. If there are already
1074 ** nCacheMax such Page objects outstanding, try to recycle an existing
1075 ** Page instead.
1076 */
1077 static int fsPageBuffer(
1078   FileSystem *pFS,
1079   Page **ppOut
1080 ){
1081   int rc = LSM_OK;
1082   Page *pPage = 0;
1083   if( pFS->pLruFirst==0 || pFS->nCacheAlloc<pFS->nCacheMax ){
1084     /* Allocate a new Page object */
1085     pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
1086     if( !pPage ){
1087       rc = LSM_NOMEM_BKPT;
1088     }else{
1089       pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);
1090       if( !pPage->aData ){
1091         lsmFree(pFS->pEnv, pPage);
1092         rc = LSM_NOMEM_BKPT;
1093         pPage = 0;
1094       }else{
1095         pFS->nCacheAlloc++;
1096       }
1097     }
1098   }else{
1099     /* Reuse an existing Page object */
1100     u8 *aData;
1101     pPage = pFS->pLruFirst;
1102     aData = pPage->aData;
1103     fsPageRemoveFromLru(pFS, pPage);
1104     fsPageRemoveFromHash(pFS, pPage);
1105
1106     memset(pPage, 0, sizeof(Page));
1107     pPage->aData = aData;
1108   }
1109
1110   if( pPage ){
1111     pPage->flags = PAGE_FREE;
1112   }
1113   *ppOut = pPage;
1114   return rc;
1115 }
1116
1117 /*
1118 ** Assuming *pRc is initially LSM_OK, attempt to ensure that the
1119 ** memory-mapped region is at least iSz bytes in size. If it is not already,
1120 ** iSz bytes in size, extend it and update the pointers associated with any
1121 ** outstanding Page objects.
1122 **
1123 ** If *pRc is not LSM_OK when this function is called, it is a no-op.
1124 ** Otherwise, *pRc is set to an lsm error code if an error occurs, or
1125 ** left unmodified otherwise.
1126 **
1127 ** This function is never called in compressed database mode.
1128 */
1129 static void fsGrowMapping(
1130   FileSystem *pFS,                /* File system object */
1131   i64 iSz,                        /* Minimum size to extend mapping to */
1132   int *pRc                        /* IN/OUT: Error code */
1133 ){
1134   assert( PAGE_HASPREV==4 );
1135
1136   if( *pRc==LSM_OK && iSz>pFS->nMap ){
1137     int rc;
1138     u8 *aOld = pFS->pMap;
1139     rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
1140     if( rc==LSM_OK && pFS->pMap!=aOld ){
1141       Page *pFix;
1142       i64 iOff = (u8 *)pFS->pMap - aOld;
1143       for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){
1144         pFix->aData += iOff;
1145       }
1146       lsmSortedRemap(pFS->pDb);
1147     }
1148     *pRc = rc;
1149   }
1150 }
1151
1152 /*
1153 ** If it is mapped, unmap the database file.
1154 */
1155 int lsmFsUnmap(FileSystem *pFS){
1156   int rc = LSM_OK;
1157   if( pFS ){
1158     rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
1159   }
1160   return rc;
1161 }
1162
1163 /*
1164 ** fsync() the database file.
1165 */
1166 int lsmFsSyncDb(FileSystem *pFS, int nBlock){
1167   return lsmEnvSync(pFS->pEnv, pFS->fdDb);
1168 }
1169
1170 /*
1171 ** If block iBlk has been redirected according to the redirections in the
1172 ** object passed as the first argument, return the destination block to
1173 ** which it is redirected. Otherwise, return a copy of iBlk.
1174 */
1175 static int fsRedirectBlock(Redirect *p, int iBlk){
1176   if( p ){
1177     int i;
1178     for(i=0; i<p->n; i++){
1179       if( iBlk==p->a[i].iFrom ) return p->a[i].iTo;
1180     }
1181   }
1182   assert( iBlk!=0 );
1183   return iBlk;
1184 }
1185
1186 /*
1187 ** If page iPg has been redirected according to the redirections in the
1188 ** object passed as the second argument, return the destination page to
1189 ** which it is redirected. Otherwise, return a copy of iPg.
1190 */
1191 LsmPgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, LsmPgno iPg){
1192   LsmPgno iReal = iPg;
1193
1194   if( pRedir ){
1195     const int nPagePerBlock = (
1196         pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
1197     );
1198     int iBlk = fsPageToBlock(pFS, iPg);
1199     int i;
1200     for(i=0; i<pRedir->n; i++){
1201       int iFrom = pRedir->a[i].iFrom;
1202       if( iFrom>iBlk ) break;
1203       if( iFrom==iBlk ){
1204         int iTo = pRedir->a[i].iTo;
1205         iReal = iPg - (LsmPgno)(iFrom - iTo) * nPagePerBlock;
1206         if( iTo==1 ){
1207           iReal += (fsFirstPageOnBlock(pFS, 1)-1);
1208         }
1209         break;
1210       }
1211     }
1212   }
1213
1214   assert( iReal!=0 );
1215   return iReal;
1216 }
1217
1218 /* Required by the circular fsBlockNext<->fsPageGet dependency. */
1219 static int fsPageGet(FileSystem *, Segment *, LsmPgno, int, Page **, int *);
1220
1221 /*
1222 ** Parameter iBlock is a database file block. This function reads the value
1223 ** stored in the blocks "next block" pointer and stores it in *piNext.
1224 ** LSM_OK is returned if everything is successful, or an LSM error code
1225 ** otherwise.
1226 */
1227 static int fsBlockNext(
1228   FileSystem *pFS,                /* File-system object handle */
1229   Segment *pSeg,                  /* Use this segment for block redirects */
1230   int iBlock,                     /* Read field from this block */
1231   int *piNext                     /* OUT: Next block in linked list */
1232 ){
1233   int rc;
1234   int iRead;                      /* Read block from here */
1235
1236   if( pSeg ){
1237     iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
1238   }else{
1239     iRead = iBlock;
1240   }
1241
1242   assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
1243   if( pFS->pCompress ){
1244     i64 iOff;                     /* File offset to read data from */
1245     u8 aNext[4];                  /* 4-byte pointer read from db file */
1246
1247     iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
1248     rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
1249     if( rc==LSM_OK ){
1250       *piNext = (int)lsmGetU32(aNext);
1251     }
1252   }else{
1253     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
1254     Page *pLast;
1255     rc = fsPageGet(pFS, 0, iRead*nPagePerBlock, 0, &pLast, 0);
1256     if( rc==LSM_OK ){
1257       *piNext = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
1258       lsmFsPageRelease(pLast);
1259     }
1260   }
1261
1262   if( pSeg ){
1263     *piNext = fsRedirectBlock(pSeg->pRedirect, *piNext);
1264   }
1265   return rc;
1266 }
1267
1268 /*
1269 ** Return the page number of the last page on the same block as page iPg.
1270 */
1271 LsmPgno fsLastPageOnPagesBlock(FileSystem *pFS, LsmPgno iPg){
1272   return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg));
1273 }
1274
1275 /*
1276 ** Read nData bytes of data from offset iOff of the database file into
1277 ** buffer aData. If this means reading past the end of a block, follow
1278 ** the block pointer to the next block and continue reading.
1279 **
1280 ** Offset iOff is an absolute offset - not subject to any block redirection.
1281 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1282 **
1283 ** This function is only called in compressed database mode.
1284 */
1285 static int fsReadData(
1286   FileSystem *pFS,                /* File-system handle */
1287   Segment *pSeg,                  /* Block redirection */
1288   i64 iOff,                       /* Read data from this offset */
1289   u8 *aData,                      /* Buffer to read data into */
1290   int nData                       /* Number of bytes to read */
1291 ){
1292   i64 iEob;                       /* End of block */
1293   int nRead;
1294   int rc;
1295
1296   assert( pFS->pCompress );
1297
1298   iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1;
1299   nRead = (int)LSM_MIN(iEob - iOff, nData);
1300
1301   rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead);
1302   if( rc==LSM_OK && nRead!=nData ){
1303     int iBlk;
1304
1305     rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1306     if( rc==LSM_OK ){
1307       i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk);
1308       rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead);
1309     }
1310   }
1311
1312   return rc;
1313 }
1314
1315 /*
1316 ** Parameter iBlock is a database file block. This function reads the value
1317 ** stored in the blocks "previous block" pointer and stores it in *piPrev.
1318 ** LSM_OK is returned if everything is successful, or an LSM error code
1319 ** otherwise.
1320 */
1321 static int fsBlockPrev(
1322   FileSystem *pFS,                /* File-system object handle */
1323   Segment *pSeg,                  /* Use this segment for block redirects */
1324   int iBlock,                     /* Read field from this block */
1325   int *piPrev                     /* OUT: Previous block in linked list */
1326 ){
1327   int rc = LSM_OK;                /* Return code */
1328
1329   assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
1330   assert( iBlock>0 );
1331
1332   if( pFS->pCompress ){
1333     i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
1334     u8 aPrev[4];                  /* 4-byte pointer read from db file */
1335     rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
1336     if( rc==LSM_OK ){
1337       Redirect *pRedir = (pSeg ? pSeg->pRedirect : 0);
1338       *piPrev = fsRedirectBlock(pRedir, (int)lsmGetU32(aPrev));
1339     }
1340   }else{
1341     assert( 0 );
1342   }
1343   return rc;
1344 }
1345
1346 /*
1347 ** Encode and decode routines for record size fields.
1348 */
1349 static void putRecordSize(u8 *aBuf, int nByte, int bFree){
1350   aBuf[0] = (u8)(nByte >> 14) | 0x80;
1351   aBuf[1] = ((u8)(nByte >>  7) & 0x7F) | (bFree ? 0x00 : 0x80);
1352   aBuf[2] = (u8)nByte | 0x80;
1353 }
1354 static int getRecordSize(u8 *aBuf, int *pbFree){
1355   int nByte;
1356   nByte  = (aBuf[0] & 0x7F) << 14;
1357   nByte += (aBuf[1] & 0x7F) << 7;
1358   nByte += (aBuf[2] & 0x7F);
1359   *pbFree = !(aBuf[1] & 0x80);
1360   return nByte;
1361 }
1362
1363 /*
1364 ** Subtract iSub from database file offset iOff and set *piRes to the
1365 ** result. If doing so means passing the start of a block, follow the
1366 ** block pointer stored in the first 4 bytes of the block.
1367 **
1368 ** Offset iOff is an absolute offset - not subject to any block redirection.
1369 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1370 **
1371 ** Return LSM_OK if successful or an lsm error code if an error occurs.
1372 */
1373 static int fsSubtractOffset(
1374   FileSystem *pFS,
1375   Segment *pSeg,
1376   i64 iOff,
1377   int iSub,
1378   i64 *piRes
1379 ){
1380   i64 iStart;
1381   int iBlk = 0;
1382   int rc;
1383
1384   assert( pFS->pCompress );
1385
1386   iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff));
1387   if( (iOff-iSub)>=iStart ){
1388     *piRes = (iOff-iSub);
1389     return LSM_OK;
1390   }
1391
1392   rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1393   *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1);
1394   return rc;
1395 }
1396
1397 /*
1398 ** Add iAdd to database file offset iOff and set *piRes to the
1399 ** result. If doing so means passing the end of a block, follow the
1400 ** block pointer stored in the last 4 bytes of the block.
1401 **
1402 ** Offset iOff is an absolute offset - not subject to any block redirection.
1403 ** However any block pointer followed is. Use pSeg->pRedirect in this case.
1404 **
1405 ** Return LSM_OK if successful or an lsm error code if an error occurs.
1406 */
1407 static int fsAddOffset(
1408   FileSystem *pFS,
1409   Segment *pSeg,
1410   i64 iOff,
1411   int iAdd,
1412   i64 *piRes
1413 ){
1414   i64 iEob;
1415   int iBlk;
1416   int rc;
1417
1418   assert( pFS->pCompress );
1419
1420   iEob = fsLastPageOnPagesBlock(pFS, iOff);
1421   if( (iOff+iAdd)<=iEob ){
1422     *piRes = (iOff+iAdd);
1423     return LSM_OK;
1424   }
1425
1426   rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
1427   *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1);
1428   return rc;
1429 }
1430
1431 /*
1432 ** If it is not already allocated, allocate either the FileSystem.aOBuffer (if
1433 ** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return
1434 ** LSM_OK if successful if the attempt to allocate memory fails.
1435 */
1436 static int fsAllocateBuffer(FileSystem *pFS, int bWrite){
1437   u8 **pp;                        /* Pointer to either aIBuffer or aOBuffer */
1438
1439   assert( pFS->pCompress );
1440
1441   /* If neither buffer has been allocated, figure out how large they
1442   ** should be. Store this value in FileSystem.nBuffer.  */
1443   if( pFS->nBuffer==0 ){
1444     assert( pFS->aIBuffer==0 && pFS->aOBuffer==0 );
1445     pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize);
1446     if( pFS->nBuffer<(pFS->szSector+6) ){
1447       pFS->nBuffer = pFS->szSector+6;
1448     }
1449   }
1450
1451   pp = (bWrite ? &pFS->aOBuffer : &pFS->aIBuffer);
1452   if( *pp==0 ){
1453     *pp = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize));
1454     if( *pp==0 ) return LSM_NOMEM_BKPT;
1455   }
1456
1457   return LSM_OK;
1458 }
1459
1460 /*
1461 ** This function is only called in compressed database mode. It reads and
1462 ** uncompresses the compressed data for page pPg from the database and
1463 ** populates the pPg->aData[] buffer and pPg->nCompress field.
1464 **
1465 ** It is possible that instead of a page record, there is free space
1466 ** at offset pPg->iPgno. In this case no data is read from the file, but
1467 ** output variable *pnSpace is set to the total number of free bytes.
1468 **
1469 ** LSM_OK is returned if successful, or an LSM error code otherwise.
1470 */
1471 static int fsReadPagedata(
1472   FileSystem *pFS,                /* File-system handle */
1473   Segment *pSeg,                  /* pPg is part of this segment */
1474   Page *pPg,                      /* Page to read and uncompress data for */
1475   int *pnSpace                    /* OUT: Total bytes of free space */
1476 ){
1477   lsm_compress *p = pFS->pCompress;
1478   i64 iOff = pPg->iPg;
1479   u8 aSz[3];
1480   int rc;
1481
1482   assert( p && pPg->nCompress==0 );
1483
1484   if( fsAllocateBuffer(pFS, 0) ) return LSM_NOMEM;
1485
1486   rc = fsReadData(pFS, pSeg, iOff, aSz, sizeof(aSz));
1487
1488   if( rc==LSM_OK ){
1489     int bFree;
1490     if( aSz[0] & 0x80 ){
1491       pPg->nCompress = (int)getRecordSize(aSz, &bFree);
1492     }else{
1493       pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2;
1494       bFree = 1;
1495     }
1496     if( bFree ){
1497       if( pnSpace ){
1498         *pnSpace = pPg->nCompress + sizeof(aSz)*2;
1499       }else{
1500         rc = LSM_CORRUPT_BKPT;
1501       }
1502     }else{
1503       rc = fsAddOffset(pFS, pSeg, iOff, 3, &iOff);
1504       if( rc==LSM_OK ){
1505         if( pPg->nCompress>pFS->nBuffer ){
1506           rc = LSM_CORRUPT_BKPT;
1507         }else{
1508           rc = fsReadData(pFS, pSeg, iOff, pFS->aIBuffer, pPg->nCompress);
1509         }
1510         if( rc==LSM_OK ){
1511           int n = pFS->nPagesize;
1512           rc = p->xUncompress(p->pCtx,
1513               (char *)pPg->aData, &n,
1514               (const char *)pFS->aIBuffer, pPg->nCompress
1515           );
1516           if( rc==LSM_OK && n!=pPg->pFS->nPagesize ){
1517             rc = LSM_CORRUPT_BKPT;
1518           }
1519         }
1520       }
1521     }
1522   }
1523   return rc;
1524 }
1525
1526 /*
1527 ** Return a handle for a database page.
1528 **
1529 ** If this file-system object is accessing a compressed database it may be
1530 ** that there is no page record at database file offset iPg. Instead, there
1531 ** may be a free space record. In this case, set *ppPg to NULL and *pnSpace
1532 ** to the total number of free bytes before returning.
1533 **
1534 ** If no error occurs, LSM_OK is returned. Otherwise, an lsm error code.
1535 */
1536 static int fsPageGet(
1537   FileSystem *pFS,                /* File-system handle */
1538   Segment *pSeg,                  /* Block redirection to use (or NULL) */
1539   LsmPgno iPg,                    /* Page id */
1540   int noContent,                  /* True to not load content from disk */
1541   Page **ppPg,                    /* OUT: New page handle */
1542   int *pnSpace                    /* OUT: Bytes of free space */
1543 ){
1544   Page *p;
1545   int iHash;
1546   int rc = LSM_OK;
1547
1548   /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is
1549   ** not NULL, and the block containing iPg has been redirected, then iReal
1550   ** is the page number after redirection.  */
1551   LsmPgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);
1552
1553   assert_lists_are_ok(pFS);
1554   assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
1555   assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
1556   *ppPg = 0;
1557
1558   /* Search the hash-table for the page */
1559   p = fsPageFindInHash(pFS, iReal, &iHash);
1560
1561   if( p ){
1562     assert( p->flags & PAGE_FREE );
1563     if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p);
1564   }else{
1565
1566     if( fsMmapPage(pFS, iReal) ){
1567       i64 iEnd = (i64)iReal * pFS->nPagesize;
1568       fsGrowMapping(pFS, iEnd, &rc);
1569       if( rc!=LSM_OK ) return rc;
1570
1571       if( pFS->pFree ){
1572         p = pFS->pFree;
1573         pFS->pFree = p->pFreeNext;
1574         assert( p->nRef==0 );
1575       }else{
1576         p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
1577         if( rc ) return rc;
1578         p->pFS = pFS;
1579       }
1580       p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
1581       p->iPg = iReal;
1582
1583       /* This page now carries a pointer to the mapping. Link it in to
1584       ** the FileSystem.pMapped list.  */
1585       assert( p->pMappedNext==0 );
1586       p->pMappedNext = pFS->pMapped;
1587       pFS->pMapped = p;
1588
1589       assert( pFS->pCompress==0 );
1590       assert( (p->flags & PAGE_FREE)==0 );
1591     }else{
1592       rc = fsPageBuffer(pFS, &p);
1593       if( rc==LSM_OK ){
1594         int nSpace = 0;
1595         p->iPg = iReal;
1596         p->nRef = 0;
1597         p->pFS = pFS;
1598         assert( p->flags==0 || p->flags==PAGE_FREE );
1599
1600 #ifdef LSM_DEBUG
1601         memset(p->aData, 0x56, pFS->nPagesize);
1602 #endif
1603         assert( p->pLruNext==0 && p->pLruPrev==0 );
1604         if( noContent==0 ){
1605           if( pFS->pCompress ){
1606             rc = fsReadPagedata(pFS, pSeg, p, &nSpace);
1607           }else{
1608             int nByte = pFS->nPagesize;
1609             i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
1610             rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
1611           }
1612           pFS->nRead++;
1613         }
1614
1615         /* If the xRead() call was successful (or not attempted), link the
1616         ** page into the page-cache hash-table. Otherwise, if it failed,
1617         ** free the buffer. */
1618         if( rc==LSM_OK && nSpace==0 ){
1619           p->pHashNext = pFS->apHash[iHash];
1620           pFS->apHash[iHash] = p;
1621         }else{
1622           fsPageBufferFree(p);
1623           p = 0;
1624           if( pnSpace ) *pnSpace = nSpace;
1625         }
1626       }
1627     }
1628
1629     assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
1630          || (rc!=LSM_OK && p==0)
1631     );
1632   }
1633
1634   if( rc==LSM_OK && p ){
1635     if( pFS->pCompress==0 && (fsIsLast(pFS, iReal) || fsIsFirst(pFS, iReal)) ){
1636       p->nData = pFS->nPagesize - 4;
1637       if( fsIsFirst(pFS, iReal) && p->nRef==0 ){
1638         p->aData += 4;
1639         p->flags |= PAGE_HASPREV;
1640       }
1641     }else{
1642       p->nData = pFS->nPagesize;
1643     }
1644     pFS->nOut += (p->nRef==0);
1645     p->nRef++;
1646   }
1647   *ppPg = p;
1648   return rc;
1649 }
1650
1651 /*
1652 ** Read the 64-bit checkpoint id of the checkpoint currently stored on meta
1653 ** page iMeta of the database file. If no error occurs, store the id value
1654 ** in *piVal and return LSM_OK. Otherwise, return an LSM error code and leave
1655 ** *piVal unmodified.
1656 **
1657 ** If a checkpointer connection is currently updating meta-page iMeta, or an
1658 ** earlier checkpointer crashed while doing so, the value read into *piVal
1659 ** may be garbage. It is the callers responsibility to deal with this.
1660 */
1661 int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
1662   FileSystem *pFS = db->pFS;
1663   int rc = LSM_OK;
1664
1665   assert( iMeta==1 || iMeta==2 );
1666   if( pFS->nMapLimit>0 ){
1667     fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
1668     if( rc==LSM_OK ){
1669       *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
1670     }
1671   }else{
1672     MetaPage *pMeta = 0;
1673     rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
1674     if( rc==LSM_OK ){
1675       *piVal = (i64)lsmGetU64(pMeta->aData);
1676       lsmFsMetaPageRelease(pMeta);
1677     }
1678   }
1679
1680   return rc;
1681 }
1682
1683
1684 /*
1685 ** Return true if the first or last page of segment pRun falls between iFirst
1686 ** and iLast, inclusive, and pRun is not equal to pIgnore.
1687 */
1688 static int fsRunEndsBetween(
1689   Segment *pRun,
1690   Segment *pIgnore,
1691   LsmPgno iFirst,
1692   LsmPgno iLast
1693 ){
1694   return (pRun!=pIgnore && (
1695         (pRun->iFirst>=iFirst && pRun->iFirst<=iLast)
1696      || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast)
1697   ));
1698 }
1699
1700 /*
1701 ** Return true if level pLevel contains a segment other than pIgnore for
1702 ** which the first or last page is between iFirst and iLast, inclusive.
1703 */
1704 static int fsLevelEndsBetween(
1705   Level *pLevel,
1706   Segment *pIgnore,
1707   LsmPgno iFirst,
1708   LsmPgno iLast
1709 ){
1710   int i;
1711
1712   if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){
1713     return 1;
1714   }
1715   for(i=0; i<pLevel->nRight; i++){
1716     if( fsRunEndsBetween(&pLevel->aRhs[i], pIgnore, iFirst, iLast) ){
1717       return 1;
1718     }
1719   }
1720
1721   return 0;
1722 }
1723
1724 /*
1725 ** Block iBlk is no longer in use by segment pIgnore. If it is not in use
1726 ** by any other segment, move it to the free block list.
1727 */
1728 static int fsFreeBlock(
1729   FileSystem *pFS,                /* File system object */
1730   Snapshot *pSnapshot,            /* Worker snapshot */
1731   Segment *pIgnore,               /* Ignore this run when searching */
1732   int iBlk                        /* Block number of block to free */
1733 ){
1734   int rc = LSM_OK;                /* Return code */
1735   LsmPgno iFirst;                 /* First page on block iBlk */
1736   LsmPgno iLast;                  /* Last page on block iBlk */
1737   Level *pLevel;                  /* Used to iterate through levels */
1738
1739   int iIn;                        /* Used to iterate through append points */
1740   int iOut = 0;                   /* Used to output append points */
1741   LsmPgno *aApp = pSnapshot->aiAppend;
1742
1743   iFirst = fsFirstPageOnBlock(pFS, iBlk);
1744   iLast = fsLastPageOnBlock(pFS, iBlk);
1745
1746   /* Check if any other run in the snapshot has a start or end page
1747   ** within this block. If there is such a run, return early. */
1748   for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
1749     if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
1750       return LSM_OK;
1751     }
1752   }
1753
1754   /* Remove any entries that lie on this block from the append-list. */
1755   for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
1756     if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
1757       aApp[iOut++] = aApp[iIn];
1758     }
1759   }
1760   while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;
1761
1762   if( rc==LSM_OK ){
1763     rc = lsmBlockFree(pFS->pDb, iBlk);
1764   }
1765   return rc;
1766 }
1767
1768 /*
1769 ** Delete or otherwise recycle the blocks currently occupied by run pDel.
1770 */
1771 int lsmFsSortedDelete(
1772   FileSystem *pFS,
1773   Snapshot *pSnapshot,
1774   int bZero,                      /* True to zero the Segment structure */
1775   Segment *pDel
1776 ){
1777   if( pDel->iFirst ){
1778     int rc = LSM_OK;
1779
1780     int iBlk;
1781     int iLastBlk;
1782
1783     iBlk = fsPageToBlock(pFS, pDel->iFirst);
1784     iLastBlk = fsPageToBlock(pFS, pDel->iLastPg);
1785
1786     /* Mark all blocks currently used by this sorted run as free */
1787     while( iBlk && rc==LSM_OK ){
1788       int iNext = 0;
1789       if( iBlk!=iLastBlk ){
1790         rc = fsBlockNext(pFS, pDel, iBlk, &iNext);
1791       }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){
1792         break;
1793       }
1794       rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk);
1795       iBlk = iNext;
1796     }
1797
1798     if( pDel->pRedirect ){
1799       assert( pDel->pRedirect==&pSnapshot->redirect );
1800       pSnapshot->redirect.n = 0;
1801     }
1802
1803     if( bZero ) memset(pDel, 0, sizeof(Segment));
1804   }
1805   return LSM_OK;
1806 }
1807
1808 /*
1809 ** aPgno is an array containing nPgno page numbers. Return the smallest page
1810 ** number from the array that falls on block iBlk. Or, if none of the pages
1811 ** in aPgno[] fall on block iBlk, return 0.
1812 */
1813 static LsmPgno firstOnBlock(
1814   FileSystem *pFS,
1815   int iBlk,
1816   LsmPgno *aPgno,
1817   int nPgno
1818 ){
1819   LsmPgno iRet = 0;
1820   int i;
1821   for(i=0; i<nPgno; i++){
1822     LsmPgno iPg = aPgno[i];
1823     if( fsPageToBlock(pFS, iPg)==iBlk && (iRet==0 || iPg<iRet) ){
1824       iRet = iPg;
1825     }
1826   }
1827   return iRet;
1828 }
1829
1830 #ifndef NDEBUG
1831 /*
1832 ** Return true if page iPg, which is a part of segment p, lies on
1833 ** a redirected block.
1834 */
1835 static int fsPageRedirects(FileSystem *pFS, Segment *p, LsmPgno iPg){
1836   return (iPg!=0 && iPg!=lsmFsRedirectPage(pFS, p->pRedirect, iPg));
1837 }
1838
1839 /*
1840 ** Return true if the second argument is not NULL and any of the first
1841 ** last or root pages lie on a redirected block.
1842 */
1843 static int fsSegmentRedirects(FileSystem *pFS, Segment *p){
1844   return (p && (
1845       fsPageRedirects(pFS, p, p->iFirst)
1846    || fsPageRedirects(pFS, p, p->iRoot)
1847    || fsPageRedirects(pFS, p, p->iLastPg)
1848   ));
1849 }
1850 #endif
1851
1852 /*
1853 ** Argument aPgno is an array of nPgno page numbers. All pages belong to
1854 ** the segment pRun. This function gobbles from the start of the run to the
1855 ** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is
1856 ** the new first page of the run).
1857 */
1858 void lsmFsGobble(
1859   lsm_db *pDb,
1860   Segment *pRun,
1861   LsmPgno *aPgno,
1862   int nPgno
1863 ){
1864   int rc = LSM_OK;
1865   FileSystem *pFS = pDb->pFS;
1866   Snapshot *pSnapshot = pDb->pWorker;
1867   int iBlk;
1868
1869   assert( pRun->nSize>0 );
1870   assert( 0==fsSegmentRedirects(pFS, pRun) );
1871   assert( nPgno>0 && 0==fsPageRedirects(pFS, pRun, aPgno[0]) );
1872
1873   iBlk = fsPageToBlock(pFS, pRun->iFirst);
1874   pRun->nSize += (pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
1875
1876   while( rc==LSM_OK ){
1877     int iNext = 0;
1878     LsmPgno iFirst = firstOnBlock(pFS, iBlk, aPgno, nPgno);
1879     if( iFirst ){
1880       pRun->iFirst = iFirst;
1881       break;
1882     }
1883     rc = fsBlockNext(pFS, pRun, iBlk, &iNext);
1884     if( rc==LSM_OK ) rc = fsFreeBlock(pFS, pSnapshot, pRun, iBlk);
1885     pRun->nSize -= (
1886         1 + fsLastPageOnBlock(pFS, iBlk) - fsFirstPageOnBlock(pFS, iBlk)
1887     );
1888     iBlk = iNext;
1889   }
1890
1891   pRun->nSize -= (pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
1892   assert( pRun->nSize>0 );
1893 }
1894
1895 /*
1896 ** This function is only used in compressed database mode.
1897 **
1898 ** Argument iPg is the page number (byte offset) of a page within segment
1899 ** pSeg. The page record, including all headers, is nByte bytes in size.
1900 ** Before returning, set *piNext to the page number of the next page in
1901 ** the segment, or to zero if iPg is the last.
1902 **
1903 ** In other words, do:
1904 **
1905 **   *piNext = iPg + nByte;
1906 **
1907 ** But take block overflow and redirection into account.
1908 */
1909 static int fsNextPageOffset(
1910   FileSystem *pFS,                /* File system object */
1911   Segment *pSeg,                  /* Segment to move within */
1912   LsmPgno iPg,                    /* Offset of current page */
1913   int nByte,                      /* Size of current page including headers */
1914   LsmPgno *piNext                 /* OUT: Offset of next page. Or zero (EOF) */
1915 ){
1916   LsmPgno iNext;
1917   int rc;
1918
1919   assert( pFS->pCompress );
1920
1921   rc = fsAddOffset(pFS, pSeg, iPg, nByte-1, &iNext);
1922   if( pSeg && iNext==pSeg->iLastPg ){
1923     iNext = 0;
1924   }else if( rc==LSM_OK ){
1925     rc = fsAddOffset(pFS, pSeg, iNext, 1, &iNext);
1926   }
1927
1928   *piNext = iNext;
1929   return rc;
1930 }
1931
1932 /*
1933 ** This function is only used in compressed database mode.
1934 **
1935 ** Argument iPg is the page number of a pagethat appears in segment pSeg.
1936 ** This function determines the page number of the previous page in the
1937 ** same run. *piPrev is set to the previous page number before returning.
1938 **
1939 ** LSM_OK is returned if no error occurs. Otherwise, an lsm error code.
1940 ** If any value other than LSM_OK is returned, then the final value of
1941 ** *piPrev is undefined.
1942 */
1943 static int fsGetPageBefore(
1944   FileSystem *pFS,
1945   Segment *pSeg,
1946   LsmPgno iPg,
1947   LsmPgno *piPrev
1948 ){
1949   u8 aSz[3];
1950   int rc;
1951   i64 iRead;
1952
1953   assert( pFS->pCompress );
1954
1955   rc = fsSubtractOffset(pFS, pSeg, iPg, sizeof(aSz), &iRead);
1956   if( rc==LSM_OK ) rc = fsReadData(pFS, pSeg, iRead, aSz, sizeof(aSz));
1957
1958   if( rc==LSM_OK ){
1959     int bFree;
1960     int nSz;
1961     if( aSz[2] & 0x80 ){
1962       nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2;
1963     }else{
1964       nSz = (int)(aSz[2] & 0x7F);
1965       bFree = 1;
1966     }
1967     rc = fsSubtractOffset(pFS, pSeg, iPg, nSz, piPrev);
1968   }
1969
1970   return rc;
1971 }
1972
1973 /*
1974 ** The first argument to this function is a valid reference to a database
1975 ** file page that is part of a sorted run. If parameter eDir is -1, this
1976 ** function attempts to locate and load the previous page in the same run.
1977 ** Or, if eDir is +1, it attempts to find the next page in the same run.
1978 ** The results of passing an eDir value other than positive or negative one
1979 ** are undefined.
1980 **
1981 ** If parameter pRun is not NULL then it must point to the run that page
1982 ** pPg belongs to. In this case, if pPg is the first or last page of the
1983 ** run, and the request is for the previous or next page, respectively,
1984 ** *ppNext is set to NULL before returning LSM_OK. If pRun is NULL, then it
1985 ** is assumed that the next or previous page, as requested, exists.
1986 **
1987 ** If the previous/next page does exist and is successfully loaded, *ppNext
1988 ** is set to point to it and LSM_OK is returned. Otherwise, if an error
1989 ** occurs, *ppNext is set to NULL and and lsm error code returned.
1990 **
1991 ** Page references returned by this function should be released by the
1992 ** caller using lsmFsPageRelease().
1993 */
1994 int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){
1995   int rc = LSM_OK;
1996   FileSystem *pFS = pPg->pFS;
1997   LsmPgno iPg = pPg->iPg;
1998
1999   assert( 0==fsSegmentRedirects(pFS, pRun) );
2000   if( pFS->pCompress ){
2001     int nSpace = pPg->nCompress + 2*3;
2002
2003     do {
2004       if( eDir>0 ){
2005         rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg);
2006       }else{
2007         if( iPg==pRun->iFirst ){
2008           iPg = 0;
2009         }else{
2010           rc = fsGetPageBefore(pFS, pRun, iPg, &iPg);
2011         }
2012       }
2013
2014       nSpace = 0;
2015       if( iPg!=0 ){
2016         rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, &nSpace);
2017         assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) );
2018       }else{
2019         *ppNext = 0;
2020       }
2021     }while( nSpace>0 && rc==LSM_OK );
2022
2023   }else{
2024     Redirect *pRedir = pRun ? pRun->pRedirect : 0;
2025     assert( eDir==1 || eDir==-1 );
2026     if( eDir<0 ){
2027       if( pRun && iPg==pRun->iFirst ){
2028         *ppNext = 0;
2029         return LSM_OK;
2030       }else if( fsIsFirst(pFS, iPg) ){
2031         assert( pPg->flags & PAGE_HASPREV );
2032         iPg = fsLastPageOnBlock(pFS, lsmGetU32(&pPg->aData[-4]));
2033       }else{
2034         iPg--;
2035       }
2036     }else{
2037       if( pRun ){
2038         if( iPg==pRun->iLastPg ){
2039           *ppNext = 0;
2040           return LSM_OK;
2041         }
2042       }
2043
2044       if( fsIsLast(pFS, iPg) ){
2045         int iBlk = fsRedirectBlock(
2046             pRedir, lsmGetU32(&pPg->aData[pFS->nPagesize-4])
2047         );
2048         iPg = fsFirstPageOnBlock(pFS, iBlk);
2049       }else{
2050         iPg++;
2051       }
2052     }
2053     rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, 0);
2054   }
2055
2056   return rc;
2057 }
2058
2059 /*
2060 ** This function is called when creating a new segment to determine if the
2061 ** first part of it can be written following an existing segment on an
2062 ** already allocated block. If it is possible, the page number of the first
2063 ** page to use for the new segment is returned. Otherwise zero.
2064 **
2065 ** If argument pLvl is not NULL, then this function will not attempt to
2066 ** start the new segment immediately following any segment that is part
2067 ** of the right-hand-side of pLvl.
2068 */
2069 static LsmPgno findAppendPoint(FileSystem *pFS, Level *pLvl){
2070   int i;
2071   LsmPgno *aiAppend = pFS->pDb->pWorker->aiAppend;
2072   LsmPgno iRet = 0;
2073
2074   for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
2075     if( (iRet = aiAppend[i]) ){
2076       if( pLvl ){
2077         int iBlk = fsPageToBlock(pFS, iRet);
2078         int j;
2079         for(j=0; iRet && j<pLvl->nRight; j++){
2080           if( fsPageToBlock(pFS, pLvl->aRhs[j].iLastPg)==iBlk ){
2081             iRet = 0;
2082           }
2083         }
2084       }
2085       if( iRet ) aiAppend[i] = 0;
2086     }
2087   }
2088   return iRet;
2089 }
2090
2091 /*
2092 ** Append a page to the left-hand-side of pLvl. Set the ref-count to 1 and
2093 ** return a pointer to it. The page is writable until either
2094 ** lsmFsPagePersist() is called on it or the ref-count drops to zero.
2095 */
2096 int lsmFsSortedAppend(
2097   FileSystem *pFS,
2098   Snapshot *pSnapshot,
2099   Level *pLvl,
2100   int bDefer,
2101   Page **ppOut
2102 ){
2103   int rc = LSM_OK;
2104   Page *pPg = 0;
2105   LsmPgno iApp = 0;
2106   LsmPgno iNext = 0;
2107   Segment *p = &pLvl->lhs;
2108   LsmPgno iPrev = p->iLastPg;
2109
2110   *ppOut = 0;
2111   assert( p->pRedirect==0 );
2112
2113   if( pFS->pCompress || bDefer ){
2114     /* In compressed database mode the page is not assigned a page number
2115     ** or location in the database file at this point. This will be done
2116     ** by the lsmFsPagePersist() call.  */
2117     rc = fsPageBuffer(pFS, &pPg);
2118     if( rc==LSM_OK ){
2119       pPg->pFS = pFS;
2120       pPg->pSeg = p;
2121       pPg->iPg = 0;
2122       pPg->flags |= PAGE_DIRTY;
2123       pPg->nData = pFS->nPagesize;
2124       assert( pPg->aData );
2125       if( pFS->pCompress==0 ) pPg->nData -= 4;
2126
2127       pPg->nRef = 1;
2128       pFS->nOut++;
2129     }
2130   }else{
2131     if( iPrev==0 ){
2132       iApp = findAppendPoint(pFS, pLvl);
2133     }else if( fsIsLast(pFS, iPrev) ){
2134       int iNext2;
2135       rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iPrev), &iNext2);
2136       if( rc!=LSM_OK ) return rc;
2137       iApp = fsFirstPageOnBlock(pFS, iNext2);
2138     }else{
2139       iApp = iPrev + 1;
2140     }
2141
2142     /* If this is the first page allocated, or if the page allocated is the
2143     ** last in the block, also allocate the next block here.  */
2144     if( iApp==0 || fsIsLast(pFS, iApp) ){
2145       int iNew;                     /* New block number */
2146
2147       rc = lsmBlockAllocate(pFS->pDb, 0, &iNew);
2148       if( rc!=LSM_OK ) return rc;
2149       if( iApp==0 ){
2150         iApp = fsFirstPageOnBlock(pFS, iNew);
2151       }else{
2152         iNext = fsFirstPageOnBlock(pFS, iNew);
2153       }
2154     }
2155
2156     /* Grab the new page. */
2157     pPg = 0;
2158     rc = fsPageGet(pFS, 0, iApp, 1, &pPg, 0);
2159     assert( rc==LSM_OK || pPg==0 );
2160
2161     /* If this is the first or last page of a block, fill in the pointer
2162      ** value at the end of the new page. */
2163     if( rc==LSM_OK ){
2164       p->nSize++;
2165       p->iLastPg = iApp;
2166       if( p->iFirst==0 ) p->iFirst = iApp;
2167       pPg->flags |= PAGE_DIRTY;
2168
2169       if( fsIsLast(pFS, iApp) ){
2170         lsmPutU32(&pPg->aData[pFS->nPagesize-4], fsPageToBlock(pFS, iNext));
2171       }else if( fsIsFirst(pFS, iApp) ){
2172         lsmPutU32(&pPg->aData[-4], fsPageToBlock(pFS, iPrev));
2173       }
2174     }
2175   }
2176
2177   *ppOut = pPg;
2178   return rc;
2179 }
2180
2181 /*
2182 ** Mark the segment passed as the second argument as finished. Once a segment
2183 ** is marked as finished it is not possible to append any further pages to
2184 ** it.
2185 **
2186 ** Return LSM_OK if successful or an lsm error code if an error occurs.
2187 */
2188 int lsmFsSortedFinish(FileSystem *pFS, Segment *p){
2189   int rc = LSM_OK;
2190   if( p && p->iLastPg ){
2191     assert( p->pRedirect==0 );
2192
2193     /* Check if the last page of this run happens to be the last of a block.
2194     ** If it is, then an extra block has already been allocated for this run.
2195     ** Shift this extra block back to the free-block list.
2196     **
2197     ** Otherwise, add the first free page in the last block used by the run
2198     ** to the lAppend list.
2199     */
2200     if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){
2201       int i;
2202       LsmPgno *aiAppend = pFS->pDb->pWorker->aiAppend;
2203       for(i=0; i<LSM_APPLIST_SZ; i++){
2204         if( aiAppend[i]==0 ){
2205           aiAppend[i] = p->iLastPg+1;
2206           break;
2207         }
2208       }
2209     }else if( pFS->pCompress==0 ){
2210       Page *pLast;
2211       rc = fsPageGet(pFS, 0, p->iLastPg, 0, &pLast, 0);
2212       if( rc==LSM_OK ){
2213         int iBlk = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
2214         lsmBlockRefree(pFS->pDb, iBlk);
2215         lsmFsPageRelease(pLast);
2216       }
2217     }else{
2218       int iBlk = 0;
2219       rc = fsBlockNext(pFS, p, fsPageToBlock(pFS, p->iLastPg), &iBlk);
2220       if( rc==LSM_OK ){
2221         lsmBlockRefree(pFS->pDb, iBlk);
2222       }
2223     }
2224   }
2225   return rc;
2226 }
2227
2228 /*
2229 ** Obtain a reference to page number iPg.
2230 **
2231 ** Return LSM_OK if successful, or an lsm error code if an error occurs.
2232 */
2233 int lsmFsDbPageGet(FileSystem *pFS, Segment *pSeg, LsmPgno iPg, Page **ppPg){
2234   return fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
2235 }
2236
2237 /*
2238 ** Obtain a reference to the last page in the segment passed as the
2239 ** second argument.
2240 **
2241 ** Return LSM_OK if successful, or an lsm error code if an error occurs.
2242 */
2243 int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){
2244   int rc;
2245   LsmPgno iPg = pSeg->iLastPg;
2246   if( pFS->pCompress ){
2247     int nSpace;
2248     iPg++;
2249     do {
2250       nSpace = 0;
2251       rc = fsGetPageBefore(pFS, pSeg, iPg, &iPg);
2252       if( rc==LSM_OK ){
2253         rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, &nSpace);
2254       }
2255     }while( rc==LSM_OK && nSpace>0 );
2256
2257   }else{
2258     rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
2259   }
2260   return rc;
2261 }
2262
2263 /*
2264 ** Return a reference to meta-page iPg. If successful, LSM_OK is returned
2265 ** and *ppPg populated with the new page reference. The reference should
2266 ** be released by the caller using lsmFsPageRelease().
2267 **
2268 ** Otherwise, if an error occurs, *ppPg is set to NULL and an LSM error
2269 ** code is returned.
2270 */
2271 int lsmFsMetaPageGet(
2272   FileSystem *pFS,                /* File-system connection */
2273   int bWrite,                     /* True for write access, false for read */
2274   int iPg,                        /* Either 1 or 2 */
2275   MetaPage **ppPg                 /* OUT: Pointer to MetaPage object */
2276 ){
2277   int rc = LSM_OK;
2278   MetaPage *pPg;
2279   assert( iPg==1 || iPg==2 );
2280
2281   pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
2282
2283   if( pPg ){
2284     i64 iOff = (iPg-1) * pFS->nMetasize;
2285     if( pFS->nMapLimit>0 ){
2286       fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
2287       pPg->aData = (u8 *)(pFS->pMap) + iOff;
2288     }else{
2289       pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
2290       if( rc==LSM_OK && bWrite==0 ){
2291         rc = lsmEnvRead(
2292             pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetaRwSize
2293         );
2294       }
2295 #ifndef NDEBUG
2296       /* pPg->aData causes an uninitialized access via a downstreadm write().
2297          After discussion on this list, this memory should not, for performance
2298          reasons, be memset. However, tracking down "real" misuse is more
2299          difficult with this "false" positive, so it is set when NDEBUG.
2300       */
2301       else if( rc==LSM_OK ){
2302         memset( pPg->aData, 0x77, pFS->nMetasize );
2303       }
2304 #endif
2305     }
2306
2307     if( rc!=LSM_OK ){
2308       if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData);
2309       lsmFree(pFS->pEnv, pPg);
2310       pPg = 0;
2311     }else{
2312       pPg->iPg = iPg;
2313       pPg->bWrite = bWrite;
2314       pPg->pFS = pFS;
2315     }
2316   }
2317
2318   *ppPg = pPg;
2319   return rc;
2320 }
2321
2322 /*
2323 ** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
2324 */
2325 int lsmFsMetaPageRelease(MetaPage *pPg){
2326   int rc = LSM_OK;
2327   if( pPg ){
2328     FileSystem *pFS = pPg->pFS;
2329
2330     if( pFS->nMapLimit==0 ){
2331       if( pPg->bWrite ){
2332         i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
2333         int nWrite = pFS->nMetaRwSize;
2334         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
2335       }
2336       lsmFree(pFS->pEnv, pPg->aData);
2337     }
2338
2339     lsmFree(pFS->pEnv, pPg);
2340   }
2341   return rc;
2342 }
2343
2344 /*
2345 ** Return a pointer to a buffer containing the data associated with the
2346 ** meta-page passed as the first argument. If parameter pnData is not NULL,
2347 ** set *pnData to the size of the meta-page in bytes before returning.
2348 */
2349 u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){
2350   if( pnData ) *pnData = pPg->pFS->nMetaRwSize;
2351   return pPg->aData;
2352 }
2353
2354 /*
2355 ** Return true if page is currently writable. This is used in assert()
2356 ** statements only.
2357 */
2358 #ifndef NDEBUG
2359 int lsmFsPageWritable(Page *pPg){
2360   return (pPg->flags & PAGE_DIRTY) ? 1 : 0;
2361 }
2362 #endif
2363
2364 /*
2365 ** This is called when block iFrom is being redirected to iTo. If page
2366 ** number (*piPg) lies on block iFrom, then calculate the equivalent
2367 ** page on block iTo and set *piPg to this value before returning.
2368 */
2369 static void fsMovePage(
2370   FileSystem *pFS,                /* File system object */
2371   int iTo,                        /* Destination block */
2372   int iFrom,                      /* Source block */
2373   LsmPgno *piPg                   /* IN/OUT: Page number */
2374 ){
2375   LsmPgno iPg = *piPg;
2376   if( iFrom==fsPageToBlock(pFS, iPg) ){
2377     const int nPagePerBlock = (
2378         pFS->pCompress ? pFS ->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
2379     );
2380     *piPg = iPg - (LsmPgno)(iFrom - iTo) * nPagePerBlock;
2381   }
2382 }
2383
2384 /*
2385 ** Copy the contents of block iFrom to block iTo.
2386 **
2387 ** It is safe to assume that there are no outstanding references to pages
2388 ** on block iTo. And that block iFrom is not currently being written. In
2389 ** other words, the data can be read and written directly.
2390 */
2391 int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
2392   Snapshot *p = pFS->pDb->pWorker;
2393   int rc = LSM_OK;
2394   int i;
2395   i64 nMap;
2396
2397   i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
2398   i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
2399
2400   assert( iTo!=1 );
2401   assert( iFrom>iTo );
2402
2403   /* Grow the mapping as required. */
2404   nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize);
2405   fsGrowMapping(pFS, nMap, &rc);
2406
2407   if( rc==LSM_OK ){
2408     const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
2409     int nSz = pFS->nPagesize;
2410     u8 *aBuf = 0;
2411     u8 *aData = 0;
2412
2413     for(i=0; rc==LSM_OK && i<nPagePerBlock; i++){
2414       i64 iOff = iFromOff + i*nSz;
2415
2416       /* Set aData to point to a buffer containing the from page */
2417       if( (iOff+nSz)<=pFS->nMapLimit ){
2418         u8 *aMap = (u8 *)(pFS->pMap);
2419         aData = &aMap[iOff];
2420       }else{
2421         if( aBuf==0 ){
2422           aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
2423           if( aBuf==0 ) break;
2424         }
2425         aData = aBuf;
2426         rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
2427       }
2428
2429       /* Copy aData to the to page */
2430       if( rc==LSM_OK ){
2431         iOff = iToOff + i*nSz;
2432         if( (iOff+nSz)<=pFS->nMapLimit ){
2433           u8 *aMap = (u8 *)(pFS->pMap);
2434           memcpy(&aMap[iOff], aData, nSz);
2435         }else{
2436           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
2437         }
2438       }
2439     }
2440     lsmFree(pFS->pEnv, aBuf);
2441     lsmFsPurgeCache(pFS);
2442   }
2443
2444   /* Update append-point list if necessary */
2445   for(i=0; i<LSM_APPLIST_SZ; i++){
2446     fsMovePage(pFS, iTo, iFrom, &p->aiAppend[i]);
2447   }
2448
2449   /* Update the Segment structure itself */
2450   fsMovePage(pFS, iTo, iFrom, &pSeg->iFirst);
2451   fsMovePage(pFS, iTo, iFrom, &pSeg->iLastPg);
2452   fsMovePage(pFS, iTo, iFrom, &pSeg->iRoot);
2453
2454   return rc;
2455 }
2456
2457 /*
2458 ** Append raw data to a segment. Return the database file offset that the
2459 ** data is written to (this may be used as the page number if the data
2460 ** being appended is a new page record).
2461 **
2462 ** This function is only used in compressed database mode.
2463 */
2464 static LsmPgno fsAppendData(
2465   FileSystem *pFS,                /* File-system handle */
2466   Segment *pSeg,                  /* Segment to append to */
2467   const u8 *aData,                /* Buffer containing data to write */
2468   int nData,                      /* Size of buffer aData[] in bytes */
2469   int *pRc                        /* IN/OUT: Error code */
2470 ){
2471   LsmPgno iRet = 0;
2472   int rc = *pRc;
2473   assert( pFS->pCompress );
2474   if( rc==LSM_OK ){
2475     int nRem = 0;
2476     int nWrite = 0;
2477     LsmPgno iLastOnBlock;
2478     LsmPgno iApp = pSeg->iLastPg+1;
2479
2480     /* If this is the first data written into the segment, find an append-point
2481     ** or allocate a new block.  */
2482     if( iApp==1 ){
2483       pSeg->iFirst = iApp = findAppendPoint(pFS, 0);
2484       if( iApp==0 ){
2485         int iBlk;
2486         rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2487         pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk);
2488       }
2489     }
2490     iRet = iApp;
2491
2492     /* Write as much data as is possible at iApp (usually all of it). */
2493     iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp);
2494     if( rc==LSM_OK ){
2495       int nSpace = (int)(iLastOnBlock - iApp + 1);
2496       nWrite = LSM_MIN(nData, nSpace);
2497       nRem = nData - nWrite;
2498       assert( nWrite>=0 );
2499       if( nWrite!=0 ){
2500         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite);
2501       }
2502       iApp += nWrite;
2503     }
2504
2505     /* If required, allocate a new block and write the rest of the data
2506     ** into it. Set the next and previous block pointers to link the new
2507     ** block to the old.  */
2508     assert( nRem<=0 || (iApp-1)==iLastOnBlock );
2509     if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){
2510       u8 aPtr[4];                 /* Space to serialize a u32 */
2511       int iBlk;                   /* New block number */
2512
2513       if( nWrite>0 ){
2514         /* Allocate a new block. */
2515         rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2516
2517         /* Set the "next" pointer on the old block */
2518         if( rc==LSM_OK ){
2519           assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 );
2520           lsmPutU32(aPtr, iBlk);
2521           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr));
2522         }
2523
2524         /* Set the "prev" pointer on the new block */
2525         if( rc==LSM_OK ){
2526           LsmPgno iWrite;
2527           lsmPutU32(aPtr, fsPageToBlock(pFS, iApp));
2528           iWrite = fsFirstPageOnBlock(pFS, iBlk);
2529           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr));
2530           if( nRem>0 ) iApp = iWrite;
2531         }
2532       }else{
2533         /* The next block is already allocated. */
2534         assert( nRem>0 );
2535         assert( pSeg->pRedirect==0 );
2536         rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iApp), &iBlk);
2537         iRet = iApp = fsFirstPageOnBlock(pFS, iBlk);
2538       }
2539
2540       /* Write the remaining data into the new block */
2541       if( rc==LSM_OK && nRem>0 ){
2542         rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem);
2543         iApp += nRem;
2544       }
2545     }
2546
2547     pSeg->iLastPg = iApp-1;
2548     *pRc = rc;
2549   }
2550
2551   return iRet;
2552 }
2553
2554 /*
2555 ** This function is only called in compressed database mode. It
2556 ** compresses the contents of page pPg and writes the result to the
2557 ** buffer at pFS->aOBuffer. The size of the compressed data is stored in
2558 ** pPg->nCompress.
2559 **
2560 ** If buffer pFS->aOBuffer[] has not been allocated then this function
2561 ** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK.
2562 */
2563 static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){
2564   lsm_compress *p = pFS->pCompress;
2565
2566   if( fsAllocateBuffer(pFS, 1) ) return LSM_NOMEM;
2567   assert( pPg->nData==pFS->nPagesize );
2568
2569   pPg->nCompress = pFS->nBuffer;
2570   return p->xCompress(p->pCtx,
2571       (char *)pFS->aOBuffer, &pPg->nCompress,
2572       (const char *)pPg->aData, pPg->nData
2573   );
2574 }
2575
2576 /*
2577 ** Append a new page to segment pSeg. Set output variable *piNew to the
2578 ** page number of the new page before returning.
2579 **
2580 ** If the new page is the last on its block, then the 'next' block that
2581 ** will be used by the segment is allocated here too. In this case output
2582 ** variable *piNext is set to the block number of the next block.
2583 **
2584 ** If the new page is the first on its block but not the first in the
2585 ** entire segment, set output variable *piPrev to the block number of
2586 ** the previous block in the segment.
2587 **
2588 ** LSM_OK is returned if successful, or an lsm error code otherwise. If
2589 ** any value other than LSM_OK is returned, then the final value of all
2590 ** output variables is undefined.
2591 */
2592 static int fsAppendPage(
2593   FileSystem *pFS,
2594   Segment *pSeg,
2595   LsmPgno *piNew,
2596   int *piPrev,
2597   int *piNext
2598 ){
2599   LsmPgno iPrev = pSeg->iLastPg;
2600   int rc;
2601   assert( iPrev!=0 );
2602
2603   *piPrev = 0;
2604   *piNext = 0;
2605
2606   if( fsIsLast(pFS, iPrev) ){
2607     /* Grab the first page on the next block (which has already be
2608     ** allocated). In this case set *piPrev to tell the caller to set
2609     ** the "previous block" pointer in the first 4 bytes of the page.
2610     */
2611     int iNext;
2612     int iBlk = fsPageToBlock(pFS, iPrev);
2613     assert( pSeg->pRedirect==0 );
2614     rc = fsBlockNext(pFS, 0, iBlk, &iNext);
2615     if( rc!=LSM_OK ) return rc;
2616     *piNew = fsFirstPageOnBlock(pFS, iNext);
2617     *piPrev = iBlk;
2618   }else{
2619     *piNew = iPrev+1;
2620     if( fsIsLast(pFS, *piNew) ){
2621       /* Allocate the next block here. */
2622       int iBlk;
2623       rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
2624       if( rc!=LSM_OK ) return rc;
2625       *piNext = iBlk;
2626     }
2627   }
2628
2629   pSeg->nSize++;
2630   pSeg->iLastPg = *piNew;
2631   return LSM_OK;
2632 }
2633
2634 /*
2635 ** Flush all pages in the FileSystem.pWaiting list to disk.
2636 */
2637 void lsmFsFlushWaiting(FileSystem *pFS, int *pRc){
2638   int rc = *pRc;
2639   Page *pPg;
2640
2641   pPg = pFS->pWaiting;
2642   pFS->pWaiting = 0;
2643
2644   while( pPg ){
2645     Page *pNext = pPg->pWaitingNext;
2646     if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
2647     assert( pPg->nRef==1 );
2648     lsmFsPageRelease(pPg);
2649     pPg = pNext;
2650   }
2651   *pRc = rc;
2652 }
2653
2654 /*
2655 ** If there exists a hash-table entry associated with page iPg, remove it.
2656 */
2657 static void fsRemoveHashEntry(FileSystem *pFS, LsmPgno iPg){
2658   Page *p;
2659   int iHash = fsHashKey(pFS->nHash, iPg);
2660
2661   for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);
2662
2663   if( p ){
2664     assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 );
2665     fsPageRemoveFromHash(pFS, p);
2666     p->iPg = 0;
2667     iHash = fsHashKey(pFS->nHash, 0);
2668     p->pHashNext = pFS->apHash[iHash];
2669     pFS->apHash[iHash] = p;
2670   }
2671 }
2672
2673 /*
2674 ** If the page passed as an argument is dirty, update the database file
2675 ** (or mapping of the database file) with its current contents and mark
2676 ** the page as clean.
2677 **
2678 ** Return LSM_OK if the operation is a success, or an LSM error code
2679 ** otherwise.
2680 */
2681 int lsmFsPagePersist(Page *pPg){
2682   int rc = LSM_OK;
2683   if( pPg && (pPg->flags & PAGE_DIRTY) ){
2684     FileSystem *pFS = pPg->pFS;
2685
2686     if( pFS->pCompress ){
2687       int iHash;                  /* Hash key of assigned page number */
2688       u8 aSz[3];                  /* pPg->nCompress as a 24-bit big-endian */
2689       assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 );
2690
2691       /* Compress the page image. */
2692       rc = fsCompressIntoBuffer(pFS, pPg);
2693
2694       /* Serialize the compressed size into buffer aSz[] */
2695       putRecordSize(aSz, pPg->nCompress, 0);
2696
2697       /* Write the serialized page record into the database file. */
2698       pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
2699       fsAppendData(pFS, pPg->pSeg, pFS->aOBuffer, pPg->nCompress, &rc);
2700       fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
2701
2702       /* Now that it has a page number, insert the page into the hash table */
2703       iHash = fsHashKey(pFS->nHash, pPg->iPg);
2704       pPg->pHashNext = pFS->apHash[iHash];
2705       pFS->apHash[iHash] = pPg;
2706
2707       pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;
2708
2709       pPg->flags &= ~PAGE_DIRTY;
2710       pFS->nWrite++;
2711     }else{
2712
2713       if( pPg->iPg==0 ){
2714         /* No page number has been assigned yet. This occurs with pages used
2715         ** in the b-tree hierarchy. They were not assigned page numbers when
2716         ** they were created as doing so would cause this call to
2717         ** lsmFsPagePersist() to write an out-of-order page. Instead a page
2718         ** number is assigned here so that the page data will be appended
2719         ** to the current segment.
2720         */
2721         Page **pp;
2722         int iPrev = 0;
2723         int iNext = 0;
2724         int iHash;
2725
2726         assert( pPg->pSeg->iFirst );
2727         assert( pPg->flags & PAGE_FREE );
2728         assert( (pPg->flags & PAGE_HASPREV)==0 );
2729         assert( pPg->nData==pFS->nPagesize-4 );
2730
2731         rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
2732         if( rc!=LSM_OK ) return rc;
2733
2734         assert( pPg->flags & PAGE_FREE );
2735         iHash = fsHashKey(pFS->nHash, pPg->iPg);
2736         fsRemoveHashEntry(pFS, pPg->iPg);
2737         pPg->pHashNext = pFS->apHash[iHash];
2738         pFS->apHash[iHash] = pPg;
2739         assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );
2740
2741         if( iPrev ){
2742           assert( iNext==0 );
2743           memmove(&pPg->aData[4], pPg->aData, pPg->nData);
2744           lsmPutU32(pPg->aData, iPrev);
2745           pPg->flags |= PAGE_HASPREV;
2746           pPg->aData += 4;
2747         }else if( iNext ){
2748           assert( iPrev==0 );
2749           lsmPutU32(&pPg->aData[pPg->nData], iNext);
2750         }else{
2751           int nData = pPg->nData;
2752           pPg->nData += 4;
2753           lsmSortedExpandBtreePage(pPg, nData);
2754         }
2755
2756         pPg->nRef++;
2757         for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext);
2758         *pp = pPg;
2759         assert( pPg->pWaitingNext==0 );
2760
2761       }else{
2762         i64 iOff;                   /* Offset to write within database file */
2763
2764         iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
2765         if( fsMmapPage(pFS, pPg->iPg)==0 ){
2766           u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
2767           rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
2768         }else if( pPg->flags & PAGE_FREE ){
2769           fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
2770           if( rc==LSM_OK ){
2771             u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
2772             u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
2773             memcpy(aTo, aFrom, pFS->nPagesize);
2774             lsmFree(pFS->pEnv, aFrom);
2775             pFS->nCacheAlloc--;
2776             pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
2777             pPg->flags &= ~PAGE_FREE;
2778             fsPageRemoveFromHash(pFS, pPg);
2779             pPg->pMappedNext = pFS->pMapped;
2780             pFS->pMapped = pPg;
2781           }
2782         }
2783
2784         lsmFsFlushWaiting(pFS, &rc);
2785         pPg->flags &= ~PAGE_DIRTY;
2786         pFS->nWrite++;
2787       }
2788     }
2789   }
2790
2791   return rc;
2792 }
2793
2794 /*
2795 ** For non-compressed databases, this function is a no-op. For compressed
2796 ** databases, it adds a padding record to the segment passed as the third
2797 ** argument.
2798 **
2799 ** The size of the padding records is selected so that the last byte
2800 ** written is the last byte of a disk sector. This means that if a
2801 ** snapshot is taken and checkpointed, subsequent worker processes will
2802 ** not write to any sector that contains checkpointed data.
2803 */
2804 int lsmFsSortedPadding(
2805   FileSystem *pFS,
2806   Snapshot *pSnapshot,
2807   Segment *pSeg
2808 ){
2809   int rc = LSM_OK;
2810   if( pFS->pCompress && pSeg->iFirst ){
2811     LsmPgno iLast2;
2812     LsmPgno iLast = pSeg->iLastPg;  /* Current last page of segment */
2813     int nPad;                       /* Bytes of padding required */
2814     u8 aSz[3];
2815
2816     iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1;
2817     assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) );
2818     nPad = (int)(iLast2 - iLast);
2819
2820     if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){
2821       nPad -= 4;
2822     }
2823     assert( nPad>=0 );
2824
2825     if( nPad>=6 ){
2826       pSeg->nSize += nPad;
2827       nPad -= 6;
2828       putRecordSize(aSz, nPad, 1);
2829       fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
2830       memset(pFS->aOBuffer, 0, nPad);
2831       fsAppendData(pFS, pSeg, pFS->aOBuffer, nPad, &rc);
2832       fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
2833     }else if( nPad>0 ){
2834       u8 aBuf[5] = {0,0,0,0,0};
2835       aBuf[0] = (u8)nPad;
2836       aBuf[nPad-1] = (u8)nPad;
2837       fsAppendData(pFS, pSeg, aBuf, nPad, &rc);
2838     }
2839
2840     assert( rc!=LSM_OK
2841         || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg)
2842         || ((pSeg->iLastPg + 1) % pFS->szSector)==0
2843     );
2844   }
2845
2846   return rc;
2847 }
2848
2849
2850 /*
2851 ** Increment the reference count on the page object passed as the first
2852 ** argument.
2853 */
2854 void lsmFsPageRef(Page *pPg){
2855   if( pPg ){
2856     pPg->nRef++;
2857   }
2858 }
2859
2860 /*
2861 ** Release a page-reference obtained using fsPageGet().
2862 */
2863 int lsmFsPageRelease(Page *pPg){
2864   int rc = LSM_OK;
2865   if( pPg ){
2866     assert( pPg->nRef>0 );
2867     pPg->nRef--;
2868     if( pPg->nRef==0 ){
2869       FileSystem *pFS = pPg->pFS;
2870       rc = lsmFsPagePersist(pPg);
2871       pFS->nOut--;
2872
2873       assert( pPg->pFS->pCompress
2874            || fsIsFirst(pPg->pFS, pPg->iPg)==0
2875            || (pPg->flags & PAGE_HASPREV)
2876       );
2877       pPg->aData -= (pPg->flags & PAGE_HASPREV);
2878       pPg->flags &= ~PAGE_HASPREV;
2879
2880       if( (pPg->flags & PAGE_FREE)==0 ){
2881         /* Removed from mapped list */
2882         Page **pp;
2883         for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext);
2884         *pp = pPg->pMappedNext;
2885         pPg->pMappedNext = 0;
2886
2887         /* Add to free list */
2888         pPg->pFreeNext = pFS->pFree;
2889         pFS->pFree = pPg;
2890       }else{
2891         fsPageAddToLru(pFS, pPg);
2892       }
2893     }
2894   }
2895
2896   return rc;
2897 }
2898
2899 /*
2900 ** Return the total number of pages read from the database file.
2901 */
2902 int lsmFsNRead(FileSystem *pFS){ return pFS->nRead; }
2903
2904 /*
2905 ** Return the total number of pages written to the database file.
2906 */
2907 int lsmFsNWrite(FileSystem *pFS){ return pFS->nWrite; }
2908
2909 /*
2910 ** Return a copy of the environment pointer used by the file-system object.
2911 */
2912 lsm_env *lsmFsEnv(FileSystem *pFS){
2913   return pFS->pEnv;
2914 }
2915
2916 /*
2917 ** Return a copy of the environment pointer used by the file-system object
2918 ** to which this page belongs.
2919 */
2920 lsm_env *lsmPageEnv(Page *pPg) {
2921   return pPg->pFS->pEnv;
2922 }
2923
2924 /*
2925 ** Return a pointer to the file-system object associated with the Page
2926 ** passed as the only argument.
2927 */
2928 FileSystem *lsmPageFS(Page *pPg){
2929   return pPg->pFS;
2930 }
2931
2932 /*
2933 ** Return the sector-size as reported by the log file handle.
2934 */
2935 int lsmFsSectorSize(FileSystem *pFS){
2936   return pFS->szSector;
2937 }
2938
2939 /*
2940 ** Helper function for lsmInfoArrayStructure().
2941 */
2942 static Segment *startsWith(Segment *pRun, LsmPgno iFirst){
2943   return (iFirst==pRun->iFirst) ? pRun : 0;
2944 }
2945
2946 /*
2947 ** Return the segment that starts with page iFirst, if any. If no such segment
2948 ** can be found, return NULL.
2949 */
2950 static Segment *findSegment(Snapshot *pWorker, LsmPgno iFirst){
2951   Level *pLvl;                    /* Used to iterate through db levels */
2952   Segment *pSeg = 0;              /* Pointer to segment to return */
2953
2954   for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pSeg==0; pLvl=pLvl->pNext){
2955     if( 0==(pSeg = startsWith(&pLvl->lhs, iFirst)) ){
2956       int i;
2957       for(i=0; i<pLvl->nRight; i++){
2958         if( (pSeg = startsWith(&pLvl->aRhs[i], iFirst)) ) break;
2959       }
2960     }
2961   }
2962
2963   return pSeg;
2964 }
2965
2966 /*
2967 ** This function implements the lsm_info(LSM_INFO_ARRAY_STRUCTURE) request.
2968 ** If successful, *pzOut is set to point to a nul-terminated string
2969 ** containing the array structure and LSM_OK is returned. The caller should
2970 ** eventually free the string using lsmFree().
2971 **
2972 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
2973 */
2974 int lsmInfoArrayStructure(
2975   lsm_db *pDb,
2976   int bBlock,                     /* True for block numbers only */
2977   LsmPgno iFirst,
2978   char **pzOut
2979 ){
2980   int rc = LSM_OK;
2981   Snapshot *pWorker;              /* Worker snapshot */
2982   Segment *pArray = 0;            /* Array to report on */
2983   int bUnlock = 0;
2984
2985   *pzOut = 0;
2986   if( iFirst==0 ) return LSM_ERROR;
2987
2988   /* Obtain the worker snapshot */
2989   pWorker = pDb->pWorker;
2990   if( !pWorker ){
2991     rc = lsmBeginWork(pDb);
2992     if( rc!=LSM_OK ) return rc;
2993     pWorker = pDb->pWorker;
2994     bUnlock = 1;
2995   }
2996
2997   /* Search for the array that starts on page iFirst */
2998   pArray = findSegment(pWorker, iFirst);
2999
3000   if( pArray==0 ){
3001     /* Could not find the requested array. This is an error. */
3002     rc = LSM_ERROR;
3003   }else{
3004     FileSystem *pFS = pDb->pFS;
3005     LsmString str;
3006     int iBlk;
3007     int iLastBlk;
3008
3009     iBlk = fsPageToBlock(pFS, pArray->iFirst);
3010     iLastBlk = fsPageToBlock(pFS, pArray->iLastPg);
3011
3012     lsmStringInit(&str, pDb->pEnv);
3013     if( bBlock ){
3014       lsmStringAppendf(&str, "%d", iBlk);
3015       while( iBlk!=iLastBlk ){
3016         fsBlockNext(pFS, pArray, iBlk, &iBlk);
3017         lsmStringAppendf(&str, " %d", iBlk);
3018       }
3019     }else{
3020       lsmStringAppendf(&str, "%d", pArray->iFirst);
3021       while( iBlk!=iLastBlk ){
3022         lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk));
3023         fsBlockNext(pFS, pArray, iBlk, &iBlk);
3024         lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
3025       }
3026       lsmStringAppendf(&str, " %d", pArray->iLastPg);
3027     }
3028
3029     *pzOut = str.z;
3030   }
3031
3032   if( bUnlock ){
3033     int rcwork = LSM_BUSY;
3034     lsmFinishWork(pDb, 0, &rcwork);
3035   }
3036   return rc;
3037 }
3038
3039 int lsmFsSegmentContainsPg(
3040   FileSystem *pFS,
3041   Segment *pSeg,
3042   LsmPgno iPg,
3043   int *pbRes
3044 ){
3045   Redirect *pRedir = pSeg->pRedirect;
3046   int rc = LSM_OK;
3047   int iBlk;
3048   int iLastBlk;
3049   int iPgBlock;                   /* Block containing page iPg */
3050
3051   iPgBlock = fsPageToBlock(pFS, pSeg->iFirst);
3052   iBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iFirst));
3053   iLastBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iLastPg));
3054
3055   while( iBlk!=iLastBlk && iBlk!=iPgBlock && rc==LSM_OK ){
3056     rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
3057   }
3058
3059   *pbRes = (iBlk==iPgBlock);
3060   return rc;
3061 }
3062
3063 /*
3064 ** This function implements the lsm_info(LSM_INFO_ARRAY_PAGES) request.
3065 ** If successful, *pzOut is set to point to a nul-terminated string
3066 ** containing the array structure and LSM_OK is returned. The caller should
3067 ** eventually free the string using lsmFree().
3068 **
3069 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
3070 */
3071 int lsmInfoArrayPages(lsm_db *pDb, LsmPgno iFirst, char **pzOut){
3072   int rc = LSM_OK;
3073   Snapshot *pWorker;              /* Worker snapshot */
3074   Segment *pSeg = 0;              /* Array to report on */
3075   int bUnlock = 0;
3076
3077   *pzOut = 0;
3078   if( iFirst==0 ) return LSM_ERROR;
3079
3080   /* Obtain the worker snapshot */
3081   pWorker = pDb->pWorker;
3082   if( !pWorker ){
3083     rc = lsmBeginWork(pDb);
3084     if( rc!=LSM_OK ) return rc;
3085     pWorker = pDb->pWorker;
3086     bUnlock = 1;
3087   }
3088
3089   /* Search for the array that starts on page iFirst */
3090   pSeg = findSegment(pWorker, iFirst);
3091
3092   if( pSeg==0 ){
3093     /* Could not find the requested array. This is an error. */
3094     rc = LSM_ERROR;
3095   }else{
3096     Page *pPg = 0;
3097     FileSystem *pFS = pDb->pFS;
3098     LsmString str;
3099
3100     lsmStringInit(&str, pDb->pEnv);
3101     rc = lsmFsDbPageGet(pFS, pSeg, iFirst, &pPg);
3102     while( rc==LSM_OK && pPg ){
3103       Page *pNext = 0;
3104       lsmStringAppendf(&str, " %lld", lsmFsPageNumber(pPg));
3105       rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
3106       lsmFsPageRelease(pPg);
3107       pPg = pNext;
3108     }
3109
3110     if( rc!=LSM_OK ){
3111       lsmFree(pDb->pEnv, str.z);
3112     }else{
3113       *pzOut = str.z;
3114     }
3115   }
3116
3117   if( bUnlock ){
3118     int rcwork = LSM_BUSY;
3119     lsmFinishWork(pDb, 0, &rcwork);
3120   }
3121   return rc;
3122 }
3123
3124 /*
3125 ** The following macros are used by the integrity-check code. Associated with
3126 ** each block in the database is an 8-bit bit mask (the entry in the aUsed[]
3127 ** array). As the integrity-check meanders through the database, it sets the
3128 ** following bits to indicate how each block is used.
3129 **
3130 ** INTEGRITY_CHECK_FIRST_PG:
3131 **   First page of block is in use by sorted run.
3132 **
3133 ** INTEGRITY_CHECK_LAST_PG:
3134 **   Last page of block is in use by sorted run.
3135 **
3136 ** INTEGRITY_CHECK_USED:
3137 **   At least one page of the block is in use by a sorted run.
3138 **
3139 ** INTEGRITY_CHECK_FREE:
3140 **   The free block list contains an entry corresponding to this block.
3141 */
3142 #define INTEGRITY_CHECK_FIRST_PG 0x01
3143 #define INTEGRITY_CHECK_LAST_PG  0x02
3144 #define INTEGRITY_CHECK_USED     0x04
3145 #define INTEGRITY_CHECK_FREE     0x08
3146
3147 /*
3148 ** Helper function for lsmFsIntegrityCheck()
3149 */
3150 static void checkBlocks(
3151   FileSystem *pFS,
3152   Segment *pSeg,
3153   int bExtra,                     /* If true, count the "next" block if any */
3154   int nUsed,
3155   u8 *aUsed
3156 ){
3157   if( pSeg ){
3158     if( pSeg && pSeg->nSize>0 ){
3159       int rc;
3160       int iBlk;                   /* Current block (during iteration) */
3161       int iLastBlk;               /* Last block of segment */
3162       int iFirstBlk;              /* First block of segment */
3163       int bLastIsLastOnBlock;     /* True iLast is the last on its block */
3164
3165       assert( 0==fsSegmentRedirects(pFS, pSeg) );
3166       iBlk = iFirstBlk = fsPageToBlock(pFS, pSeg->iFirst);
3167       iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg);
3168
3169       bLastIsLastOnBlock = (fsLastPageOnBlock(pFS, iLastBlk)==pSeg->iLastPg);
3170       assert( iBlk>0 );
3171
3172       do {
3173         /* iBlk is a part of this sorted run. */
3174         aUsed[iBlk-1] |= INTEGRITY_CHECK_USED;
3175
3176         /* If the first page of this block is also part of the segment,
3177         ** set the flag to indicate that the first page of iBlk is in use.
3178         */
3179         if( fsFirstPageOnBlock(pFS, iBlk)==pSeg->iFirst || iBlk!=iFirstBlk ){
3180           assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_FIRST_PG)==0 );
3181           aUsed[iBlk-1] |= INTEGRITY_CHECK_FIRST_PG;
3182         }
3183
3184         /* Unless the sorted run finishes before the last page on this block,
3185         ** the last page of this block is also in use.  */
3186         if( iBlk!=iLastBlk || bLastIsLastOnBlock ){
3187           assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_LAST_PG)==0 );
3188           aUsed[iBlk-1] |= INTEGRITY_CHECK_LAST_PG;
3189         }
3190
3191         /* Special case. The sorted run being scanned is the output run of
3192         ** a level currently undergoing an incremental merge. The sorted
3193         ** run ends on the last page of iBlk, but the next block has already
3194         ** been allocated. So mark it as in use as well.  */
3195         if( iBlk==iLastBlk && bLastIsLastOnBlock && bExtra ){
3196           int iExtra = 0;
3197           rc = fsBlockNext(pFS, pSeg, iBlk, &iExtra);
3198           assert( rc==LSM_OK );
3199
3200           assert( aUsed[iExtra-1]==0 );
3201           aUsed[iExtra-1] |= INTEGRITY_CHECK_USED;
3202           aUsed[iExtra-1] |= INTEGRITY_CHECK_FIRST_PG;
3203           aUsed[iExtra-1] |= INTEGRITY_CHECK_LAST_PG;
3204         }
3205
3206         /* Move on to the next block in the sorted run. Or set iBlk to zero
3207         ** in order to break out of the loop if this was the last block in
3208         ** the run.  */
3209         if( iBlk==iLastBlk ){
3210           iBlk = 0;
3211         }else{
3212           rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
3213           assert( rc==LSM_OK );
3214         }
3215       }while( iBlk );
3216     }
3217   }
3218 }
3219
3220 typedef struct CheckFreelistCtx CheckFreelistCtx;
3221 struct CheckFreelistCtx {
3222   u8 *aUsed;
3223   int nBlock;
3224 };
3225 static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
3226   CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx;
3227
3228   assert( iBlk>=1 );
3229   assert( iBlk<=p->nBlock );
3230   assert( p->aUsed[iBlk-1]==0 );
3231   p->aUsed[iBlk-1] = INTEGRITY_CHECK_FREE;
3232   return 0;
3233 }
3234
3235 /*
3236 ** This function checks that all blocks in the database file are accounted
3237 ** for. For each block, exactly one of the following must be true:
3238 **
3239 **   + the block is part of a sorted run, or
3240 **   + the block is on the free-block list
3241 **
3242 ** This function also checks that there are no references to blocks with
3243 ** out-of-range block numbers.
3244 **
3245 ** If no errors are found, non-zero is returned. If an error is found, an
3246 ** assert() fails.
3247 */
3248 int lsmFsIntegrityCheck(lsm_db *pDb){
3249   CheckFreelistCtx ctx;
3250   FileSystem *pFS = pDb->pFS;
3251   int i;
3252   int rc;
3253   Freelist freelist = {0, 0, 0};
3254   u8 *aUsed;
3255   Level *pLevel;
3256   Snapshot *pWorker = pDb->pWorker;
3257   int nBlock = pWorker->nBlock;
3258
3259 #if 0
3260   static int nCall = 0;
3261   nCall++;
3262   printf("%d calls\n", nCall);
3263 #endif
3264
3265   aUsed = lsmMallocZero(pDb->pEnv, nBlock);
3266   if( aUsed==0 ){
3267     /* Malloc has failed. Since this function is only called within debug
3268     ** builds, this probably means the user is running an OOM injection test.
3269     ** Regardless, it will not be possible to run the integrity-check at this
3270     ** time, so assume the database is Ok and return non-zero. */
3271     return 1;
3272   }
3273
3274   for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
3275     int j;
3276     checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
3277     for(j=0; j<pLevel->nRight; j++){
3278       checkBlocks(pFS, &pLevel->aRhs[j], 0, nBlock, aUsed);
3279     }
3280   }
3281
3282   /* Mark all blocks in the free-list as used */
3283   ctx.aUsed = aUsed;
3284   ctx.nBlock = nBlock;
3285   rc = lsmWalkFreelist(pDb, 0, checkFreelistCb, (void *)&ctx);
3286
3287   if( rc==LSM_OK ){
3288     for(i=0; i<nBlock; i++) assert( aUsed[i]!=0 );
3289   }
3290
3291   lsmFree(pDb->pEnv, aUsed);
3292   lsmFree(pDb->pEnv, freelist.aEntry);
3293
3294   return 1;
3295 }
3296
3297 #ifndef NDEBUG
3298 /*
3299 ** Return true if pPg happens to be the last page in segment pSeg. Or false
3300 ** otherwise. This function is only invoked as part of assert() conditions.
3301 */
3302 int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){
3303   if( pPg->pFS->pCompress ){
3304     LsmPgno iNext = 0;
3305     int rc;
3306     rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext);
3307     return (rc!=LSM_OK || iNext==0);
3308   }
3309   return (pPg->iPg==pSeg->iLastPg);
3310 }
3311 #endif