src/btree.c

   1 /*
   2 ** 2004 April 6
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 ** This file implements an external (disk-based) database using BTrees.
  13 ** See the header comment on "btreeInt.h" for additional information.
  14 ** Including a description of file format and an overview of operation.
  15 */
  16 #include "btreeInt.h"
  17
  18 /*
  19 ** The header string that appears at the beginning of every
  20 ** SQLite database.
  21 */
  22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
  23
  24 /*
  25 ** Set this global variable to 1 to enable tracing using the TRACE
  26 ** macro.
  27 */
  28 #if 0
  29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
  30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
  31 #else
  32 # define TRACE(X)
  33 #endif
  34
  35 /*
  36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
  37 ** But if the value is zero, make it 65536.
  38 **
  39 ** This routine is used to extract the "offset to cell content area" value
  40 ** from the header of a btree page.  If the page size is 65536 and the page
  41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
  42 ** This routine makes the necessary adjustment to 65536.
  43 */
  44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
  45
  46 /*
  47 ** Values passed as the 5th argument to allocateBtreePage()
  48 */
  49 #define BTALLOC_ANY   0           /* Allocate any page */
  50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
  51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
  52
  53 /*
  54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
  55 ** defined, or 0 if it is. For example:
  56 **
  57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
  58 */
  59 #ifndef SQLITE_OMIT_AUTOVACUUM
  60 #define IfNotOmitAV(expr) (expr)
  61 #else
  62 #define IfNotOmitAV(expr) 0
  63 #endif
  64
  65 #ifndef SQLITE_OMIT_SHARED_CACHE
  66 /*
  67 ** A list of BtShared objects that are eligible for participation
  68 ** in shared cache.  This variable has file scope during normal builds,
  69 ** but the test harness needs to access it so we make it global for
  70 ** test builds.
  71 **
  72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
  73 */
  74 #ifdef SQLITE_TEST
  75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
  76 #else
  77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
  78 #endif
  79 #endif /* SQLITE_OMIT_SHARED_CACHE */
  80
  81 #ifndef SQLITE_OMIT_SHARED_CACHE
  82 /*
  83 ** Enable or disable the shared pager and schema features.
  84 **
  85 ** This routine has no effect on existing database connections.
  86 ** The shared cache setting effects only future calls to
  87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
  88 */
  89 int sqlite3_enable_shared_cache(int enable){
  90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
  91   return SQLITE_OK;
  92 }
  93 #endif
  94
  95
  96
  97 #ifdef SQLITE_OMIT_SHARED_CACHE
  98   /*
  99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
 100   ** and clearAllSharedCacheTableLocks()
 101   ** manipulate entries in the BtShared.pLock linked list used to store
 102   ** shared-cache table level locks. If the library is compiled with the
 103   ** shared-cache feature disabled, then there is only ever one user
 104   ** of each BtShared structure and so this locking is not necessary.
 105   ** So define the lock related functions as no-ops.
 106   */
 107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
 108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
 109   #define clearAllSharedCacheTableLocks(a)
 110   #define downgradeAllSharedCacheTableLocks(a)
 111   #define hasSharedCacheTableLock(a,b,c,d) 1
 112   #define hasReadConflicts(a, b) 0
 113 #endif
 114
 115 /*
 116 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
 117 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
 118 **
 119 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
 120 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
 121 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
 122 ** with the page number and filename associated with the (MemPage*).
 123 */
 124 #ifdef SQLITE_DEBUG
 125 int corruptPageError(int lineno, MemPage *p){
 126   char *zMsg = sqlite3_mprintf("database corruption page %d of %s",
 127       (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
 128   );
 129   if( zMsg ){
 130     sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
 131   }
 132   sqlite3_free(zMsg);
 133   return SQLITE_CORRUPT_BKPT;
 134 }
 135 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
 136 #else
 137 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
 138 #endif
 139
 140 #ifndef SQLITE_OMIT_SHARED_CACHE
 141
 142 #ifdef SQLITE_DEBUG
 143 /*
 144 **** This function is only used as part of an assert() statement. ***
 145 **
 146 ** Check to see if pBtree holds the required locks to read or write to the
 147 ** table with root page iRoot.   Return 1 if it does and 0 if not.
 148 **
 149 ** For example, when writing to a table with root-page iRoot via
 150 ** Btree connection pBtree:
 151 **
 152 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
 153 **
 154 ** When writing to an index that resides in a sharable database, the
 155 ** caller should have first obtained a lock specifying the root page of
 156 ** the corresponding table. This makes things a bit more complicated,
 157 ** as this module treats each table as a separate structure. To determine
 158 ** the table corresponding to the index being written, this
 159 ** function has to search through the database schema.
 160 **
 161 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
 162 ** hold a write-lock on the schema table (root page 1). This is also
 163 ** acceptable.
 164 */
 165 static int hasSharedCacheTableLock(
 166   Btree *pBtree,         /* Handle that must hold lock */
 167   Pgno iRoot,            /* Root page of b-tree */
 168   int isIndex,           /* True if iRoot is the root of an index b-tree */
 169   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
 170 ){
 171   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
 172   Pgno iTab = 0;
 173   BtLock *pLock;
 174
 175   /* If this database is not shareable, or if the client is reading
 176   ** and has the read-uncommitted flag set, then no lock is required.
 177   ** Return true immediately.
 178   */
 179   if( (pBtree->sharable==0)
 180    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
 181   ){
 182     return 1;
 183   }
 184
 185   /* If the client is reading  or writing an index and the schema is
 186   ** not loaded, then it is too difficult to actually check to see if
 187   ** the correct locks are held.  So do not bother - just return true.
 188   ** This case does not come up very often anyhow.
 189   */
 190   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
 191     return 1;
 192   }
 193
 194   /* Figure out the root-page that the lock should be held on. For table
 195   ** b-trees, this is just the root page of the b-tree being read or
 196   ** written. For index b-trees, it is the root page of the associated
 197   ** table.  */
 198   if( isIndex ){
 199     HashElem *p;
 200     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
 201       Index *pIdx = (Index *)sqliteHashData(p);
 202       if( pIdx->tnum==(int)iRoot ){
 203         if( iTab ){
 204           /* Two or more indexes share the same root page.  There must
 205           ** be imposter tables.  So just return true.  The assert is not
 206           ** useful in that case. */
 207           return 1;
 208         }
 209         iTab = pIdx->pTable->tnum;
 210       }
 211     }
 212   }else{
 213     iTab = iRoot;
 214   }
 215
 216   /* Search for the required lock. Either a write-lock on root-page iTab, a
 217   ** write-lock on the schema table, or (if the client is reading) a
 218   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
 219   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
 220     if( pLock->pBtree==pBtree
 221      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
 222      && pLock->eLock>=eLockType
 223     ){
 224       return 1;
 225     }
 226   }
 227
 228   /* Failed to find the required lock. */
 229   return 0;
 230 }
 231 #endif /* SQLITE_DEBUG */
 232
 233 #ifdef SQLITE_DEBUG
 234 /*
 235 **** This function may be used as part of assert() statements only. ****
 236 **
 237 ** Return true if it would be illegal for pBtree to write into the
 238 ** table or index rooted at iRoot because other shared connections are
 239 ** simultaneously reading that same table or index.
 240 **
 241 ** It is illegal for pBtree to write if some other Btree object that
 242 ** shares the same BtShared object is currently reading or writing
 243 ** the iRoot table.  Except, if the other Btree object has the
 244 ** read-uncommitted flag set, then it is OK for the other object to
 245 ** have a read cursor.
 246 **
 247 ** For example, before writing to any part of the table or index
 248 ** rooted at page iRoot, one should call:
 249 **
 250 **    assert( !hasReadConflicts(pBtree, iRoot) );
 251 */
 252 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
 253   BtCursor *p;
 254   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
 255     if( p->pgnoRoot==iRoot
 256      && p->pBtree!=pBtree
 257      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
 258     ){
 259       return 1;
 260     }
 261   }
 262   return 0;
 263 }
 264 #endif    /* #ifdef SQLITE_DEBUG */
 265
 266 /*
 267 ** Query to see if Btree handle p may obtain a lock of type eLock
 268 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
 269 ** SQLITE_OK if the lock may be obtained (by calling
 270 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
 271 */
 272 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
 273   BtShared *pBt = p->pBt;
 274   BtLock *pIter;
 275
 276   assert( sqlite3BtreeHoldsMutex(p) );
 277   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
 278   assert( p->db!=0 );
 279   assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
 280
 281   /* If requesting a write-lock, then the Btree must have an open write
 282   ** transaction on this file. And, obviously, for this to be so there
 283   ** must be an open write transaction on the file itself.
 284   */
 285   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
 286   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
 287
 288   /* This routine is a no-op if the shared-cache is not enabled */
 289   if( !p->sharable ){
 290     return SQLITE_OK;
 291   }
 292
 293   /* If some other connection is holding an exclusive lock, the
 294   ** requested lock may not be obtained.
 295   */
 296   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
 297     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
 298     return SQLITE_LOCKED_SHAREDCACHE;
 299   }
 300
 301   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
 302     /* The condition (pIter->eLock!=eLock) in the following if(...)
 303     ** statement is a simplification of:
 304     **
 305     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
 306     **
 307     ** since we know that if eLock==WRITE_LOCK, then no other connection
 308     ** may hold a WRITE_LOCK on any table in this file (since there can
 309     ** only be a single writer).
 310     */
 311     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
 312     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
 313     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
 314       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
 315       if( eLock==WRITE_LOCK ){
 316         assert( p==pBt->pWriter );
 317         pBt->btsFlags |= BTS_PENDING;
 318       }
 319       return SQLITE_LOCKED_SHAREDCACHE;
 320     }
 321   }
 322   return SQLITE_OK;
 323 }
 324 #endif /* !SQLITE_OMIT_SHARED_CACHE */
 325
 326 #ifndef SQLITE_OMIT_SHARED_CACHE
 327 /*
 328 ** Add a lock on the table with root-page iTable to the shared-btree used
 329 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
 330 ** WRITE_LOCK.
 331 **
 332 ** This function assumes the following:
 333 **
 334 **   (a) The specified Btree object p is connected to a sharable
 335 **       database (one with the BtShared.sharable flag set), and
 336 **
 337 **   (b) No other Btree objects hold a lock that conflicts
 338 **       with the requested lock (i.e. querySharedCacheTableLock() has
 339 **       already been called and returned SQLITE_OK).
 340 **
 341 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
 342 ** is returned if a malloc attempt fails.
 343 */
 344 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
 345   BtShared *pBt = p->pBt;
 346   BtLock *pLock = 0;
 347   BtLock *pIter;
 348
 349   assert( sqlite3BtreeHoldsMutex(p) );
 350   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
 351   assert( p->db!=0 );
 352
 353   /* A connection with the read-uncommitted flag set will never try to
 354   ** obtain a read-lock using this function. The only read-lock obtained
 355   ** by a connection in read-uncommitted mode is on the sqlite_master
 356   ** table, and that lock is obtained in BtreeBeginTrans().  */
 357   assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
 358
 359   /* This function should only be called on a sharable b-tree after it
 360   ** has been determined that no other b-tree holds a conflicting lock.  */
 361   assert( p->sharable );
 362   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
 363
 364   /* First search the list for an existing lock on this table. */
 365   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
 366     if( pIter->iTable==iTable && pIter->pBtree==p ){
 367       pLock = pIter;
 368       break;
 369     }
 370   }
 371
 372   /* If the above search did not find a BtLock struct associating Btree p
 373   ** with table iTable, allocate one and link it into the list.
 374   */
 375   if( !pLock ){
 376     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
 377     if( !pLock ){
 378       return SQLITE_NOMEM_BKPT;
 379     }
 380     pLock->iTable = iTable;
 381     pLock->pBtree = p;
 382     pLock->pNext = pBt->pLock;
 383     pBt->pLock = pLock;
 384   }
 385
 386   /* Set the BtLock.eLock variable to the maximum of the current lock
 387   ** and the requested lock. This means if a write-lock was already held
 388   ** and a read-lock requested, we don't incorrectly downgrade the lock.
 389   */
 390   assert( WRITE_LOCK>READ_LOCK );
 391   if( eLock>pLock->eLock ){
 392     pLock->eLock = eLock;
 393   }
 394
 395   return SQLITE_OK;
 396 }
 397 #endif /* !SQLITE_OMIT_SHARED_CACHE */
 398
 399 #ifndef SQLITE_OMIT_SHARED_CACHE
 400 /*
 401 ** Release all the table locks (locks obtained via calls to
 402 ** the setSharedCacheTableLock() procedure) held by Btree object p.
 403 **
 404 ** This function assumes that Btree p has an open read or write
 405 ** transaction. If it does not, then the BTS_PENDING flag
 406 ** may be incorrectly cleared.
 407 */
 408 static void clearAllSharedCacheTableLocks(Btree *p){
 409   BtShared *pBt = p->pBt;
 410   BtLock **ppIter = &pBt->pLock;
 411
 412   assert( sqlite3BtreeHoldsMutex(p) );
 413   assert( p->sharable || 0==*ppIter );
 414   assert( p->inTrans>0 );
 415
 416   while( *ppIter ){
 417     BtLock *pLock = *ppIter;
 418     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
 419     assert( pLock->pBtree->inTrans>=pLock->eLock );
 420     if( pLock->pBtree==p ){
 421       *ppIter = pLock->pNext;
 422       assert( pLock->iTable!=1 || pLock==&p->lock );
 423       if( pLock->iTable!=1 ){
 424         sqlite3_free(pLock);
 425       }
 426     }else{
 427       ppIter = &pLock->pNext;
 428     }
 429   }
 430
 431   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
 432   if( pBt->pWriter==p ){
 433     pBt->pWriter = 0;
 434     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
 435   }else if( pBt->nTransaction==2 ){
 436     /* This function is called when Btree p is concluding its
 437     ** transaction. If there currently exists a writer, and p is not
 438     ** that writer, then the number of locks held by connections other
 439     ** than the writer must be about to drop to zero. In this case
 440     ** set the BTS_PENDING flag to 0.
 441     **
 442     ** If there is not currently a writer, then BTS_PENDING must
 443     ** be zero already. So this next line is harmless in that case.
 444     */
 445     pBt->btsFlags &= ~BTS_PENDING;
 446   }
 447 }
 448
 449 /*
 450 ** This function changes all write-locks held by Btree p into read-locks.
 451 */
 452 static void downgradeAllSharedCacheTableLocks(Btree *p){
 453   BtShared *pBt = p->pBt;
 454   if( pBt->pWriter==p ){
 455     BtLock *pLock;
 456     pBt->pWriter = 0;
 457     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
 458     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
 459       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
 460       pLock->eLock = READ_LOCK;
 461     }
 462   }
 463 }
 464
 465 #endif /* SQLITE_OMIT_SHARED_CACHE */
 466
 467 static void releasePage(MemPage *pPage);         /* Forward reference */
 468 static void releasePageOne(MemPage *pPage);      /* Forward reference */
 469 static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
 470
 471 /*
 472 ***** This routine is used inside of assert() only ****
 473 **
 474 ** Verify that the cursor holds the mutex on its BtShared
 475 */
 476 #ifdef SQLITE_DEBUG
 477 static int cursorHoldsMutex(BtCursor *p){
 478   return sqlite3_mutex_held(p->pBt->mutex);
 479 }
 480
 481 /* Verify that the cursor and the BtShared agree about what is the current
 482 ** database connetion. This is important in shared-cache mode. If the database
 483 ** connection pointers get out-of-sync, it is possible for routines like
 484 ** btreeInitPage() to reference an stale connection pointer that references a
 485 ** a connection that has already closed.  This routine is used inside assert()
 486 ** statements only and for the purpose of double-checking that the btree code
 487 ** does keep the database connection pointers up-to-date.
 488 */
 489 static int cursorOwnsBtShared(BtCursor *p){
 490   assert( cursorHoldsMutex(p) );
 491   return (p->pBtree->db==p->pBt->db);
 492 }
 493 #endif
 494
 495 /*
 496 ** Invalidate the overflow cache of the cursor passed as the first argument.
 497 ** on the shared btree structure pBt.
 498 */
 499 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
 500
 501 /*
 502 ** Invalidate the overflow page-list cache for all cursors opened
 503 ** on the shared btree structure pBt.
 504 */
 505 static void invalidateAllOverflowCache(BtShared *pBt){
 506   BtCursor *p;
 507   assert( sqlite3_mutex_held(pBt->mutex) );
 508   for(p=pBt->pCursor; p; p=p->pNext){
 509     invalidateOverflowCache(p);
 510   }
 511 }
 512
 513 #ifndef SQLITE_OMIT_INCRBLOB
 514 /*
 515 ** This function is called before modifying the contents of a table
 516 ** to invalidate any incrblob cursors that are open on the
 517 ** row or one of the rows being modified.
 518 **
 519 ** If argument isClearTable is true, then the entire contents of the
 520 ** table is about to be deleted. In this case invalidate all incrblob
 521 ** cursors open on any row within the table with root-page pgnoRoot.
 522 **
 523 ** Otherwise, if argument isClearTable is false, then the row with
 524 ** rowid iRow is being replaced or deleted. In this case invalidate
 525 ** only those incrblob cursors open on that specific row.
 526 */
 527 static void invalidateIncrblobCursors(
 528   Btree *pBtree,          /* The database file to check */
 529   Pgno pgnoRoot,          /* The table that might be changing */
 530   i64 iRow,               /* The rowid that might be changing */
 531   int isClearTable        /* True if all rows are being deleted */
 532 ){
 533   BtCursor *p;
 534   if( pBtree->hasIncrblobCur==0 ) return;
 535   assert( sqlite3BtreeHoldsMutex(pBtree) );
 536   pBtree->hasIncrblobCur = 0;
 537   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
 538     if( (p->curFlags & BTCF_Incrblob)!=0 ){
 539       pBtree->hasIncrblobCur = 1;
 540       if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
 541         p->eState = CURSOR_INVALID;
 542       }
 543     }
 544   }
 545 }
 546
 547 #else
 548   /* Stub function when INCRBLOB is omitted */
 549   #define invalidateIncrblobCursors(w,x,y,z)
 550 #endif /* SQLITE_OMIT_INCRBLOB */
 551
 552 /*
 553 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
 554 ** when a page that previously contained data becomes a free-list leaf
 555 ** page.
 556 **
 557 ** The BtShared.pHasContent bitvec exists to work around an obscure
 558 ** bug caused by the interaction of two useful IO optimizations surrounding
 559 ** free-list leaf pages:
 560 **
 561 **   1) When all data is deleted from a page and the page becomes
 562 **      a free-list leaf page, the page is not written to the database
 563 **      (as free-list leaf pages contain no meaningful data). Sometimes
 564 **      such a page is not even journalled (as it will not be modified,
 565 **      why bother journalling it?).
 566 **
 567 **   2) When a free-list leaf page is reused, its content is not read
 568 **      from the database or written to the journal file (why should it
 569 **      be, if it is not at all meaningful?).
 570 **
 571 ** By themselves, these optimizations work fine and provide a handy
 572 ** performance boost to bulk delete or insert operations. However, if
 573 ** a page is moved to the free-list and then reused within the same
 574 ** transaction, a problem comes up. If the page is not journalled when
 575 ** it is moved to the free-list and it is also not journalled when it
 576 ** is extracted from the free-list and reused, then the original data
 577 ** may be lost. In the event of a rollback, it may not be possible
 578 ** to restore the database to its original configuration.
 579 **
 580 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
 581 ** moved to become a free-list leaf page, the corresponding bit is
 582 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
 583 ** optimization 2 above is omitted if the corresponding bit is already
 584 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
 585 ** at the end of every transaction.
 586 */
 587 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
 588   int rc = SQLITE_OK;
 589   if( !pBt->pHasContent ){
 590     assert( pgno<=pBt->nPage );
 591     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
 592     if( !pBt->pHasContent ){
 593       rc = SQLITE_NOMEM_BKPT;
 594     }
 595   }
 596   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
 597     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
 598   }
 599   return rc;
 600 }
 601
 602 /*
 603 ** Query the BtShared.pHasContent vector.
 604 **
 605 ** This function is called when a free-list leaf page is removed from the
 606 ** free-list for reuse. It returns false if it is safe to retrieve the
 607 ** page from the pager layer with the 'no-content' flag set. True otherwise.
 608 */
 609 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
 610   Bitvec *p = pBt->pHasContent;
 611   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
 612 }
 613
 614 /*
 615 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
 616 ** invoked at the conclusion of each write-transaction.
 617 */
 618 static void btreeClearHasContent(BtShared *pBt){
 619   sqlite3BitvecDestroy(pBt->pHasContent);
 620   pBt->pHasContent = 0;
 621 }
 622
 623 /*
 624 ** Release all of the apPage[] pages for a cursor.
 625 */
 626 static void btreeReleaseAllCursorPages(BtCursor *pCur){
 627   int i;
 628   if( pCur->iPage>=0 ){
 629     for(i=0; i<pCur->iPage; i++){
 630       releasePageNotNull(pCur->apPage[i]);
 631     }
 632     releasePageNotNull(pCur->pPage);
 633     pCur->iPage = -1;
 634   }
 635 }
 636
 637 /*
 638 ** The cursor passed as the only argument must point to a valid entry
 639 ** when this function is called (i.e. have eState==CURSOR_VALID). This
 640 ** function saves the current cursor key in variables pCur->nKey and
 641 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
 642 ** code otherwise.
 643 **
 644 ** If the cursor is open on an intkey table, then the integer key
 645 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
 646 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
 647 ** set to point to a malloced buffer pCur->nKey bytes in size containing
 648 ** the key.
 649 */
 650 static int saveCursorKey(BtCursor *pCur){
 651   int rc = SQLITE_OK;
 652   assert( CURSOR_VALID==pCur->eState );
 653   assert( 0==pCur->pKey );
 654   assert( cursorHoldsMutex(pCur) );
 655
 656   if( pCur->curIntKey ){
 657     /* Only the rowid is required for a table btree */
 658     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
 659   }else{
 660     /* For an index btree, save the complete key content */
 661     void *pKey;
 662     pCur->nKey = sqlite3BtreePayloadSize(pCur);
 663     pKey = sqlite3Malloc( pCur->nKey );
 664     if( pKey ){
 665       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
 666       if( rc==SQLITE_OK ){
 667         pCur->pKey = pKey;
 668       }else{
 669         sqlite3_free(pKey);
 670       }
 671     }else{
 672       rc = SQLITE_NOMEM_BKPT;
 673     }
 674   }
 675   assert( !pCur->curIntKey || !pCur->pKey );
 676   return rc;
 677 }
 678
 679 /*
 680 ** Save the current cursor position in the variables BtCursor.nKey
 681 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
 682 **
 683 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
 684 ** prior to calling this routine.
 685 */
 686 static int saveCursorPosition(BtCursor *pCur){
 687   int rc;
 688
 689   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
 690   assert( 0==pCur->pKey );
 691   assert( cursorHoldsMutex(pCur) );
 692
 693   if( pCur->eState==CURSOR_SKIPNEXT ){
 694     pCur->eState = CURSOR_VALID;
 695   }else{
 696     pCur->skipNext = 0;
 697   }
 698
 699   rc = saveCursorKey(pCur);
 700   if( rc==SQLITE_OK ){
 701     btreeReleaseAllCursorPages(pCur);
 702     pCur->eState = CURSOR_REQUIRESEEK;
 703   }
 704
 705   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
 706   return rc;
 707 }
 708
 709 /* Forward reference */
 710 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
 711
 712 /*
 713 ** Save the positions of all cursors (except pExcept) that are open on
 714 ** the table with root-page iRoot.  "Saving the cursor position" means that
 715 ** the location in the btree is remembered in such a way that it can be
 716 ** moved back to the same spot after the btree has been modified.  This
 717 ** routine is called just before cursor pExcept is used to modify the
 718 ** table, for example in BtreeDelete() or BtreeInsert().
 719 **
 720 ** If there are two or more cursors on the same btree, then all such
 721 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
 722 ** routine enforces that rule.  This routine only needs to be called in
 723 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
 724 **
 725 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
 726 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
 727 ** pointless call to this routine.
 728 **
 729 ** Implementation note:  This routine merely checks to see if any cursors
 730 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
 731 ** event that cursors are in need to being saved.
 732 */
 733 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
 734   BtCursor *p;
 735   assert( sqlite3_mutex_held(pBt->mutex) );
 736   assert( pExcept==0 || pExcept->pBt==pBt );
 737   for(p=pBt->pCursor; p; p=p->pNext){
 738     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
 739   }
 740   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
 741   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
 742   return SQLITE_OK;
 743 }
 744
 745 /* This helper routine to saveAllCursors does the actual work of saving
 746 ** the cursors if and when a cursor is found that actually requires saving.
 747 ** The common case is that no cursors need to be saved, so this routine is
 748 ** broken out from its caller to avoid unnecessary stack pointer movement.
 749 */
 750 static int SQLITE_NOINLINE saveCursorsOnList(
 751   BtCursor *p,         /* The first cursor that needs saving */
 752   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
 753   BtCursor *pExcept    /* Do not save this cursor */
 754 ){
 755   do{
 756     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
 757       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
 758         int rc = saveCursorPosition(p);
 759         if( SQLITE_OK!=rc ){
 760           return rc;
 761         }
 762       }else{
 763         testcase( p->iPage>=0 );
 764         btreeReleaseAllCursorPages(p);
 765       }
 766     }
 767     p = p->pNext;
 768   }while( p );
 769   return SQLITE_OK;
 770 }
 771
 772 /*
 773 ** Clear the current cursor position.
 774 */
 775 void sqlite3BtreeClearCursor(BtCursor *pCur){
 776   assert( cursorHoldsMutex(pCur) );
 777   sqlite3_free(pCur->pKey);
 778   pCur->pKey = 0;
 779   pCur->eState = CURSOR_INVALID;
 780 }
 781
 782 /*
 783 ** In this version of BtreeMoveto, pKey is a packed index record
 784 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
 785 ** record and then call BtreeMovetoUnpacked() to do the work.
 786 */
 787 static int btreeMoveto(
 788   BtCursor *pCur,     /* Cursor open on the btree to be searched */
 789   const void *pKey,   /* Packed key if the btree is an index */
 790   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
 791   int bias,           /* Bias search to the high end */
 792   int *pRes           /* Write search results here */
 793 ){
 794   int rc;                    /* Status code */
 795   UnpackedRecord *pIdxKey;   /* Unpacked index key */
 796
 797   if( pKey ){
 798     assert( nKey==(i64)(int)nKey );
 799     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);
 800     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
 801     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
 802     if( pIdxKey->nField==0 ){
 803       rc = SQLITE_CORRUPT_BKPT;
 804       goto moveto_done;
 805     }
 806   }else{
 807     pIdxKey = 0;
 808   }
 809   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
 810 moveto_done:
 811   if( pIdxKey ){
 812     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
 813   }
 814   return rc;
 815 }
 816
 817 /*
 818 ** Restore the cursor to the position it was in (or as close to as possible)
 819 ** when saveCursorPosition() was called. Note that this call deletes the
 820 ** saved position info stored by saveCursorPosition(), so there can be
 821 ** at most one effective restoreCursorPosition() call after each
 822 ** saveCursorPosition().
 823 */
 824 static int btreeRestoreCursorPosition(BtCursor *pCur){
 825   int rc;
 826   int skipNext;
 827   assert( cursorOwnsBtShared(pCur) );
 828   assert( pCur->eState>=CURSOR_REQUIRESEEK );
 829   if( pCur->eState==CURSOR_FAULT ){
 830     return pCur->skipNext;
 831   }
 832   pCur->eState = CURSOR_INVALID;
 833   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
 834   if( rc==SQLITE_OK ){
 835     sqlite3_free(pCur->pKey);
 836     pCur->pKey = 0;
 837     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
 838     pCur->skipNext |= skipNext;
 839     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
 840       pCur->eState = CURSOR_SKIPNEXT;
 841     }
 842   }
 843   return rc;
 844 }
 845
 846 #define restoreCursorPosition(p) \
 847   (p->eState>=CURSOR_REQUIRESEEK ? \
 848          btreeRestoreCursorPosition(p) : \
 849          SQLITE_OK)
 850
 851 /*
 852 ** Determine whether or not a cursor has moved from the position where
 853 ** it was last placed, or has been invalidated for any other reason.
 854 ** Cursors can move when the row they are pointing at is deleted out
 855 ** from under them, for example.  Cursor might also move if a btree
 856 ** is rebalanced.
 857 **
 858 ** Calling this routine with a NULL cursor pointer returns false.
 859 **
 860 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
 861 ** back to where it ought to be if this routine returns true.
 862 */
 863 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
 864   return pCur->eState!=CURSOR_VALID;
 865 }
 866
 867 /*
 868 ** Return a pointer to a fake BtCursor object that will always answer
 869 ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
 870 ** cursor returned must not be used with any other Btree interface.
 871 */
 872 BtCursor *sqlite3BtreeFakeValidCursor(void){
 873   static u8 fakeCursor = CURSOR_VALID;
 874   assert( offsetof(BtCursor, eState)==0 );
 875   return (BtCursor*)&fakeCursor;
 876 }
 877
 878 /*
 879 ** This routine restores a cursor back to its original position after it
 880 ** has been moved by some outside activity (such as a btree rebalance or
 881 ** a row having been deleted out from under the cursor).
 882 **
 883 ** On success, the *pDifferentRow parameter is false if the cursor is left
 884 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
 885 ** was pointing to has been deleted, forcing the cursor to point to some
 886 ** nearby row.
 887 **
 888 ** This routine should only be called for a cursor that just returned
 889 ** TRUE from sqlite3BtreeCursorHasMoved().
 890 */
 891 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
 892   int rc;
 893
 894   assert( pCur!=0 );
 895   assert( pCur->eState!=CURSOR_VALID );
 896   rc = restoreCursorPosition(pCur);
 897   if( rc ){
 898     *pDifferentRow = 1;
 899     return rc;
 900   }
 901   if( pCur->eState!=CURSOR_VALID ){
 902     *pDifferentRow = 1;
 903   }else{
 904     assert( pCur->skipNext==0 );
 905     *pDifferentRow = 0;
 906   }
 907   return SQLITE_OK;
 908 }
 909
 910 #ifdef SQLITE_ENABLE_CURSOR_HINTS
 911 /*
 912 ** Provide hints to the cursor.  The particular hint given (and the type
 913 ** and number of the varargs parameters) is determined by the eHintType
 914 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
 915 */
 916 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
 917   /* Used only by system that substitute their own storage engine */
 918 }
 919 #endif
 920
 921 /*
 922 ** Provide flag hints to the cursor.
 923 */
 924 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
 925   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
 926   pCur->hints = x;
 927 }
 928
 929
 930 #ifndef SQLITE_OMIT_AUTOVACUUM
 931 /*
 932 ** Given a page number of a regular database page, return the page
 933 ** number for the pointer-map page that contains the entry for the
 934 ** input page number.
 935 **
 936 ** Return 0 (not a valid page) for pgno==1 since there is
 937 ** no pointer map associated with page 1.  The integrity_check logic
 938 ** requires that ptrmapPageno(*,1)!=1.
 939 */
 940 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
 941   int nPagesPerMapPage;
 942   Pgno iPtrMap, ret;
 943   assert( sqlite3_mutex_held(pBt->mutex) );
 944   if( pgno<2 ) return 0;
 945   nPagesPerMapPage = (pBt->usableSize/5)+1;
 946   iPtrMap = (pgno-2)/nPagesPerMapPage;
 947   ret = (iPtrMap*nPagesPerMapPage) + 2;
 948   if( ret==PENDING_BYTE_PAGE(pBt) ){
 949     ret++;
 950   }
 951   return ret;
 952 }
 953
 954 /*
 955 ** Write an entry into the pointer map.
 956 **
 957 ** This routine updates the pointer map entry for page number 'key'
 958 ** so that it maps to type 'eType' and parent page number 'pgno'.
 959 **
 960 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
 961 ** a no-op.  If an error occurs, the appropriate error code is written
 962 ** into *pRC.
 963 */
 964 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
 965   DbPage *pDbPage;  /* The pointer map page */
 966   u8 *pPtrmap;      /* The pointer map data */
 967   Pgno iPtrmap;     /* The pointer map page number */
 968   int offset;       /* Offset in pointer map page */
 969   int rc;           /* Return code from subfunctions */
 970
 971   if( *pRC ) return;
 972
 973   assert( sqlite3_mutex_held(pBt->mutex) );
 974   /* The master-journal page number must never be used as a pointer map page */
 975   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
 976
 977   assert( pBt->autoVacuum );
 978   if( key==0 ){
 979     *pRC = SQLITE_CORRUPT_BKPT;
 980     return;
 981   }
 982   iPtrmap = PTRMAP_PAGENO(pBt, key);
 983   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
 984   if( rc!=SQLITE_OK ){
 985     *pRC = rc;
 986     return;
 987   }
 988   offset = PTRMAP_PTROFFSET(iPtrmap, key);
 989   if( offset<0 ){
 990     *pRC = SQLITE_CORRUPT_BKPT;
 991     goto ptrmap_exit;
 992   }
 993   assert( offset <= (int)pBt->usableSize-5 );
 994   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
 995
 996   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
 997     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
 998     *pRC= rc = sqlite3PagerWrite(pDbPage);
 999     if( rc==SQLITE_OK ){
1000       pPtrmap[offset] = eType;
1001       put4byte(&pPtrmap[offset+1], parent);
1002     }
1003   }
1004
1005 ptrmap_exit:
1006   sqlite3PagerUnref(pDbPage);
1007 }
1008
1009 /*
1010 ** Read an entry from the pointer map.
1011 **
1012 ** This routine retrieves the pointer map entry for page 'key', writing
1013 ** the type and parent page number to *pEType and *pPgno respectively.
1014 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1015 */
1016 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1017   DbPage *pDbPage;   /* The pointer map page */
1018   int iPtrmap;       /* Pointer map page index */
1019   u8 *pPtrmap;       /* Pointer map page data */
1020   int offset;        /* Offset of entry in pointer map */
1021   int rc;
1022
1023   assert( sqlite3_mutex_held(pBt->mutex) );
1024
1025   iPtrmap = PTRMAP_PAGENO(pBt, key);
1026   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1027   if( rc!=0 ){
1028     return rc;
1029   }
1030   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1031
1032   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1033   if( offset<0 ){
1034     sqlite3PagerUnref(pDbPage);
1035     return SQLITE_CORRUPT_BKPT;
1036   }
1037   assert( offset <= (int)pBt->usableSize-5 );
1038   assert( pEType!=0 );
1039   *pEType = pPtrmap[offset];
1040   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1041
1042   sqlite3PagerUnref(pDbPage);
1043   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
1044   return SQLITE_OK;
1045 }
1046
1047 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1048   #define ptrmapPut(w,x,y,z,rc)
1049   #define ptrmapGet(w,x,y,z) SQLITE_OK
1050   #define ptrmapPutOvflPtr(x, y, rc)
1051 #endif
1052
1053 /*
1054 ** Given a btree page and a cell index (0 means the first cell on
1055 ** the page, 1 means the second cell, and so forth) return a pointer
1056 ** to the cell content.
1057 **
1058 ** findCellPastPtr() does the same except it skips past the initial
1059 ** 4-byte child pointer found on interior pages, if there is one.
1060 **
1061 ** This routine works only for pages that do not contain overflow cells.
1062 */
1063 #define findCell(P,I) \
1064   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1065 #define findCellPastPtr(P,I) \
1066   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1067
1068
1069 /*
1070 ** This is common tail processing for btreeParseCellPtr() and
1071 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1072 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1073 ** structure.
1074 */
1075 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1076   MemPage *pPage,         /* Page containing the cell */
1077   u8 *pCell,              /* Pointer to the cell text. */
1078   CellInfo *pInfo         /* Fill in this structure */
1079 ){
1080   /* If the payload will not fit completely on the local page, we have
1081   ** to decide how much to store locally and how much to spill onto
1082   ** overflow pages.  The strategy is to minimize the amount of unused
1083   ** space on overflow pages while keeping the amount of local storage
1084   ** in between minLocal and maxLocal.
1085   **
1086   ** Warning:  changing the way overflow payload is distributed in any
1087   ** way will result in an incompatible file format.
1088   */
1089   int minLocal;  /* Minimum amount of payload held locally */
1090   int maxLocal;  /* Maximum amount of payload held locally */
1091   int surplus;   /* Overflow payload available for local storage */
1092
1093   minLocal = pPage->minLocal;
1094   maxLocal = pPage->maxLocal;
1095   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1096   testcase( surplus==maxLocal );
1097   testcase( surplus==maxLocal+1 );
1098   if( surplus <= maxLocal ){
1099     pInfo->nLocal = (u16)surplus;
1100   }else{
1101     pInfo->nLocal = (u16)minLocal;
1102   }
1103   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1104 }
1105
1106 /*
1107 ** The following routines are implementations of the MemPage.xParseCell()
1108 ** method.
1109 **
1110 ** Parse a cell content block and fill in the CellInfo structure.
1111 **
1112 ** btreeParseCellPtr()        =>   table btree leaf nodes
1113 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1114 ** btreeParseCellPtrIndex()   =>   index btree nodes
1115 **
1116 ** There is also a wrapper function btreeParseCell() that works for
1117 ** all MemPage types and that references the cell by index rather than
1118 ** by pointer.
1119 */
1120 static void btreeParseCellPtrNoPayload(
1121   MemPage *pPage,         /* Page containing the cell */
1122   u8 *pCell,              /* Pointer to the cell text. */
1123   CellInfo *pInfo         /* Fill in this structure */
1124 ){
1125   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1126   assert( pPage->leaf==0 );
1127   assert( pPage->childPtrSize==4 );
1128 #ifndef SQLITE_DEBUG
1129   UNUSED_PARAMETER(pPage);
1130 #endif
1131   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1132   pInfo->nPayload = 0;
1133   pInfo->nLocal = 0;
1134   pInfo->pPayload = 0;
1135   return;
1136 }
1137 static void btreeParseCellPtr(
1138   MemPage *pPage,         /* Page containing the cell */
1139   u8 *pCell,              /* Pointer to the cell text. */
1140   CellInfo *pInfo         /* Fill in this structure */
1141 ){
1142   u8 *pIter;              /* For scanning through pCell */
1143   u32 nPayload;           /* Number of bytes of cell payload */
1144   u64 iKey;               /* Extracted Key value */
1145
1146   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1147   assert( pPage->leaf==0 || pPage->leaf==1 );
1148   assert( pPage->intKeyLeaf );
1149   assert( pPage->childPtrSize==0 );
1150   pIter = pCell;
1151
1152   /* The next block of code is equivalent to:
1153   **
1154   **     pIter += getVarint32(pIter, nPayload);
1155   **
1156   ** The code is inlined to avoid a function call.
1157   */
1158   nPayload = *pIter;
1159   if( nPayload>=0x80 ){
1160     u8 *pEnd = &pIter[8];
1161     nPayload &= 0x7f;
1162     do{
1163       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1164     }while( (*pIter)>=0x80 && pIter<pEnd );
1165   }
1166   pIter++;
1167
1168   /* The next block of code is equivalent to:
1169   **
1170   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1171   **
1172   ** The code is inlined to avoid a function call.
1173   */
1174   iKey = *pIter;
1175   if( iKey>=0x80 ){
1176     u8 *pEnd = &pIter[7];
1177     iKey &= 0x7f;
1178     while(1){
1179       iKey = (iKey<<7) | (*++pIter & 0x7f);
1180       if( (*pIter)<0x80 ) break;
1181       if( pIter>=pEnd ){
1182         iKey = (iKey<<8) | *++pIter;
1183         break;
1184       }
1185     }
1186   }
1187   pIter++;
1188
1189   pInfo->nKey = *(i64*)&iKey;
1190   pInfo->nPayload = nPayload;
1191   pInfo->pPayload = pIter;
1192   testcase( nPayload==pPage->maxLocal );
1193   testcase( nPayload==pPage->maxLocal+1 );
1194   if( nPayload<=pPage->maxLocal ){
1195     /* This is the (easy) common case where the entire payload fits
1196     ** on the local page.  No overflow is required.
1197     */
1198     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1199     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1200     pInfo->nLocal = (u16)nPayload;
1201   }else{
1202     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1203   }
1204 }
1205 static void btreeParseCellPtrIndex(
1206   MemPage *pPage,         /* Page containing the cell */
1207   u8 *pCell,              /* Pointer to the cell text. */
1208   CellInfo *pInfo         /* Fill in this structure */
1209 ){
1210   u8 *pIter;              /* For scanning through pCell */
1211   u32 nPayload;           /* Number of bytes of cell payload */
1212
1213   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1214   assert( pPage->leaf==0 || pPage->leaf==1 );
1215   assert( pPage->intKeyLeaf==0 );
1216   pIter = pCell + pPage->childPtrSize;
1217   nPayload = *pIter;
1218   if( nPayload>=0x80 ){
1219     u8 *pEnd = &pIter[8];
1220     nPayload &= 0x7f;
1221     do{
1222       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1223     }while( *(pIter)>=0x80 && pIter<pEnd );
1224   }
1225   pIter++;
1226   pInfo->nKey = nPayload;
1227   pInfo->nPayload = nPayload;
1228   pInfo->pPayload = pIter;
1229   testcase( nPayload==pPage->maxLocal );
1230   testcase( nPayload==pPage->maxLocal+1 );
1231   if( nPayload<=pPage->maxLocal ){
1232     /* This is the (easy) common case where the entire payload fits
1233     ** on the local page.  No overflow is required.
1234     */
1235     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1236     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1237     pInfo->nLocal = (u16)nPayload;
1238   }else{
1239     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1240   }
1241 }
1242 static void btreeParseCell(
1243   MemPage *pPage,         /* Page containing the cell */
1244   int iCell,              /* The cell index.  First cell is 0 */
1245   CellInfo *pInfo         /* Fill in this structure */
1246 ){
1247   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1248 }
1249
1250 /*
1251 ** The following routines are implementations of the MemPage.xCellSize
1252 ** method.
1253 **
1254 ** Compute the total number of bytes that a Cell needs in the cell
1255 ** data area of the btree-page.  The return number includes the cell
1256 ** data header and the local payload, but not any overflow page or
1257 ** the space used by the cell pointer.
1258 **
1259 ** cellSizePtrNoPayload()    =>   table internal nodes
1260 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1261 */
1262 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1263   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1264   u8 *pEnd;                                /* End mark for a varint */
1265   u32 nSize;                               /* Size value to return */
1266
1267 #ifdef SQLITE_DEBUG
1268   /* The value returned by this function should always be the same as
1269   ** the (CellInfo.nSize) value found by doing a full parse of the
1270   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1271   ** this function verifies that this invariant is not violated. */
1272   CellInfo debuginfo;
1273   pPage->xParseCell(pPage, pCell, &debuginfo);
1274 #endif
1275
1276   nSize = *pIter;
1277   if( nSize>=0x80 ){
1278     pEnd = &pIter[8];
1279     nSize &= 0x7f;
1280     do{
1281       nSize = (nSize<<7) | (*++pIter & 0x7f);
1282     }while( *(pIter)>=0x80 && pIter<pEnd );
1283   }
1284   pIter++;
1285   if( pPage->intKey ){
1286     /* pIter now points at the 64-bit integer key value, a variable length
1287     ** integer. The following block moves pIter to point at the first byte
1288     ** past the end of the key value. */
1289     pEnd = &pIter[9];
1290     while( (*pIter++)&0x80 && pIter<pEnd );
1291   }
1292   testcase( nSize==pPage->maxLocal );
1293   testcase( nSize==pPage->maxLocal+1 );
1294   if( nSize<=pPage->maxLocal ){
1295     nSize += (u32)(pIter - pCell);
1296     if( nSize<4 ) nSize = 4;
1297   }else{
1298     int minLocal = pPage->minLocal;
1299     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1300     testcase( nSize==pPage->maxLocal );
1301     testcase( nSize==pPage->maxLocal+1 );
1302     if( nSize>pPage->maxLocal ){
1303       nSize = minLocal;
1304     }
1305     nSize += 4 + (u16)(pIter - pCell);
1306   }
1307   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1308   return (u16)nSize;
1309 }
1310 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1311   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1312   u8 *pEnd;              /* End mark for a varint */
1313
1314 #ifdef SQLITE_DEBUG
1315   /* The value returned by this function should always be the same as
1316   ** the (CellInfo.nSize) value found by doing a full parse of the
1317   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1318   ** this function verifies that this invariant is not violated. */
1319   CellInfo debuginfo;
1320   pPage->xParseCell(pPage, pCell, &debuginfo);
1321 #else
1322   UNUSED_PARAMETER(pPage);
1323 #endif
1324
1325   assert( pPage->childPtrSize==4 );
1326   pEnd = pIter + 9;
1327   while( (*pIter++)&0x80 && pIter<pEnd );
1328   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1329   return (u16)(pIter - pCell);
1330 }
1331
1332
1333 #ifdef SQLITE_DEBUG
1334 /* This variation on cellSizePtr() is used inside of assert() statements
1335 ** only. */
1336 static u16 cellSize(MemPage *pPage, int iCell){
1337   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1338 }
1339 #endif
1340
1341 #ifndef SQLITE_OMIT_AUTOVACUUM
1342 /*
1343 ** If the cell pCell, part of page pPage contains a pointer
1344 ** to an overflow page, insert an entry into the pointer-map
1345 ** for the overflow page.
1346 */
1347 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1348   CellInfo info;
1349   if( *pRC ) return;
1350   assert( pCell!=0 );
1351   pPage->xParseCell(pPage, pCell, &info);
1352   if( info.nLocal<info.nPayload ){
1353     Pgno ovfl = get4byte(&pCell[info.nSize-4]);
1354     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1355   }
1356 }
1357 #endif
1358
1359
1360 /*
1361 ** Defragment the page given. This routine reorganizes cells within the
1362 ** page so that there are no free-blocks on the free-block list.
1363 **
1364 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
1365 ** present in the page after this routine returns.
1366 **
1367 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1368 ** b-tree page so that there are no freeblocks or fragment bytes, all
1369 ** unused bytes are contained in the unallocated space region, and all
1370 ** cells are packed tightly at the end of the page.
1371 */
1372 static int defragmentPage(MemPage *pPage, int nMaxFrag){
1373   int i;                     /* Loop counter */
1374   int pc;                    /* Address of the i-th cell */
1375   int hdr;                   /* Offset to the page header */
1376   int size;                  /* Size of a cell */
1377   int usableSize;            /* Number of usable bytes on a page */
1378   int cellOffset;            /* Offset to the cell pointer array */
1379   int cbrk;                  /* Offset to the cell content area */
1380   int nCell;                 /* Number of cells on the page */
1381   unsigned char *data;       /* The page data */
1382   unsigned char *temp;       /* Temp area for cell content */
1383   unsigned char *src;        /* Source of content */
1384   int iCellFirst;            /* First allowable cell index */
1385   int iCellLast;             /* Last possible cell index */
1386
1387   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1388   assert( pPage->pBt!=0 );
1389   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1390   assert( pPage->nOverflow==0 );
1391   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1392   temp = 0;
1393   src = data = pPage->aData;
1394   hdr = pPage->hdrOffset;
1395   cellOffset = pPage->cellOffset;
1396   nCell = pPage->nCell;
1397   assert( nCell==get2byte(&data[hdr+3]) );
1398   iCellFirst = cellOffset + 2*nCell;
1399   usableSize = pPage->pBt->usableSize;
1400
1401   /* This block handles pages with two or fewer free blocks and nMaxFrag
1402   ** or fewer fragmented bytes. In this case it is faster to move the
1403   ** two (or one) blocks of cells using memmove() and add the required
1404   ** offsets to each pointer in the cell-pointer array than it is to
1405   ** reconstruct the entire page.  */
1406   if( (int)data[hdr+7]<=nMaxFrag ){
1407     int iFree = get2byte(&data[hdr+1]);
1408     if( iFree ){
1409       int iFree2 = get2byte(&data[iFree]);
1410
1411       /* pageFindSlot() has already verified that free blocks are sorted
1412       ** in order of offset within the page, and that no block extends
1413       ** past the end of the page. Provided the two free slots do not
1414       ** overlap, this guarantees that the memmove() calls below will not
1415       ** overwrite the usableSize byte buffer, even if the database page
1416       ** is corrupt.  */
1417       assert( iFree2==0 || iFree2>iFree );
1418       assert( iFree+get2byte(&data[iFree+2]) <= usableSize );
1419       assert( iFree2==0 || iFree2+get2byte(&data[iFree2+2]) <= usableSize );
1420
1421       if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
1422         u8 *pEnd = &data[cellOffset + nCell*2];
1423         u8 *pAddr;
1424         int sz2 = 0;
1425         int sz = get2byte(&data[iFree+2]);
1426         int top = get2byte(&data[hdr+5]);
1427         if( top>=iFree ){
1428           return SQLITE_CORRUPT_PAGE(pPage);
1429         }
1430         if( iFree2 ){
1431           assert( iFree+sz<=iFree2 ); /* Verified by pageFindSlot() */
1432           sz2 = get2byte(&data[iFree2+2]);
1433           assert( iFree+sz+sz2+iFree2-(iFree+sz) <= usableSize );
1434           memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
1435           sz += sz2;
1436         }
1437         cbrk = top+sz;
1438         assert( cbrk+(iFree-top) <= usableSize );
1439         memmove(&data[cbrk], &data[top], iFree-top);
1440         for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
1441           pc = get2byte(pAddr);
1442           if( pc<iFree ){ put2byte(pAddr, pc+sz); }
1443           else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
1444         }
1445         goto defragment_out;
1446       }
1447     }
1448   }
1449
1450   cbrk = usableSize;
1451   iCellLast = usableSize - 4;
1452   for(i=0; i<nCell; i++){
1453     u8 *pAddr;     /* The i-th cell pointer */
1454     pAddr = &data[cellOffset + i*2];
1455     pc = get2byte(pAddr);
1456     testcase( pc==iCellFirst );
1457     testcase( pc==iCellLast );
1458     /* These conditions have already been verified in btreeInitPage()
1459     ** if PRAGMA cell_size_check=ON.
1460     */
1461     if( pc<iCellFirst || pc>iCellLast ){
1462       return SQLITE_CORRUPT_PAGE(pPage);
1463     }
1464     assert( pc>=iCellFirst && pc<=iCellLast );
1465     size = pPage->xCellSize(pPage, &src[pc]);
1466     cbrk -= size;
1467     if( cbrk<iCellFirst || pc+size>usableSize ){
1468       return SQLITE_CORRUPT_PAGE(pPage);
1469     }
1470     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1471     testcase( cbrk+size==usableSize );
1472     testcase( pc+size==usableSize );
1473     put2byte(pAddr, cbrk);
1474     if( temp==0 ){
1475       int x;
1476       if( cbrk==pc ) continue;
1477       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1478       x = get2byte(&data[hdr+5]);
1479       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1480       src = temp;
1481     }
1482     memcpy(&data[cbrk], &src[pc], size);
1483   }
1484   data[hdr+7] = 0;
1485
1486  defragment_out:
1487   if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
1488     return SQLITE_CORRUPT_PAGE(pPage);
1489   }
1490   assert( cbrk>=iCellFirst );
1491   put2byte(&data[hdr+5], cbrk);
1492   data[hdr+1] = 0;
1493   data[hdr+2] = 0;
1494   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1495   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1496   return SQLITE_OK;
1497 }
1498
1499 /*
1500 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1501 ** size. If one can be found, return a pointer to the space and remove it
1502 ** from the free-list.
1503 **
1504 ** If no suitable space can be found on the free-list, return NULL.
1505 **
1506 ** This function may detect corruption within pPg.  If corruption is
1507 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1508 **
1509 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1510 ** will be ignored if adding the extra space to the fragmentation count
1511 ** causes the fragmentation count to exceed 60.
1512 */
1513 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1514   const int hdr = pPg->hdrOffset;
1515   u8 * const aData = pPg->aData;
1516   int iAddr = hdr + 1;
1517   int pc = get2byte(&aData[iAddr]);
1518   int x;
1519   int usableSize = pPg->pBt->usableSize;
1520   int size;            /* Size of the free slot */
1521
1522   assert( pc>0 );
1523   while( pc<=usableSize-4 ){
1524     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1525     ** freeblock form a big-endian integer which is the size of the freeblock
1526     ** in bytes, including the 4-byte header. */
1527     size = get2byte(&aData[pc+2]);
1528     if( (x = size - nByte)>=0 ){
1529       testcase( x==4 );
1530       testcase( x==3 );
1531       if( size+pc > usableSize ){
1532         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1533         return 0;
1534       }else if( x<4 ){
1535         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1536         ** number of bytes in fragments may not exceed 60. */
1537         if( aData[hdr+7]>57 ) return 0;
1538
1539         /* Remove the slot from the free-list. Update the number of
1540         ** fragmented bytes within the page. */
1541         memcpy(&aData[iAddr], &aData[pc], 2);
1542         aData[hdr+7] += (u8)x;
1543       }else{
1544         /* The slot remains on the free-list. Reduce its size to account
1545          ** for the portion used by the new allocation. */
1546         put2byte(&aData[pc+2], x);
1547       }
1548       return &aData[pc + x];
1549     }
1550     iAddr = pc;
1551     pc = get2byte(&aData[pc]);
1552     if( pc<iAddr+size ) break;
1553   }
1554   if( pc ){
1555     *pRc = SQLITE_CORRUPT_PAGE(pPg);
1556   }
1557
1558   return 0;
1559 }
1560
1561 /*
1562 ** Allocate nByte bytes of space from within the B-Tree page passed
1563 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1564 ** of the first byte of allocated space. Return either SQLITE_OK or
1565 ** an error code (usually SQLITE_CORRUPT).
1566 **
1567 ** The caller guarantees that there is sufficient space to make the
1568 ** allocation.  This routine might need to defragment in order to bring
1569 ** all the space together, however.  This routine will avoid using
1570 ** the first two bytes past the cell pointer area since presumably this
1571 ** allocation is being made in order to insert a new cell, so we will
1572 ** also end up needing a new cell pointer.
1573 */
1574 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1575   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1576   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1577   int top;                             /* First byte of cell content area */
1578   int rc = SQLITE_OK;                  /* Integer return code */
1579   int gap;        /* First byte of gap between cell pointers and cell content */
1580
1581   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1582   assert( pPage->pBt );
1583   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1584   assert( nByte>=0 );  /* Minimum cell size is 4 */
1585   assert( pPage->nFree>=nByte );
1586   assert( pPage->nOverflow==0 );
1587   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1588
1589   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1590   gap = pPage->cellOffset + 2*pPage->nCell;
1591   assert( gap<=65536 );
1592   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1593   ** and the reserved space is zero (the usual value for reserved space)
1594   ** then the cell content offset of an empty page wants to be 65536.
1595   ** However, that integer is too large to be stored in a 2-byte unsigned
1596   ** integer, so a value of 0 is used in its place. */
1597   top = get2byte(&data[hdr+5]);
1598   assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
1599   if( gap>top ){
1600     if( top==0 && pPage->pBt->usableSize==65536 ){
1601       top = 65536;
1602     }else{
1603       return SQLITE_CORRUPT_PAGE(pPage);
1604     }
1605   }
1606
1607   /* If there is enough space between gap and top for one more cell pointer
1608   ** array entry offset, and if the freelist is not empty, then search the
1609   ** freelist looking for a free slot big enough to satisfy the request.
1610   */
1611   testcase( gap+2==top );
1612   testcase( gap+1==top );
1613   testcase( gap==top );
1614   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1615     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1616     if( pSpace ){
1617       assert( pSpace>=data && (pSpace - data)<65536 );
1618       *pIdx = (int)(pSpace - data);
1619       return SQLITE_OK;
1620     }else if( rc ){
1621       return rc;
1622     }
1623   }
1624
1625   /* The request could not be fulfilled using a freelist slot.  Check
1626   ** to see if defragmentation is necessary.
1627   */
1628   testcase( gap+2+nByte==top );
1629   if( gap+2+nByte>top ){
1630     assert( pPage->nCell>0 || CORRUPT_DB );
1631     rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
1632     if( rc ) return rc;
1633     top = get2byteNotZero(&data[hdr+5]);
1634     assert( gap+2+nByte<=top );
1635   }
1636
1637
1638   /* Allocate memory from the gap in between the cell pointer array
1639   ** and the cell content area.  The btreeInitPage() call has already
1640   ** validated the freelist.  Given that the freelist is valid, there
1641   ** is no way that the allocation can extend off the end of the page.
1642   ** The assert() below verifies the previous sentence.
1643   */
1644   top -= nByte;
1645   put2byte(&data[hdr+5], top);
1646   assert( top+nByte <= (int)pPage->pBt->usableSize );
1647   *pIdx = top;
1648   return SQLITE_OK;
1649 }
1650
1651 /*
1652 ** Return a section of the pPage->aData to the freelist.
1653 ** The first byte of the new free block is pPage->aData[iStart]
1654 ** and the size of the block is iSize bytes.
1655 **
1656 ** Adjacent freeblocks are coalesced.
1657 **
1658 ** Note that even though the freeblock list was checked by btreeInitPage(),
1659 ** that routine will not detect overlap between cells or freeblocks.  Nor
1660 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1661 ** at the end of the page.  So do additional corruption checks inside this
1662 ** routine and return SQLITE_CORRUPT if any problems are found.
1663 */
1664 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1665   u16 iPtr;                             /* Address of ptr to next freeblock */
1666   u16 iFreeBlk;                         /* Address of the next freeblock */
1667   u8 hdr;                               /* Page header size.  0 or 100 */
1668   u8 nFrag = 0;                         /* Reduction in fragmentation */
1669   u16 iOrigSize = iSize;                /* Original value of iSize */
1670   u16 x;                                /* Offset to cell content area */
1671   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1672   unsigned char *data = pPage->aData;   /* Page content */
1673
1674   assert( pPage->pBt!=0 );
1675   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1676   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1677   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1678   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1679   assert( iSize>=4 );   /* Minimum cell size is 4 */
1680   assert( iStart<=pPage->pBt->usableSize-4 );
1681
1682   /* The list of freeblocks must be in ascending order.  Find the
1683   ** spot on the list where iStart should be inserted.
1684   */
1685   hdr = pPage->hdrOffset;
1686   iPtr = hdr + 1;
1687   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1688     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1689   }else{
1690     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1691       if( iFreeBlk<iPtr+4 ){
1692         if( iFreeBlk==0 ) break;
1693         return SQLITE_CORRUPT_PAGE(pPage);
1694       }
1695       iPtr = iFreeBlk;
1696     }
1697     if( iFreeBlk>pPage->pBt->usableSize-4 ){
1698       return SQLITE_CORRUPT_PAGE(pPage);
1699     }
1700     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1701
1702     /* At this point:
1703     **    iFreeBlk:   First freeblock after iStart, or zero if none
1704     **    iPtr:       The address of a pointer to iFreeBlk
1705     **
1706     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1707     */
1708     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1709       nFrag = iFreeBlk - iEnd;
1710       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
1711       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1712       if( iEnd > pPage->pBt->usableSize ){
1713         return SQLITE_CORRUPT_PAGE(pPage);
1714       }
1715       iSize = iEnd - iStart;
1716       iFreeBlk = get2byte(&data[iFreeBlk]);
1717     }
1718
1719     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1720     ** pointer in the page header) then check to see if iStart should be
1721     ** coalesced onto the end of iPtr.
1722     */
1723     if( iPtr>hdr+1 ){
1724       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1725       if( iPtrEnd+3>=iStart ){
1726         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
1727         nFrag += iStart - iPtrEnd;
1728         iSize = iEnd - iPtr;
1729         iStart = iPtr;
1730       }
1731     }
1732     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
1733     data[hdr+7] -= nFrag;
1734   }
1735   x = get2byte(&data[hdr+5]);
1736   if( iStart<=x ){
1737     /* The new freeblock is at the beginning of the cell content area,
1738     ** so just extend the cell content area rather than create another
1739     ** freelist entry */
1740     if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
1741     put2byte(&data[hdr+1], iFreeBlk);
1742     put2byte(&data[hdr+5], iEnd);
1743   }else{
1744     /* Insert the new freeblock into the freelist */
1745     put2byte(&data[iPtr], iStart);
1746   }
1747   if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
1748     /* Overwrite deleted information with zeros when the secure_delete
1749     ** option is enabled */
1750     memset(&data[iStart], 0, iSize);
1751   }
1752   put2byte(&data[iStart], iFreeBlk);
1753   put2byte(&data[iStart+2], iSize);
1754   pPage->nFree += iOrigSize;
1755   return SQLITE_OK;
1756 }
1757
1758 /*
1759 ** Decode the flags byte (the first byte of the header) for a page
1760 ** and initialize fields of the MemPage structure accordingly.
1761 **
1762 ** Only the following combinations are supported.  Anything different
1763 ** indicates a corrupt database files:
1764 **
1765 **         PTF_ZERODATA
1766 **         PTF_ZERODATA | PTF_LEAF
1767 **         PTF_LEAFDATA | PTF_INTKEY
1768 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1769 */
1770 static int decodeFlags(MemPage *pPage, int flagByte){
1771   BtShared *pBt;     /* A copy of pPage->pBt */
1772
1773   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1774   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1775   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1776   flagByte &= ~PTF_LEAF;
1777   pPage->childPtrSize = 4-4*pPage->leaf;
1778   pPage->xCellSize = cellSizePtr;
1779   pBt = pPage->pBt;
1780   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1781     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1782     ** interior table b-tree page. */
1783     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1784     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1785     ** leaf table b-tree page. */
1786     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1787     pPage->intKey = 1;
1788     if( pPage->leaf ){
1789       pPage->intKeyLeaf = 1;
1790       pPage->xParseCell = btreeParseCellPtr;
1791     }else{
1792       pPage->intKeyLeaf = 0;
1793       pPage->xCellSize = cellSizePtrNoPayload;
1794       pPage->xParseCell = btreeParseCellPtrNoPayload;
1795     }
1796     pPage->maxLocal = pBt->maxLeaf;
1797     pPage->minLocal = pBt->minLeaf;
1798   }else if( flagByte==PTF_ZERODATA ){
1799     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1800     ** interior index b-tree page. */
1801     assert( (PTF_ZERODATA)==2 );
1802     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1803     ** leaf index b-tree page. */
1804     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1805     pPage->intKey = 0;
1806     pPage->intKeyLeaf = 0;
1807     pPage->xParseCell = btreeParseCellPtrIndex;
1808     pPage->maxLocal = pBt->maxLocal;
1809     pPage->minLocal = pBt->minLocal;
1810   }else{
1811     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1812     ** an error. */
1813     return SQLITE_CORRUPT_PAGE(pPage);
1814   }
1815   pPage->max1bytePayload = pBt->max1bytePayload;
1816   return SQLITE_OK;
1817 }
1818
1819 /*
1820 ** Initialize the auxiliary information for a disk block.
1821 **
1822 ** Return SQLITE_OK on success.  If we see that the page does
1823 ** not contain a well-formed database page, then return
1824 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1825 ** guarantee that the page is well-formed.  It only shows that
1826 ** we failed to detect any corruption.
1827 */
1828 static int btreeInitPage(MemPage *pPage){
1829   int pc;            /* Address of a freeblock within pPage->aData[] */
1830   u8 hdr;            /* Offset to beginning of page header */
1831   u8 *data;          /* Equal to pPage->aData */
1832   BtShared *pBt;        /* The main btree structure */
1833   int usableSize;    /* Amount of usable space on each page */
1834   u16 cellOffset;    /* Offset from start of page to first cell pointer */
1835   int nFree;         /* Number of unused bytes on the page */
1836   int top;           /* First byte of the cell content area */
1837   int iCellFirst;    /* First allowable cell or freeblock offset */
1838   int iCellLast;     /* Last possible cell or freeblock offset */
1839
1840   assert( pPage->pBt!=0 );
1841   assert( pPage->pBt->db!=0 );
1842   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1843   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1844   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1845   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1846   assert( pPage->isInit==0 );
1847
1848   pBt = pPage->pBt;
1849   hdr = pPage->hdrOffset;
1850   data = pPage->aData;
1851   /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
1852   ** the b-tree page type. */
1853   if( decodeFlags(pPage, data[hdr]) ){
1854     return SQLITE_CORRUPT_PAGE(pPage);
1855   }
1856   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1857   pPage->maskPage = (u16)(pBt->pageSize - 1);
1858   pPage->nOverflow = 0;
1859   usableSize = pBt->usableSize;
1860   pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
1861   pPage->aDataEnd = &data[usableSize];
1862   pPage->aCellIdx = &data[cellOffset];
1863   pPage->aDataOfst = &data[pPage->childPtrSize];
1864   /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1865   ** the start of the cell content area. A zero value for this integer is
1866   ** interpreted as 65536. */
1867   top = get2byteNotZero(&data[hdr+5]);
1868   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
1869   ** number of cells on the page. */
1870   pPage->nCell = get2byte(&data[hdr+3]);
1871   if( pPage->nCell>MX_CELL(pBt) ){
1872     /* To many cells for a single page.  The page must be corrupt */
1873     return SQLITE_CORRUPT_PAGE(pPage);
1874   }
1875   testcase( pPage->nCell==MX_CELL(pBt) );
1876   /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
1877   ** possible for a root page of a table that contains no rows) then the
1878   ** offset to the cell content area will equal the page size minus the
1879   ** bytes of reserved space. */
1880   assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
1881
1882   /* A malformed database page might cause us to read past the end
1883   ** of page when parsing a cell.
1884   **
1885   ** The following block of code checks early to see if a cell extends
1886   ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1887   ** returned if it does.
1888   */
1889   iCellFirst = cellOffset + 2*pPage->nCell;
1890   iCellLast = usableSize - 4;
1891   if( pBt->db->flags & SQLITE_CellSizeCk ){
1892     int i;            /* Index into the cell pointer array */
1893     int sz;           /* Size of a cell */
1894
1895     if( !pPage->leaf ) iCellLast--;
1896     for(i=0; i<pPage->nCell; i++){
1897       pc = get2byteAligned(&data[cellOffset+i*2]);
1898       testcase( pc==iCellFirst );
1899       testcase( pc==iCellLast );
1900       if( pc<iCellFirst || pc>iCellLast ){
1901         return SQLITE_CORRUPT_PAGE(pPage);
1902       }
1903       sz = pPage->xCellSize(pPage, &data[pc]);
1904       testcase( pc+sz==usableSize );
1905       if( pc+sz>usableSize ){
1906         return SQLITE_CORRUPT_PAGE(pPage);
1907       }
1908     }
1909     if( !pPage->leaf ) iCellLast++;
1910   }
1911
1912   /* Compute the total free space on the page
1913   ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1914   ** start of the first freeblock on the page, or is zero if there are no
1915   ** freeblocks. */
1916   pc = get2byte(&data[hdr+1]);
1917   nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1918   if( pc>0 ){
1919     u32 next, size;
1920     if( pc<iCellFirst ){
1921       /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1922       ** always be at least one cell before the first freeblock.
1923       */
1924       return SQLITE_CORRUPT_PAGE(pPage);
1925     }
1926     while( 1 ){
1927       if( pc>iCellLast ){
1928         /* Freeblock off the end of the page */
1929         return SQLITE_CORRUPT_PAGE(pPage);
1930       }
1931       next = get2byte(&data[pc]);
1932       size = get2byte(&data[pc+2]);
1933       nFree = nFree + size;
1934       if( next<=pc+size+3 ) break;
1935       pc = next;
1936     }
1937     if( next>0 ){
1938       /* Freeblock not in ascending order */
1939       return SQLITE_CORRUPT_PAGE(pPage);
1940     }
1941     if( pc+size>(unsigned int)usableSize ){
1942       /* Last freeblock extends past page end */
1943       return SQLITE_CORRUPT_PAGE(pPage);
1944     }
1945   }
1946
1947   /* At this point, nFree contains the sum of the offset to the start
1948   ** of the cell-content area plus the number of free bytes within
1949   ** the cell-content area. If this is greater than the usable-size
1950   ** of the page, then the page must be corrupted. This check also
1951   ** serves to verify that the offset to the start of the cell-content
1952   ** area, according to the page header, lies within the page.
1953   */
1954   if( nFree>usableSize ){
1955     return SQLITE_CORRUPT_PAGE(pPage);
1956   }
1957   pPage->nFree = (u16)(nFree - iCellFirst);
1958   pPage->isInit = 1;
1959   return SQLITE_OK;
1960 }
1961
1962 /*
1963 ** Set up a raw page so that it looks like a database page holding
1964 ** no entries.
1965 */
1966 static void zeroPage(MemPage *pPage, int flags){
1967   unsigned char *data = pPage->aData;
1968   BtShared *pBt = pPage->pBt;
1969   u8 hdr = pPage->hdrOffset;
1970   u16 first;
1971
1972   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1973   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1974   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1975   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1976   assert( sqlite3_mutex_held(pBt->mutex) );
1977   if( pBt->btsFlags & BTS_FAST_SECURE ){
1978     memset(&data[hdr], 0, pBt->usableSize - hdr);
1979   }
1980   data[hdr] = (char)flags;
1981   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
1982   memset(&data[hdr+1], 0, 4);
1983   data[hdr+7] = 0;
1984   put2byte(&data[hdr+5], pBt->usableSize);
1985   pPage->nFree = (u16)(pBt->usableSize - first);
1986   decodeFlags(pPage, flags);
1987   pPage->cellOffset = first;
1988   pPage->aDataEnd = &data[pBt->usableSize];
1989   pPage->aCellIdx = &data[first];
1990   pPage->aDataOfst = &data[pPage->childPtrSize];
1991   pPage->nOverflow = 0;
1992   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1993   pPage->maskPage = (u16)(pBt->pageSize - 1);
1994   pPage->nCell = 0;
1995   pPage->isInit = 1;
1996 }
1997
1998
1999 /*
2000 ** Convert a DbPage obtained from the pager into a MemPage used by
2001 ** the btree layer.
2002 */
2003 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
2004   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2005   if( pgno!=pPage->pgno ){
2006     pPage->aData = sqlite3PagerGetData(pDbPage);
2007     pPage->pDbPage = pDbPage;
2008     pPage->pBt = pBt;
2009     pPage->pgno = pgno;
2010     pPage->hdrOffset = pgno==1 ? 100 : 0;
2011   }
2012   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2013   return pPage;
2014 }
2015
2016 /*
2017 ** Get a page from the pager.  Initialize the MemPage.pBt and
2018 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
2019 **
2020 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
2021 ** about the content of the page at this time.  So do not go to the disk
2022 ** to fetch the content.  Just fill in the content with zeros for now.
2023 ** If in the future we call sqlite3PagerWrite() on this page, that
2024 ** means we have started to be concerned about content and the disk
2025 ** read should occur at that point.
2026 */
2027 static int btreeGetPage(
2028   BtShared *pBt,       /* The btree */
2029   Pgno pgno,           /* Number of the page to fetch */
2030   MemPage **ppPage,    /* Return the page in this parameter */
2031   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2032 ){
2033   int rc;
2034   DbPage *pDbPage;
2035
2036   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
2037   assert( sqlite3_mutex_held(pBt->mutex) );
2038   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
2039   if( rc ) return rc;
2040   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
2041   return SQLITE_OK;
2042 }
2043
2044 /*
2045 ** Retrieve a page from the pager cache. If the requested page is not
2046 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
2047 ** MemPage.aData elements if needed.
2048 */
2049 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
2050   DbPage *pDbPage;
2051   assert( sqlite3_mutex_held(pBt->mutex) );
2052   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
2053   if( pDbPage ){
2054     return btreePageFromDbPage(pDbPage, pgno, pBt);
2055   }
2056   return 0;
2057 }
2058
2059 /*
2060 ** Return the size of the database file in pages. If there is any kind of
2061 ** error, return ((unsigned int)-1).
2062 */
2063 static Pgno btreePagecount(BtShared *pBt){
2064   return pBt->nPage;
2065 }
2066 u32 sqlite3BtreeLastPage(Btree *p){
2067   assert( sqlite3BtreeHoldsMutex(p) );
2068   assert( ((p->pBt->nPage)&0x80000000)==0 );
2069   return btreePagecount(p->pBt);
2070 }
2071
2072 /*
2073 ** Get a page from the pager and initialize it.
2074 **
2075 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
2076 ** call.  Do additional sanity checking on the page in this case.
2077 ** And if the fetch fails, this routine must decrement pCur->iPage.
2078 **
2079 ** The page is fetched as read-write unless pCur is not NULL and is
2080 ** a read-only cursor.
2081 **
2082 ** If an error occurs, then *ppPage is undefined. It
2083 ** may remain unchanged, or it may be set to an invalid value.
2084 */
2085 static int getAndInitPage(
2086   BtShared *pBt,                  /* The database file */
2087   Pgno pgno,                      /* Number of the page to get */
2088   MemPage **ppPage,               /* Write the page pointer here */
2089   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
2090   int bReadOnly                   /* True for a read-only page */
2091 ){
2092   int rc;
2093   DbPage *pDbPage;
2094   assert( sqlite3_mutex_held(pBt->mutex) );
2095   assert( pCur==0 || ppPage==&pCur->pPage );
2096   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2097   assert( pCur==0 || pCur->iPage>0 );
2098
2099   if( pgno>btreePagecount(pBt) ){
2100     rc = SQLITE_CORRUPT_BKPT;
2101     goto getAndInitPage_error;
2102   }
2103   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2104   if( rc ){
2105     goto getAndInitPage_error;
2106   }
2107   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2108   if( (*ppPage)->isInit==0 ){
2109     btreePageFromDbPage(pDbPage, pgno, pBt);
2110     rc = btreeInitPage(*ppPage);
2111     if( rc!=SQLITE_OK ){
2112       releasePage(*ppPage);
2113       goto getAndInitPage_error;
2114     }
2115   }
2116   assert( (*ppPage)->pgno==pgno );
2117   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2118
2119   /* If obtaining a child page for a cursor, we must verify that the page is
2120   ** compatible with the root page. */
2121   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2122     rc = SQLITE_CORRUPT_PGNO(pgno);
2123     releasePage(*ppPage);
2124     goto getAndInitPage_error;
2125   }
2126   return SQLITE_OK;
2127
2128 getAndInitPage_error:
2129   if( pCur ){
2130     pCur->iPage--;
2131     pCur->pPage = pCur->apPage[pCur->iPage];
2132   }
2133   testcase( pgno==0 );
2134   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2135   return rc;
2136 }
2137
2138 /*
2139 ** Release a MemPage.  This should be called once for each prior
2140 ** call to btreeGetPage.
2141 **
2142 ** Page1 is a special case and must be released using releasePageOne().
2143 */
2144 static void releasePageNotNull(MemPage *pPage){
2145   assert( pPage->aData );
2146   assert( pPage->pBt );
2147   assert( pPage->pDbPage!=0 );
2148   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2149   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2150   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2151   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2152 }
2153 static void releasePage(MemPage *pPage){
2154   if( pPage ) releasePageNotNull(pPage);
2155 }
2156 static void releasePageOne(MemPage *pPage){
2157   assert( pPage!=0 );
2158   assert( pPage->aData );
2159   assert( pPage->pBt );
2160   assert( pPage->pDbPage!=0 );
2161   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2162   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2163   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2164   sqlite3PagerUnrefPageOne(pPage->pDbPage);
2165 }
2166
2167 /*
2168 ** Get an unused page.
2169 **
2170 ** This works just like btreeGetPage() with the addition:
2171 **
2172 **   *  If the page is already in use for some other purpose, immediately
2173 **      release it and return an SQLITE_CURRUPT error.
2174 **   *  Make sure the isInit flag is clear
2175 */
2176 static int btreeGetUnusedPage(
2177   BtShared *pBt,       /* The btree */
2178   Pgno pgno,           /* Number of the page to fetch */
2179   MemPage **ppPage,    /* Return the page in this parameter */
2180   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2181 ){
2182   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2183   if( rc==SQLITE_OK ){
2184     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2185       releasePage(*ppPage);
2186       *ppPage = 0;
2187       return SQLITE_CORRUPT_BKPT;
2188     }
2189     (*ppPage)->isInit = 0;
2190   }else{
2191     *ppPage = 0;
2192   }
2193   return rc;
2194 }
2195
2196
2197 /*
2198 ** During a rollback, when the pager reloads information into the cache
2199 ** so that the cache is restored to its original state at the start of
2200 ** the transaction, for each page restored this routine is called.
2201 **
2202 ** This routine needs to reset the extra data section at the end of the
2203 ** page to agree with the restored data.
2204 */
2205 static void pageReinit(DbPage *pData){
2206   MemPage *pPage;
2207   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2208   assert( sqlite3PagerPageRefcount(pData)>0 );
2209   if( pPage->isInit ){
2210     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2211     pPage->isInit = 0;
2212     if( sqlite3PagerPageRefcount(pData)>1 ){
2213       /* pPage might not be a btree page;  it might be an overflow page
2214       ** or ptrmap page or a free page.  In those cases, the following
2215       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2216       ** But no harm is done by this.  And it is very important that
2217       ** btreeInitPage() be called on every btree page so we make
2218       ** the call for every page that comes in for re-initing. */
2219       btreeInitPage(pPage);
2220     }
2221   }
2222 }
2223
2224 /*
2225 ** Invoke the busy handler for a btree.
2226 */
2227 static int btreeInvokeBusyHandler(void *pArg){
2228   BtShared *pBt = (BtShared*)pArg;
2229   assert( pBt->db );
2230   assert( sqlite3_mutex_held(pBt->db->mutex) );
2231   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2232 }
2233
2234 /*
2235 ** Open a database file.
2236 **
2237 ** zFilename is the name of the database file.  If zFilename is NULL
2238 ** then an ephemeral database is created.  The ephemeral database might
2239 ** be exclusively in memory, or it might use a disk-based memory cache.
2240 ** Either way, the ephemeral database will be automatically deleted
2241 ** when sqlite3BtreeClose() is called.
2242 **
2243 ** If zFilename is ":memory:" then an in-memory database is created
2244 ** that is automatically destroyed when it is closed.
2245 **
2246 ** The "flags" parameter is a bitmask that might contain bits like
2247 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2248 **
2249 ** If the database is already opened in the same database connection
2250 ** and we are in shared cache mode, then the open will fail with an
2251 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2252 ** objects in the same database connection since doing so will lead
2253 ** to problems with locking.
2254 */
2255 int sqlite3BtreeOpen(
2256   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2257   const char *zFilename,  /* Name of the file containing the BTree database */
2258   sqlite3 *db,            /* Associated database handle */
2259   Btree **ppBtree,        /* Pointer to new Btree object written here */
2260   int flags,              /* Options */
2261   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2262 ){
2263   BtShared *pBt = 0;             /* Shared part of btree structure */
2264   Btree *p;                      /* Handle to return */
2265   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2266   int rc = SQLITE_OK;            /* Result code from this function */
2267   u8 nReserve;                   /* Byte of unused space on each page */
2268   unsigned char zDbHeader[100];  /* Database header content */
2269
2270   /* True if opening an ephemeral, temporary database */
2271   const int isTempDb = zFilename==0 || zFilename[0]==0;
2272
2273   /* Set the variable isMemdb to true for an in-memory database, or
2274   ** false for a file-based database.
2275   */
2276 #ifdef SQLITE_OMIT_MEMORYDB
2277   const int isMemdb = 0;
2278 #else
2279   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2280                        || (isTempDb && sqlite3TempInMemory(db))
2281                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2282 #endif
2283
2284   assert( db!=0 );
2285   assert( pVfs!=0 );
2286   assert( sqlite3_mutex_held(db->mutex) );
2287   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2288
2289   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2290   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2291
2292   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2293   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2294
2295   if( isMemdb ){
2296     flags |= BTREE_MEMORY;
2297   }
2298   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2299     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2300   }
2301   p = sqlite3MallocZero(sizeof(Btree));
2302   if( !p ){
2303     return SQLITE_NOMEM_BKPT;
2304   }
2305   p->inTrans = TRANS_NONE;
2306   p->db = db;
2307 #ifndef SQLITE_OMIT_SHARED_CACHE
2308   p->lock.pBtree = p;
2309   p->lock.iTable = 1;
2310 #endif
2311
2312 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2313   /*
2314   ** If this Btree is a candidate for shared cache, try to find an
2315   ** existing BtShared object that we can share with
2316   */
2317   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2318     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2319       int nFilename = sqlite3Strlen30(zFilename)+1;
2320       int nFullPathname = pVfs->mxPathname+1;
2321       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2322       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2323
2324       p->sharable = 1;
2325       if( !zFullPathname ){
2326         sqlite3_free(p);
2327         return SQLITE_NOMEM_BKPT;
2328       }
2329       if( isMemdb ){
2330         memcpy(zFullPathname, zFilename, nFilename);
2331       }else{
2332         rc = sqlite3OsFullPathname(pVfs, zFilename,
2333                                    nFullPathname, zFullPathname);
2334         if( rc ){
2335           sqlite3_free(zFullPathname);
2336           sqlite3_free(p);
2337           return rc;
2338         }
2339       }
2340 #if SQLITE_THREADSAFE
2341       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2342       sqlite3_mutex_enter(mutexOpen);
2343       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2344       sqlite3_mutex_enter(mutexShared);
2345 #endif
2346       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2347         assert( pBt->nRef>0 );
2348         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2349                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2350           int iDb;
2351           for(iDb=db->nDb-1; iDb>=0; iDb--){
2352             Btree *pExisting = db->aDb[iDb].pBt;
2353             if( pExisting && pExisting->pBt==pBt ){
2354               sqlite3_mutex_leave(mutexShared);
2355               sqlite3_mutex_leave(mutexOpen);
2356               sqlite3_free(zFullPathname);
2357               sqlite3_free(p);
2358               return SQLITE_CONSTRAINT;
2359             }
2360           }
2361           p->pBt = pBt;
2362           pBt->nRef++;
2363           break;
2364         }
2365       }
2366       sqlite3_mutex_leave(mutexShared);
2367       sqlite3_free(zFullPathname);
2368     }
2369 #ifdef SQLITE_DEBUG
2370     else{
2371       /* In debug mode, we mark all persistent databases as sharable
2372       ** even when they are not.  This exercises the locking code and
2373       ** gives more opportunity for asserts(sqlite3_mutex_held())
2374       ** statements to find locking problems.
2375       */
2376       p->sharable = 1;
2377     }
2378 #endif
2379   }
2380 #endif
2381   if( pBt==0 ){
2382     /*
2383     ** The following asserts make sure that structures used by the btree are
2384     ** the right size.  This is to guard against size changes that result
2385     ** when compiling on a different architecture.
2386     */
2387     assert( sizeof(i64)==8 );
2388     assert( sizeof(u64)==8 );
2389     assert( sizeof(u32)==4 );
2390     assert( sizeof(u16)==2 );
2391     assert( sizeof(Pgno)==4 );
2392
2393     pBt = sqlite3MallocZero( sizeof(*pBt) );
2394     if( pBt==0 ){
2395       rc = SQLITE_NOMEM_BKPT;
2396       goto btree_open_out;
2397     }
2398     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2399                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2400     if( rc==SQLITE_OK ){
2401       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2402       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2403     }
2404     if( rc!=SQLITE_OK ){
2405       goto btree_open_out;
2406     }
2407     pBt->openFlags = (u8)flags;
2408     pBt->db = db;
2409     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2410     p->pBt = pBt;
2411
2412     pBt->pCursor = 0;
2413     pBt->pPage1 = 0;
2414     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2415 #if defined(SQLITE_SECURE_DELETE)
2416     pBt->btsFlags |= BTS_SECURE_DELETE;
2417 #elif defined(SQLITE_FAST_SECURE_DELETE)
2418     pBt->btsFlags |= BTS_OVERWRITE;
2419 #endif
2420     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2421     ** determined by the 2-byte integer located at an offset of 16 bytes from
2422     ** the beginning of the database file. */
2423     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2424     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2425          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2426       pBt->pageSize = 0;
2427 #ifndef SQLITE_OMIT_AUTOVACUUM
2428       /* If the magic name ":memory:" will create an in-memory database, then
2429       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2430       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2431       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2432       ** regular file-name. In this case the auto-vacuum applies as per normal.
2433       */
2434       if( zFilename && !isMemdb ){
2435         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2436         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2437       }
2438 #endif
2439       nReserve = 0;
2440     }else{
2441       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2442       ** determined by the one-byte unsigned integer found at an offset of 20
2443       ** into the database file header. */
2444       nReserve = zDbHeader[20];
2445       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2446 #ifndef SQLITE_OMIT_AUTOVACUUM
2447       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2448       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2449 #endif
2450     }
2451     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2452     if( rc ) goto btree_open_out;
2453     pBt->usableSize = pBt->pageSize - nReserve;
2454     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2455
2456 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2457     /* Add the new BtShared object to the linked list sharable BtShareds.
2458     */
2459     pBt->nRef = 1;
2460     if( p->sharable ){
2461       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2462       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2463       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2464         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2465         if( pBt->mutex==0 ){
2466           rc = SQLITE_NOMEM_BKPT;
2467           goto btree_open_out;
2468         }
2469       }
2470       sqlite3_mutex_enter(mutexShared);
2471       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2472       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2473       sqlite3_mutex_leave(mutexShared);
2474     }
2475 #endif
2476   }
2477
2478 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2479   /* If the new Btree uses a sharable pBtShared, then link the new
2480   ** Btree into the list of all sharable Btrees for the same connection.
2481   ** The list is kept in ascending order by pBt address.
2482   */
2483   if( p->sharable ){
2484     int i;
2485     Btree *pSib;
2486     for(i=0; i<db->nDb; i++){
2487       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2488         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2489         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2490           p->pNext = pSib;
2491           p->pPrev = 0;
2492           pSib->pPrev = p;
2493         }else{
2494           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2495             pSib = pSib->pNext;
2496           }
2497           p->pNext = pSib->pNext;
2498           p->pPrev = pSib;
2499           if( p->pNext ){
2500             p->pNext->pPrev = p;
2501           }
2502           pSib->pNext = p;
2503         }
2504         break;
2505       }
2506     }
2507   }
2508 #endif
2509   *ppBtree = p;
2510
2511 btree_open_out:
2512   if( rc!=SQLITE_OK ){
2513     if( pBt && pBt->pPager ){
2514       sqlite3PagerClose(pBt->pPager, 0);
2515     }
2516     sqlite3_free(pBt);
2517     sqlite3_free(p);
2518     *ppBtree = 0;
2519   }else{
2520     sqlite3_file *pFile;
2521
2522     /* If the B-Tree was successfully opened, set the pager-cache size to the
2523     ** default value. Except, when opening on an existing shared pager-cache,
2524     ** do not change the pager-cache size.
2525     */
2526     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2527       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2528     }
2529
2530     pFile = sqlite3PagerFile(pBt->pPager);
2531     if( pFile->pMethods ){
2532       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2533     }
2534   }
2535   if( mutexOpen ){
2536     assert( sqlite3_mutex_held(mutexOpen) );
2537     sqlite3_mutex_leave(mutexOpen);
2538   }
2539   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2540   return rc;
2541 }
2542
2543 /*
2544 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2545 ** remove the BtShared structure from the sharing list.  Return
2546 ** true if the BtShared.nRef counter reaches zero and return
2547 ** false if it is still positive.
2548 */
2549 static int removeFromSharingList(BtShared *pBt){
2550 #ifndef SQLITE_OMIT_SHARED_CACHE
2551   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2552   BtShared *pList;
2553   int removed = 0;
2554
2555   assert( sqlite3_mutex_notheld(pBt->mutex) );
2556   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2557   sqlite3_mutex_enter(pMaster);
2558   pBt->nRef--;
2559   if( pBt->nRef<=0 ){
2560     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2561       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2562     }else{
2563       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2564       while( ALWAYS(pList) && pList->pNext!=pBt ){
2565         pList=pList->pNext;
2566       }
2567       if( ALWAYS(pList) ){
2568         pList->pNext = pBt->pNext;
2569       }
2570     }
2571     if( SQLITE_THREADSAFE ){
2572       sqlite3_mutex_free(pBt->mutex);
2573     }
2574     removed = 1;
2575   }
2576   sqlite3_mutex_leave(pMaster);
2577   return removed;
2578 #else
2579   return 1;
2580 #endif
2581 }
2582
2583 /*
2584 ** Make sure pBt->pTmpSpace points to an allocation of
2585 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2586 ** pointer.
2587 */
2588 static void allocateTempSpace(BtShared *pBt){
2589   if( !pBt->pTmpSpace ){
2590     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2591
2592     /* One of the uses of pBt->pTmpSpace is to format cells before
2593     ** inserting them into a leaf page (function fillInCell()). If
2594     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2595     ** by the various routines that manipulate binary cells. Which
2596     ** can mean that fillInCell() only initializes the first 2 or 3
2597     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2598     ** it into a database page. This is not actually a problem, but it
2599     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2600     ** data is passed to system call write(). So to avoid this error,
2601     ** zero the first 4 bytes of temp space here.
2602     **
2603     ** Also:  Provide four bytes of initialized space before the
2604     ** beginning of pTmpSpace as an area available to prepend the
2605     ** left-child pointer to the beginning of a cell.
2606     */
2607     if( pBt->pTmpSpace ){
2608       memset(pBt->pTmpSpace, 0, 8);
2609       pBt->pTmpSpace += 4;
2610     }
2611   }
2612 }
2613
2614 /*
2615 ** Free the pBt->pTmpSpace allocation
2616 */
2617 static void freeTempSpace(BtShared *pBt){
2618   if( pBt->pTmpSpace ){
2619     pBt->pTmpSpace -= 4;
2620     sqlite3PageFree(pBt->pTmpSpace);
2621     pBt->pTmpSpace = 0;
2622   }
2623 }
2624
2625 /*
2626 ** Close an open database and invalidate all cursors.
2627 */
2628 int sqlite3BtreeClose(Btree *p){
2629   BtShared *pBt = p->pBt;
2630   BtCursor *pCur;
2631
2632   /* Close all cursors opened via this handle.  */
2633   assert( sqlite3_mutex_held(p->db->mutex) );
2634   sqlite3BtreeEnter(p);
2635   pCur = pBt->pCursor;
2636   while( pCur ){
2637     BtCursor *pTmp = pCur;
2638     pCur = pCur->pNext;
2639     if( pTmp->pBtree==p ){
2640       sqlite3BtreeCloseCursor(pTmp);
2641     }
2642   }
2643
2644   /* Rollback any active transaction and free the handle structure.
2645   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2646   ** this handle.
2647   */
2648   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2649   sqlite3BtreeLeave(p);
2650
2651   /* If there are still other outstanding references to the shared-btree
2652   ** structure, return now. The remainder of this procedure cleans
2653   ** up the shared-btree.
2654   */
2655   assert( p->wantToLock==0 && p->locked==0 );
2656   if( !p->sharable || removeFromSharingList(pBt) ){
2657     /* The pBt is no longer on the sharing list, so we can access
2658     ** it without having to hold the mutex.
2659     **
2660     ** Clean out and delete the BtShared object.
2661     */
2662     assert( !pBt->pCursor );
2663     sqlite3PagerClose(pBt->pPager, p->db);
2664     if( pBt->xFreeSchema && pBt->pSchema ){
2665       pBt->xFreeSchema(pBt->pSchema);
2666     }
2667     sqlite3DbFree(0, pBt->pSchema);
2668     freeTempSpace(pBt);
2669     sqlite3_free(pBt);
2670   }
2671
2672 #ifndef SQLITE_OMIT_SHARED_CACHE
2673   assert( p->wantToLock==0 );
2674   assert( p->locked==0 );
2675   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2676   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2677 #endif
2678
2679   sqlite3_free(p);
2680   return SQLITE_OK;
2681 }
2682
2683 /*
2684 ** Change the "soft" limit on the number of pages in the cache.
2685 ** Unused and unmodified pages will be recycled when the number of
2686 ** pages in the cache exceeds this soft limit.  But the size of the
2687 ** cache is allowed to grow larger than this limit if it contains
2688 ** dirty pages or pages still in active use.
2689 */
2690 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2691   BtShared *pBt = p->pBt;
2692   assert( sqlite3_mutex_held(p->db->mutex) );
2693   sqlite3BtreeEnter(p);
2694   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2695   sqlite3BtreeLeave(p);
2696   return SQLITE_OK;
2697 }
2698
2699 /*
2700 ** Change the "spill" limit on the number of pages in the cache.
2701 ** If the number of pages exceeds this limit during a write transaction,
2702 ** the pager might attempt to "spill" pages to the journal early in
2703 ** order to free up memory.
2704 **
2705 ** The value returned is the current spill size.  If zero is passed
2706 ** as an argument, no changes are made to the spill size setting, so
2707 ** using mxPage of 0 is a way to query the current spill size.
2708 */
2709 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2710   BtShared *pBt = p->pBt;
2711   int res;
2712   assert( sqlite3_mutex_held(p->db->mutex) );
2713   sqlite3BtreeEnter(p);
2714   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2715   sqlite3BtreeLeave(p);
2716   return res;
2717 }
2718
2719 #if SQLITE_MAX_MMAP_SIZE>0
2720 /*
2721 ** Change the limit on the amount of the database file that may be
2722 ** memory mapped.
2723 */
2724 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2725   BtShared *pBt = p->pBt;
2726   assert( sqlite3_mutex_held(p->db->mutex) );
2727   sqlite3BtreeEnter(p);
2728   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2729   sqlite3BtreeLeave(p);
2730   return SQLITE_OK;
2731 }
2732 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2733
2734 /*
2735 ** Change the way data is synced to disk in order to increase or decrease
2736 ** how well the database resists damage due to OS crashes and power
2737 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2738 ** there is a high probability of damage)  Level 2 is the default.  There
2739 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2740 ** probability of damage to near zero but with a write performance reduction.
2741 */
2742 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2743 int sqlite3BtreeSetPagerFlags(
2744   Btree *p,              /* The btree to set the safety level on */
2745   unsigned pgFlags       /* Various PAGER_* flags */
2746 ){
2747   BtShared *pBt = p->pBt;
2748   assert( sqlite3_mutex_held(p->db->mutex) );
2749   sqlite3BtreeEnter(p);
2750   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2751   sqlite3BtreeLeave(p);
2752   return SQLITE_OK;
2753 }
2754 #endif
2755
2756 /*
2757 ** Change the default pages size and the number of reserved bytes per page.
2758 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2759 ** without changing anything.
2760 **
2761 ** The page size must be a power of 2 between 512 and 65536.  If the page
2762 ** size supplied does not meet this constraint then the page size is not
2763 ** changed.
2764 **
2765 ** Page sizes are constrained to be a power of two so that the region
2766 ** of the database file used for locking (beginning at PENDING_BYTE,
2767 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2768 ** at the beginning of a page.
2769 **
2770 ** If parameter nReserve is less than zero, then the number of reserved
2771 ** bytes per page is left unchanged.
2772 **
2773 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2774 ** and autovacuum mode can no longer be changed.
2775 */
2776 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2777   int rc = SQLITE_OK;
2778   BtShared *pBt = p->pBt;
2779   assert( nReserve>=-1 && nReserve<=255 );
2780   sqlite3BtreeEnter(p);
2781 #if SQLITE_HAS_CODEC
2782   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2783 #endif
2784   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2785     sqlite3BtreeLeave(p);
2786     return SQLITE_READONLY;
2787   }
2788   if( nReserve<0 ){
2789     nReserve = pBt->pageSize - pBt->usableSize;
2790   }
2791   assert( nReserve>=0 && nReserve<=255 );
2792   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2793         ((pageSize-1)&pageSize)==0 ){
2794     assert( (pageSize & 7)==0 );
2795     assert( !pBt->pCursor );
2796     pBt->pageSize = (u32)pageSize;
2797     freeTempSpace(pBt);
2798   }
2799   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2800   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2801   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2802   sqlite3BtreeLeave(p);
2803   return rc;
2804 }
2805
2806 /*
2807 ** Return the currently defined page size
2808 */
2809 int sqlite3BtreeGetPageSize(Btree *p){
2810   return p->pBt->pageSize;
2811 }
2812
2813 /*
2814 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2815 ** may only be called if it is guaranteed that the b-tree mutex is already
2816 ** held.
2817 **
2818 ** This is useful in one special case in the backup API code where it is
2819 ** known that the shared b-tree mutex is held, but the mutex on the
2820 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2821 ** were to be called, it might collide with some other operation on the
2822 ** database handle that owns *p, causing undefined behavior.
2823 */
2824 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2825   int n;
2826   assert( sqlite3_mutex_held(p->pBt->mutex) );
2827   n = p->pBt->pageSize - p->pBt->usableSize;
2828   return n;
2829 }
2830
2831 /*
2832 ** Return the number of bytes of space at the end of every page that
2833 ** are intentually left unused.  This is the "reserved" space that is
2834 ** sometimes used by extensions.
2835 **
2836 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2837 ** greater of the current reserved space and the maximum requested
2838 ** reserve space.
2839 */
2840 int sqlite3BtreeGetOptimalReserve(Btree *p){
2841   int n;
2842   sqlite3BtreeEnter(p);
2843   n = sqlite3BtreeGetReserveNoMutex(p);
2844 #ifdef SQLITE_HAS_CODEC
2845   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2846 #endif
2847   sqlite3BtreeLeave(p);
2848   return n;
2849 }
2850
2851
2852 /*
2853 ** Set the maximum page count for a database if mxPage is positive.
2854 ** No changes are made if mxPage is 0 or negative.
2855 ** Regardless of the value of mxPage, return the maximum page count.
2856 */
2857 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2858   int n;
2859   sqlite3BtreeEnter(p);
2860   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2861   sqlite3BtreeLeave(p);
2862   return n;
2863 }
2864
2865 /*
2866 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
2867 **
2868 **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
2869 **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
2870 **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
2871 **    newFlag==(-1)    No changes
2872 **
2873 ** This routine acts as a query if newFlag is less than zero
2874 **
2875 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
2876 ** freelist leaf pages are not written back to the database.  Thus in-page
2877 ** deleted content is cleared, but freelist deleted content is not.
2878 **
2879 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
2880 ** that freelist leaf pages are written back into the database, increasing
2881 ** the amount of disk I/O.
2882 */
2883 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2884   int b;
2885   if( p==0 ) return 0;
2886   sqlite3BtreeEnter(p);
2887   assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
2888   assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
2889   if( newFlag>=0 ){
2890     p->pBt->btsFlags &= ~BTS_FAST_SECURE;
2891     p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
2892   }
2893   b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
2894   sqlite3BtreeLeave(p);
2895   return b;
2896 }
2897
2898 /*
2899 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2900 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2901 ** is disabled. The default value for the auto-vacuum property is
2902 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2903 */
2904 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2905 #ifdef SQLITE_OMIT_AUTOVACUUM
2906   return SQLITE_READONLY;
2907 #else
2908   BtShared *pBt = p->pBt;
2909   int rc = SQLITE_OK;
2910   u8 av = (u8)autoVacuum;
2911
2912   sqlite3BtreeEnter(p);
2913   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2914     rc = SQLITE_READONLY;
2915   }else{
2916     pBt->autoVacuum = av ?1:0;
2917     pBt->incrVacuum = av==2 ?1:0;
2918   }
2919   sqlite3BtreeLeave(p);
2920   return rc;
2921 #endif
2922 }
2923
2924 /*
2925 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2926 ** enabled 1 is returned. Otherwise 0.
2927 */
2928 int sqlite3BtreeGetAutoVacuum(Btree *p){
2929 #ifdef SQLITE_OMIT_AUTOVACUUM
2930   return BTREE_AUTOVACUUM_NONE;
2931 #else
2932   int rc;
2933   sqlite3BtreeEnter(p);
2934   rc = (
2935     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2936     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2937     BTREE_AUTOVACUUM_INCR
2938   );
2939   sqlite3BtreeLeave(p);
2940   return rc;
2941 #endif
2942 }
2943
2944 /*
2945 ** If the user has not set the safety-level for this database connection
2946 ** using "PRAGMA synchronous", and if the safety-level is not already
2947 ** set to the value passed to this function as the second parameter,
2948 ** set it so.
2949 */
2950 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
2951     && !defined(SQLITE_OMIT_WAL)
2952 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
2953   sqlite3 *db;
2954   Db *pDb;
2955   if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
2956     while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
2957     if( pDb->bSyncSet==0
2958      && pDb->safety_level!=safety_level
2959      && pDb!=&db->aDb[1]
2960     ){
2961       pDb->safety_level = safety_level;
2962       sqlite3PagerSetFlags(pBt->pPager,
2963           pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
2964     }
2965   }
2966 }
2967 #else
2968 # define setDefaultSyncFlag(pBt,safety_level)
2969 #endif
2970
2971 /*
2972 ** Get a reference to pPage1 of the database file.  This will
2973 ** also acquire a readlock on that file.
2974 **
2975 ** SQLITE_OK is returned on success.  If the file is not a
2976 ** well-formed database file, then SQLITE_CORRUPT is returned.
2977 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2978 ** is returned if we run out of memory.
2979 */
2980 static int lockBtree(BtShared *pBt){
2981   int rc;              /* Result code from subfunctions */
2982   MemPage *pPage1;     /* Page 1 of the database file */
2983   int nPage;           /* Number of pages in the database */
2984   int nPageFile = 0;   /* Number of pages in the database file */
2985   int nPageHeader;     /* Number of pages in the database according to hdr */
2986
2987   assert( sqlite3_mutex_held(pBt->mutex) );
2988   assert( pBt->pPage1==0 );
2989   rc = sqlite3PagerSharedLock(pBt->pPager);
2990   if( rc!=SQLITE_OK ) return rc;
2991   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2992   if( rc!=SQLITE_OK ) return rc;
2993
2994   /* Do some checking to help insure the file we opened really is
2995   ** a valid database file.
2996   */
2997   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2998   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2999   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
3000     nPage = nPageFile;
3001   }
3002   if( nPage>0 ){
3003     u32 pageSize;
3004     u32 usableSize;
3005     u8 *page1 = pPage1->aData;
3006     rc = SQLITE_NOTADB;
3007     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
3008     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
3009     ** 61 74 20 33 00. */
3010     if( memcmp(page1, zMagicHeader, 16)!=0 ){
3011       goto page1_init_failed;
3012     }
3013
3014 #ifdef SQLITE_OMIT_WAL
3015     if( page1[18]>1 ){
3016       pBt->btsFlags |= BTS_READ_ONLY;
3017     }
3018     if( page1[19]>1 ){
3019       goto page1_init_failed;
3020     }
3021 #else
3022     if( page1[18]>2 ){
3023       pBt->btsFlags |= BTS_READ_ONLY;
3024     }
3025     if( page1[19]>2 ){
3026       goto page1_init_failed;
3027     }
3028
3029     /* If the write version is set to 2, this database should be accessed
3030     ** in WAL mode. If the log is not already open, open it now. Then
3031     ** return SQLITE_OK and return without populating BtShared.pPage1.
3032     ** The caller detects this and calls this function again. This is
3033     ** required as the version of page 1 currently in the page1 buffer
3034     ** may not be the latest version - there may be a newer one in the log
3035     ** file.
3036     */
3037     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
3038       int isOpen = 0;
3039       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
3040       if( rc!=SQLITE_OK ){
3041         goto page1_init_failed;
3042       }else{
3043         setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
3044         if( isOpen==0 ){
3045           releasePageOne(pPage1);
3046           return SQLITE_OK;
3047         }
3048       }
3049       rc = SQLITE_NOTADB;
3050     }else{
3051       setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
3052     }
3053 #endif
3054
3055     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
3056     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
3057     **
3058     ** The original design allowed these amounts to vary, but as of
3059     ** version 3.6.0, we require them to be fixed.
3060     */
3061     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
3062       goto page1_init_failed;
3063     }
3064     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
3065     ** determined by the 2-byte integer located at an offset of 16 bytes from
3066     ** the beginning of the database file. */
3067     pageSize = (page1[16]<<8) | (page1[17]<<16);
3068     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
3069     ** between 512 and 65536 inclusive. */
3070     if( ((pageSize-1)&pageSize)!=0
3071      || pageSize>SQLITE_MAX_PAGE_SIZE
3072      || pageSize<=256
3073     ){
3074       goto page1_init_failed;
3075     }
3076     assert( (pageSize & 7)==0 );
3077     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
3078     ** integer at offset 20 is the number of bytes of space at the end of
3079     ** each page to reserve for extensions.
3080     **
3081     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
3082     ** determined by the one-byte unsigned integer found at an offset of 20
3083     ** into the database file header. */
3084     usableSize = pageSize - page1[20];
3085     if( (u32)pageSize!=pBt->pageSize ){
3086       /* After reading the first page of the database assuming a page size
3087       ** of BtShared.pageSize, we have discovered that the page-size is
3088       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
3089       ** zero and return SQLITE_OK. The caller will call this function
3090       ** again with the correct page-size.
3091       */
3092       releasePageOne(pPage1);
3093       pBt->usableSize = usableSize;
3094       pBt->pageSize = pageSize;
3095       freeTempSpace(pBt);
3096       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
3097                                    pageSize-usableSize);
3098       return rc;
3099     }
3100     if( (pBt->db->flags & SQLITE_WriteSchema)==0 && nPage>nPageFile ){
3101       rc = SQLITE_CORRUPT_BKPT;
3102       goto page1_init_failed;
3103     }
3104     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
3105     ** be less than 480. In other words, if the page size is 512, then the
3106     ** reserved space size cannot exceed 32. */
3107     if( usableSize<480 ){
3108       goto page1_init_failed;
3109     }
3110     pBt->pageSize = pageSize;
3111     pBt->usableSize = usableSize;
3112 #ifndef SQLITE_OMIT_AUTOVACUUM
3113     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
3114     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
3115 #endif
3116   }
3117
3118   /* maxLocal is the maximum amount of payload to store locally for
3119   ** a cell.  Make sure it is small enough so that at least minFanout
3120   ** cells can will fit on one page.  We assume a 10-byte page header.
3121   ** Besides the payload, the cell must store:
3122   **     2-byte pointer to the cell
3123   **     4-byte child pointer
3124   **     9-byte nKey value
3125   **     4-byte nData value
3126   **     4-byte overflow page pointer
3127   ** So a cell consists of a 2-byte pointer, a header which is as much as
3128   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
3129   ** page pointer.
3130   */
3131   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
3132   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
3133   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
3134   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
3135   if( pBt->maxLocal>127 ){
3136     pBt->max1bytePayload = 127;
3137   }else{
3138     pBt->max1bytePayload = (u8)pBt->maxLocal;
3139   }
3140   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
3141   pBt->pPage1 = pPage1;
3142   pBt->nPage = nPage;
3143   return SQLITE_OK;
3144
3145 page1_init_failed:
3146   releasePageOne(pPage1);
3147   pBt->pPage1 = 0;
3148   return rc;
3149 }
3150
3151 #ifndef NDEBUG
3152 /*
3153 ** Return the number of cursors open on pBt. This is for use
3154 ** in assert() expressions, so it is only compiled if NDEBUG is not
3155 ** defined.
3156 **
3157 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3158 ** false then all cursors are counted.
3159 **
3160 ** For the purposes of this routine, a cursor is any cursor that
3161 ** is capable of reading or writing to the database.  Cursors that
3162 ** have been tripped into the CURSOR_FAULT state are not counted.
3163 */
3164 static int countValidCursors(BtShared *pBt, int wrOnly){
3165   BtCursor *pCur;
3166   int r = 0;
3167   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3168     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3169      && pCur->eState!=CURSOR_FAULT ) r++;
3170   }
3171   return r;
3172 }
3173 #endif
3174
3175 /*
3176 ** If there are no outstanding cursors and we are not in the middle
3177 ** of a transaction but there is a read lock on the database, then
3178 ** this routine unrefs the first page of the database file which
3179 ** has the effect of releasing the read lock.
3180 **
3181 ** If there is a transaction in progress, this routine is a no-op.
3182 */
3183 static void unlockBtreeIfUnused(BtShared *pBt){
3184   assert( sqlite3_mutex_held(pBt->mutex) );
3185   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3186   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3187     MemPage *pPage1 = pBt->pPage1;
3188     assert( pPage1->aData );
3189     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3190     pBt->pPage1 = 0;
3191     releasePageOne(pPage1);
3192   }
3193 }
3194
3195 /*
3196 ** If pBt points to an empty file then convert that empty file
3197 ** into a new empty database by initializing the first page of
3198 ** the database.
3199 */
3200 static int newDatabase(BtShared *pBt){
3201   MemPage *pP1;
3202   unsigned char *data;
3203   int rc;
3204
3205   assert( sqlite3_mutex_held(pBt->mutex) );
3206   if( pBt->nPage>0 ){
3207     return SQLITE_OK;
3208   }
3209   pP1 = pBt->pPage1;
3210   assert( pP1!=0 );
3211   data = pP1->aData;
3212   rc = sqlite3PagerWrite(pP1->pDbPage);
3213   if( rc ) return rc;
3214   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3215   assert( sizeof(zMagicHeader)==16 );
3216   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3217   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3218   data[18] = 1;
3219   data[19] = 1;
3220   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3221   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3222   data[21] = 64;
3223   data[22] = 32;
3224   data[23] = 32;
3225   memset(&data[24], 0, 100-24);
3226   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3227   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3228 #ifndef SQLITE_OMIT_AUTOVACUUM
3229   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3230   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3231   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3232   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3233 #endif
3234   pBt->nPage = 1;
3235   data[31] = 1;
3236   return SQLITE_OK;
3237 }
3238
3239 /*
3240 ** Initialize the first page of the database file (creating a database
3241 ** consisting of a single page and no schema objects). Return SQLITE_OK
3242 ** if successful, or an SQLite error code otherwise.
3243 */
3244 int sqlite3BtreeNewDb(Btree *p){
3245   int rc;
3246   sqlite3BtreeEnter(p);
3247   p->pBt->nPage = 0;
3248   rc = newDatabase(p->pBt);
3249   sqlite3BtreeLeave(p);
3250   return rc;
3251 }
3252
3253 /*
3254 ** Attempt to start a new transaction. A write-transaction
3255 ** is started if the second argument is nonzero, otherwise a read-
3256 ** transaction.  If the second argument is 2 or more and exclusive
3257 ** transaction is started, meaning that no other process is allowed
3258 ** to access the database.  A preexisting transaction may not be
3259 ** upgraded to exclusive by calling this routine a second time - the
3260 ** exclusivity flag only works for a new transaction.
3261 **
3262 ** A write-transaction must be started before attempting any
3263 ** changes to the database.  None of the following routines
3264 ** will work unless a transaction is started first:
3265 **
3266 **      sqlite3BtreeCreateTable()
3267 **      sqlite3BtreeCreateIndex()
3268 **      sqlite3BtreeClearTable()
3269 **      sqlite3BtreeDropTable()
3270 **      sqlite3BtreeInsert()
3271 **      sqlite3BtreeDelete()
3272 **      sqlite3BtreeUpdateMeta()
3273 **
3274 ** If an initial attempt to acquire the lock fails because of lock contention
3275 ** and the database was previously unlocked, then invoke the busy handler
3276 ** if there is one.  But if there was previously a read-lock, do not
3277 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3278 ** returned when there is already a read-lock in order to avoid a deadlock.
3279 **
3280 ** Suppose there are two processes A and B.  A has a read lock and B has
3281 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3282 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3283 ** One or the other of the two processes must give way or there can be
3284 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3285 ** when A already has a read lock, we encourage A to give up and let B
3286 ** proceed.
3287 */
3288 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
3289   BtShared *pBt = p->pBt;
3290   int rc = SQLITE_OK;
3291
3292   sqlite3BtreeEnter(p);
3293   btreeIntegrity(p);
3294
3295   /* If the btree is already in a write-transaction, or it
3296   ** is already in a read-transaction and a read-transaction
3297   ** is requested, this is a no-op.
3298   */
3299   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3300     goto trans_begun;
3301   }
3302   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3303
3304   /* Write transactions are not possible on a read-only database */
3305   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3306     rc = SQLITE_READONLY;
3307     goto trans_begun;
3308   }
3309
3310 #ifndef SQLITE_OMIT_SHARED_CACHE
3311   {
3312     sqlite3 *pBlock = 0;
3313     /* If another database handle has already opened a write transaction
3314     ** on this shared-btree structure and a second write transaction is
3315     ** requested, return SQLITE_LOCKED.
3316     */
3317     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3318      || (pBt->btsFlags & BTS_PENDING)!=0
3319     ){
3320       pBlock = pBt->pWriter->db;
3321     }else if( wrflag>1 ){
3322       BtLock *pIter;
3323       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3324         if( pIter->pBtree!=p ){
3325           pBlock = pIter->pBtree->db;
3326           break;
3327         }
3328       }
3329     }
3330     if( pBlock ){
3331       sqlite3ConnectionBlocked(p->db, pBlock);
3332       rc = SQLITE_LOCKED_SHAREDCACHE;
3333       goto trans_begun;
3334     }
3335   }
3336 #endif
3337
3338   /* Any read-only or read-write transaction implies a read-lock on
3339   ** page 1. So if some other shared-cache client already has a write-lock
3340   ** on page 1, the transaction cannot be opened. */
3341   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3342   if( SQLITE_OK!=rc ) goto trans_begun;
3343
3344   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3345   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3346   do {
3347     /* Call lockBtree() until either pBt->pPage1 is populated or
3348     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3349     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3350     ** reading page 1 it discovers that the page-size of the database
3351     ** file is not pBt->pageSize. In this case lockBtree() will update
3352     ** pBt->pageSize to the page-size of the file on disk.
3353     */
3354     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3355
3356     if( rc==SQLITE_OK && wrflag ){
3357       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3358         rc = SQLITE_READONLY;
3359       }else{
3360         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3361         if( rc==SQLITE_OK ){
3362           rc = newDatabase(pBt);
3363         }
3364       }
3365     }
3366
3367     if( rc!=SQLITE_OK ){
3368       unlockBtreeIfUnused(pBt);
3369     }
3370   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3371           btreeInvokeBusyHandler(pBt) );
3372
3373   if( rc==SQLITE_OK ){
3374     if( p->inTrans==TRANS_NONE ){
3375       pBt->nTransaction++;
3376 #ifndef SQLITE_OMIT_SHARED_CACHE
3377       if( p->sharable ){
3378         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3379         p->lock.eLock = READ_LOCK;
3380         p->lock.pNext = pBt->pLock;
3381         pBt->pLock = &p->lock;
3382       }
3383 #endif
3384     }
3385     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3386     if( p->inTrans>pBt->inTransaction ){
3387       pBt->inTransaction = p->inTrans;
3388     }
3389     if( wrflag ){
3390       MemPage *pPage1 = pBt->pPage1;
3391 #ifndef SQLITE_OMIT_SHARED_CACHE
3392       assert( !pBt->pWriter );
3393       pBt->pWriter = p;
3394       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3395       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3396 #endif
3397
3398       /* If the db-size header field is incorrect (as it may be if an old
3399       ** client has been writing the database file), update it now. Doing
3400       ** this sooner rather than later means the database size can safely
3401       ** re-read the database size from page 1 if a savepoint or transaction
3402       ** rollback occurs within the transaction.
3403       */
3404       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3405         rc = sqlite3PagerWrite(pPage1->pDbPage);
3406         if( rc==SQLITE_OK ){
3407           put4byte(&pPage1->aData[28], pBt->nPage);
3408         }
3409       }
3410     }
3411   }
3412
3413
3414 trans_begun:
3415   if( rc==SQLITE_OK && wrflag ){
3416     /* This call makes sure that the pager has the correct number of
3417     ** open savepoints. If the second parameter is greater than 0 and
3418     ** the sub-journal is not already open, then it will be opened here.
3419     */
3420     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3421   }
3422
3423   btreeIntegrity(p);
3424   sqlite3BtreeLeave(p);
3425   return rc;
3426 }
3427
3428 #ifndef SQLITE_OMIT_AUTOVACUUM
3429
3430 /*
3431 ** Set the pointer-map entries for all children of page pPage. Also, if
3432 ** pPage contains cells that point to overflow pages, set the pointer
3433 ** map entries for the overflow pages as well.
3434 */
3435 static int setChildPtrmaps(MemPage *pPage){
3436   int i;                             /* Counter variable */
3437   int nCell;                         /* Number of cells in page pPage */
3438   int rc;                            /* Return code */
3439   BtShared *pBt = pPage->pBt;
3440   Pgno pgno = pPage->pgno;
3441
3442   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3443   rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3444   if( rc!=SQLITE_OK ) return rc;
3445   nCell = pPage->nCell;
3446
3447   for(i=0; i<nCell; i++){
3448     u8 *pCell = findCell(pPage, i);
3449
3450     ptrmapPutOvflPtr(pPage, pCell, &rc);
3451
3452     if( !pPage->leaf ){
3453       Pgno childPgno = get4byte(pCell);
3454       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3455     }
3456   }
3457
3458   if( !pPage->leaf ){
3459     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3460     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3461   }
3462
3463   return rc;
3464 }
3465
3466 /*
3467 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3468 ** that it points to iTo. Parameter eType describes the type of pointer to
3469 ** be modified, as  follows:
3470 **
3471 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3472 **                   page of pPage.
3473 **
3474 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3475 **                   page pointed to by one of the cells on pPage.
3476 **
3477 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3478 **                   overflow page in the list.
3479 */
3480 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3481   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3482   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3483   if( eType==PTRMAP_OVERFLOW2 ){
3484     /* The pointer is always the first 4 bytes of the page in this case.  */
3485     if( get4byte(pPage->aData)!=iFrom ){
3486       return SQLITE_CORRUPT_PAGE(pPage);
3487     }
3488     put4byte(pPage->aData, iTo);
3489   }else{
3490     int i;
3491     int nCell;
3492     int rc;
3493
3494     rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3495     if( rc ) return rc;
3496     nCell = pPage->nCell;
3497
3498     for(i=0; i<nCell; i++){
3499       u8 *pCell = findCell(pPage, i);
3500       if( eType==PTRMAP_OVERFLOW1 ){
3501         CellInfo info;
3502         pPage->xParseCell(pPage, pCell, &info);
3503         if( info.nLocal<info.nPayload ){
3504           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3505             return SQLITE_CORRUPT_PAGE(pPage);
3506           }
3507           if( iFrom==get4byte(pCell+info.nSize-4) ){
3508             put4byte(pCell+info.nSize-4, iTo);
3509             break;
3510           }
3511         }
3512       }else{
3513         if( get4byte(pCell)==iFrom ){
3514           put4byte(pCell, iTo);
3515           break;
3516         }
3517       }
3518     }
3519
3520     if( i==nCell ){
3521       if( eType!=PTRMAP_BTREE ||
3522           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3523         return SQLITE_CORRUPT_PAGE(pPage);
3524       }
3525       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3526     }
3527   }
3528   return SQLITE_OK;
3529 }
3530
3531
3532 /*
3533 ** Move the open database page pDbPage to location iFreePage in the
3534 ** database. The pDbPage reference remains valid.
3535 **
3536 ** The isCommit flag indicates that there is no need to remember that
3537 ** the journal needs to be sync()ed before database page pDbPage->pgno
3538 ** can be written to. The caller has already promised not to write to that
3539 ** page.
3540 */
3541 static int relocatePage(
3542   BtShared *pBt,           /* Btree */
3543   MemPage *pDbPage,        /* Open page to move */
3544   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3545   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3546   Pgno iFreePage,          /* The location to move pDbPage to */
3547   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3548 ){
3549   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3550   Pgno iDbPage = pDbPage->pgno;
3551   Pager *pPager = pBt->pPager;
3552   int rc;
3553
3554   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3555       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3556   assert( sqlite3_mutex_held(pBt->mutex) );
3557   assert( pDbPage->pBt==pBt );
3558
3559   /* Move page iDbPage from its current location to page number iFreePage */
3560   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3561       iDbPage, iFreePage, iPtrPage, eType));
3562   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3563   if( rc!=SQLITE_OK ){
3564     return rc;
3565   }
3566   pDbPage->pgno = iFreePage;
3567
3568   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3569   ** that point to overflow pages. The pointer map entries for all these
3570   ** pages need to be changed.
3571   **
3572   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3573   ** pointer to a subsequent overflow page. If this is the case, then
3574   ** the pointer map needs to be updated for the subsequent overflow page.
3575   */
3576   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3577     rc = setChildPtrmaps(pDbPage);
3578     if( rc!=SQLITE_OK ){
3579       return rc;
3580     }
3581   }else{
3582     Pgno nextOvfl = get4byte(pDbPage->aData);
3583     if( nextOvfl!=0 ){
3584       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3585       if( rc!=SQLITE_OK ){
3586         return rc;
3587       }
3588     }
3589   }
3590
3591   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3592   ** that it points at iFreePage. Also fix the pointer map entry for
3593   ** iPtrPage.
3594   */
3595   if( eType!=PTRMAP_ROOTPAGE ){
3596     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3597     if( rc!=SQLITE_OK ){
3598       return rc;
3599     }
3600     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3601     if( rc!=SQLITE_OK ){
3602       releasePage(pPtrPage);
3603       return rc;
3604     }
3605     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3606     releasePage(pPtrPage);
3607     if( rc==SQLITE_OK ){
3608       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3609     }
3610   }
3611   return rc;
3612 }
3613
3614 /* Forward declaration required by incrVacuumStep(). */
3615 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3616
3617 /*
3618 ** Perform a single step of an incremental-vacuum. If successful, return
3619 ** SQLITE_OK. If there is no work to do (and therefore no point in
3620 ** calling this function again), return SQLITE_DONE. Or, if an error
3621 ** occurs, return some other error code.
3622 **
3623 ** More specifically, this function attempts to re-organize the database so
3624 ** that the last page of the file currently in use is no longer in use.
3625 **
3626 ** Parameter nFin is the number of pages that this database would contain
3627 ** were this function called until it returns SQLITE_DONE.
3628 **
3629 ** If the bCommit parameter is non-zero, this function assumes that the
3630 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3631 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3632 ** operation, or false for an incremental vacuum.
3633 */
3634 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3635   Pgno nFreeList;           /* Number of pages still on the free-list */
3636   int rc;
3637
3638   assert( sqlite3_mutex_held(pBt->mutex) );
3639   assert( iLastPg>nFin );
3640
3641   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3642     u8 eType;
3643     Pgno iPtrPage;
3644
3645     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3646     if( nFreeList==0 ){
3647       return SQLITE_DONE;
3648     }
3649
3650     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3651     if( rc!=SQLITE_OK ){
3652       return rc;
3653     }
3654     if( eType==PTRMAP_ROOTPAGE ){
3655       return SQLITE_CORRUPT_BKPT;
3656     }
3657
3658     if( eType==PTRMAP_FREEPAGE ){
3659       if( bCommit==0 ){
3660         /* Remove the page from the files free-list. This is not required
3661         ** if bCommit is non-zero. In that case, the free-list will be
3662         ** truncated to zero after this function returns, so it doesn't
3663         ** matter if it still contains some garbage entries.
3664         */
3665         Pgno iFreePg;
3666         MemPage *pFreePg;
3667         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3668         if( rc!=SQLITE_OK ){
3669           return rc;
3670         }
3671         assert( iFreePg==iLastPg );
3672         releasePage(pFreePg);
3673       }
3674     } else {
3675       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3676       MemPage *pLastPg;
3677       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3678       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3679
3680       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3681       if( rc!=SQLITE_OK ){
3682         return rc;
3683       }
3684
3685       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3686       ** is swapped with the first free page pulled off the free list.
3687       **
3688       ** On the other hand, if bCommit is greater than zero, then keep
3689       ** looping until a free-page located within the first nFin pages
3690       ** of the file is found.
3691       */
3692       if( bCommit==0 ){
3693         eMode = BTALLOC_LE;
3694         iNear = nFin;
3695       }
3696       do {
3697         MemPage *pFreePg;
3698         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3699         if( rc!=SQLITE_OK ){
3700           releasePage(pLastPg);
3701           return rc;
3702         }
3703         releasePage(pFreePg);
3704       }while( bCommit && iFreePg>nFin );
3705       assert( iFreePg<iLastPg );
3706
3707       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3708       releasePage(pLastPg);
3709       if( rc!=SQLITE_OK ){
3710         return rc;
3711       }
3712     }
3713   }
3714
3715   if( bCommit==0 ){
3716     do {
3717       iLastPg--;
3718     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3719     pBt->bDoTruncate = 1;
3720     pBt->nPage = iLastPg;
3721   }
3722   return SQLITE_OK;
3723 }
3724
3725 /*
3726 ** The database opened by the first argument is an auto-vacuum database
3727 ** nOrig pages in size containing nFree free pages. Return the expected
3728 ** size of the database in pages following an auto-vacuum operation.
3729 */
3730 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3731   int nEntry;                     /* Number of entries on one ptrmap page */
3732   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3733   Pgno nFin;                      /* Return value */
3734
3735   nEntry = pBt->usableSize/5;
3736   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3737   nFin = nOrig - nFree - nPtrmap;
3738   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3739     nFin--;
3740   }
3741   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3742     nFin--;
3743   }
3744
3745   return nFin;
3746 }
3747
3748 /*
3749 ** A write-transaction must be opened before calling this function.
3750 ** It performs a single unit of work towards an incremental vacuum.
3751 **
3752 ** If the incremental vacuum is finished after this function has run,
3753 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3754 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3755 */
3756 int sqlite3BtreeIncrVacuum(Btree *p){
3757   int rc;
3758   BtShared *pBt = p->pBt;
3759
3760   sqlite3BtreeEnter(p);
3761   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3762   if( !pBt->autoVacuum ){
3763     rc = SQLITE_DONE;
3764   }else{
3765     Pgno nOrig = btreePagecount(pBt);
3766     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3767     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3768
3769     if( nOrig<nFin ){
3770       rc = SQLITE_CORRUPT_BKPT;
3771     }else if( nFree>0 ){
3772       rc = saveAllCursors(pBt, 0, 0);
3773       if( rc==SQLITE_OK ){
3774         invalidateAllOverflowCache(pBt);
3775         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3776       }
3777       if( rc==SQLITE_OK ){
3778         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3779         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3780       }
3781     }else{
3782       rc = SQLITE_DONE;
3783     }
3784   }
3785   sqlite3BtreeLeave(p);
3786   return rc;
3787 }
3788
3789 /*
3790 ** This routine is called prior to sqlite3PagerCommit when a transaction
3791 ** is committed for an auto-vacuum database.
3792 **
3793 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3794 ** the database file should be truncated to during the commit process.
3795 ** i.e. the database has been reorganized so that only the first *pnTrunc
3796 ** pages are in use.
3797 */
3798 static int autoVacuumCommit(BtShared *pBt){
3799   int rc = SQLITE_OK;
3800   Pager *pPager = pBt->pPager;
3801   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3802
3803   assert( sqlite3_mutex_held(pBt->mutex) );
3804   invalidateAllOverflowCache(pBt);
3805   assert(pBt->autoVacuum);
3806   if( !pBt->incrVacuum ){
3807     Pgno nFin;         /* Number of pages in database after autovacuuming */
3808     Pgno nFree;        /* Number of pages on the freelist initially */
3809     Pgno iFree;        /* The next page to be freed */
3810     Pgno nOrig;        /* Database size before freeing */
3811
3812     nOrig = btreePagecount(pBt);
3813     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3814       /* It is not possible to create a database for which the final page
3815       ** is either a pointer-map page or the pending-byte page. If one
3816       ** is encountered, this indicates corruption.
3817       */
3818       return SQLITE_CORRUPT_BKPT;
3819     }
3820
3821     nFree = get4byte(&pBt->pPage1->aData[36]);
3822     nFin = finalDbSize(pBt, nOrig, nFree);
3823     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3824     if( nFin<nOrig ){
3825       rc = saveAllCursors(pBt, 0, 0);
3826     }
3827     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3828       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3829     }
3830     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3831       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3832       put4byte(&pBt->pPage1->aData[32], 0);
3833       put4byte(&pBt->pPage1->aData[36], 0);
3834       put4byte(&pBt->pPage1->aData[28], nFin);
3835       pBt->bDoTruncate = 1;
3836       pBt->nPage = nFin;
3837     }
3838     if( rc!=SQLITE_OK ){
3839       sqlite3PagerRollback(pPager);
3840     }
3841   }
3842
3843   assert( nRef>=sqlite3PagerRefcount(pPager) );
3844   return rc;
3845 }
3846
3847 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3848 # define setChildPtrmaps(x) SQLITE_OK
3849 #endif
3850
3851 /*
3852 ** This routine does the first phase of a two-phase commit.  This routine
3853 ** causes a rollback journal to be created (if it does not already exist)
3854 ** and populated with enough information so that if a power loss occurs
3855 ** the database can be restored to its original state by playing back
3856 ** the journal.  Then the contents of the journal are flushed out to
3857 ** the disk.  After the journal is safely on oxide, the changes to the
3858 ** database are written into the database file and flushed to oxide.
3859 ** At the end of this call, the rollback journal still exists on the
3860 ** disk and we are still holding all locks, so the transaction has not
3861 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3862 ** commit process.
3863 **
3864 ** This call is a no-op if no write-transaction is currently active on pBt.
3865 **
3866 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3867 ** the name of a master journal file that should be written into the
3868 ** individual journal file, or is NULL, indicating no master journal file
3869 ** (single database transaction).
3870 **
3871 ** When this is called, the master journal should already have been
3872 ** created, populated with this journal pointer and synced to disk.
3873 **
3874 ** Once this is routine has returned, the only thing required to commit
3875 ** the write-transaction for this database file is to delete the journal.
3876 */
3877 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3878   int rc = SQLITE_OK;
3879   if( p->inTrans==TRANS_WRITE ){
3880     BtShared *pBt = p->pBt;
3881     sqlite3BtreeEnter(p);
3882 #ifndef SQLITE_OMIT_AUTOVACUUM
3883     if( pBt->autoVacuum ){
3884       rc = autoVacuumCommit(pBt);
3885       if( rc!=SQLITE_OK ){
3886         sqlite3BtreeLeave(p);
3887         return rc;
3888       }
3889     }
3890     if( pBt->bDoTruncate ){
3891       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3892     }
3893 #endif
3894     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3895     sqlite3BtreeLeave(p);
3896   }
3897   return rc;
3898 }
3899
3900 /*
3901 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3902 ** at the conclusion of a transaction.
3903 */
3904 static void btreeEndTransaction(Btree *p){
3905   BtShared *pBt = p->pBt;
3906   sqlite3 *db = p->db;
3907   assert( sqlite3BtreeHoldsMutex(p) );
3908
3909 #ifndef SQLITE_OMIT_AUTOVACUUM
3910   pBt->bDoTruncate = 0;
3911 #endif
3912   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
3913     /* If there are other active statements that belong to this database
3914     ** handle, downgrade to a read-only transaction. The other statements
3915     ** may still be reading from the database.  */
3916     downgradeAllSharedCacheTableLocks(p);
3917     p->inTrans = TRANS_READ;
3918   }else{
3919     /* If the handle had any kind of transaction open, decrement the
3920     ** transaction count of the shared btree. If the transaction count
3921     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3922     ** call below will unlock the pager.  */
3923     if( p->inTrans!=TRANS_NONE ){
3924       clearAllSharedCacheTableLocks(p);
3925       pBt->nTransaction--;
3926       if( 0==pBt->nTransaction ){
3927         pBt->inTransaction = TRANS_NONE;
3928       }
3929     }
3930
3931     /* Set the current transaction state to TRANS_NONE and unlock the
3932     ** pager if this call closed the only read or write transaction.  */
3933     p->inTrans = TRANS_NONE;
3934     unlockBtreeIfUnused(pBt);
3935   }
3936
3937   btreeIntegrity(p);
3938 }
3939
3940 /*
3941 ** Commit the transaction currently in progress.
3942 **
3943 ** This routine implements the second phase of a 2-phase commit.  The
3944 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3945 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3946 ** routine did all the work of writing information out to disk and flushing the
3947 ** contents so that they are written onto the disk platter.  All this
3948 ** routine has to do is delete or truncate or zero the header in the
3949 ** the rollback journal (which causes the transaction to commit) and
3950 ** drop locks.
3951 **
3952 ** Normally, if an error occurs while the pager layer is attempting to
3953 ** finalize the underlying journal file, this function returns an error and
3954 ** the upper layer will attempt a rollback. However, if the second argument
3955 ** is non-zero then this b-tree transaction is part of a multi-file
3956 ** transaction. In this case, the transaction has already been committed
3957 ** (by deleting a master journal file) and the caller will ignore this
3958 ** functions return code. So, even if an error occurs in the pager layer,
3959 ** reset the b-tree objects internal state to indicate that the write
3960 ** transaction has been closed. This is quite safe, as the pager will have
3961 ** transitioned to the error state.
3962 **
3963 ** This will release the write lock on the database file.  If there
3964 ** are no active cursors, it also releases the read lock.
3965 */
3966 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3967
3968   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3969   sqlite3BtreeEnter(p);
3970   btreeIntegrity(p);
3971
3972   /* If the handle has a write-transaction open, commit the shared-btrees
3973   ** transaction and set the shared state to TRANS_READ.
3974   */
3975   if( p->inTrans==TRANS_WRITE ){
3976     int rc;
3977     BtShared *pBt = p->pBt;
3978     assert( pBt->inTransaction==TRANS_WRITE );
3979     assert( pBt->nTransaction>0 );
3980     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3981     if( rc!=SQLITE_OK && bCleanup==0 ){
3982       sqlite3BtreeLeave(p);
3983       return rc;
3984     }
3985     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
3986     pBt->inTransaction = TRANS_READ;
3987     btreeClearHasContent(pBt);
3988   }
3989
3990   btreeEndTransaction(p);
3991   sqlite3BtreeLeave(p);
3992   return SQLITE_OK;
3993 }
3994
3995 /*
3996 ** Do both phases of a commit.
3997 */
3998 int sqlite3BtreeCommit(Btree *p){
3999   int rc;
4000   sqlite3BtreeEnter(p);
4001   rc = sqlite3BtreeCommitPhaseOne(p, 0);
4002   if( rc==SQLITE_OK ){
4003     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
4004   }
4005   sqlite3BtreeLeave(p);
4006   return rc;
4007 }
4008
4009 /*
4010 ** This routine sets the state to CURSOR_FAULT and the error
4011 ** code to errCode for every cursor on any BtShared that pBtree
4012 ** references.  Or if the writeOnly flag is set to 1, then only
4013 ** trip write cursors and leave read cursors unchanged.
4014 **
4015 ** Every cursor is a candidate to be tripped, including cursors
4016 ** that belong to other database connections that happen to be
4017 ** sharing the cache with pBtree.
4018 **
4019 ** This routine gets called when a rollback occurs. If the writeOnly
4020 ** flag is true, then only write-cursors need be tripped - read-only
4021 ** cursors save their current positions so that they may continue
4022 ** following the rollback. Or, if writeOnly is false, all cursors are
4023 ** tripped. In general, writeOnly is false if the transaction being
4024 ** rolled back modified the database schema. In this case b-tree root
4025 ** pages may be moved or deleted from the database altogether, making
4026 ** it unsafe for read cursors to continue.
4027 **
4028 ** If the writeOnly flag is true and an error is encountered while
4029 ** saving the current position of a read-only cursor, all cursors,
4030 ** including all read-cursors are tripped.
4031 **
4032 ** SQLITE_OK is returned if successful, or if an error occurs while
4033 ** saving a cursor position, an SQLite error code.
4034 */
4035 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
4036   BtCursor *p;
4037   int rc = SQLITE_OK;
4038
4039   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
4040   if( pBtree ){
4041     sqlite3BtreeEnter(pBtree);
4042     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
4043       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
4044         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
4045           rc = saveCursorPosition(p);
4046           if( rc!=SQLITE_OK ){
4047             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
4048             break;
4049           }
4050         }
4051       }else{
4052         sqlite3BtreeClearCursor(p);
4053         p->eState = CURSOR_FAULT;
4054         p->skipNext = errCode;
4055       }
4056       btreeReleaseAllCursorPages(p);
4057     }
4058     sqlite3BtreeLeave(pBtree);
4059   }
4060   return rc;
4061 }
4062
4063 /*
4064 ** Rollback the transaction in progress.
4065 **
4066 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
4067 ** Only write cursors are tripped if writeOnly is true but all cursors are
4068 ** tripped if writeOnly is false.  Any attempt to use
4069 ** a tripped cursor will result in an error.
4070 **
4071 ** This will release the write lock on the database file.  If there
4072 ** are no active cursors, it also releases the read lock.
4073 */
4074 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
4075   int rc;
4076   BtShared *pBt = p->pBt;
4077   MemPage *pPage1;
4078
4079   assert( writeOnly==1 || writeOnly==0 );
4080   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
4081   sqlite3BtreeEnter(p);
4082   if( tripCode==SQLITE_OK ){
4083     rc = tripCode = saveAllCursors(pBt, 0, 0);
4084     if( rc ) writeOnly = 0;
4085   }else{
4086     rc = SQLITE_OK;
4087   }
4088   if( tripCode ){
4089     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
4090     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
4091     if( rc2!=SQLITE_OK ) rc = rc2;
4092   }
4093   btreeIntegrity(p);
4094
4095   if( p->inTrans==TRANS_WRITE ){
4096     int rc2;
4097
4098     assert( TRANS_WRITE==pBt->inTransaction );
4099     rc2 = sqlite3PagerRollback(pBt->pPager);
4100     if( rc2!=SQLITE_OK ){
4101       rc = rc2;
4102     }
4103
4104     /* The rollback may have destroyed the pPage1->aData value.  So
4105     ** call btreeGetPage() on page 1 again to make
4106     ** sure pPage1->aData is set correctly. */
4107     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
4108       int nPage = get4byte(28+(u8*)pPage1->aData);
4109       testcase( nPage==0 );
4110       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
4111       testcase( pBt->nPage!=nPage );
4112       pBt->nPage = nPage;
4113       releasePageOne(pPage1);
4114     }
4115     assert( countValidCursors(pBt, 1)==0 );
4116     pBt->inTransaction = TRANS_READ;
4117     btreeClearHasContent(pBt);
4118   }
4119
4120   btreeEndTransaction(p);
4121   sqlite3BtreeLeave(p);
4122   return rc;
4123 }
4124
4125 /*
4126 ** Start a statement subtransaction. The subtransaction can be rolled
4127 ** back independently of the main transaction. You must start a transaction
4128 ** before starting a subtransaction. The subtransaction is ended automatically
4129 ** if the main transaction commits or rolls back.
4130 **
4131 ** Statement subtransactions are used around individual SQL statements
4132 ** that are contained within a BEGIN...COMMIT block.  If a constraint
4133 ** error occurs within the statement, the effect of that one statement
4134 ** can be rolled back without having to rollback the entire transaction.
4135 **
4136 ** A statement sub-transaction is implemented as an anonymous savepoint. The
4137 ** value passed as the second parameter is the total number of savepoints,
4138 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
4139 ** are no active savepoints and no other statement-transactions open,
4140 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4141 ** using the sqlite3BtreeSavepoint() function.
4142 */
4143 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4144   int rc;
4145   BtShared *pBt = p->pBt;
4146   sqlite3BtreeEnter(p);
4147   assert( p->inTrans==TRANS_WRITE );
4148   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4149   assert( iStatement>0 );
4150   assert( iStatement>p->db->nSavepoint );
4151   assert( pBt->inTransaction==TRANS_WRITE );
4152   /* At the pager level, a statement transaction is a savepoint with
4153   ** an index greater than all savepoints created explicitly using
4154   ** SQL statements. It is illegal to open, release or rollback any
4155   ** such savepoints while the statement transaction savepoint is active.
4156   */
4157   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4158   sqlite3BtreeLeave(p);
4159   return rc;
4160 }
4161
4162 /*
4163 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4164 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4165 ** savepoint identified by parameter iSavepoint, depending on the value
4166 ** of op.
4167 **
4168 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4169 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4170 ** contents of the entire transaction are rolled back. This is different
4171 ** from a normal transaction rollback, as no locks are released and the
4172 ** transaction remains open.
4173 */
4174 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4175   int rc = SQLITE_OK;
4176   if( p && p->inTrans==TRANS_WRITE ){
4177     BtShared *pBt = p->pBt;
4178     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4179     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4180     sqlite3BtreeEnter(p);
4181     if( op==SAVEPOINT_ROLLBACK ){
4182       rc = saveAllCursors(pBt, 0, 0);
4183     }
4184     if( rc==SQLITE_OK ){
4185       rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4186     }
4187     if( rc==SQLITE_OK ){
4188       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4189         pBt->nPage = 0;
4190       }
4191       rc = newDatabase(pBt);
4192       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
4193
4194       /* The database size was written into the offset 28 of the header
4195       ** when the transaction started, so we know that the value at offset
4196       ** 28 is nonzero. */
4197       assert( pBt->nPage>0 );
4198     }
4199     sqlite3BtreeLeave(p);
4200   }
4201   return rc;
4202 }
4203
4204 /*
4205 ** Create a new cursor for the BTree whose root is on the page
4206 ** iTable. If a read-only cursor is requested, it is assumed that
4207 ** the caller already has at least a read-only transaction open
4208 ** on the database already. If a write-cursor is requested, then
4209 ** the caller is assumed to have an open write transaction.
4210 **
4211 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4212 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4213 ** can be used for reading or for writing if other conditions for writing
4214 ** are also met.  These are the conditions that must be met in order
4215 ** for writing to be allowed:
4216 **
4217 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4218 **
4219 ** 2:  Other database connections that share the same pager cache
4220 **     but which are not in the READ_UNCOMMITTED state may not have
4221 **     cursors open with wrFlag==0 on the same table.  Otherwise
4222 **     the changes made by this write cursor would be visible to
4223 **     the read cursors in the other database connection.
4224 **
4225 ** 3:  The database must be writable (not on read-only media)
4226 **
4227 ** 4:  There must be an active transaction.
4228 **
4229 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4230 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4231 ** this cursor will only be used to seek to and delete entries of an index
4232 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4233 ** this implementation.  But in a hypothetical alternative storage engine
4234 ** in which index entries are automatically deleted when corresponding table
4235 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4236 ** operations on this cursor can be no-ops and all READ operations can
4237 ** return a null row (2-bytes: 0x01 0x00).
4238 **
4239 ** No checking is done to make sure that page iTable really is the
4240 ** root page of a b-tree.  If it is not, then the cursor acquired
4241 ** will not work correctly.
4242 **
4243 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4244 ** on pCur to initialize the memory space prior to invoking this routine.
4245 */
4246 static int btreeCursor(
4247   Btree *p,                              /* The btree */
4248   int iTable,                            /* Root page of table to open */
4249   int wrFlag,                            /* 1 to write. 0 read-only */
4250   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4251   BtCursor *pCur                         /* Space for new cursor */
4252 ){
4253   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4254   BtCursor *pX;                          /* Looping over other all cursors */
4255
4256   assert( sqlite3BtreeHoldsMutex(p) );
4257   assert( wrFlag==0
4258        || wrFlag==BTREE_WRCSR
4259        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4260   );
4261
4262   /* The following assert statements verify that if this is a sharable
4263   ** b-tree database, the connection is holding the required table locks,
4264   ** and that no other connection has any open cursor that conflicts with
4265   ** this lock.  */
4266   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
4267   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4268
4269   /* Assert that the caller has opened the required transaction. */
4270   assert( p->inTrans>TRANS_NONE );
4271   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4272   assert( pBt->pPage1 && pBt->pPage1->aData );
4273   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4274
4275   if( wrFlag ){
4276     allocateTempSpace(pBt);
4277     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4278   }
4279   if( iTable==1 && btreePagecount(pBt)==0 ){
4280     assert( wrFlag==0 );
4281     iTable = 0;
4282   }
4283
4284   /* Now that no other errors can occur, finish filling in the BtCursor
4285   ** variables and link the cursor into the BtShared list.  */
4286   pCur->pgnoRoot = (Pgno)iTable;
4287   pCur->iPage = -1;
4288   pCur->pKeyInfo = pKeyInfo;
4289   pCur->pBtree = p;
4290   pCur->pBt = pBt;
4291   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4292   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4293   /* If there are two or more cursors on the same btree, then all such
4294   ** cursors *must* have the BTCF_Multiple flag set. */
4295   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4296     if( pX->pgnoRoot==(Pgno)iTable ){
4297       pX->curFlags |= BTCF_Multiple;
4298       pCur->curFlags |= BTCF_Multiple;
4299     }
4300   }
4301   pCur->pNext = pBt->pCursor;
4302   pBt->pCursor = pCur;
4303   pCur->eState = CURSOR_INVALID;
4304   return SQLITE_OK;
4305 }
4306 int sqlite3BtreeCursor(
4307   Btree *p,                                   /* The btree */
4308   int iTable,                                 /* Root page of table to open */
4309   int wrFlag,                                 /* 1 to write. 0 read-only */
4310   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4311   BtCursor *pCur                              /* Write new cursor here */
4312 ){
4313   int rc;
4314   if( iTable<1 ){
4315     rc = SQLITE_CORRUPT_BKPT;
4316   }else{
4317     sqlite3BtreeEnter(p);
4318     rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4319     sqlite3BtreeLeave(p);
4320   }
4321   return rc;
4322 }
4323
4324 /*
4325 ** Return the size of a BtCursor object in bytes.
4326 **
4327 ** This interfaces is needed so that users of cursors can preallocate
4328 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4329 ** to users so they cannot do the sizeof() themselves - they must call
4330 ** this routine.
4331 */
4332 int sqlite3BtreeCursorSize(void){
4333   return ROUND8(sizeof(BtCursor));
4334 }
4335
4336 /*
4337 ** Initialize memory that will be converted into a BtCursor object.
4338 **
4339 ** The simple approach here would be to memset() the entire object
4340 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4341 ** do not need to be zeroed and they are large, so we can save a lot
4342 ** of run-time by skipping the initialization of those elements.
4343 */
4344 void sqlite3BtreeCursorZero(BtCursor *p){
4345   memset(p, 0, offsetof(BtCursor, iPage));
4346 }
4347
4348 /*
4349 ** Close a cursor.  The read lock on the database file is released
4350 ** when the last cursor is closed.
4351 */
4352 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4353   Btree *pBtree = pCur->pBtree;
4354   if( pBtree ){
4355     BtShared *pBt = pCur->pBt;
4356     sqlite3BtreeEnter(pBtree);
4357     assert( pBt->pCursor!=0 );
4358     if( pBt->pCursor==pCur ){
4359       pBt->pCursor = pCur->pNext;
4360     }else{
4361       BtCursor *pPrev = pBt->pCursor;
4362       do{
4363         if( pPrev->pNext==pCur ){
4364           pPrev->pNext = pCur->pNext;
4365           break;
4366         }
4367         pPrev = pPrev->pNext;
4368       }while( ALWAYS(pPrev) );
4369     }
4370     btreeReleaseAllCursorPages(pCur);
4371     unlockBtreeIfUnused(pBt);
4372     sqlite3_free(pCur->aOverflow);
4373     sqlite3_free(pCur->pKey);
4374     sqlite3BtreeLeave(pBtree);
4375   }
4376   return SQLITE_OK;
4377 }
4378
4379 /*
4380 ** Make sure the BtCursor* given in the argument has a valid
4381 ** BtCursor.info structure.  If it is not already valid, call
4382 ** btreeParseCell() to fill it in.
4383 **
4384 ** BtCursor.info is a cache of the information in the current cell.
4385 ** Using this cache reduces the number of calls to btreeParseCell().
4386 */
4387 #ifndef NDEBUG
4388   static void assertCellInfo(BtCursor *pCur){
4389     CellInfo info;
4390     memset(&info, 0, sizeof(info));
4391     btreeParseCell(pCur->pPage, pCur->ix, &info);
4392     assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
4393   }
4394 #else
4395   #define assertCellInfo(x)
4396 #endif
4397 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4398   if( pCur->info.nSize==0 ){
4399     pCur->curFlags |= BTCF_ValidNKey;
4400     btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
4401   }else{
4402     assertCellInfo(pCur);
4403   }
4404 }
4405
4406 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4407 /*
4408 ** Return true if the given BtCursor is valid.  A valid cursor is one
4409 ** that is currently pointing to a row in a (non-empty) table.
4410 ** This is a verification routine is used only within assert() statements.
4411 */
4412 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4413   return pCur && pCur->eState==CURSOR_VALID;
4414 }
4415 #endif /* NDEBUG */
4416 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4417   assert( pCur!=0 );
4418   return pCur->eState==CURSOR_VALID;
4419 }
4420
4421 /*
4422 ** Return the value of the integer key or "rowid" for a table btree.
4423 ** This routine is only valid for a cursor that is pointing into a
4424 ** ordinary table btree.  If the cursor points to an index btree or
4425 ** is invalid, the result of this routine is undefined.
4426 */
4427 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4428   assert( cursorHoldsMutex(pCur) );
4429   assert( pCur->eState==CURSOR_VALID );
4430   assert( pCur->curIntKey );
4431   getCellInfo(pCur);
4432   return pCur->info.nKey;
4433 }
4434
4435 #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
4436 /*
4437 ** Return the offset into the database file for the start of the
4438 ** payload to which the cursor is pointing.
4439 */
4440 i64 sqlite3BtreeOffset(BtCursor *pCur){
4441   assert( cursorHoldsMutex(pCur) );
4442   assert( pCur->eState==CURSOR_VALID );
4443   getCellInfo(pCur);
4444   return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
4445          (i64)(pCur->info.pPayload - pCur->pPage->aData);
4446 }
4447 #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */
4448
4449 /*
4450 ** Return the number of bytes of payload for the entry that pCur is
4451 ** currently pointing to.  For table btrees, this will be the amount
4452 ** of data.  For index btrees, this will be the size of the key.
4453 **
4454 ** The caller must guarantee that the cursor is pointing to a non-NULL
4455 ** valid entry.  In other words, the calling procedure must guarantee
4456 ** that the cursor has Cursor.eState==CURSOR_VALID.
4457 */
4458 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4459   assert( cursorHoldsMutex(pCur) );
4460   assert( pCur->eState==CURSOR_VALID );
4461   getCellInfo(pCur);
4462   return pCur->info.nPayload;
4463 }
4464
4465 /*
4466 ** Given the page number of an overflow page in the database (parameter
4467 ** ovfl), this function finds the page number of the next page in the
4468 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4469 ** pointer-map data instead of reading the content of page ovfl to do so.
4470 **
4471 ** If an error occurs an SQLite error code is returned. Otherwise:
4472 **
4473 ** The page number of the next overflow page in the linked list is
4474 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4475 ** list, *pPgnoNext is set to zero.
4476 **
4477 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4478 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4479 ** reference. It is the responsibility of the caller to call releasePage()
4480 ** on *ppPage to free the reference. In no reference was obtained (because
4481 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4482 ** *ppPage is set to zero.
4483 */
4484 static int getOverflowPage(
4485   BtShared *pBt,               /* The database file */
4486   Pgno ovfl,                   /* Current overflow page number */
4487   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4488   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4489 ){
4490   Pgno next = 0;
4491   MemPage *pPage = 0;
4492   int rc = SQLITE_OK;
4493
4494   assert( sqlite3_mutex_held(pBt->mutex) );
4495   assert(pPgnoNext);
4496
4497 #ifndef SQLITE_OMIT_AUTOVACUUM
4498   /* Try to find the next page in the overflow list using the
4499   ** autovacuum pointer-map pages. Guess that the next page in
4500   ** the overflow list is page number (ovfl+1). If that guess turns
4501   ** out to be wrong, fall back to loading the data of page
4502   ** number ovfl to determine the next page number.
4503   */
4504   if( pBt->autoVacuum ){
4505     Pgno pgno;
4506     Pgno iGuess = ovfl+1;
4507     u8 eType;
4508
4509     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4510       iGuess++;
4511     }
4512
4513     if( iGuess<=btreePagecount(pBt) ){
4514       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4515       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4516         next = iGuess;
4517         rc = SQLITE_DONE;
4518       }
4519     }
4520   }
4521 #endif
4522
4523   assert( next==0 || rc==SQLITE_DONE );
4524   if( rc==SQLITE_OK ){
4525     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4526     assert( rc==SQLITE_OK || pPage==0 );
4527     if( rc==SQLITE_OK ){
4528       next = get4byte(pPage->aData);
4529     }
4530   }
4531
4532   *pPgnoNext = next;
4533   if( ppPage ){
4534     *ppPage = pPage;
4535   }else{
4536     releasePage(pPage);
4537   }
4538   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4539 }
4540
4541 /*
4542 ** Copy data from a buffer to a page, or from a page to a buffer.
4543 **
4544 ** pPayload is a pointer to data stored on database page pDbPage.
4545 ** If argument eOp is false, then nByte bytes of data are copied
4546 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4547 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4548 ** of data are copied from the buffer pBuf to pPayload.
4549 **
4550 ** SQLITE_OK is returned on success, otherwise an error code.
4551 */
4552 static int copyPayload(
4553   void *pPayload,           /* Pointer to page data */
4554   void *pBuf,               /* Pointer to buffer */
4555   int nByte,                /* Number of bytes to copy */
4556   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4557   DbPage *pDbPage           /* Page containing pPayload */
4558 ){
4559   if( eOp ){
4560     /* Copy data from buffer to page (a write operation) */
4561     int rc = sqlite3PagerWrite(pDbPage);
4562     if( rc!=SQLITE_OK ){
4563       return rc;
4564     }
4565     memcpy(pPayload, pBuf, nByte);
4566   }else{
4567     /* Copy data from page to buffer (a read operation) */
4568     memcpy(pBuf, pPayload, nByte);
4569   }
4570   return SQLITE_OK;
4571 }
4572
4573 /*
4574 ** This function is used to read or overwrite payload information
4575 ** for the entry that the pCur cursor is pointing to. The eOp
4576 ** argument is interpreted as follows:
4577 **
4578 **   0: The operation is a read. Populate the overflow cache.
4579 **   1: The operation is a write. Populate the overflow cache.
4580 **
4581 ** A total of "amt" bytes are read or written beginning at "offset".
4582 ** Data is read to or from the buffer pBuf.
4583 **
4584 ** The content being read or written might appear on the main page
4585 ** or be scattered out on multiple overflow pages.
4586 **
4587 ** If the current cursor entry uses one or more overflow pages
4588 ** this function may allocate space for and lazily populate
4589 ** the overflow page-list cache array (BtCursor.aOverflow).
4590 ** Subsequent calls use this cache to make seeking to the supplied offset
4591 ** more efficient.
4592 **
4593 ** Once an overflow page-list cache has been allocated, it must be
4594 ** invalidated if some other cursor writes to the same table, or if
4595 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4596 ** mode, the following events may invalidate an overflow page-list cache.
4597 **
4598 **   * An incremental vacuum,
4599 **   * A commit in auto_vacuum="full" mode,
4600 **   * Creating a table (may require moving an overflow page).
4601 */
4602 static int accessPayload(
4603   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4604   u32 offset,          /* Begin reading this far into payload */
4605   u32 amt,             /* Read this many bytes */
4606   unsigned char *pBuf, /* Write the bytes into this buffer */
4607   int eOp              /* zero to read. non-zero to write. */
4608 ){
4609   unsigned char *aPayload;
4610   int rc = SQLITE_OK;
4611   int iIdx = 0;
4612   MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
4613   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4614 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4615   unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
4616 #endif
4617
4618   assert( pPage );
4619   assert( eOp==0 || eOp==1 );
4620   assert( pCur->eState==CURSOR_VALID );
4621   assert( pCur->ix<pPage->nCell );
4622   assert( cursorHoldsMutex(pCur) );
4623
4624   getCellInfo(pCur);
4625   aPayload = pCur->info.pPayload;
4626   assert( offset+amt <= pCur->info.nPayload );
4627
4628   assert( aPayload > pPage->aData );
4629   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4630     /* Trying to read or write past the end of the data is an error.  The
4631     ** conditional above is really:
4632     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4633     ** but is recast into its current form to avoid integer overflow problems
4634     */
4635     return SQLITE_CORRUPT_PAGE(pPage);
4636   }
4637
4638   /* Check if data must be read/written to/from the btree page itself. */
4639   if( offset<pCur->info.nLocal ){
4640     int a = amt;
4641     if( a+offset>pCur->info.nLocal ){
4642       a = pCur->info.nLocal - offset;
4643     }
4644     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
4645     offset = 0;
4646     pBuf += a;
4647     amt -= a;
4648   }else{
4649     offset -= pCur->info.nLocal;
4650   }
4651
4652
4653   if( rc==SQLITE_OK && amt>0 ){
4654     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4655     Pgno nextPage;
4656
4657     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4658
4659     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4660     **
4661     ** The aOverflow[] array is sized at one entry for each overflow page
4662     ** in the overflow chain. The page number of the first overflow page is
4663     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4664     ** means "not yet known" (the cache is lazily populated).
4665     */
4666     if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4667       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4668       if( nOvfl>pCur->nOvflAlloc ){
4669         Pgno *aNew = (Pgno*)sqlite3Realloc(
4670             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4671         );
4672         if( aNew==0 ){
4673           return SQLITE_NOMEM_BKPT;
4674         }else{
4675           pCur->nOvflAlloc = nOvfl*2;
4676           pCur->aOverflow = aNew;
4677         }
4678       }
4679       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4680       pCur->curFlags |= BTCF_ValidOvfl;
4681     }else{
4682       /* If the overflow page-list cache has been allocated and the
4683       ** entry for the first required overflow page is valid, skip
4684       ** directly to it.
4685       */
4686       if( pCur->aOverflow[offset/ovflSize] ){
4687         iIdx = (offset/ovflSize);
4688         nextPage = pCur->aOverflow[iIdx];
4689         offset = (offset%ovflSize);
4690       }
4691     }
4692
4693     assert( rc==SQLITE_OK && amt>0 );
4694     while( nextPage ){
4695       /* If required, populate the overflow page-list cache. */
4696       assert( pCur->aOverflow[iIdx]==0
4697               || pCur->aOverflow[iIdx]==nextPage
4698               || CORRUPT_DB );
4699       pCur->aOverflow[iIdx] = nextPage;
4700
4701       if( offset>=ovflSize ){
4702         /* The only reason to read this page is to obtain the page
4703         ** number for the next page in the overflow chain. The page
4704         ** data is not required. So first try to lookup the overflow
4705         ** page-list cache, if any, then fall back to the getOverflowPage()
4706         ** function.
4707         */
4708         assert( pCur->curFlags & BTCF_ValidOvfl );
4709         assert( pCur->pBtree->db==pBt->db );
4710         if( pCur->aOverflow[iIdx+1] ){
4711           nextPage = pCur->aOverflow[iIdx+1];
4712         }else{
4713           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4714         }
4715         offset -= ovflSize;
4716       }else{
4717         /* Need to read this page properly. It contains some of the
4718         ** range of data that is being read (eOp==0) or written (eOp!=0).
4719         */
4720 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4721         sqlite3_file *fd;      /* File from which to do direct overflow read */
4722 #endif
4723         int a = amt;
4724         if( a + offset > ovflSize ){
4725           a = ovflSize - offset;
4726         }
4727
4728 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4729         /* If all the following are true:
4730         **
4731         **   1) this is a read operation, and
4732         **   2) data is required from the start of this overflow page, and
4733         **   3) there is no open write-transaction, and
4734         **   4) the database is file-backed, and
4735         **   5) the page is not in the WAL file
4736         **   6) at least 4 bytes have already been read into the output buffer
4737         **
4738         ** then data can be read directly from the database file into the
4739         ** output buffer, bypassing the page-cache altogether. This speeds
4740         ** up loading large records that span many overflow pages.
4741         */
4742         if( eOp==0                                             /* (1) */
4743          && offset==0                                          /* (2) */
4744          && pBt->inTransaction==TRANS_READ                     /* (3) */
4745          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (4) */
4746          && 0==sqlite3PagerUseWal(pBt->pPager, nextPage)       /* (5) */
4747          && &pBuf[-4]>=pBufStart                               /* (6) */
4748         ){
4749           u8 aSave[4];
4750           u8 *aWrite = &pBuf[-4];
4751           assert( aWrite>=pBufStart );                         /* due to (6) */
4752           memcpy(aSave, aWrite, 4);
4753           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4754           nextPage = get4byte(aWrite);
4755           memcpy(aWrite, aSave, 4);
4756         }else
4757 #endif
4758
4759         {
4760           DbPage *pDbPage;
4761           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4762               (eOp==0 ? PAGER_GET_READONLY : 0)
4763           );
4764           if( rc==SQLITE_OK ){
4765             aPayload = sqlite3PagerGetData(pDbPage);
4766             nextPage = get4byte(aPayload);
4767             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4768             sqlite3PagerUnref(pDbPage);
4769             offset = 0;
4770           }
4771         }
4772         amt -= a;
4773         if( amt==0 ) return rc;
4774         pBuf += a;
4775       }
4776       if( rc ) break;
4777       iIdx++;
4778     }
4779   }
4780
4781   if( rc==SQLITE_OK && amt>0 ){
4782     /* Overflow chain ends prematurely */
4783     return SQLITE_CORRUPT_PAGE(pPage);
4784   }
4785   return rc;
4786 }
4787
4788 /*
4789 ** Read part of the payload for the row at which that cursor pCur is currently
4790 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
4791 ** begins at "offset".
4792 **
4793 ** pCur can be pointing to either a table or an index b-tree.
4794 ** If pointing to a table btree, then the content section is read.  If
4795 ** pCur is pointing to an index b-tree then the key section is read.
4796 **
4797 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
4798 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
4799 ** cursor might be invalid or might need to be restored before being read.
4800 **
4801 ** Return SQLITE_OK on success or an error code if anything goes
4802 ** wrong.  An error is returned if "offset+amt" is larger than
4803 ** the available payload.
4804 */
4805 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4806   assert( cursorHoldsMutex(pCur) );
4807   assert( pCur->eState==CURSOR_VALID );
4808   assert( pCur->iPage>=0 && pCur->pPage );
4809   assert( pCur->ix<pCur->pPage->nCell );
4810   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4811 }
4812
4813 /*
4814 ** This variant of sqlite3BtreePayload() works even if the cursor has not
4815 ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
4816 ** interface.
4817 */
4818 #ifndef SQLITE_OMIT_INCRBLOB
4819 static SQLITE_NOINLINE int accessPayloadChecked(
4820   BtCursor *pCur,
4821   u32 offset,
4822   u32 amt,
4823   void *pBuf
4824 ){
4825   int rc;
4826   if ( pCur->eState==CURSOR_INVALID ){
4827     return SQLITE_ABORT;
4828   }
4829   assert( cursorOwnsBtShared(pCur) );
4830   rc = btreeRestoreCursorPosition(pCur);
4831   return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
4832 }
4833 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4834   if( pCur->eState==CURSOR_VALID ){
4835     assert( cursorOwnsBtShared(pCur) );
4836     return accessPayload(pCur, offset, amt, pBuf, 0);
4837   }else{
4838     return accessPayloadChecked(pCur, offset, amt, pBuf);
4839   }
4840 }
4841 #endif /* SQLITE_OMIT_INCRBLOB */
4842
4843 /*
4844 ** Return a pointer to payload information from the entry that the
4845 ** pCur cursor is pointing to.  The pointer is to the beginning of
4846 ** the key if index btrees (pPage->intKey==0) and is the data for
4847 ** table btrees (pPage->intKey==1). The number of bytes of available
4848 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4849 ** returned will not be a valid pointer.
4850 **
4851 ** This routine is an optimization.  It is common for the entire key
4852 ** and data to fit on the local page and for there to be no overflow
4853 ** pages.  When that is so, this routine can be used to access the
4854 ** key and data without making a copy.  If the key and/or data spills
4855 ** onto overflow pages, then accessPayload() must be used to reassemble
4856 ** the key/data and copy it into a preallocated buffer.
4857 **
4858 ** The pointer returned by this routine looks directly into the cached
4859 ** page of the database.  The data might change or move the next time
4860 ** any btree routine is called.
4861 */
4862 static const void *fetchPayload(
4863   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4864   u32 *pAmt            /* Write the number of available bytes here */
4865 ){
4866   int amt;
4867   assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
4868   assert( pCur->eState==CURSOR_VALID );
4869   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4870   assert( cursorOwnsBtShared(pCur) );
4871   assert( pCur->ix<pCur->pPage->nCell );
4872   assert( pCur->info.nSize>0 );
4873   assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
4874   assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
4875   amt = pCur->info.nLocal;
4876   if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
4877     /* There is too little space on the page for the expected amount
4878     ** of local content. Database must be corrupt. */
4879     assert( CORRUPT_DB );
4880     amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
4881   }
4882   *pAmt = (u32)amt;
4883   return (void*)pCur->info.pPayload;
4884 }
4885
4886
4887 /*
4888 ** For the entry that cursor pCur is point to, return as
4889 ** many bytes of the key or data as are available on the local
4890 ** b-tree page.  Write the number of available bytes into *pAmt.
4891 **
4892 ** The pointer returned is ephemeral.  The key/data may move
4893 ** or be destroyed on the next call to any Btree routine,
4894 ** including calls from other threads against the same cache.
4895 ** Hence, a mutex on the BtShared should be held prior to calling
4896 ** this routine.
4897 **
4898 ** These routines is used to get quick access to key and data
4899 ** in the common case where no overflow pages are used.
4900 */
4901 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
4902   return fetchPayload(pCur, pAmt);
4903 }
4904
4905
4906 /*
4907 ** Move the cursor down to a new child page.  The newPgno argument is the
4908 ** page number of the child page to move to.
4909 **
4910 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4911 ** the new child page does not match the flags field of the parent (i.e.
4912 ** if an intkey page appears to be the parent of a non-intkey page, or
4913 ** vice-versa).
4914 */
4915 static int moveToChild(BtCursor *pCur, u32 newPgno){
4916   BtShared *pBt = pCur->pBt;
4917
4918   assert( cursorOwnsBtShared(pCur) );
4919   assert( pCur->eState==CURSOR_VALID );
4920   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4921   assert( pCur->iPage>=0 );
4922   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4923     return SQLITE_CORRUPT_BKPT;
4924   }
4925   pCur->info.nSize = 0;
4926   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4927   pCur->aiIdx[pCur->iPage] = pCur->ix;
4928   pCur->apPage[pCur->iPage] = pCur->pPage;
4929   pCur->ix = 0;
4930   pCur->iPage++;
4931   return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
4932 }
4933
4934 #ifdef SQLITE_DEBUG
4935 /*
4936 ** Page pParent is an internal (non-leaf) tree page. This function
4937 ** asserts that page number iChild is the left-child if the iIdx'th
4938 ** cell in page pParent. Or, if iIdx is equal to the total number of
4939 ** cells in pParent, that page number iChild is the right-child of
4940 ** the page.
4941 */
4942 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4943   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
4944                             ** in a corrupt database */
4945   assert( iIdx<=pParent->nCell );
4946   if( iIdx==pParent->nCell ){
4947     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4948   }else{
4949     assert( get4byte(findCell(pParent, iIdx))==iChild );
4950   }
4951 }
4952 #else
4953 #  define assertParentIndex(x,y,z)
4954 #endif
4955
4956 /*
4957 ** Move the cursor up to the parent page.
4958 **
4959 ** pCur->idx is set to the cell index that contains the pointer
4960 ** to the page we are coming from.  If we are coming from the
4961 ** right-most child page then pCur->idx is set to one more than
4962 ** the largest cell index.
4963 */
4964 static void moveToParent(BtCursor *pCur){
4965   MemPage *pLeaf;
4966   assert( cursorOwnsBtShared(pCur) );
4967   assert( pCur->eState==CURSOR_VALID );
4968   assert( pCur->iPage>0 );
4969   assert( pCur->pPage );
4970   assertParentIndex(
4971     pCur->apPage[pCur->iPage-1],
4972     pCur->aiIdx[pCur->iPage-1],
4973     pCur->pPage->pgno
4974   );
4975   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4976   pCur->info.nSize = 0;
4977   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4978   pCur->ix = pCur->aiIdx[pCur->iPage-1];
4979   pLeaf = pCur->pPage;
4980   pCur->pPage = pCur->apPage[--pCur->iPage];
4981   releasePageNotNull(pLeaf);
4982 }
4983
4984 /*
4985 ** Move the cursor to point to the root page of its b-tree structure.
4986 **
4987 ** If the table has a virtual root page, then the cursor is moved to point
4988 ** to the virtual root page instead of the actual root page. A table has a
4989 ** virtual root page when the actual root page contains no cells and a
4990 ** single child page. This can only happen with the table rooted at page 1.
4991 **
4992 ** If the b-tree structure is empty, the cursor state is set to
4993 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
4994 ** the cursor is set to point to the first cell located on the root
4995 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
4996 **
4997 ** If this function returns successfully, it may be assumed that the
4998 ** page-header flags indicate that the [virtual] root-page is the expected
4999 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
5000 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
5001 ** indicating a table b-tree, or if the caller did specify a KeyInfo
5002 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
5003 ** b-tree).
5004 */
5005 static int moveToRoot(BtCursor *pCur){
5006   MemPage *pRoot;
5007   int rc = SQLITE_OK;
5008
5009   assert( cursorOwnsBtShared(pCur) );
5010   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
5011   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
5012   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
5013   assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
5014   assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
5015
5016   if( pCur->iPage>=0 ){
5017     if( pCur->iPage ){
5018       releasePageNotNull(pCur->pPage);
5019       while( --pCur->iPage ){
5020         releasePageNotNull(pCur->apPage[pCur->iPage]);
5021       }
5022       pCur->pPage = pCur->apPage[0];
5023       goto skip_init;
5024     }
5025   }else if( pCur->pgnoRoot==0 ){
5026     pCur->eState = CURSOR_INVALID;
5027     return SQLITE_EMPTY;
5028   }else{
5029     assert( pCur->iPage==(-1) );
5030     if( pCur->eState>=CURSOR_REQUIRESEEK ){
5031       if( pCur->eState==CURSOR_FAULT ){
5032         assert( pCur->skipNext!=SQLITE_OK );
5033         return pCur->skipNext;
5034       }
5035       sqlite3BtreeClearCursor(pCur);
5036     }
5037     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
5038                         0, pCur->curPagerFlags);
5039     if( rc!=SQLITE_OK ){
5040       pCur->eState = CURSOR_INVALID;
5041       return rc;
5042     }
5043     pCur->iPage = 0;
5044     pCur->curIntKey = pCur->pPage->intKey;
5045   }
5046   pRoot = pCur->pPage;
5047   assert( pRoot->pgno==pCur->pgnoRoot );
5048
5049   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
5050   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
5051   ** NULL, the caller expects a table b-tree. If this is not the case,
5052   ** return an SQLITE_CORRUPT error.
5053   **
5054   ** Earlier versions of SQLite assumed that this test could not fail
5055   ** if the root page was already loaded when this function was called (i.e.
5056   ** if pCur->iPage>=0). But this is not so if the database is corrupted
5057   ** in such a way that page pRoot is linked into a second b-tree table
5058   ** (or the freelist).  */
5059   assert( pRoot->intKey==1 || pRoot->intKey==0 );
5060   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
5061     return SQLITE_CORRUPT_PAGE(pCur->pPage);
5062   }
5063
5064 skip_init:
5065   pCur->ix = 0;
5066   pCur->info.nSize = 0;
5067   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
5068
5069   pRoot = pCur->pPage;
5070   if( pRoot->nCell>0 ){
5071     pCur->eState = CURSOR_VALID;
5072   }else if( !pRoot->leaf ){
5073     Pgno subpage;
5074     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
5075     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
5076     pCur->eState = CURSOR_VALID;
5077     rc = moveToChild(pCur, subpage);
5078   }else{
5079     pCur->eState = CURSOR_INVALID;
5080     rc = SQLITE_EMPTY;
5081   }
5082   return rc;
5083 }
5084
5085 /*
5086 ** Move the cursor down to the left-most leaf entry beneath the
5087 ** entry to which it is currently pointing.
5088 **
5089 ** The left-most leaf is the one with the smallest key - the first
5090 ** in ascending order.
5091 */
5092 static int moveToLeftmost(BtCursor *pCur){
5093   Pgno pgno;
5094   int rc = SQLITE_OK;
5095   MemPage *pPage;
5096
5097   assert( cursorOwnsBtShared(pCur) );
5098   assert( pCur->eState==CURSOR_VALID );
5099   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
5100     assert( pCur->ix<pPage->nCell );
5101     pgno = get4byte(findCell(pPage, pCur->ix));
5102     rc = moveToChild(pCur, pgno);
5103   }
5104   return rc;
5105 }
5106
5107 /*
5108 ** Move the cursor down to the right-most leaf entry beneath the
5109 ** page to which it is currently pointing.  Notice the difference
5110 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
5111 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
5112 ** finds the right-most entry beneath the *page*.
5113 **
5114 ** The right-most entry is the one with the largest key - the last
5115 ** key in ascending order.
5116 */
5117 static int moveToRightmost(BtCursor *pCur){
5118   Pgno pgno;
5119   int rc = SQLITE_OK;
5120   MemPage *pPage = 0;
5121
5122   assert( cursorOwnsBtShared(pCur) );
5123   assert( pCur->eState==CURSOR_VALID );
5124   while( !(pPage = pCur->pPage)->leaf ){
5125     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5126     pCur->ix = pPage->nCell;
5127     rc = moveToChild(pCur, pgno);
5128     if( rc ) return rc;
5129   }
5130   pCur->ix = pPage->nCell-1;
5131   assert( pCur->info.nSize==0 );
5132   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
5133   return SQLITE_OK;
5134 }
5135
5136 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
5137 ** on success.  Set *pRes to 0 if the cursor actually points to something
5138 ** or set *pRes to 1 if the table is empty.
5139 */
5140 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
5141   int rc;
5142
5143   assert( cursorOwnsBtShared(pCur) );
5144   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5145   rc = moveToRoot(pCur);
5146   if( rc==SQLITE_OK ){
5147     assert( pCur->pPage->nCell>0 );
5148     *pRes = 0;
5149     rc = moveToLeftmost(pCur);
5150   }else if( rc==SQLITE_EMPTY ){
5151     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5152     *pRes = 1;
5153     rc = SQLITE_OK;
5154   }
5155   return rc;
5156 }
5157
5158 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
5159 ** on success.  Set *pRes to 0 if the cursor actually points to something
5160 ** or set *pRes to 1 if the table is empty.
5161 */
5162 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
5163   int rc;
5164
5165   assert( cursorOwnsBtShared(pCur) );
5166   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5167
5168   /* If the cursor already points to the last entry, this is a no-op. */
5169   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5170 #ifdef SQLITE_DEBUG
5171     /* This block serves to assert() that the cursor really does point
5172     ** to the last entry in the b-tree. */
5173     int ii;
5174     for(ii=0; ii<pCur->iPage; ii++){
5175       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5176     }
5177     assert( pCur->ix==pCur->pPage->nCell-1 );
5178     assert( pCur->pPage->leaf );
5179 #endif
5180     return SQLITE_OK;
5181   }
5182
5183   rc = moveToRoot(pCur);
5184   if( rc==SQLITE_OK ){
5185     assert( pCur->eState==CURSOR_VALID );
5186     *pRes = 0;
5187     rc = moveToRightmost(pCur);
5188     if( rc==SQLITE_OK ){
5189       pCur->curFlags |= BTCF_AtLast;
5190     }else{
5191       pCur->curFlags &= ~BTCF_AtLast;
5192     }
5193   }else if( rc==SQLITE_EMPTY ){
5194     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5195     *pRes = 1;
5196     rc = SQLITE_OK;
5197   }
5198   return rc;
5199 }
5200
5201 /* Move the cursor so that it points to an entry near the key
5202 ** specified by pIdxKey or intKey.   Return a success code.
5203 **
5204 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5205 ** must be NULL.  For index tables, pIdxKey is used and intKey
5206 ** is ignored.
5207 **
5208 ** If an exact match is not found, then the cursor is always
5209 ** left pointing at a leaf page which would hold the entry if it
5210 ** were present.  The cursor might point to an entry that comes
5211 ** before or after the key.
5212 **
5213 ** An integer is written into *pRes which is the result of
5214 ** comparing the key with the entry to which the cursor is
5215 ** pointing.  The meaning of the integer written into
5216 ** *pRes is as follows:
5217 **
5218 **     *pRes<0      The cursor is left pointing at an entry that
5219 **                  is smaller than intKey/pIdxKey or if the table is empty
5220 **                  and the cursor is therefore left point to nothing.
5221 **
5222 **     *pRes==0     The cursor is left pointing at an entry that
5223 **                  exactly matches intKey/pIdxKey.
5224 **
5225 **     *pRes>0      The cursor is left pointing at an entry that
5226 **                  is larger than intKey/pIdxKey.
5227 **
5228 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5229 ** exists an entry in the table that exactly matches pIdxKey.
5230 */
5231 int sqlite3BtreeMovetoUnpacked(
5232   BtCursor *pCur,          /* The cursor to be moved */
5233   UnpackedRecord *pIdxKey, /* Unpacked index key */
5234   i64 intKey,              /* The table key */
5235   int biasRight,           /* If true, bias the search to the high end */
5236   int *pRes                /* Write search results here */
5237 ){
5238   int rc;
5239   RecordCompare xRecordCompare;
5240
5241   assert( cursorOwnsBtShared(pCur) );
5242   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5243   assert( pRes );
5244   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5245   assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
5246
5247   /* If the cursor is already positioned at the point we are trying
5248   ** to move to, then just return without doing any work */
5249   if( pIdxKey==0
5250    && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5251   ){
5252     if( pCur->info.nKey==intKey ){
5253       *pRes = 0;
5254       return SQLITE_OK;
5255     }
5256     if( pCur->info.nKey<intKey ){
5257       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5258         *pRes = -1;
5259         return SQLITE_OK;
5260       }
5261       /* If the requested key is one more than the previous key, then
5262       ** try to get there using sqlite3BtreeNext() rather than a full
5263       ** binary search.  This is an optimization only.  The correct answer
5264       ** is still obtained without this case, only a little more slowely */
5265       if( pCur->info.nKey+1==intKey && !pCur->skipNext ){
5266         *pRes = 0;
5267         rc = sqlite3BtreeNext(pCur, 0);
5268         if( rc==SQLITE_OK ){
5269           getCellInfo(pCur);
5270           if( pCur->info.nKey==intKey ){
5271             return SQLITE_OK;
5272           }
5273         }else if( rc==SQLITE_DONE ){
5274           rc = SQLITE_OK;
5275         }else{
5276           return rc;
5277         }
5278       }
5279     }
5280   }
5281
5282   if( pIdxKey ){
5283     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5284     pIdxKey->errCode = 0;
5285     assert( pIdxKey->default_rc==1
5286          || pIdxKey->default_rc==0
5287          || pIdxKey->default_rc==-1
5288     );
5289   }else{
5290     xRecordCompare = 0; /* All keys are integers */
5291   }
5292
5293   rc = moveToRoot(pCur);
5294   if( rc ){
5295     if( rc==SQLITE_EMPTY ){
5296       assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5297       *pRes = -1;
5298       return SQLITE_OK;
5299     }
5300     return rc;
5301   }
5302   assert( pCur->pPage );
5303   assert( pCur->pPage->isInit );
5304   assert( pCur->eState==CURSOR_VALID );
5305   assert( pCur->pPage->nCell > 0 );
5306   assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5307   assert( pCur->curIntKey || pIdxKey );
5308   for(;;){
5309     int lwr, upr, idx, c;
5310     Pgno chldPg;
5311     MemPage *pPage = pCur->pPage;
5312     u8 *pCell;                          /* Pointer to current cell in pPage */
5313
5314     /* pPage->nCell must be greater than zero. If this is the root-page
5315     ** the cursor would have been INVALID above and this for(;;) loop
5316     ** not run. If this is not the root-page, then the moveToChild() routine
5317     ** would have already detected db corruption. Similarly, pPage must
5318     ** be the right kind (index or table) of b-tree page. Otherwise
5319     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5320     assert( pPage->nCell>0 );
5321     assert( pPage->intKey==(pIdxKey==0) );
5322     lwr = 0;
5323     upr = pPage->nCell-1;
5324     assert( biasRight==0 || biasRight==1 );
5325     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5326     pCur->ix = (u16)idx;
5327     if( xRecordCompare==0 ){
5328       for(;;){
5329         i64 nCellKey;
5330         pCell = findCellPastPtr(pPage, idx);
5331         if( pPage->intKeyLeaf ){
5332           while( 0x80 <= *(pCell++) ){
5333             if( pCell>=pPage->aDataEnd ){
5334               return SQLITE_CORRUPT_PAGE(pPage);
5335             }
5336           }
5337         }
5338         getVarint(pCell, (u64*)&nCellKey);
5339         if( nCellKey<intKey ){
5340           lwr = idx+1;
5341           if( lwr>upr ){ c = -1; break; }
5342         }else if( nCellKey>intKey ){
5343           upr = idx-1;
5344           if( lwr>upr ){ c = +1; break; }
5345         }else{
5346           assert( nCellKey==intKey );
5347           pCur->ix = (u16)idx;
5348           if( !pPage->leaf ){
5349             lwr = idx;
5350             goto moveto_next_layer;
5351           }else{
5352             pCur->curFlags |= BTCF_ValidNKey;
5353             pCur->info.nKey = nCellKey;
5354             pCur->info.nSize = 0;
5355             *pRes = 0;
5356             return SQLITE_OK;
5357           }
5358         }
5359         assert( lwr+upr>=0 );
5360         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5361       }
5362     }else{
5363       for(;;){
5364         int nCell;  /* Size of the pCell cell in bytes */
5365         pCell = findCellPastPtr(pPage, idx);
5366
5367         /* The maximum supported page-size is 65536 bytes. This means that
5368         ** the maximum number of record bytes stored on an index B-Tree
5369         ** page is less than 16384 bytes and may be stored as a 2-byte
5370         ** varint. This information is used to attempt to avoid parsing
5371         ** the entire cell by checking for the cases where the record is
5372         ** stored entirely within the b-tree page by inspecting the first
5373         ** 2 bytes of the cell.
5374         */
5375         nCell = pCell[0];
5376         if( nCell<=pPage->max1bytePayload ){
5377           /* This branch runs if the record-size field of the cell is a
5378           ** single byte varint and the record fits entirely on the main
5379           ** b-tree page.  */
5380           testcase( pCell+nCell+1==pPage->aDataEnd );
5381           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5382         }else if( !(pCell[1] & 0x80)
5383           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5384         ){
5385           /* The record-size field is a 2 byte varint and the record
5386           ** fits entirely on the main b-tree page.  */
5387           testcase( pCell+nCell+2==pPage->aDataEnd );
5388           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5389         }else{
5390           /* The record flows over onto one or more overflow pages. In
5391           ** this case the whole cell needs to be parsed, a buffer allocated
5392           ** and accessPayload() used to retrieve the record into the
5393           ** buffer before VdbeRecordCompare() can be called.
5394           **
5395           ** If the record is corrupt, the xRecordCompare routine may read
5396           ** up to two varints past the end of the buffer. An extra 18
5397           ** bytes of padding is allocated at the end of the buffer in
5398           ** case this happens.  */
5399           void *pCellKey;
5400           u8 * const pCellBody = pCell - pPage->childPtrSize;
5401           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5402           nCell = (int)pCur->info.nKey;
5403           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5404           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5405           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5406           testcase( nCell==2 );  /* Minimum legal index key size */
5407           if( nCell<2 ){
5408             rc = SQLITE_CORRUPT_PAGE(pPage);
5409             goto moveto_finish;
5410           }
5411           pCellKey = sqlite3Malloc( nCell+18 );
5412           if( pCellKey==0 ){
5413             rc = SQLITE_NOMEM_BKPT;
5414             goto moveto_finish;
5415           }
5416           pCur->ix = (u16)idx;
5417           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
5418           pCur->curFlags &= ~BTCF_ValidOvfl;
5419           if( rc ){
5420             sqlite3_free(pCellKey);
5421             goto moveto_finish;
5422           }
5423           c = xRecordCompare(nCell, pCellKey, pIdxKey);
5424           sqlite3_free(pCellKey);
5425         }
5426         assert(
5427             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5428          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5429         );
5430         if( c<0 ){
5431           lwr = idx+1;
5432         }else if( c>0 ){
5433           upr = idx-1;
5434         }else{
5435           assert( c==0 );
5436           *pRes = 0;
5437           rc = SQLITE_OK;
5438           pCur->ix = (u16)idx;
5439           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
5440           goto moveto_finish;
5441         }
5442         if( lwr>upr ) break;
5443         assert( lwr+upr>=0 );
5444         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5445       }
5446     }
5447     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5448     assert( pPage->isInit );
5449     if( pPage->leaf ){
5450       assert( pCur->ix<pCur->pPage->nCell );
5451       pCur->ix = (u16)idx;
5452       *pRes = c;
5453       rc = SQLITE_OK;
5454       goto moveto_finish;
5455     }
5456 moveto_next_layer:
5457     if( lwr>=pPage->nCell ){
5458       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5459     }else{
5460       chldPg = get4byte(findCell(pPage, lwr));
5461     }
5462     pCur->ix = (u16)lwr;
5463     rc = moveToChild(pCur, chldPg);
5464     if( rc ) break;
5465   }
5466 moveto_finish:
5467   pCur->info.nSize = 0;
5468   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5469   return rc;
5470 }
5471
5472
5473 /*
5474 ** Return TRUE if the cursor is not pointing at an entry of the table.
5475 **
5476 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5477 ** past the last entry in the table or sqlite3BtreePrev() moves past
5478 ** the first entry.  TRUE is also returned if the table is empty.
5479 */
5480 int sqlite3BtreeEof(BtCursor *pCur){
5481   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5482   ** have been deleted? This API will need to change to return an error code
5483   ** as well as the boolean result value.
5484   */
5485   return (CURSOR_VALID!=pCur->eState);
5486 }
5487
5488 /*
5489 ** Return an estimate for the number of rows in the table that pCur is
5490 ** pointing to.  Return a negative number if no estimate is currently
5491 ** available.
5492 */
5493 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
5494   i64 n;
5495   u8 i;
5496
5497   assert( cursorOwnsBtShared(pCur) );
5498   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5499
5500   /* Currently this interface is only called by the OP_IfSmaller
5501   ** opcode, and it that case the cursor will always be valid and
5502   ** will always point to a leaf node. */
5503   if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
5504   if( NEVER(pCur->pPage->leaf==0) ) return -1;
5505
5506   n = pCur->pPage->nCell;
5507   for(i=0; i<pCur->iPage; i++){
5508     n *= pCur->apPage[i]->nCell;
5509   }
5510   return n;
5511 }
5512
5513 /*
5514 ** Advance the cursor to the next entry in the database.
5515 ** Return value:
5516 **
5517 **    SQLITE_OK        success
5518 **    SQLITE_DONE      cursor is already pointing at the last element
5519 **    otherwise        some kind of error occurred
5520 **
5521 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5522 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5523 ** to the next cell on the current page.  The (slower) btreeNext() helper
5524 ** routine is called when it is necessary to move to a different page or
5525 ** to restore the cursor.
5526 **
5527 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
5528 ** cursor corresponds to an SQL index and this routine could have been
5529 ** skipped if the SQL index had been a unique index.  The F argument
5530 ** is a hint to the implement.  SQLite btree implementation does not use
5531 ** this hint, but COMDB2 does.
5532 */
5533 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
5534   int rc;
5535   int idx;
5536   MemPage *pPage;
5537
5538   assert( cursorOwnsBtShared(pCur) );
5539   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5540   if( pCur->eState!=CURSOR_VALID ){
5541     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5542     rc = restoreCursorPosition(pCur);
5543     if( rc!=SQLITE_OK ){
5544       return rc;
5545     }
5546     if( CURSOR_INVALID==pCur->eState ){
5547       return SQLITE_DONE;
5548     }
5549     if( pCur->skipNext ){
5550       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5551       pCur->eState = CURSOR_VALID;
5552       if( pCur->skipNext>0 ){
5553         pCur->skipNext = 0;
5554         return SQLITE_OK;
5555       }
5556       pCur->skipNext = 0;
5557     }
5558   }
5559
5560   pPage = pCur->pPage;
5561   idx = ++pCur->ix;
5562   assert( pPage->isInit );
5563
5564   /* If the database file is corrupt, it is possible for the value of idx
5565   ** to be invalid here. This can only occur if a second cursor modifies
5566   ** the page while cursor pCur is holding a reference to it. Which can
5567   ** only happen if the database is corrupt in such a way as to link the
5568   ** page into more than one b-tree structure. */
5569   testcase( idx>pPage->nCell );
5570
5571   if( idx>=pPage->nCell ){
5572     if( !pPage->leaf ){
5573       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5574       if( rc ) return rc;
5575       return moveToLeftmost(pCur);
5576     }
5577     do{
5578       if( pCur->iPage==0 ){
5579         pCur->eState = CURSOR_INVALID;
5580         return SQLITE_DONE;
5581       }
5582       moveToParent(pCur);
5583       pPage = pCur->pPage;
5584     }while( pCur->ix>=pPage->nCell );
5585     if( pPage->intKey ){
5586       return sqlite3BtreeNext(pCur, 0);
5587     }else{
5588       return SQLITE_OK;
5589     }
5590   }
5591   if( pPage->leaf ){
5592     return SQLITE_OK;
5593   }else{
5594     return moveToLeftmost(pCur);
5595   }
5596 }
5597 int sqlite3BtreeNext(BtCursor *pCur, int flags){
5598   MemPage *pPage;
5599   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5600   assert( cursorOwnsBtShared(pCur) );
5601   assert( flags==0 || flags==1 );
5602   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5603   pCur->info.nSize = 0;
5604   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5605   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
5606   pPage = pCur->pPage;
5607   if( (++pCur->ix)>=pPage->nCell ){
5608     pCur->ix--;
5609     return btreeNext(pCur);
5610   }
5611   if( pPage->leaf ){
5612     return SQLITE_OK;
5613   }else{
5614     return moveToLeftmost(pCur);
5615   }
5616 }
5617
5618 /*
5619 ** Step the cursor to the back to the previous entry in the database.
5620 ** Return values:
5621 **
5622 **     SQLITE_OK     success
5623 **     SQLITE_DONE   the cursor is already on the first element of the table
5624 **     otherwise     some kind of error occurred
5625 **
5626 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5627 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5628 ** to the previous cell on the current page.  The (slower) btreePrevious()
5629 ** helper routine is called when it is necessary to move to a different page
5630 ** or to restore the cursor.
5631 **
5632 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
5633 ** the cursor corresponds to an SQL index and this routine could have been
5634 ** skipped if the SQL index had been a unique index.  The F argument is a
5635 ** hint to the implement.  The native SQLite btree implementation does not
5636 ** use this hint, but COMDB2 does.
5637 */
5638 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
5639   int rc;
5640   MemPage *pPage;
5641
5642   assert( cursorOwnsBtShared(pCur) );
5643   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5644   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5645   assert( pCur->info.nSize==0 );
5646   if( pCur->eState!=CURSOR_VALID ){
5647     rc = restoreCursorPosition(pCur);
5648     if( rc!=SQLITE_OK ){
5649       return rc;
5650     }
5651     if( CURSOR_INVALID==pCur->eState ){
5652       return SQLITE_DONE;
5653     }
5654     if( pCur->skipNext ){
5655       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5656       pCur->eState = CURSOR_VALID;
5657       if( pCur->skipNext<0 ){
5658         pCur->skipNext = 0;
5659         return SQLITE_OK;
5660       }
5661       pCur->skipNext = 0;
5662     }
5663   }
5664
5665   pPage = pCur->pPage;
5666   assert( pPage->isInit );
5667   if( !pPage->leaf ){
5668     int idx = pCur->ix;
5669     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5670     if( rc ) return rc;
5671     rc = moveToRightmost(pCur);
5672   }else{
5673     while( pCur->ix==0 ){
5674       if( pCur->iPage==0 ){
5675         pCur->eState = CURSOR_INVALID;
5676         return SQLITE_DONE;
5677       }
5678       moveToParent(pCur);
5679     }
5680     assert( pCur->info.nSize==0 );
5681     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5682
5683     pCur->ix--;
5684     pPage = pCur->pPage;
5685     if( pPage->intKey && !pPage->leaf ){
5686       rc = sqlite3BtreePrevious(pCur, 0);
5687     }else{
5688       rc = SQLITE_OK;
5689     }
5690   }
5691   return rc;
5692 }
5693 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
5694   assert( cursorOwnsBtShared(pCur) );
5695   assert( flags==0 || flags==1 );
5696   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5697   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5698   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5699   pCur->info.nSize = 0;
5700   if( pCur->eState!=CURSOR_VALID
5701    || pCur->ix==0
5702    || pCur->pPage->leaf==0
5703   ){
5704     return btreePrevious(pCur);
5705   }
5706   pCur->ix--;
5707   return SQLITE_OK;
5708 }
5709
5710 /*
5711 ** Allocate a new page from the database file.
5712 **
5713 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5714 ** has already been called on the new page.)  The new page has also
5715 ** been referenced and the calling routine is responsible for calling
5716 ** sqlite3PagerUnref() on the new page when it is done.
5717 **
5718 ** SQLITE_OK is returned on success.  Any other return value indicates
5719 ** an error.  *ppPage is set to NULL in the event of an error.
5720 **
5721 ** If the "nearby" parameter is not 0, then an effort is made to
5722 ** locate a page close to the page number "nearby".  This can be used in an
5723 ** attempt to keep related pages close to each other in the database file,
5724 ** which in turn can make database access faster.
5725 **
5726 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5727 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5728 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5729 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5730 ** are no restrictions on which page is returned.
5731 */
5732 static int allocateBtreePage(
5733   BtShared *pBt,         /* The btree */
5734   MemPage **ppPage,      /* Store pointer to the allocated page here */
5735   Pgno *pPgno,           /* Store the page number here */
5736   Pgno nearby,           /* Search for a page near this one */
5737   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5738 ){
5739   MemPage *pPage1;
5740   int rc;
5741   u32 n;     /* Number of pages on the freelist */
5742   u32 k;     /* Number of leaves on the trunk of the freelist */
5743   MemPage *pTrunk = 0;
5744   MemPage *pPrevTrunk = 0;
5745   Pgno mxPage;     /* Total size of the database file */
5746
5747   assert( sqlite3_mutex_held(pBt->mutex) );
5748   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5749   pPage1 = pBt->pPage1;
5750   mxPage = btreePagecount(pBt);
5751   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5752   ** stores stores the total number of pages on the freelist. */
5753   n = get4byte(&pPage1->aData[36]);
5754   testcase( n==mxPage-1 );
5755   if( n>=mxPage ){
5756     return SQLITE_CORRUPT_BKPT;
5757   }
5758   if( n>0 ){
5759     /* There are pages on the freelist.  Reuse one of those pages. */
5760     Pgno iTrunk;
5761     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5762     u32 nSearch = 0;   /* Count of the number of search attempts */
5763
5764     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5765     ** shows that the page 'nearby' is somewhere on the free-list, then
5766     ** the entire-list will be searched for that page.
5767     */
5768 #ifndef SQLITE_OMIT_AUTOVACUUM
5769     if( eMode==BTALLOC_EXACT ){
5770       if( nearby<=mxPage ){
5771         u8 eType;
5772         assert( nearby>0 );
5773         assert( pBt->autoVacuum );
5774         rc = ptrmapGet(pBt, nearby, &eType, 0);
5775         if( rc ) return rc;
5776         if( eType==PTRMAP_FREEPAGE ){
5777           searchList = 1;
5778         }
5779       }
5780     }else if( eMode==BTALLOC_LE ){
5781       searchList = 1;
5782     }
5783 #endif
5784
5785     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5786     ** first free-list trunk page. iPrevTrunk is initially 1.
5787     */
5788     rc = sqlite3PagerWrite(pPage1->pDbPage);
5789     if( rc ) return rc;
5790     put4byte(&pPage1->aData[36], n-1);
5791
5792     /* The code within this loop is run only once if the 'searchList' variable
5793     ** is not true. Otherwise, it runs once for each trunk-page on the
5794     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5795     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5796     */
5797     do {
5798       pPrevTrunk = pTrunk;
5799       if( pPrevTrunk ){
5800         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5801         ** is the page number of the next freelist trunk page in the list or
5802         ** zero if this is the last freelist trunk page. */
5803         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5804       }else{
5805         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5806         ** stores the page number of the first page of the freelist, or zero if
5807         ** the freelist is empty. */
5808         iTrunk = get4byte(&pPage1->aData[32]);
5809       }
5810       testcase( iTrunk==mxPage );
5811       if( iTrunk>mxPage || nSearch++ > n ){
5812         rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
5813       }else{
5814         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5815       }
5816       if( rc ){
5817         pTrunk = 0;
5818         goto end_allocate_page;
5819       }
5820       assert( pTrunk!=0 );
5821       assert( pTrunk->aData!=0 );
5822       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5823       ** is the number of leaf page pointers to follow. */
5824       k = get4byte(&pTrunk->aData[4]);
5825       if( k==0 && !searchList ){
5826         /* The trunk has no leaves and the list is not being searched.
5827         ** So extract the trunk page itself and use it as the newly
5828         ** allocated page */
5829         assert( pPrevTrunk==0 );
5830         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5831         if( rc ){
5832           goto end_allocate_page;
5833         }
5834         *pPgno = iTrunk;
5835         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5836         *ppPage = pTrunk;
5837         pTrunk = 0;
5838         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5839       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5840         /* Value of k is out of range.  Database corruption */
5841         rc = SQLITE_CORRUPT_PGNO(iTrunk);
5842         goto end_allocate_page;
5843 #ifndef SQLITE_OMIT_AUTOVACUUM
5844       }else if( searchList
5845             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5846       ){
5847         /* The list is being searched and this trunk page is the page
5848         ** to allocate, regardless of whether it has leaves.
5849         */
5850         *pPgno = iTrunk;
5851         *ppPage = pTrunk;
5852         searchList = 0;
5853         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5854         if( rc ){
5855           goto end_allocate_page;
5856         }
5857         if( k==0 ){
5858           if( !pPrevTrunk ){
5859             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5860           }else{
5861             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5862             if( rc!=SQLITE_OK ){
5863               goto end_allocate_page;
5864             }
5865             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5866           }
5867         }else{
5868           /* The trunk page is required by the caller but it contains
5869           ** pointers to free-list leaves. The first leaf becomes a trunk
5870           ** page in this case.
5871           */
5872           MemPage *pNewTrunk;
5873           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5874           if( iNewTrunk>mxPage ){
5875             rc = SQLITE_CORRUPT_PGNO(iTrunk);
5876             goto end_allocate_page;
5877           }
5878           testcase( iNewTrunk==mxPage );
5879           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
5880           if( rc!=SQLITE_OK ){
5881             goto end_allocate_page;
5882           }
5883           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
5884           if( rc!=SQLITE_OK ){
5885             releasePage(pNewTrunk);
5886             goto end_allocate_page;
5887           }
5888           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
5889           put4byte(&pNewTrunk->aData[4], k-1);
5890           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
5891           releasePage(pNewTrunk);
5892           if( !pPrevTrunk ){
5893             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
5894             put4byte(&pPage1->aData[32], iNewTrunk);
5895           }else{
5896             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5897             if( rc ){
5898               goto end_allocate_page;
5899             }
5900             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
5901           }
5902         }
5903         pTrunk = 0;
5904         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5905 #endif
5906       }else if( k>0 ){
5907         /* Extract a leaf from the trunk */
5908         u32 closest;
5909         Pgno iPage;
5910         unsigned char *aData = pTrunk->aData;
5911         if( nearby>0 ){
5912           u32 i;
5913           closest = 0;
5914           if( eMode==BTALLOC_LE ){
5915             for(i=0; i<k; i++){
5916               iPage = get4byte(&aData[8+i*4]);
5917               if( iPage<=nearby ){
5918                 closest = i;
5919                 break;
5920               }
5921             }
5922           }else{
5923             int dist;
5924             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5925             for(i=1; i<k; i++){
5926               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5927               if( d2<dist ){
5928                 closest = i;
5929                 dist = d2;
5930               }
5931             }
5932           }
5933         }else{
5934           closest = 0;
5935         }
5936
5937         iPage = get4byte(&aData[8+closest*4]);
5938         testcase( iPage==mxPage );
5939         if( iPage>mxPage ){
5940           rc = SQLITE_CORRUPT_PGNO(iTrunk);
5941           goto end_allocate_page;
5942         }
5943         testcase( iPage==mxPage );
5944         if( !searchList
5945          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
5946         ){
5947           int noContent;
5948           *pPgno = iPage;
5949           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5950                  ": %d more free pages\n",
5951                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5952           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5953           if( rc ) goto end_allocate_page;
5954           if( closest<k-1 ){
5955             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5956           }
5957           put4byte(&aData[4], k-1);
5958           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
5959           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
5960           if( rc==SQLITE_OK ){
5961             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5962             if( rc!=SQLITE_OK ){
5963               releasePage(*ppPage);
5964               *ppPage = 0;
5965             }
5966           }
5967           searchList = 0;
5968         }
5969       }
5970       releasePage(pPrevTrunk);
5971       pPrevTrunk = 0;
5972     }while( searchList );
5973   }else{
5974     /* There are no pages on the freelist, so append a new page to the
5975     ** database image.
5976     **
5977     ** Normally, new pages allocated by this block can be requested from the
5978     ** pager layer with the 'no-content' flag set. This prevents the pager
5979     ** from trying to read the pages content from disk. However, if the
5980     ** current transaction has already run one or more incremental-vacuum
5981     ** steps, then the page we are about to allocate may contain content
5982     ** that is required in the event of a rollback. In this case, do
5983     ** not set the no-content flag. This causes the pager to load and journal
5984     ** the current page content before overwriting it.
5985     **
5986     ** Note that the pager will not actually attempt to load or journal
5987     ** content for any page that really does lie past the end of the database
5988     ** file on disk. So the effects of disabling the no-content optimization
5989     ** here are confined to those pages that lie between the end of the
5990     ** database image and the end of the database file.
5991     */
5992     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
5993
5994     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5995     if( rc ) return rc;
5996     pBt->nPage++;
5997     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5998
5999 #ifndef SQLITE_OMIT_AUTOVACUUM
6000     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
6001       /* If *pPgno refers to a pointer-map page, allocate two new pages
6002       ** at the end of the file instead of one. The first allocated page
6003       ** becomes a new pointer-map page, the second is used by the caller.
6004       */
6005       MemPage *pPg = 0;
6006       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
6007       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
6008       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
6009       if( rc==SQLITE_OK ){
6010         rc = sqlite3PagerWrite(pPg->pDbPage);
6011         releasePage(pPg);
6012       }
6013       if( rc ) return rc;
6014       pBt->nPage++;
6015       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
6016     }
6017 #endif
6018     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
6019     *pPgno = pBt->nPage;
6020
6021     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6022     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
6023     if( rc ) return rc;
6024     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6025     if( rc!=SQLITE_OK ){
6026       releasePage(*ppPage);
6027       *ppPage = 0;
6028     }
6029     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
6030   }
6031
6032   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6033
6034 end_allocate_page:
6035   releasePage(pTrunk);
6036   releasePage(pPrevTrunk);
6037   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
6038   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
6039   return rc;
6040 }
6041
6042 /*
6043 ** This function is used to add page iPage to the database file free-list.
6044 ** It is assumed that the page is not already a part of the free-list.
6045 **
6046 ** The value passed as the second argument to this function is optional.
6047 ** If the caller happens to have a pointer to the MemPage object
6048 ** corresponding to page iPage handy, it may pass it as the second value.
6049 ** Otherwise, it may pass NULL.
6050 **
6051 ** If a pointer to a MemPage object is passed as the second argument,
6052 ** its reference count is not altered by this function.
6053 */
6054 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
6055   MemPage *pTrunk = 0;                /* Free-list trunk page */
6056   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
6057   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
6058   MemPage *pPage;                     /* Page being freed. May be NULL. */
6059   int rc;                             /* Return Code */
6060   int nFree;                          /* Initial number of pages on free-list */
6061
6062   assert( sqlite3_mutex_held(pBt->mutex) );
6063   assert( CORRUPT_DB || iPage>1 );
6064   assert( !pMemPage || pMemPage->pgno==iPage );
6065
6066   if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
6067   if( pMemPage ){
6068     pPage = pMemPage;
6069     sqlite3PagerRef(pPage->pDbPage);
6070   }else{
6071     pPage = btreePageLookup(pBt, iPage);
6072   }
6073
6074   /* Increment the free page count on pPage1 */
6075   rc = sqlite3PagerWrite(pPage1->pDbPage);
6076   if( rc ) goto freepage_out;
6077   nFree = get4byte(&pPage1->aData[36]);
6078   put4byte(&pPage1->aData[36], nFree+1);
6079
6080   if( pBt->btsFlags & BTS_SECURE_DELETE ){
6081     /* If the secure_delete option is enabled, then
6082     ** always fully overwrite deleted information with zeros.
6083     */
6084     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
6085      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
6086     ){
6087       goto freepage_out;
6088     }
6089     memset(pPage->aData, 0, pPage->pBt->pageSize);
6090   }
6091
6092   /* If the database supports auto-vacuum, write an entry in the pointer-map
6093   ** to indicate that the page is free.
6094   */
6095   if( ISAUTOVACUUM ){
6096     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
6097     if( rc ) goto freepage_out;
6098   }
6099
6100   /* Now manipulate the actual database free-list structure. There are two
6101   ** possibilities. If the free-list is currently empty, or if the first
6102   ** trunk page in the free-list is full, then this page will become a
6103   ** new free-list trunk page. Otherwise, it will become a leaf of the
6104   ** first trunk page in the current free-list. This block tests if it
6105   ** is possible to add the page as a new free-list leaf.
6106   */
6107   if( nFree!=0 ){
6108     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
6109
6110     iTrunk = get4byte(&pPage1->aData[32]);
6111     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
6112     if( rc!=SQLITE_OK ){
6113       goto freepage_out;
6114     }
6115
6116     nLeaf = get4byte(&pTrunk->aData[4]);
6117     assert( pBt->usableSize>32 );
6118     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
6119       rc = SQLITE_CORRUPT_BKPT;
6120       goto freepage_out;
6121     }
6122     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
6123       /* In this case there is room on the trunk page to insert the page
6124       ** being freed as a new leaf.
6125       **
6126       ** Note that the trunk page is not really full until it contains
6127       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
6128       ** coded.  But due to a coding error in versions of SQLite prior to
6129       ** 3.6.0, databases with freelist trunk pages holding more than
6130       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
6131       ** to maintain backwards compatibility with older versions of SQLite,
6132       ** we will continue to restrict the number of entries to usableSize/4 - 8
6133       ** for now.  At some point in the future (once everyone has upgraded
6134       ** to 3.6.0 or later) we should consider fixing the conditional above
6135       ** to read "usableSize/4-2" instead of "usableSize/4-8".
6136       **
6137       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
6138       ** avoid using the last six entries in the freelist trunk page array in
6139       ** order that database files created by newer versions of SQLite can be
6140       ** read by older versions of SQLite.
6141       */
6142       rc = sqlite3PagerWrite(pTrunk->pDbPage);
6143       if( rc==SQLITE_OK ){
6144         put4byte(&pTrunk->aData[4], nLeaf+1);
6145         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
6146         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
6147           sqlite3PagerDontWrite(pPage->pDbPage);
6148         }
6149         rc = btreeSetHasContent(pBt, iPage);
6150       }
6151       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
6152       goto freepage_out;
6153     }
6154   }
6155
6156   /* If control flows to this point, then it was not possible to add the
6157   ** the page being freed as a leaf page of the first trunk in the free-list.
6158   ** Possibly because the free-list is empty, or possibly because the
6159   ** first trunk in the free-list is full. Either way, the page being freed
6160   ** will become the new first trunk page in the free-list.
6161   */
6162   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
6163     goto freepage_out;
6164   }
6165   rc = sqlite3PagerWrite(pPage->pDbPage);
6166   if( rc!=SQLITE_OK ){
6167     goto freepage_out;
6168   }
6169   put4byte(pPage->aData, iTrunk);
6170   put4byte(&pPage->aData[4], 0);
6171   put4byte(&pPage1->aData[32], iPage);
6172   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
6173
6174 freepage_out:
6175   if( pPage ){
6176     pPage->isInit = 0;
6177   }
6178   releasePage(pPage);
6179   releasePage(pTrunk);
6180   return rc;
6181 }
6182 static void freePage(MemPage *pPage, int *pRC){
6183   if( (*pRC)==SQLITE_OK ){
6184     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6185   }
6186 }
6187
6188 /*
6189 ** Free any overflow pages associated with the given Cell.  Write the
6190 ** local Cell size (the number of bytes on the original page, omitting
6191 ** overflow) into *pnSize.
6192 */
6193 static int clearCell(
6194   MemPage *pPage,          /* The page that contains the Cell */
6195   unsigned char *pCell,    /* First byte of the Cell */
6196   CellInfo *pInfo          /* Size information about the cell */
6197 ){
6198   BtShared *pBt;
6199   Pgno ovflPgno;
6200   int rc;
6201   int nOvfl;
6202   u32 ovflPageSize;
6203
6204   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6205   pPage->xParseCell(pPage, pCell, pInfo);
6206   if( pInfo->nLocal==pInfo->nPayload ){
6207     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6208   }
6209   if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){
6210     /* Cell extends past end of page */
6211     return SQLITE_CORRUPT_PAGE(pPage);
6212   }
6213   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6214   pBt = pPage->pBt;
6215   assert( pBt->usableSize > 4 );
6216   ovflPageSize = pBt->usableSize - 4;
6217   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6218   assert( nOvfl>0 ||
6219     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6220   );
6221   while( nOvfl-- ){
6222     Pgno iNext = 0;
6223     MemPage *pOvfl = 0;
6224     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6225       /* 0 is not a legal page number and page 1 cannot be an
6226       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6227       ** file the database must be corrupt. */
6228       return SQLITE_CORRUPT_BKPT;
6229     }
6230     if( nOvfl ){
6231       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6232       if( rc ) return rc;
6233     }
6234
6235     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6236      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6237     ){
6238       /* There is no reason any cursor should have an outstanding reference
6239       ** to an overflow page belonging to a cell that is being deleted/updated.
6240       ** So if there exists more than one reference to this page, then it
6241       ** must not really be an overflow page and the database must be corrupt.
6242       ** It is helpful to detect this before calling freePage2(), as
6243       ** freePage2() may zero the page contents if secure-delete mode is
6244       ** enabled. If this 'overflow' page happens to be a page that the
6245       ** caller is iterating through or using in some other way, this
6246       ** can be problematic.
6247       */
6248       rc = SQLITE_CORRUPT_BKPT;
6249     }else{
6250       rc = freePage2(pBt, pOvfl, ovflPgno);
6251     }
6252
6253     if( pOvfl ){
6254       sqlite3PagerUnref(pOvfl->pDbPage);
6255     }
6256     if( rc ) return rc;
6257     ovflPgno = iNext;
6258   }
6259   return SQLITE_OK;
6260 }
6261
6262 /*
6263 ** Create the byte sequence used to represent a cell on page pPage
6264 ** and write that byte sequence into pCell[].  Overflow pages are
6265 ** allocated and filled in as necessary.  The calling procedure
6266 ** is responsible for making sure sufficient space has been allocated
6267 ** for pCell[].
6268 **
6269 ** Note that pCell does not necessary need to point to the pPage->aData
6270 ** area.  pCell might point to some temporary storage.  The cell will
6271 ** be constructed in this temporary area then copied into pPage->aData
6272 ** later.
6273 */
6274 static int fillInCell(
6275   MemPage *pPage,                /* The page that contains the cell */
6276   unsigned char *pCell,          /* Complete text of the cell */
6277   const BtreePayload *pX,        /* Payload with which to construct the cell */
6278   int *pnSize                    /* Write cell size here */
6279 ){
6280   int nPayload;
6281   const u8 *pSrc;
6282   int nSrc, n, rc, mn;
6283   int spaceLeft;
6284   MemPage *pToRelease;
6285   unsigned char *pPrior;
6286   unsigned char *pPayload;
6287   BtShared *pBt;
6288   Pgno pgnoOvfl;
6289   int nHeader;
6290
6291   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6292
6293   /* pPage is not necessarily writeable since pCell might be auxiliary
6294   ** buffer space that is separate from the pPage buffer area */
6295   assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
6296             || sqlite3PagerIswriteable(pPage->pDbPage) );
6297
6298   /* Fill in the header. */
6299   nHeader = pPage->childPtrSize;
6300   if( pPage->intKey ){
6301     nPayload = pX->nData + pX->nZero;
6302     pSrc = pX->pData;
6303     nSrc = pX->nData;
6304     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6305     nHeader += putVarint32(&pCell[nHeader], nPayload);
6306     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6307   }else{
6308     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6309     nSrc = nPayload = (int)pX->nKey;
6310     pSrc = pX->pKey;
6311     nHeader += putVarint32(&pCell[nHeader], nPayload);
6312   }
6313
6314   /* Fill in the payload */
6315   pPayload = &pCell[nHeader];
6316   if( nPayload<=pPage->maxLocal ){
6317     /* This is the common case where everything fits on the btree page
6318     ** and no overflow pages are required. */
6319     n = nHeader + nPayload;
6320     testcase( n==3 );
6321     testcase( n==4 );
6322     if( n<4 ) n = 4;
6323     *pnSize = n;
6324     assert( nSrc<=nPayload );
6325     testcase( nSrc<nPayload );
6326     memcpy(pPayload, pSrc, nSrc);
6327     memset(pPayload+nSrc, 0, nPayload-nSrc);
6328     return SQLITE_OK;
6329   }
6330
6331   /* If we reach this point, it means that some of the content will need
6332   ** to spill onto overflow pages.
6333   */
6334   mn = pPage->minLocal;
6335   n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6336   testcase( n==pPage->maxLocal );
6337   testcase( n==pPage->maxLocal+1 );
6338   if( n > pPage->maxLocal ) n = mn;
6339   spaceLeft = n;
6340   *pnSize = n + nHeader + 4;
6341   pPrior = &pCell[nHeader+n];
6342   pToRelease = 0;
6343   pgnoOvfl = 0;
6344   pBt = pPage->pBt;
6345
6346   /* At this point variables should be set as follows:
6347   **
6348   **   nPayload           Total payload size in bytes
6349   **   pPayload           Begin writing payload here
6350   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6351   **                      that means content must spill into overflow pages.
6352   **   *pnSize            Size of the local cell (not counting overflow pages)
6353   **   pPrior             Where to write the pgno of the first overflow page
6354   **
6355   ** Use a call to btreeParseCellPtr() to verify that the values above
6356   ** were computed correctly.
6357   */
6358 #ifdef SQLITE_DEBUG
6359   {
6360     CellInfo info;
6361     pPage->xParseCell(pPage, pCell, &info);
6362     assert( nHeader==(int)(info.pPayload - pCell) );
6363     assert( info.nKey==pX->nKey );
6364     assert( *pnSize == info.nSize );
6365     assert( spaceLeft == info.nLocal );
6366   }
6367 #endif
6368
6369   /* Write the payload into the local Cell and any extra into overflow pages */
6370   while( 1 ){
6371     n = nPayload;
6372     if( n>spaceLeft ) n = spaceLeft;
6373
6374     /* If pToRelease is not zero than pPayload points into the data area
6375     ** of pToRelease.  Make sure pToRelease is still writeable. */
6376     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6377
6378     /* If pPayload is part of the data area of pPage, then make sure pPage
6379     ** is still writeable */
6380     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6381             || sqlite3PagerIswriteable(pPage->pDbPage) );
6382
6383     if( nSrc>=n ){
6384       memcpy(pPayload, pSrc, n);
6385     }else if( nSrc>0 ){
6386       n = nSrc;
6387       memcpy(pPayload, pSrc, n);
6388     }else{
6389       memset(pPayload, 0, n);
6390     }
6391     nPayload -= n;
6392     if( nPayload<=0 ) break;
6393     pPayload += n;
6394     pSrc += n;
6395     nSrc -= n;
6396     spaceLeft -= n;
6397     if( spaceLeft==0 ){
6398       MemPage *pOvfl = 0;
6399 #ifndef SQLITE_OMIT_AUTOVACUUM
6400       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6401       if( pBt->autoVacuum ){
6402         do{
6403           pgnoOvfl++;
6404         } while(
6405           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6406         );
6407       }
6408 #endif
6409       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6410 #ifndef SQLITE_OMIT_AUTOVACUUM
6411       /* If the database supports auto-vacuum, and the second or subsequent
6412       ** overflow page is being allocated, add an entry to the pointer-map
6413       ** for that page now.
6414       **
6415       ** If this is the first overflow page, then write a partial entry
6416       ** to the pointer-map. If we write nothing to this pointer-map slot,
6417       ** then the optimistic overflow chain processing in clearCell()
6418       ** may misinterpret the uninitialized values and delete the
6419       ** wrong pages from the database.
6420       */
6421       if( pBt->autoVacuum && rc==SQLITE_OK ){
6422         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6423         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6424         if( rc ){
6425           releasePage(pOvfl);
6426         }
6427       }
6428 #endif
6429       if( rc ){
6430         releasePage(pToRelease);
6431         return rc;
6432       }
6433
6434       /* If pToRelease is not zero than pPrior points into the data area
6435       ** of pToRelease.  Make sure pToRelease is still writeable. */
6436       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6437
6438       /* If pPrior is part of the data area of pPage, then make sure pPage
6439       ** is still writeable */
6440       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6441             || sqlite3PagerIswriteable(pPage->pDbPage) );
6442
6443       put4byte(pPrior, pgnoOvfl);
6444       releasePage(pToRelease);
6445       pToRelease = pOvfl;
6446       pPrior = pOvfl->aData;
6447       put4byte(pPrior, 0);
6448       pPayload = &pOvfl->aData[4];
6449       spaceLeft = pBt->usableSize - 4;
6450     }
6451   }
6452   releasePage(pToRelease);
6453   return SQLITE_OK;
6454 }
6455
6456 /*
6457 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6458 ** The cell content is not freed or deallocated.  It is assumed that
6459 ** the cell content has been copied someplace else.  This routine just
6460 ** removes the reference to the cell from pPage.
6461 **
6462 ** "sz" must be the number of bytes in the cell.
6463 */
6464 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6465   u32 pc;         /* Offset to cell content of cell being deleted */
6466   u8 *data;       /* pPage->aData */
6467   u8 *ptr;        /* Used to move bytes around within data[] */
6468   int rc;         /* The return code */
6469   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6470
6471   if( *pRC ) return;
6472   assert( idx>=0 && idx<pPage->nCell );
6473   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6474   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6475   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6476   data = pPage->aData;
6477   ptr = &pPage->aCellIdx[2*idx];
6478   pc = get2byte(ptr);
6479   hdr = pPage->hdrOffset;
6480   testcase( pc==get2byte(&data[hdr+5]) );
6481   testcase( pc+sz==pPage->pBt->usableSize );
6482   if( pc+sz > pPage->pBt->usableSize ){
6483     *pRC = SQLITE_CORRUPT_BKPT;
6484     return;
6485   }
6486   rc = freeSpace(pPage, pc, sz);
6487   if( rc ){
6488     *pRC = rc;
6489     return;
6490   }
6491   pPage->nCell--;
6492   if( pPage->nCell==0 ){
6493     memset(&data[hdr+1], 0, 4);
6494     data[hdr+7] = 0;
6495     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6496     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6497                        - pPage->childPtrSize - 8;
6498   }else{
6499     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6500     put2byte(&data[hdr+3], pPage->nCell);
6501     pPage->nFree += 2;
6502   }
6503 }
6504
6505 /*
6506 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6507 ** content of the cell.
6508 **
6509 ** If the cell content will fit on the page, then put it there.  If it
6510 ** will not fit, then make a copy of the cell content into pTemp if
6511 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6512 ** in pPage->apOvfl[] and make it point to the cell content (either
6513 ** in pTemp or the original pCell) and also record its index.
6514 ** Allocating a new entry in pPage->aCell[] implies that
6515 ** pPage->nOverflow is incremented.
6516 **
6517 ** *pRC must be SQLITE_OK when this routine is called.
6518 */
6519 static void insertCell(
6520   MemPage *pPage,   /* Page into which we are copying */
6521   int i,            /* New cell becomes the i-th cell of the page */
6522   u8 *pCell,        /* Content of the new cell */
6523   int sz,           /* Bytes of content in pCell */
6524   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6525   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6526   int *pRC          /* Read and write return code from here */
6527 ){
6528   int idx = 0;      /* Where to write new cell content in data[] */
6529   int j;            /* Loop counter */
6530   u8 *data;         /* The content of the whole page */
6531   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6532
6533   assert( *pRC==SQLITE_OK );
6534   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6535   assert( MX_CELL(pPage->pBt)<=10921 );
6536   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6537   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6538   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6539   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6540   /* The cell should normally be sized correctly.  However, when moving a
6541   ** malformed cell from a leaf page to an interior page, if the cell size
6542   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
6543   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
6544   ** the term after the || in the following assert(). */
6545   assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
6546   if( pPage->nOverflow || sz+2>pPage->nFree ){
6547     if( pTemp ){
6548       memcpy(pTemp, pCell, sz);
6549       pCell = pTemp;
6550     }
6551     if( iChild ){
6552       put4byte(pCell, iChild);
6553     }
6554     j = pPage->nOverflow++;
6555     /* Comparison against ArraySize-1 since we hold back one extra slot
6556     ** as a contingency.  In other words, never need more than 3 overflow
6557     ** slots but 4 are allocated, just to be safe. */
6558     assert( j < ArraySize(pPage->apOvfl)-1 );
6559     pPage->apOvfl[j] = pCell;
6560     pPage->aiOvfl[j] = (u16)i;
6561
6562     /* When multiple overflows occur, they are always sequential and in
6563     ** sorted order.  This invariants arise because multiple overflows can
6564     ** only occur when inserting divider cells into the parent page during
6565     ** balancing, and the dividers are adjacent and sorted.
6566     */
6567     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6568     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6569   }else{
6570     int rc = sqlite3PagerWrite(pPage->pDbPage);
6571     if( rc!=SQLITE_OK ){
6572       *pRC = rc;
6573       return;
6574     }
6575     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6576     data = pPage->aData;
6577     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6578     rc = allocateSpace(pPage, sz, &idx);
6579     if( rc ){ *pRC = rc; return; }
6580     /* The allocateSpace() routine guarantees the following properties
6581     ** if it returns successfully */
6582     assert( idx >= 0 );
6583     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6584     assert( idx+sz <= (int)pPage->pBt->usableSize );
6585     pPage->nFree -= (u16)(2 + sz);
6586     memcpy(&data[idx], pCell, sz);
6587     if( iChild ){
6588       put4byte(&data[idx], iChild);
6589     }
6590     pIns = pPage->aCellIdx + i*2;
6591     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6592     put2byte(pIns, idx);
6593     pPage->nCell++;
6594     /* increment the cell count */
6595     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6596     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
6597 #ifndef SQLITE_OMIT_AUTOVACUUM
6598     if( pPage->pBt->autoVacuum ){
6599       /* The cell may contain a pointer to an overflow page. If so, write
6600       ** the entry for the overflow page into the pointer map.
6601       */
6602       ptrmapPutOvflPtr(pPage, pCell, pRC);
6603     }
6604 #endif
6605   }
6606 }
6607
6608 /*
6609 ** A CellArray object contains a cache of pointers and sizes for a
6610 ** consecutive sequence of cells that might be held on multiple pages.
6611 */
6612 typedef struct CellArray CellArray;
6613 struct CellArray {
6614   int nCell;              /* Number of cells in apCell[] */
6615   MemPage *pRef;          /* Reference page */
6616   u8 **apCell;            /* All cells begin balanced */
6617   u16 *szCell;            /* Local size of all cells in apCell[] */
6618 };
6619
6620 /*
6621 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6622 ** computed.
6623 */
6624 static void populateCellCache(CellArray *p, int idx, int N){
6625   assert( idx>=0 && idx+N<=p->nCell );
6626   while( N>0 ){
6627     assert( p->apCell[idx]!=0 );
6628     if( p->szCell[idx]==0 ){
6629       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6630     }else{
6631       assert( CORRUPT_DB ||
6632               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6633     }
6634     idx++;
6635     N--;
6636   }
6637 }
6638
6639 /*
6640 ** Return the size of the Nth element of the cell array
6641 */
6642 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6643   assert( N>=0 && N<p->nCell );
6644   assert( p->szCell[N]==0 );
6645   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6646   return p->szCell[N];
6647 }
6648 static u16 cachedCellSize(CellArray *p, int N){
6649   assert( N>=0 && N<p->nCell );
6650   if( p->szCell[N] ) return p->szCell[N];
6651   return computeCellSize(p, N);
6652 }
6653
6654 /*
6655 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6656 ** szCell[] array contains the size in bytes of each cell. This function
6657 ** replaces the current contents of page pPg with the contents of the cell
6658 ** array.
6659 **
6660 ** Some of the cells in apCell[] may currently be stored in pPg. This
6661 ** function works around problems caused by this by making a copy of any
6662 ** such cells before overwriting the page data.
6663 **
6664 ** The MemPage.nFree field is invalidated by this function. It is the
6665 ** responsibility of the caller to set it correctly.
6666 */
6667 static int rebuildPage(
6668   MemPage *pPg,                   /* Edit this page */
6669   int nCell,                      /* Final number of cells on page */
6670   u8 **apCell,                    /* Array of cells */
6671   u16 *szCell                     /* Array of cell sizes */
6672 ){
6673   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6674   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6675   const int usableSize = pPg->pBt->usableSize;
6676   u8 * const pEnd = &aData[usableSize];
6677   int i;
6678   u8 *pCellptr = pPg->aCellIdx;
6679   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6680   u8 *pData;
6681
6682   i = get2byte(&aData[hdr+5]);
6683   memcpy(&pTmp[i], &aData[i], usableSize - i);
6684
6685   pData = pEnd;
6686   for(i=0; i<nCell; i++){
6687     u8 *pCell = apCell[i];
6688     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6689       pCell = &pTmp[pCell - aData];
6690     }
6691     pData -= szCell[i];
6692     put2byte(pCellptr, (pData - aData));
6693     pCellptr += 2;
6694     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6695     memcpy(pData, pCell, szCell[i]);
6696     assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6697     testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
6698   }
6699
6700   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6701   pPg->nCell = nCell;
6702   pPg->nOverflow = 0;
6703
6704   put2byte(&aData[hdr+1], 0);
6705   put2byte(&aData[hdr+3], pPg->nCell);
6706   put2byte(&aData[hdr+5], pData - aData);
6707   aData[hdr+7] = 0x00;
6708   return SQLITE_OK;
6709 }
6710
6711 /*
6712 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6713 ** contains the size in bytes of each such cell. This function attempts to
6714 ** add the cells stored in the array to page pPg. If it cannot (because
6715 ** the page needs to be defragmented before the cells will fit), non-zero
6716 ** is returned. Otherwise, if the cells are added successfully, zero is
6717 ** returned.
6718 **
6719 ** Argument pCellptr points to the first entry in the cell-pointer array
6720 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6721 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6722 ** cell in the array. It is the responsibility of the caller to ensure
6723 ** that it is safe to overwrite this part of the cell-pointer array.
6724 **
6725 ** When this function is called, *ppData points to the start of the
6726 ** content area on page pPg. If the size of the content area is extended,
6727 ** *ppData is updated to point to the new start of the content area
6728 ** before returning.
6729 **
6730 ** Finally, argument pBegin points to the byte immediately following the
6731 ** end of the space required by this page for the cell-pointer area (for
6732 ** all cells - not just those inserted by the current call). If the content
6733 ** area must be extended to before this point in order to accomodate all
6734 ** cells in apCell[], then the cells do not fit and non-zero is returned.
6735 */
6736 static int pageInsertArray(
6737   MemPage *pPg,                   /* Page to add cells to */
6738   u8 *pBegin,                     /* End of cell-pointer array */
6739   u8 **ppData,                    /* IN/OUT: Page content -area pointer */
6740   u8 *pCellptr,                   /* Pointer to cell-pointer area */
6741   int iFirst,                     /* Index of first cell to add */
6742   int nCell,                      /* Number of cells to add to pPg */
6743   CellArray *pCArray              /* Array of cells */
6744 ){
6745   int i;
6746   u8 *aData = pPg->aData;
6747   u8 *pData = *ppData;
6748   int iEnd = iFirst + nCell;
6749   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
6750   for(i=iFirst; i<iEnd; i++){
6751     int sz, rc;
6752     u8 *pSlot;
6753     sz = cachedCellSize(pCArray, i);
6754     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
6755       if( (pData - pBegin)<sz ) return 1;
6756       pData -= sz;
6757       pSlot = pData;
6758     }
6759     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
6760     ** database.  But they might for a corrupt database.  Hence use memmove()
6761     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
6762     assert( (pSlot+sz)<=pCArray->apCell[i]
6763          || pSlot>=(pCArray->apCell[i]+sz)
6764          || CORRUPT_DB );
6765     memmove(pSlot, pCArray->apCell[i], sz);
6766     put2byte(pCellptr, (pSlot - aData));
6767     pCellptr += 2;
6768   }
6769   *ppData = pData;
6770   return 0;
6771 }
6772
6773 /*
6774 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6775 ** contains the size in bytes of each such cell. This function adds the
6776 ** space associated with each cell in the array that is currently stored
6777 ** within the body of pPg to the pPg free-list. The cell-pointers and other
6778 ** fields of the page are not updated.
6779 **
6780 ** This function returns the total number of cells added to the free-list.
6781 */
6782 static int pageFreeArray(
6783   MemPage *pPg,                   /* Page to edit */
6784   int iFirst,                     /* First cell to delete */
6785   int nCell,                      /* Cells to delete */
6786   CellArray *pCArray              /* Array of cells */
6787 ){
6788   u8 * const aData = pPg->aData;
6789   u8 * const pEnd = &aData[pPg->pBt->usableSize];
6790   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
6791   int nRet = 0;
6792   int i;
6793   int iEnd = iFirst + nCell;
6794   u8 *pFree = 0;
6795   int szFree = 0;
6796
6797   for(i=iFirst; i<iEnd; i++){
6798     u8 *pCell = pCArray->apCell[i];
6799     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
6800       int sz;
6801       /* No need to use cachedCellSize() here.  The sizes of all cells that
6802       ** are to be freed have already been computing while deciding which
6803       ** cells need freeing */
6804       sz = pCArray->szCell[i];  assert( sz>0 );
6805       if( pFree!=(pCell + sz) ){
6806         if( pFree ){
6807           assert( pFree>aData && (pFree - aData)<65536 );
6808           freeSpace(pPg, (u16)(pFree - aData), szFree);
6809         }
6810         pFree = pCell;
6811         szFree = sz;
6812         if( pFree+sz>pEnd ) return 0;
6813       }else{
6814         pFree = pCell;
6815         szFree += sz;
6816       }
6817       nRet++;
6818     }
6819   }
6820   if( pFree ){
6821     assert( pFree>aData && (pFree - aData)<65536 );
6822     freeSpace(pPg, (u16)(pFree - aData), szFree);
6823   }
6824   return nRet;
6825 }
6826
6827 /*
6828 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
6829 ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
6830 ** with apCell[iOld].  After balancing, this page should hold nNew cells
6831 ** starting at apCell[iNew].
6832 **
6833 ** This routine makes the necessary adjustments to pPg so that it contains
6834 ** the correct cells after being balanced.
6835 **
6836 ** The pPg->nFree field is invalid when this function returns. It is the
6837 ** responsibility of the caller to set it correctly.
6838 */
6839 static int editPage(
6840   MemPage *pPg,                   /* Edit this page */
6841   int iOld,                       /* Index of first cell currently on page */
6842   int iNew,                       /* Index of new first cell on page */
6843   int nNew,                       /* Final number of cells on page */
6844   CellArray *pCArray              /* Array of cells and sizes */
6845 ){
6846   u8 * const aData = pPg->aData;
6847   const int hdr = pPg->hdrOffset;
6848   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
6849   int nCell = pPg->nCell;       /* Cells stored on pPg */
6850   u8 *pData;
6851   u8 *pCellptr;
6852   int i;
6853   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
6854   int iNewEnd = iNew + nNew;
6855
6856 #ifdef SQLITE_DEBUG
6857   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6858   memcpy(pTmp, aData, pPg->pBt->usableSize);
6859 #endif
6860
6861   /* Remove cells from the start and end of the page */
6862   if( iOld<iNew ){
6863     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
6864     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
6865     nCell -= nShift;
6866   }
6867   if( iNewEnd < iOldEnd ){
6868     nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
6869   }
6870
6871   pData = &aData[get2byteNotZero(&aData[hdr+5])];
6872   if( pData<pBegin ) goto editpage_fail;
6873
6874   /* Add cells to the start of the page */
6875   if( iNew<iOld ){
6876     int nAdd = MIN(nNew,iOld-iNew);
6877     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
6878     pCellptr = pPg->aCellIdx;
6879     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
6880     if( pageInsertArray(
6881           pPg, pBegin, &pData, pCellptr,
6882           iNew, nAdd, pCArray
6883     ) ) goto editpage_fail;
6884     nCell += nAdd;
6885   }
6886
6887   /* Add any overflow cells */
6888   for(i=0; i<pPg->nOverflow; i++){
6889     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
6890     if( iCell>=0 && iCell<nNew ){
6891       pCellptr = &pPg->aCellIdx[iCell * 2];
6892       memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
6893       nCell++;
6894       if( pageInsertArray(
6895             pPg, pBegin, &pData, pCellptr,
6896             iCell+iNew, 1, pCArray
6897       ) ) goto editpage_fail;
6898     }
6899   }
6900
6901   /* Append cells to the end of the page */
6902   pCellptr = &pPg->aCellIdx[nCell*2];
6903   if( pageInsertArray(
6904         pPg, pBegin, &pData, pCellptr,
6905         iNew+nCell, nNew-nCell, pCArray
6906   ) ) goto editpage_fail;
6907
6908   pPg->nCell = nNew;
6909   pPg->nOverflow = 0;
6910
6911   put2byte(&aData[hdr+3], pPg->nCell);
6912   put2byte(&aData[hdr+5], pData - aData);
6913
6914 #ifdef SQLITE_DEBUG
6915   for(i=0; i<nNew && !CORRUPT_DB; i++){
6916     u8 *pCell = pCArray->apCell[i+iNew];
6917     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
6918     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
6919       pCell = &pTmp[pCell - aData];
6920     }
6921     assert( 0==memcmp(pCell, &aData[iOff],
6922             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
6923   }
6924 #endif
6925
6926   return SQLITE_OK;
6927  editpage_fail:
6928   /* Unable to edit this page. Rebuild it from scratch instead. */
6929   populateCellCache(pCArray, iNew, nNew);
6930   return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
6931 }
6932
6933 /*
6934 ** The following parameters determine how many adjacent pages get involved
6935 ** in a balancing operation.  NN is the number of neighbors on either side
6936 ** of the page that participate in the balancing operation.  NB is the
6937 ** total number of pages that participate, including the target page and
6938 ** NN neighbors on either side.
6939 **
6940 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6941 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6942 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6943 ** The value of NN appears to give the best results overall.
6944 */
6945 #define NN 1             /* Number of neighbors on either side of pPage */
6946 #define NB (NN*2+1)      /* Total pages involved in the balance */
6947
6948
6949 #ifndef SQLITE_OMIT_QUICKBALANCE
6950 /*
6951 ** This version of balance() handles the common special case where
6952 ** a new entry is being inserted on the extreme right-end of the
6953 ** tree, in other words, when the new entry will become the largest
6954 ** entry in the tree.
6955 **
6956 ** Instead of trying to balance the 3 right-most leaf pages, just add
6957 ** a new page to the right-hand side and put the one new entry in
6958 ** that page.  This leaves the right side of the tree somewhat
6959 ** unbalanced.  But odds are that we will be inserting new entries
6960 ** at the end soon afterwards so the nearly empty page will quickly
6961 ** fill up.  On average.
6962 **
6963 ** pPage is the leaf page which is the right-most page in the tree.
6964 ** pParent is its parent.  pPage must have a single overflow entry
6965 ** which is also the right-most entry on the page.
6966 **
6967 ** The pSpace buffer is used to store a temporary copy of the divider
6968 ** cell that will be inserted into pParent. Such a cell consists of a 4
6969 ** byte page number followed by a variable length integer. In other
6970 ** words, at most 13 bytes. Hence the pSpace buffer must be at
6971 ** least 13 bytes in size.
6972 */
6973 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
6974   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
6975   MemPage *pNew;                       /* Newly allocated page */
6976   int rc;                              /* Return Code */
6977   Pgno pgnoNew;                        /* Page number of pNew */
6978
6979   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6980   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6981   assert( pPage->nOverflow==1 );
6982
6983   /* This error condition is now caught prior to reaching this function */
6984   if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
6985
6986   /* Allocate a new page. This page will become the right-sibling of
6987   ** pPage. Make the parent page writable, so that the new divider cell
6988   ** may be inserted. If both these operations are successful, proceed.
6989   */
6990   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
6991
6992   if( rc==SQLITE_OK ){
6993
6994     u8 *pOut = &pSpace[4];
6995     u8 *pCell = pPage->apOvfl[0];
6996     u16 szCell = pPage->xCellSize(pPage, pCell);
6997     u8 *pStop;
6998
6999     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
7000     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
7001     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
7002     rc = rebuildPage(pNew, 1, &pCell, &szCell);
7003     if( NEVER(rc) ) return rc;
7004     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
7005
7006     /* If this is an auto-vacuum database, update the pointer map
7007     ** with entries for the new page, and any pointer from the
7008     ** cell on the page to an overflow page. If either of these
7009     ** operations fails, the return code is set, but the contents
7010     ** of the parent page are still manipulated by thh code below.
7011     ** That is Ok, at this point the parent page is guaranteed to
7012     ** be marked as dirty. Returning an error code will cause a
7013     ** rollback, undoing any changes made to the parent page.
7014     */
7015     if( ISAUTOVACUUM ){
7016       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
7017       if( szCell>pNew->minLocal ){
7018         ptrmapPutOvflPtr(pNew, pCell, &rc);
7019       }
7020     }
7021
7022     /* Create a divider cell to insert into pParent. The divider cell
7023     ** consists of a 4-byte page number (the page number of pPage) and
7024     ** a variable length key value (which must be the same value as the
7025     ** largest key on pPage).
7026     **
7027     ** To find the largest key value on pPage, first find the right-most
7028     ** cell on pPage. The first two fields of this cell are the
7029     ** record-length (a variable length integer at most 32-bits in size)
7030     ** and the key value (a variable length integer, may have any value).
7031     ** The first of the while(...) loops below skips over the record-length
7032     ** field. The second while(...) loop copies the key value from the
7033     ** cell on pPage into the pSpace buffer.
7034     */
7035     pCell = findCell(pPage, pPage->nCell-1);
7036     pStop = &pCell[9];
7037     while( (*(pCell++)&0x80) && pCell<pStop );
7038     pStop = &pCell[9];
7039     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
7040
7041     /* Insert the new divider cell into pParent. */
7042     if( rc==SQLITE_OK ){
7043       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
7044                    0, pPage->pgno, &rc);
7045     }
7046
7047     /* Set the right-child pointer of pParent to point to the new page. */
7048     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
7049
7050     /* Release the reference to the new page. */
7051     releasePage(pNew);
7052   }
7053
7054   return rc;
7055 }
7056 #endif /* SQLITE_OMIT_QUICKBALANCE */
7057
7058 #if 0
7059 /*
7060 ** This function does not contribute anything to the operation of SQLite.
7061 ** it is sometimes activated temporarily while debugging code responsible
7062 ** for setting pointer-map entries.
7063 */
7064 static int ptrmapCheckPages(MemPage **apPage, int nPage){
7065   int i, j;
7066   for(i=0; i<nPage; i++){
7067     Pgno n;
7068     u8 e;
7069     MemPage *pPage = apPage[i];
7070     BtShared *pBt = pPage->pBt;
7071     assert( pPage->isInit );
7072
7073     for(j=0; j<pPage->nCell; j++){
7074       CellInfo info;
7075       u8 *z;
7076
7077       z = findCell(pPage, j);
7078       pPage->xParseCell(pPage, z, &info);
7079       if( info.nLocal<info.nPayload ){
7080         Pgno ovfl = get4byte(&z[info.nSize-4]);
7081         ptrmapGet(pBt, ovfl, &e, &n);
7082         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
7083       }
7084       if( !pPage->leaf ){
7085         Pgno child = get4byte(z);
7086         ptrmapGet(pBt, child, &e, &n);
7087         assert( n==pPage->pgno && e==PTRMAP_BTREE );
7088       }
7089     }
7090     if( !pPage->leaf ){
7091       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7092       ptrmapGet(pBt, child, &e, &n);
7093       assert( n==pPage->pgno && e==PTRMAP_BTREE );
7094     }
7095   }
7096   return 1;
7097 }
7098 #endif
7099
7100 /*
7101 ** This function is used to copy the contents of the b-tree node stored
7102 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
7103 ** the pointer-map entries for each child page are updated so that the
7104 ** parent page stored in the pointer map is page pTo. If pFrom contained
7105 ** any cells with overflow page pointers, then the corresponding pointer
7106 ** map entries are also updated so that the parent page is page pTo.
7107 **
7108 ** If pFrom is currently carrying any overflow cells (entries in the
7109 ** MemPage.apOvfl[] array), they are not copied to pTo.
7110 **
7111 ** Before returning, page pTo is reinitialized using btreeInitPage().
7112 **
7113 ** The performance of this function is not critical. It is only used by
7114 ** the balance_shallower() and balance_deeper() procedures, neither of
7115 ** which are called often under normal circumstances.
7116 */
7117 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
7118   if( (*pRC)==SQLITE_OK ){
7119     BtShared * const pBt = pFrom->pBt;
7120     u8 * const aFrom = pFrom->aData;
7121     u8 * const aTo = pTo->aData;
7122     int const iFromHdr = pFrom->hdrOffset;
7123     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
7124     int rc;
7125     int iData;
7126
7127
7128     assert( pFrom->isInit );
7129     assert( pFrom->nFree>=iToHdr );
7130     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
7131
7132     /* Copy the b-tree node content from page pFrom to page pTo. */
7133     iData = get2byte(&aFrom[iFromHdr+5]);
7134     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
7135     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
7136
7137     /* Reinitialize page pTo so that the contents of the MemPage structure
7138     ** match the new data. The initialization of pTo can actually fail under
7139     ** fairly obscure circumstances, even though it is a copy of initialized
7140     ** page pFrom.
7141     */
7142     pTo->isInit = 0;
7143     rc = btreeInitPage(pTo);
7144     if( rc!=SQLITE_OK ){
7145       *pRC = rc;
7146       return;
7147     }
7148
7149     /* If this is an auto-vacuum database, update the pointer-map entries
7150     ** for any b-tree or overflow pages that pTo now contains the pointers to.
7151     */
7152     if( ISAUTOVACUUM ){
7153       *pRC = setChildPtrmaps(pTo);
7154     }
7155   }
7156 }
7157
7158 /*
7159 ** This routine redistributes cells on the iParentIdx'th child of pParent
7160 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
7161 ** same amount of free space. Usually a single sibling on either side of the
7162 ** page are used in the balancing, though both siblings might come from one
7163 ** side if the page is the first or last child of its parent. If the page
7164 ** has fewer than 2 siblings (something which can only happen if the page
7165 ** is a root page or a child of a root page) then all available siblings
7166 ** participate in the balancing.
7167 **
7168 ** The number of siblings of the page might be increased or decreased by
7169 ** one or two in an effort to keep pages nearly full but not over full.
7170 **
7171 ** Note that when this routine is called, some of the cells on the page
7172 ** might not actually be stored in MemPage.aData[]. This can happen
7173 ** if the page is overfull. This routine ensures that all cells allocated
7174 ** to the page and its siblings fit into MemPage.aData[] before returning.
7175 **
7176 ** In the course of balancing the page and its siblings, cells may be
7177 ** inserted into or removed from the parent page (pParent). Doing so
7178 ** may cause the parent page to become overfull or underfull. If this
7179 ** happens, it is the responsibility of the caller to invoke the correct
7180 ** balancing routine to fix this problem (see the balance() routine).
7181 **
7182 ** If this routine fails for any reason, it might leave the database
7183 ** in a corrupted state. So if this routine fails, the database should
7184 ** be rolled back.
7185 **
7186 ** The third argument to this function, aOvflSpace, is a pointer to a
7187 ** buffer big enough to hold one page. If while inserting cells into the parent
7188 ** page (pParent) the parent page becomes overfull, this buffer is
7189 ** used to store the parent's overflow cells. Because this function inserts
7190 ** a maximum of four divider cells into the parent page, and the maximum
7191 ** size of a cell stored within an internal node is always less than 1/4
7192 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
7193 ** enough for all overflow cells.
7194 **
7195 ** If aOvflSpace is set to a null pointer, this function returns
7196 ** SQLITE_NOMEM.
7197 */
7198 static int balance_nonroot(
7199   MemPage *pParent,               /* Parent page of siblings being balanced */
7200   int iParentIdx,                 /* Index of "the page" in pParent */
7201   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7202   int isRoot,                     /* True if pParent is a root-page */
7203   int bBulk                       /* True if this call is part of a bulk load */
7204 ){
7205   BtShared *pBt;               /* The whole database */
7206   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7207   int nNew = 0;                /* Number of pages in apNew[] */
7208   int nOld;                    /* Number of pages in apOld[] */
7209   int i, j, k;                 /* Loop counters */
7210   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7211   int rc = SQLITE_OK;          /* The return code */
7212   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7213   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7214   int usableSpace;             /* Bytes in pPage beyond the header */
7215   int pageFlags;               /* Value of pPage->aData[0] */
7216   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7217   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7218   int szScratch;               /* Size of scratch memory requested */
7219   MemPage *apOld[NB];          /* pPage and up to two siblings */
7220   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7221   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7222   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7223   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7224   int cntOld[NB+2];            /* Old index in b.apCell[] */
7225   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7226   u8 *aSpace1;                 /* Space for copies of dividers cells */
7227   Pgno pgno;                   /* Temp var to store a page number in */
7228   u8 abDone[NB+2];             /* True after i'th new page is populated */
7229   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7230   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7231   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7232   CellArray b;                  /* Parsed information on cells being balanced */
7233
7234   memset(abDone, 0, sizeof(abDone));
7235   b.nCell = 0;
7236   b.apCell = 0;
7237   pBt = pParent->pBt;
7238   assert( sqlite3_mutex_held(pBt->mutex) );
7239   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7240
7241 #if 0
7242   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
7243 #endif
7244
7245   /* At this point pParent may have at most one overflow cell. And if
7246   ** this overflow cell is present, it must be the cell with
7247   ** index iParentIdx. This scenario comes about when this function
7248   ** is called (indirectly) from sqlite3BtreeDelete().
7249   */
7250   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7251   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7252
7253   if( !aOvflSpace ){
7254     return SQLITE_NOMEM_BKPT;
7255   }
7256
7257   /* Find the sibling pages to balance. Also locate the cells in pParent
7258   ** that divide the siblings. An attempt is made to find NN siblings on
7259   ** either side of pPage. More siblings are taken from one side, however,
7260   ** if there are fewer than NN siblings on the other side. If pParent
7261   ** has NB or fewer children then all children of pParent are taken.
7262   **
7263   ** This loop also drops the divider cells from the parent page. This
7264   ** way, the remainder of the function does not have to deal with any
7265   ** overflow cells in the parent page, since if any existed they will
7266   ** have already been removed.
7267   */
7268   i = pParent->nOverflow + pParent->nCell;
7269   if( i<2 ){
7270     nxDiv = 0;
7271   }else{
7272     assert( bBulk==0 || bBulk==1 );
7273     if( iParentIdx==0 ){
7274       nxDiv = 0;
7275     }else if( iParentIdx==i ){
7276       nxDiv = i-2+bBulk;
7277     }else{
7278       nxDiv = iParentIdx-1;
7279     }
7280     i = 2-bBulk;
7281   }
7282   nOld = i+1;
7283   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7284     pRight = &pParent->aData[pParent->hdrOffset+8];
7285   }else{
7286     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7287   }
7288   pgno = get4byte(pRight);
7289   while( 1 ){
7290     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7291     if( rc ){
7292       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7293       goto balance_cleanup;
7294     }
7295     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
7296     if( (i--)==0 ) break;
7297
7298     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7299       apDiv[i] = pParent->apOvfl[0];
7300       pgno = get4byte(apDiv[i]);
7301       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7302       pParent->nOverflow = 0;
7303     }else{
7304       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7305       pgno = get4byte(apDiv[i]);
7306       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7307
7308       /* Drop the cell from the parent page. apDiv[i] still points to
7309       ** the cell within the parent, even though it has been dropped.
7310       ** This is safe because dropping a cell only overwrites the first
7311       ** four bytes of it, and this function does not need the first
7312       ** four bytes of the divider cell. So the pointer is safe to use
7313       ** later on.
7314       **
7315       ** But not if we are in secure-delete mode. In secure-delete mode,
7316       ** the dropCell() routine will overwrite the entire cell with zeroes.
7317       ** In this case, temporarily copy the cell into the aOvflSpace[]
7318       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7319       ** is allocated.  */
7320       if( pBt->btsFlags & BTS_FAST_SECURE ){
7321         int iOff;
7322
7323         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7324         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7325           rc = SQLITE_CORRUPT_BKPT;
7326           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7327           goto balance_cleanup;
7328         }else{
7329           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7330           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7331         }
7332       }
7333       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7334     }
7335   }
7336
7337   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7338   ** alignment */
7339   nMaxCells = (nMaxCells + 3)&~3;
7340
7341   /*
7342   ** Allocate space for memory structures
7343   */
7344   szScratch =
7345        nMaxCells*sizeof(u8*)                       /* b.apCell */
7346      + nMaxCells*sizeof(u16)                       /* b.szCell */
7347      + pBt->pageSize;                              /* aSpace1 */
7348
7349   assert( szScratch<=6*(int)pBt->pageSize );
7350   b.apCell = sqlite3StackAllocRaw(0, szScratch );
7351   if( b.apCell==0 ){
7352     rc = SQLITE_NOMEM_BKPT;
7353     goto balance_cleanup;
7354   }
7355   b.szCell = (u16*)&b.apCell[nMaxCells];
7356   aSpace1 = (u8*)&b.szCell[nMaxCells];
7357   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7358
7359   /*
7360   ** Load pointers to all cells on sibling pages and the divider cells
7361   ** into the local b.apCell[] array.  Make copies of the divider cells
7362   ** into space obtained from aSpace1[]. The divider cells have already
7363   ** been removed from pParent.
7364   **
7365   ** If the siblings are on leaf pages, then the child pointers of the
7366   ** divider cells are stripped from the cells before they are copied
7367   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7368   ** child pointers.  If siblings are not leaves, then all cell in
7369   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7370   ** are alike.
7371   **
7372   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7373   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7374   */
7375   b.pRef = apOld[0];
7376   leafCorrection = b.pRef->leaf*4;
7377   leafData = b.pRef->intKeyLeaf;
7378   for(i=0; i<nOld; i++){
7379     MemPage *pOld = apOld[i];
7380     int limit = pOld->nCell;
7381     u8 *aData = pOld->aData;
7382     u16 maskPage = pOld->maskPage;
7383     u8 *piCell = aData + pOld->cellOffset;
7384     u8 *piEnd;
7385
7386     /* Verify that all sibling pages are of the same "type" (table-leaf,
7387     ** table-interior, index-leaf, or index-interior).
7388     */
7389     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7390       rc = SQLITE_CORRUPT_BKPT;
7391       goto balance_cleanup;
7392     }
7393
7394     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7395     ** constains overflow cells, include them in the b.apCell[] array
7396     ** in the correct spot.
7397     **
7398     ** Note that when there are multiple overflow cells, it is always the
7399     ** case that they are sequential and adjacent.  This invariant arises
7400     ** because multiple overflows can only occurs when inserting divider
7401     ** cells into a parent on a prior balance, and divider cells are always
7402     ** adjacent and are inserted in order.  There is an assert() tagged
7403     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7404     ** invariant.
7405     **
7406     ** This must be done in advance.  Once the balance starts, the cell
7407     ** offset section of the btree page will be overwritten and we will no
7408     ** long be able to find the cells if a pointer to each cell is not saved
7409     ** first.
7410     */
7411     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7412     if( pOld->nOverflow>0 ){
7413       limit = pOld->aiOvfl[0];
7414       for(j=0; j<limit; j++){
7415         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7416         piCell += 2;
7417         b.nCell++;
7418       }
7419       for(k=0; k<pOld->nOverflow; k++){
7420         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7421         b.apCell[b.nCell] = pOld->apOvfl[k];
7422         b.nCell++;
7423       }
7424     }
7425     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7426     while( piCell<piEnd ){
7427       assert( b.nCell<nMaxCells );
7428       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7429       piCell += 2;
7430       b.nCell++;
7431     }
7432
7433     cntOld[i] = b.nCell;
7434     if( i<nOld-1 && !leafData){
7435       u16 sz = (u16)szNew[i];
7436       u8 *pTemp;
7437       assert( b.nCell<nMaxCells );
7438       b.szCell[b.nCell] = sz;
7439       pTemp = &aSpace1[iSpace1];
7440       iSpace1 += sz;
7441       assert( sz<=pBt->maxLocal+23 );
7442       assert( iSpace1 <= (int)pBt->pageSize );
7443       memcpy(pTemp, apDiv[i], sz);
7444       b.apCell[b.nCell] = pTemp+leafCorrection;
7445       assert( leafCorrection==0 || leafCorrection==4 );
7446       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7447       if( !pOld->leaf ){
7448         assert( leafCorrection==0 );
7449         assert( pOld->hdrOffset==0 );
7450         /* The right pointer of the child page pOld becomes the left
7451         ** pointer of the divider cell */
7452         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7453       }else{
7454         assert( leafCorrection==4 );
7455         while( b.szCell[b.nCell]<4 ){
7456           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7457           ** does exist, pad it with 0x00 bytes. */
7458           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7459           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7460           aSpace1[iSpace1++] = 0x00;
7461           b.szCell[b.nCell]++;
7462         }
7463       }
7464       b.nCell++;
7465     }
7466   }
7467
7468   /*
7469   ** Figure out the number of pages needed to hold all b.nCell cells.
7470   ** Store this number in "k".  Also compute szNew[] which is the total
7471   ** size of all cells on the i-th page and cntNew[] which is the index
7472   ** in b.apCell[] of the cell that divides page i from page i+1.
7473   ** cntNew[k] should equal b.nCell.
7474   **
7475   ** Values computed by this block:
7476   **
7477   **           k: The total number of sibling pages
7478   **    szNew[i]: Spaced used on the i-th sibling page.
7479   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7480   **              the right of the i-th sibling page.
7481   ** usableSpace: Number of bytes of space available on each sibling.
7482   **
7483   */
7484   usableSpace = pBt->usableSize - 12 + leafCorrection;
7485   for(i=0; i<nOld; i++){
7486     MemPage *p = apOld[i];
7487     szNew[i] = usableSpace - p->nFree;
7488     for(j=0; j<p->nOverflow; j++){
7489       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7490     }
7491     cntNew[i] = cntOld[i];
7492   }
7493   k = nOld;
7494   for(i=0; i<k; i++){
7495     int sz;
7496     while( szNew[i]>usableSpace ){
7497       if( i+1>=k ){
7498         k = i+2;
7499         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7500         szNew[k-1] = 0;
7501         cntNew[k-1] = b.nCell;
7502       }
7503       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7504       szNew[i] -= sz;
7505       if( !leafData ){
7506         if( cntNew[i]<b.nCell ){
7507           sz = 2 + cachedCellSize(&b, cntNew[i]);
7508         }else{
7509           sz = 0;
7510         }
7511       }
7512       szNew[i+1] += sz;
7513       cntNew[i]--;
7514     }
7515     while( cntNew[i]<b.nCell ){
7516       sz = 2 + cachedCellSize(&b, cntNew[i]);
7517       if( szNew[i]+sz>usableSpace ) break;
7518       szNew[i] += sz;
7519       cntNew[i]++;
7520       if( !leafData ){
7521         if( cntNew[i]<b.nCell ){
7522           sz = 2 + cachedCellSize(&b, cntNew[i]);
7523         }else{
7524           sz = 0;
7525         }
7526       }
7527       szNew[i+1] -= sz;
7528     }
7529     if( cntNew[i]>=b.nCell ){
7530       k = i+1;
7531     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7532       rc = SQLITE_CORRUPT_BKPT;
7533       goto balance_cleanup;
7534     }
7535   }
7536
7537   /*
7538   ** The packing computed by the previous block is biased toward the siblings
7539   ** on the left side (siblings with smaller keys). The left siblings are
7540   ** always nearly full, while the right-most sibling might be nearly empty.
7541   ** The next block of code attempts to adjust the packing of siblings to
7542   ** get a better balance.
7543   **
7544   ** This adjustment is more than an optimization.  The packing above might
7545   ** be so out of balance as to be illegal.  For example, the right-most
7546   ** sibling might be completely empty.  This adjustment is not optional.
7547   */
7548   for(i=k-1; i>0; i--){
7549     int szRight = szNew[i];  /* Size of sibling on the right */
7550     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7551     int r;              /* Index of right-most cell in left sibling */
7552     int d;              /* Index of first cell to the left of right sibling */
7553
7554     r = cntNew[i-1] - 1;
7555     d = r + 1 - leafData;
7556     (void)cachedCellSize(&b, d);
7557     do{
7558       assert( d<nMaxCells );
7559       assert( r<nMaxCells );
7560       (void)cachedCellSize(&b, r);
7561       if( szRight!=0
7562        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
7563         break;
7564       }
7565       szRight += b.szCell[d] + 2;
7566       szLeft -= b.szCell[r] + 2;
7567       cntNew[i-1] = r;
7568       r--;
7569       d--;
7570     }while( r>=0 );
7571     szNew[i] = szRight;
7572     szNew[i-1] = szLeft;
7573     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7574       rc = SQLITE_CORRUPT_BKPT;
7575       goto balance_cleanup;
7576     }
7577   }
7578
7579   /* Sanity check:  For a non-corrupt database file one of the follwing
7580   ** must be true:
7581   **    (1) We found one or more cells (cntNew[0])>0), or
7582   **    (2) pPage is a virtual root page.  A virtual root page is when
7583   **        the real root page is page 1 and we are the only child of
7584   **        that page.
7585   */
7586   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7587   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7588     apOld[0]->pgno, apOld[0]->nCell,
7589     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7590     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7591   ));
7592
7593   /*
7594   ** Allocate k new pages.  Reuse old pages where possible.
7595   */
7596   pageFlags = apOld[0]->aData[0];
7597   for(i=0; i<k; i++){
7598     MemPage *pNew;
7599     if( i<nOld ){
7600       pNew = apNew[i] = apOld[i];
7601       apOld[i] = 0;
7602       rc = sqlite3PagerWrite(pNew->pDbPage);
7603       nNew++;
7604       if( rc ) goto balance_cleanup;
7605     }else{
7606       assert( i>0 );
7607       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7608       if( rc ) goto balance_cleanup;
7609       zeroPage(pNew, pageFlags);
7610       apNew[i] = pNew;
7611       nNew++;
7612       cntOld[i] = b.nCell;
7613
7614       /* Set the pointer-map entry for the new sibling page. */
7615       if( ISAUTOVACUUM ){
7616         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7617         if( rc!=SQLITE_OK ){
7618           goto balance_cleanup;
7619         }
7620       }
7621     }
7622   }
7623
7624   /*
7625   ** Reassign page numbers so that the new pages are in ascending order.
7626   ** This helps to keep entries in the disk file in order so that a scan
7627   ** of the table is closer to a linear scan through the file. That in turn
7628   ** helps the operating system to deliver pages from the disk more rapidly.
7629   **
7630   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7631   ** than (NB+2) (a small constant), that should not be a problem.
7632   **
7633   ** When NB==3, this one optimization makes the database about 25% faster
7634   ** for large insertions and deletions.
7635   */
7636   for(i=0; i<nNew; i++){
7637     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7638     aPgFlags[i] = apNew[i]->pDbPage->flags;
7639     for(j=0; j<i; j++){
7640       if( aPgno[j]==aPgno[i] ){
7641         /* This branch is taken if the set of sibling pages somehow contains
7642         ** duplicate entries. This can happen if the database is corrupt.
7643         ** It would be simpler to detect this as part of the loop below, but
7644         ** we do the detection here in order to avoid populating the pager
7645         ** cache with two separate objects associated with the same
7646         ** page number.  */
7647         assert( CORRUPT_DB );
7648         rc = SQLITE_CORRUPT_BKPT;
7649         goto balance_cleanup;
7650       }
7651     }
7652   }
7653   for(i=0; i<nNew; i++){
7654     int iBest = 0;                /* aPgno[] index of page number to use */
7655     for(j=1; j<nNew; j++){
7656       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7657     }
7658     pgno = aPgOrder[iBest];
7659     aPgOrder[iBest] = 0xffffffff;
7660     if( iBest!=i ){
7661       if( iBest>i ){
7662         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7663       }
7664       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7665       apNew[i]->pgno = pgno;
7666     }
7667   }
7668
7669   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7670          "%d(%d nc=%d) %d(%d nc=%d)\n",
7671     apNew[0]->pgno, szNew[0], cntNew[0],
7672     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7673     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7674     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7675     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7676     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7677     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7678     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7679     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7680   ));
7681
7682   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7683   put4byte(pRight, apNew[nNew-1]->pgno);
7684
7685   /* If the sibling pages are not leaves, ensure that the right-child pointer
7686   ** of the right-most new sibling page is set to the value that was
7687   ** originally in the same field of the right-most old sibling page. */
7688   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
7689     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
7690     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
7691   }
7692
7693   /* Make any required updates to pointer map entries associated with
7694   ** cells stored on sibling pages following the balance operation. Pointer
7695   ** map entries associated with divider cells are set by the insertCell()
7696   ** routine. The associated pointer map entries are:
7697   **
7698   **   a) if the cell contains a reference to an overflow chain, the
7699   **      entry associated with the first page in the overflow chain, and
7700   **
7701   **   b) if the sibling pages are not leaves, the child page associated
7702   **      with the cell.
7703   **
7704   ** If the sibling pages are not leaves, then the pointer map entry
7705   ** associated with the right-child of each sibling may also need to be
7706   ** updated. This happens below, after the sibling pages have been
7707   ** populated, not here.
7708   */
7709   if( ISAUTOVACUUM ){
7710     MemPage *pNew = apNew[0];
7711     u8 *aOld = pNew->aData;
7712     int cntOldNext = pNew->nCell + pNew->nOverflow;
7713     int usableSize = pBt->usableSize;
7714     int iNew = 0;
7715     int iOld = 0;
7716
7717     for(i=0; i<b.nCell; i++){
7718       u8 *pCell = b.apCell[i];
7719       if( i==cntOldNext ){
7720         MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
7721         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
7722         aOld = pOld->aData;
7723       }
7724       if( i==cntNew[iNew] ){
7725         pNew = apNew[++iNew];
7726         if( !leafData ) continue;
7727       }
7728
7729       /* Cell pCell is destined for new sibling page pNew. Originally, it
7730       ** was either part of sibling page iOld (possibly an overflow cell),
7731       ** or else the divider cell to the left of sibling page iOld. So,
7732       ** if sibling page iOld had the same page number as pNew, and if
7733       ** pCell really was a part of sibling page iOld (not a divider or
7734       ** overflow cell), we can skip updating the pointer map entries.  */
7735       if( iOld>=nNew
7736        || pNew->pgno!=aPgno[iOld]
7737        || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
7738       ){
7739         if( !leafCorrection ){
7740           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
7741         }
7742         if( cachedCellSize(&b,i)>pNew->minLocal ){
7743           ptrmapPutOvflPtr(pNew, pCell, &rc);
7744         }
7745         if( rc ) goto balance_cleanup;
7746       }
7747     }
7748   }
7749
7750   /* Insert new divider cells into pParent. */
7751   for(i=0; i<nNew-1; i++){
7752     u8 *pCell;
7753     u8 *pTemp;
7754     int sz;
7755     MemPage *pNew = apNew[i];
7756     j = cntNew[i];
7757
7758     assert( j<nMaxCells );
7759     assert( b.apCell[j]!=0 );
7760     pCell = b.apCell[j];
7761     sz = b.szCell[j] + leafCorrection;
7762     pTemp = &aOvflSpace[iOvflSpace];
7763     if( !pNew->leaf ){
7764       memcpy(&pNew->aData[8], pCell, 4);
7765     }else if( leafData ){
7766       /* If the tree is a leaf-data tree, and the siblings are leaves,
7767       ** then there is no divider cell in b.apCell[]. Instead, the divider
7768       ** cell consists of the integer key for the right-most cell of
7769       ** the sibling-page assembled above only.
7770       */
7771       CellInfo info;
7772       j--;
7773       pNew->xParseCell(pNew, b.apCell[j], &info);
7774       pCell = pTemp;
7775       sz = 4 + putVarint(&pCell[4], info.nKey);
7776       pTemp = 0;
7777     }else{
7778       pCell -= 4;
7779       /* Obscure case for non-leaf-data trees: If the cell at pCell was
7780       ** previously stored on a leaf node, and its reported size was 4
7781       ** bytes, then it may actually be smaller than this
7782       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
7783       ** any cell). But it is important to pass the correct size to
7784       ** insertCell(), so reparse the cell now.
7785       **
7786       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
7787       ** and WITHOUT ROWID tables with exactly one column which is the
7788       ** primary key.
7789       */
7790       if( b.szCell[j]==4 ){
7791         assert(leafCorrection==4);
7792         sz = pParent->xCellSize(pParent, pCell);
7793       }
7794     }
7795     iOvflSpace += sz;
7796     assert( sz<=pBt->maxLocal+23 );
7797     assert( iOvflSpace <= (int)pBt->pageSize );
7798     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
7799     if( rc!=SQLITE_OK ) goto balance_cleanup;
7800     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7801   }
7802
7803   /* Now update the actual sibling pages. The order in which they are updated
7804   ** is important, as this code needs to avoid disrupting any page from which
7805   ** cells may still to be read. In practice, this means:
7806   **
7807   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
7808   **      then it is not safe to update page apNew[iPg] until after
7809   **      the left-hand sibling apNew[iPg-1] has been updated.
7810   **
7811   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
7812   **      then it is not safe to update page apNew[iPg] until after
7813   **      the right-hand sibling apNew[iPg+1] has been updated.
7814   **
7815   ** If neither of the above apply, the page is safe to update.
7816   **
7817   ** The iPg value in the following loop starts at nNew-1 goes down
7818   ** to 0, then back up to nNew-1 again, thus making two passes over
7819   ** the pages.  On the initial downward pass, only condition (1) above
7820   ** needs to be tested because (2) will always be true from the previous
7821   ** step.  On the upward pass, both conditions are always true, so the
7822   ** upwards pass simply processes pages that were missed on the downward
7823   ** pass.
7824   */
7825   for(i=1-nNew; i<nNew; i++){
7826     int iPg = i<0 ? -i : i;
7827     assert( iPg>=0 && iPg<nNew );
7828     if( abDone[iPg] ) continue;         /* Skip pages already processed */
7829     if( i>=0                            /* On the upwards pass, or... */
7830      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
7831     ){
7832       int iNew;
7833       int iOld;
7834       int nNewCell;
7835
7836       /* Verify condition (1):  If cells are moving left, update iPg
7837       ** only after iPg-1 has already been updated. */
7838       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
7839
7840       /* Verify condition (2):  If cells are moving right, update iPg
7841       ** only after iPg+1 has already been updated. */
7842       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
7843
7844       if( iPg==0 ){
7845         iNew = iOld = 0;
7846         nNewCell = cntNew[0];
7847       }else{
7848         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
7849         iNew = cntNew[iPg-1] + !leafData;
7850         nNewCell = cntNew[iPg] - iNew;
7851       }
7852
7853       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
7854       if( rc ) goto balance_cleanup;
7855       abDone[iPg]++;
7856       apNew[iPg]->nFree = usableSpace-szNew[iPg];
7857       assert( apNew[iPg]->nOverflow==0 );
7858       assert( apNew[iPg]->nCell==nNewCell );
7859     }
7860   }
7861
7862   /* All pages have been processed exactly once */
7863   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
7864
7865   assert( nOld>0 );
7866   assert( nNew>0 );
7867
7868   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
7869     /* The root page of the b-tree now contains no cells. The only sibling
7870     ** page is the right-child of the parent. Copy the contents of the
7871     ** child page into the parent, decreasing the overall height of the
7872     ** b-tree structure by one. This is described as the "balance-shallower"
7873     ** sub-algorithm in some documentation.
7874     **
7875     ** If this is an auto-vacuum database, the call to copyNodeContent()
7876     ** sets all pointer-map entries corresponding to database image pages
7877     ** for which the pointer is stored within the content being copied.
7878     **
7879     ** It is critical that the child page be defragmented before being
7880     ** copied into the parent, because if the parent is page 1 then it will
7881     ** by smaller than the child due to the database header, and so all the
7882     ** free space needs to be up front.
7883     */
7884     assert( nNew==1 || CORRUPT_DB );
7885     rc = defragmentPage(apNew[0], -1);
7886     testcase( rc!=SQLITE_OK );
7887     assert( apNew[0]->nFree ==
7888         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
7889       || rc!=SQLITE_OK
7890     );
7891     copyNodeContent(apNew[0], pParent, &rc);
7892     freePage(apNew[0], &rc);
7893   }else if( ISAUTOVACUUM && !leafCorrection ){
7894     /* Fix the pointer map entries associated with the right-child of each
7895     ** sibling page. All other pointer map entries have already been taken
7896     ** care of.  */
7897     for(i=0; i<nNew; i++){
7898       u32 key = get4byte(&apNew[i]->aData[8]);
7899       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
7900     }
7901   }
7902
7903   assert( pParent->isInit );
7904   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
7905           nOld, nNew, b.nCell));
7906
7907   /* Free any old pages that were not reused as new pages.
7908   */
7909   for(i=nNew; i<nOld; i++){
7910     freePage(apOld[i], &rc);
7911   }
7912
7913 #if 0
7914   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
7915     /* The ptrmapCheckPages() contains assert() statements that verify that
7916     ** all pointer map pages are set correctly. This is helpful while
7917     ** debugging. This is usually disabled because a corrupt database may
7918     ** cause an assert() statement to fail.  */
7919     ptrmapCheckPages(apNew, nNew);
7920     ptrmapCheckPages(&pParent, 1);
7921   }
7922 #endif
7923
7924   /*
7925   ** Cleanup before returning.
7926   */
7927 balance_cleanup:
7928   sqlite3StackFree(0, b.apCell);
7929   for(i=0; i<nOld; i++){
7930     releasePage(apOld[i]);
7931   }
7932   for(i=0; i<nNew; i++){
7933     releasePage(apNew[i]);
7934   }
7935
7936   return rc;
7937 }
7938
7939
7940 /*
7941 ** This function is called when the root page of a b-tree structure is
7942 ** overfull (has one or more overflow pages).
7943 **
7944 ** A new child page is allocated and the contents of the current root
7945 ** page, including overflow cells, are copied into the child. The root
7946 ** page is then overwritten to make it an empty page with the right-child
7947 ** pointer pointing to the new page.
7948 **
7949 ** Before returning, all pointer-map entries corresponding to pages
7950 ** that the new child-page now contains pointers to are updated. The
7951 ** entry corresponding to the new right-child pointer of the root
7952 ** page is also updated.
7953 **
7954 ** If successful, *ppChild is set to contain a reference to the child
7955 ** page and SQLITE_OK is returned. In this case the caller is required
7956 ** to call releasePage() on *ppChild exactly once. If an error occurs,
7957 ** an error code is returned and *ppChild is set to 0.
7958 */
7959 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
7960   int rc;                        /* Return value from subprocedures */
7961   MemPage *pChild = 0;           /* Pointer to a new child page */
7962   Pgno pgnoChild = 0;            /* Page number of the new child page */
7963   BtShared *pBt = pRoot->pBt;    /* The BTree */
7964
7965   assert( pRoot->nOverflow>0 );
7966   assert( sqlite3_mutex_held(pBt->mutex) );
7967
7968   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
7969   ** page that will become the new right-child of pPage. Copy the contents
7970   ** of the node stored on pRoot into the new child page.
7971   */
7972   rc = sqlite3PagerWrite(pRoot->pDbPage);
7973   if( rc==SQLITE_OK ){
7974     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
7975     copyNodeContent(pRoot, pChild, &rc);
7976     if( ISAUTOVACUUM ){
7977       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
7978     }
7979   }
7980   if( rc ){
7981     *ppChild = 0;
7982     releasePage(pChild);
7983     return rc;
7984   }
7985   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
7986   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7987   assert( pChild->nCell==pRoot->nCell );
7988
7989   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
7990
7991   /* Copy the overflow cells from pRoot to pChild */
7992   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
7993          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
7994   memcpy(pChild->apOvfl, pRoot->apOvfl,
7995          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
7996   pChild->nOverflow = pRoot->nOverflow;
7997
7998   /* Zero the contents of pRoot. Then install pChild as the right-child. */
7999   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
8000   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
8001
8002   *ppChild = pChild;
8003   return SQLITE_OK;
8004 }
8005
8006 /*
8007 ** The page that pCur currently points to has just been modified in
8008 ** some way. This function figures out if this modification means the
8009 ** tree needs to be balanced, and if so calls the appropriate balancing
8010 ** routine. Balancing routines are:
8011 **
8012 **   balance_quick()
8013 **   balance_deeper()
8014 **   balance_nonroot()
8015 */
8016 static int balance(BtCursor *pCur){
8017   int rc = SQLITE_OK;
8018   const int nMin = pCur->pBt->usableSize * 2 / 3;
8019   u8 aBalanceQuickSpace[13];
8020   u8 *pFree = 0;
8021
8022   VVA_ONLY( int balance_quick_called = 0 );
8023   VVA_ONLY( int balance_deeper_called = 0 );
8024
8025   do {
8026     int iPage = pCur->iPage;
8027     MemPage *pPage = pCur->pPage;
8028
8029     if( iPage==0 ){
8030       if( pPage->nOverflow ){
8031         /* The root page of the b-tree is overfull. In this case call the
8032         ** balance_deeper() function to create a new child for the root-page
8033         ** and copy the current contents of the root-page to it. The
8034         ** next iteration of the do-loop will balance the child page.
8035         */
8036         assert( balance_deeper_called==0 );
8037         VVA_ONLY( balance_deeper_called++ );
8038         rc = balance_deeper(pPage, &pCur->apPage[1]);
8039         if( rc==SQLITE_OK ){
8040           pCur->iPage = 1;
8041           pCur->ix = 0;
8042           pCur->aiIdx[0] = 0;
8043           pCur->apPage[0] = pPage;
8044           pCur->pPage = pCur->apPage[1];
8045           assert( pCur->pPage->nOverflow );
8046         }
8047       }else{
8048         break;
8049       }
8050     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
8051       break;
8052     }else{
8053       MemPage * const pParent = pCur->apPage[iPage-1];
8054       int const iIdx = pCur->aiIdx[iPage-1];
8055
8056       rc = sqlite3PagerWrite(pParent->pDbPage);
8057       if( rc==SQLITE_OK ){
8058 #ifndef SQLITE_OMIT_QUICKBALANCE
8059         if( pPage->intKeyLeaf
8060          && pPage->nOverflow==1
8061          && pPage->aiOvfl[0]==pPage->nCell
8062          && pParent->pgno!=1
8063          && pParent->nCell==iIdx
8064         ){
8065           /* Call balance_quick() to create a new sibling of pPage on which
8066           ** to store the overflow cell. balance_quick() inserts a new cell
8067           ** into pParent, which may cause pParent overflow. If this
8068           ** happens, the next iteration of the do-loop will balance pParent
8069           ** use either balance_nonroot() or balance_deeper(). Until this
8070           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
8071           ** buffer.
8072           **
8073           ** The purpose of the following assert() is to check that only a
8074           ** single call to balance_quick() is made for each call to this
8075           ** function. If this were not verified, a subtle bug involving reuse
8076           ** of the aBalanceQuickSpace[] might sneak in.
8077           */
8078           assert( balance_quick_called==0 );
8079           VVA_ONLY( balance_quick_called++ );
8080           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
8081         }else
8082 #endif
8083         {
8084           /* In this case, call balance_nonroot() to redistribute cells
8085           ** between pPage and up to 2 of its sibling pages. This involves
8086           ** modifying the contents of pParent, which may cause pParent to
8087           ** become overfull or underfull. The next iteration of the do-loop
8088           ** will balance the parent page to correct this.
8089           **
8090           ** If the parent page becomes overfull, the overflow cell or cells
8091           ** are stored in the pSpace buffer allocated immediately below.
8092           ** A subsequent iteration of the do-loop will deal with this by
8093           ** calling balance_nonroot() (balance_deeper() may be called first,
8094           ** but it doesn't deal with overflow cells - just moves them to a
8095           ** different page). Once this subsequent call to balance_nonroot()
8096           ** has completed, it is safe to release the pSpace buffer used by
8097           ** the previous call, as the overflow cell data will have been
8098           ** copied either into the body of a database page or into the new
8099           ** pSpace buffer passed to the latter call to balance_nonroot().
8100           */
8101           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
8102           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
8103                                pCur->hints&BTREE_BULKLOAD);
8104           if( pFree ){
8105             /* If pFree is not NULL, it points to the pSpace buffer used
8106             ** by a previous call to balance_nonroot(). Its contents are
8107             ** now stored either on real database pages or within the
8108             ** new pSpace buffer, so it may be safely freed here. */
8109             sqlite3PageFree(pFree);
8110           }
8111
8112           /* The pSpace buffer will be freed after the next call to
8113           ** balance_nonroot(), or just before this function returns, whichever
8114           ** comes first. */
8115           pFree = pSpace;
8116         }
8117       }
8118
8119       pPage->nOverflow = 0;
8120
8121       /* The next iteration of the do-loop balances the parent page. */
8122       releasePage(pPage);
8123       pCur->iPage--;
8124       assert( pCur->iPage>=0 );
8125       pCur->pPage = pCur->apPage[pCur->iPage];
8126     }
8127   }while( rc==SQLITE_OK );
8128
8129   if( pFree ){
8130     sqlite3PageFree(pFree);
8131   }
8132   return rc;
8133 }
8134
8135
8136 /*
8137 ** Insert a new record into the BTree.  The content of the new record
8138 ** is described by the pX object.  The pCur cursor is used only to
8139 ** define what table the record should be inserted into, and is left
8140 ** pointing at a random location.
8141 **
8142 ** For a table btree (used for rowid tables), only the pX.nKey value of
8143 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
8144 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
8145 ** hold the content of the row.
8146 **
8147 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
8148 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
8149 ** pX.pData,nData,nZero fields must be zero.
8150 **
8151 ** If the seekResult parameter is non-zero, then a successful call to
8152 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
8153 ** been performed.  In other words, if seekResult!=0 then the cursor
8154 ** is currently pointing to a cell that will be adjacent to the cell
8155 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
8156 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
8157 ** that is larger than (pKey,nKey).
8158 **
8159 ** If seekResult==0, that means pCur is pointing at some unknown location.
8160 ** In that case, this routine must seek the cursor to the correct insertion
8161 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
8162 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
8163 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
8164 ** to decode the key.
8165 */
8166 int sqlite3BtreeInsert(
8167   BtCursor *pCur,                /* Insert data into the table of this cursor */
8168   const BtreePayload *pX,        /* Content of the row to be inserted */
8169   int flags,                     /* True if this is likely an append */
8170   int seekResult                 /* Result of prior MovetoUnpacked() call */
8171 ){
8172   int rc;
8173   int loc = seekResult;          /* -1: before desired location  +1: after */
8174   int szNew = 0;
8175   int idx;
8176   MemPage *pPage;
8177   Btree *p = pCur->pBtree;
8178   BtShared *pBt = p->pBt;
8179   unsigned char *oldCell;
8180   unsigned char *newCell = 0;
8181
8182   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
8183
8184   if( pCur->eState==CURSOR_FAULT ){
8185     assert( pCur->skipNext!=SQLITE_OK );
8186     return pCur->skipNext;
8187   }
8188
8189   assert( cursorOwnsBtShared(pCur) );
8190   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
8191               && pBt->inTransaction==TRANS_WRITE
8192               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
8193   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8194
8195   /* Assert that the caller has been consistent. If this cursor was opened
8196   ** expecting an index b-tree, then the caller should be inserting blob
8197   ** keys with no associated data. If the cursor was opened expecting an
8198   ** intkey table, the caller should be inserting integer keys with a
8199   ** blob of associated data.  */
8200   assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
8201
8202   /* Save the positions of any other cursors open on this table.
8203   **
8204   ** In some cases, the call to btreeMoveto() below is a no-op. For
8205   ** example, when inserting data into a table with auto-generated integer
8206   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8207   ** integer key to use. It then calls this function to actually insert the
8208   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8209   ** that the cursor is already where it needs to be and returns without
8210   ** doing any work. To avoid thwarting these optimizations, it is important
8211   ** not to clear the cursor here.
8212   */
8213   if( pCur->curFlags & BTCF_Multiple ){
8214     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8215     if( rc ) return rc;
8216   }
8217
8218   if( pCur->pKeyInfo==0 ){
8219     assert( pX->pKey==0 );
8220     /* If this is an insert into a table b-tree, invalidate any incrblob
8221     ** cursors open on the row being replaced */
8222     invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
8223
8224     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8225     ** to a row with the same key as the new entry being inserted.  */
8226     assert( (flags & BTREE_SAVEPOSITION)==0 ||
8227             ((pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey) );
8228
8229     /* If the cursor is currently on the last row and we are appending a
8230     ** new row onto the end, set the "loc" to avoid an unnecessary
8231     ** btreeMoveto() call */
8232     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8233       loc = 0;
8234     }else if( loc==0 ){
8235       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
8236       if( rc ) return rc;
8237     }
8238   }else if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8239     if( pX->nMem ){
8240       UnpackedRecord r;
8241       r.pKeyInfo = pCur->pKeyInfo;
8242       r.aMem = pX->aMem;
8243       r.nField = pX->nMem;
8244       r.default_rc = 0;
8245       r.errCode = 0;
8246       r.r1 = 0;
8247       r.r2 = 0;
8248       r.eqSeen = 0;
8249       rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
8250     }else{
8251       rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
8252     }
8253     if( rc ) return rc;
8254   }
8255   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
8256
8257   pPage = pCur->pPage;
8258   assert( pPage->intKey || pX->nKey>=0 );
8259   assert( pPage->leaf || !pPage->intKey );
8260
8261   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8262           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8263           loc==0 ? "overwrite" : "new entry"));
8264   assert( pPage->isInit );
8265   newCell = pBt->pTmpSpace;
8266   assert( newCell!=0 );
8267   rc = fillInCell(pPage, newCell, pX, &szNew);
8268   if( rc ) goto end_insert;
8269   assert( szNew==pPage->xCellSize(pPage, newCell) );
8270   assert( szNew <= MX_CELL_SIZE(pBt) );
8271   idx = pCur->ix;
8272   if( loc==0 ){
8273     CellInfo info;
8274     assert( idx<pPage->nCell );
8275     rc = sqlite3PagerWrite(pPage->pDbPage);
8276     if( rc ){
8277       goto end_insert;
8278     }
8279     oldCell = findCell(pPage, idx);
8280     if( !pPage->leaf ){
8281       memcpy(newCell, oldCell, 4);
8282     }
8283     rc = clearCell(pPage, oldCell, &info);
8284     if( info.nSize==szNew && info.nLocal==info.nPayload
8285      && (!ISAUTOVACUUM || szNew<pPage->minLocal)
8286     ){
8287       /* Overwrite the old cell with the new if they are the same size.
8288       ** We could also try to do this if the old cell is smaller, then add
8289       ** the leftover space to the free list.  But experiments show that
8290       ** doing that is no faster then skipping this optimization and just
8291       ** calling dropCell() and insertCell().
8292       **
8293       ** This optimization cannot be used on an autovacuum database if the
8294       ** new entry uses overflow pages, as the insertCell() call below is
8295       ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
8296       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8297       if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
8298       memcpy(oldCell, newCell, szNew);
8299       return SQLITE_OK;
8300     }
8301     dropCell(pPage, idx, info.nSize, &rc);
8302     if( rc ) goto end_insert;
8303   }else if( loc<0 && pPage->nCell>0 ){
8304     assert( pPage->leaf );
8305     idx = ++pCur->ix;
8306     pCur->curFlags &= ~BTCF_ValidNKey;
8307   }else{
8308     assert( pPage->leaf );
8309   }
8310   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8311   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
8312   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8313
8314   /* If no error has occurred and pPage has an overflow cell, call balance()
8315   ** to redistribute the cells within the tree. Since balance() may move
8316   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8317   ** variables.
8318   **
8319   ** Previous versions of SQLite called moveToRoot() to move the cursor
8320   ** back to the root page as balance() used to invalidate the contents
8321   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8322   ** set the cursor state to "invalid". This makes common insert operations
8323   ** slightly faster.
8324   **
8325   ** There is a subtle but important optimization here too. When inserting
8326   ** multiple records into an intkey b-tree using a single cursor (as can
8327   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8328   ** is advantageous to leave the cursor pointing to the last entry in
8329   ** the b-tree if possible. If the cursor is left pointing to the last
8330   ** entry in the table, and the next row inserted has an integer key
8331   ** larger than the largest existing key, it is possible to insert the
8332   ** row without seeking the cursor. This can be a big performance boost.
8333   */
8334   pCur->info.nSize = 0;
8335   if( pPage->nOverflow ){
8336     assert( rc==SQLITE_OK );
8337     pCur->curFlags &= ~(BTCF_ValidNKey);
8338     rc = balance(pCur);
8339
8340     /* Must make sure nOverflow is reset to zero even if the balance()
8341     ** fails. Internal data structure corruption will result otherwise.
8342     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8343     ** from trying to save the current position of the cursor.  */
8344     pCur->pPage->nOverflow = 0;
8345     pCur->eState = CURSOR_INVALID;
8346     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
8347       btreeReleaseAllCursorPages(pCur);
8348       if( pCur->pKeyInfo ){
8349         assert( pCur->pKey==0 );
8350         pCur->pKey = sqlite3Malloc( pX->nKey );
8351         if( pCur->pKey==0 ){
8352           rc = SQLITE_NOMEM;
8353         }else{
8354           memcpy(pCur->pKey, pX->pKey, pX->nKey);
8355         }
8356       }
8357       pCur->eState = CURSOR_REQUIRESEEK;
8358       pCur->nKey = pX->nKey;
8359     }
8360   }
8361   assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
8362
8363 end_insert:
8364   return rc;
8365 }
8366
8367 /*
8368 ** Delete the entry that the cursor is pointing to.
8369 **
8370 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8371 ** the cursor is left pointing at an arbitrary location after the delete.
8372 ** But if that bit is set, then the cursor is left in a state such that
8373 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8374 ** as it would have been on if the call to BtreeDelete() had been omitted.
8375 **
8376 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8377 ** associated with a single table entry and its indexes.  Only one of those
8378 ** deletes is considered the "primary" delete.  The primary delete occurs
8379 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8380 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8381 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8382 ** but which might be used by alternative storage engines.
8383 */
8384 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8385   Btree *p = pCur->pBtree;
8386   BtShared *pBt = p->pBt;
8387   int rc;                              /* Return code */
8388   MemPage *pPage;                      /* Page to delete cell from */
8389   unsigned char *pCell;                /* Pointer to cell to delete */
8390   int iCellIdx;                        /* Index of cell to delete */
8391   int iCellDepth;                      /* Depth of node containing pCell */
8392   CellInfo info;                       /* Size of the cell being deleted */
8393   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8394   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8395
8396   assert( cursorOwnsBtShared(pCur) );
8397   assert( pBt->inTransaction==TRANS_WRITE );
8398   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8399   assert( pCur->curFlags & BTCF_WriteFlag );
8400   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8401   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8402   assert( pCur->ix<pCur->pPage->nCell );
8403   assert( pCur->eState==CURSOR_VALID );
8404   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8405
8406   iCellDepth = pCur->iPage;
8407   iCellIdx = pCur->ix;
8408   pPage = pCur->pPage;
8409   pCell = findCell(pPage, iCellIdx);
8410
8411   /* If the bPreserve flag is set to true, then the cursor position must
8412   ** be preserved following this delete operation. If the current delete
8413   ** will cause a b-tree rebalance, then this is done by saving the cursor
8414   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8415   ** returning.
8416   **
8417   ** Or, if the current delete will not cause a rebalance, then the cursor
8418   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8419   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8420   if( bPreserve ){
8421     if( !pPage->leaf
8422      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8423     ){
8424       /* A b-tree rebalance will be required after deleting this entry.
8425       ** Save the cursor key.  */
8426       rc = saveCursorKey(pCur);
8427       if( rc ) return rc;
8428     }else{
8429       bSkipnext = 1;
8430     }
8431   }
8432
8433   /* If the page containing the entry to delete is not a leaf page, move
8434   ** the cursor to the largest entry in the tree that is smaller than
8435   ** the entry being deleted. This cell will replace the cell being deleted
8436   ** from the internal node. The 'previous' entry is used for this instead
8437   ** of the 'next' entry, as the previous entry is always a part of the
8438   ** sub-tree headed by the child page of the cell being deleted. This makes
8439   ** balancing the tree following the delete operation easier.  */
8440   if( !pPage->leaf ){
8441     rc = sqlite3BtreePrevious(pCur, 0);
8442     assert( rc!=SQLITE_DONE );
8443     if( rc ) return rc;
8444   }
8445
8446   /* Save the positions of any other cursors open on this table before
8447   ** making any modifications.  */
8448   if( pCur->curFlags & BTCF_Multiple ){
8449     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8450     if( rc ) return rc;
8451   }
8452
8453   /* If this is a delete operation to remove a row from a table b-tree,
8454   ** invalidate any incrblob cursors open on the row being deleted.  */
8455   if( pCur->pKeyInfo==0 ){
8456     invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
8457   }
8458
8459   /* Make the page containing the entry to be deleted writable. Then free any
8460   ** overflow pages associated with the entry and finally remove the cell
8461   ** itself from within the page.  */
8462   rc = sqlite3PagerWrite(pPage->pDbPage);
8463   if( rc ) return rc;
8464   rc = clearCell(pPage, pCell, &info);
8465   dropCell(pPage, iCellIdx, info.nSize, &rc);
8466   if( rc ) return rc;
8467
8468   /* If the cell deleted was not located on a leaf page, then the cursor
8469   ** is currently pointing to the largest entry in the sub-tree headed
8470   ** by the child-page of the cell that was just deleted from an internal
8471   ** node. The cell from the leaf node needs to be moved to the internal
8472   ** node to replace the deleted cell.  */
8473   if( !pPage->leaf ){
8474     MemPage *pLeaf = pCur->pPage;
8475     int nCell;
8476     Pgno n;
8477     unsigned char *pTmp;
8478
8479     if( iCellDepth<pCur->iPage-1 ){
8480       n = pCur->apPage[iCellDepth+1]->pgno;
8481     }else{
8482       n = pCur->pPage->pgno;
8483     }
8484     pCell = findCell(pLeaf, pLeaf->nCell-1);
8485     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
8486     nCell = pLeaf->xCellSize(pLeaf, pCell);
8487     assert( MX_CELL_SIZE(pBt) >= nCell );
8488     pTmp = pBt->pTmpSpace;
8489     assert( pTmp!=0 );
8490     rc = sqlite3PagerWrite(pLeaf->pDbPage);
8491     if( rc==SQLITE_OK ){
8492       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
8493     }
8494     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
8495     if( rc ) return rc;
8496   }
8497
8498   /* Balance the tree. If the entry deleted was located on a leaf page,
8499   ** then the cursor still points to that page. In this case the first
8500   ** call to balance() repairs the tree, and the if(...) condition is
8501   ** never true.
8502   **
8503   ** Otherwise, if the entry deleted was on an internal node page, then
8504   ** pCur is pointing to the leaf page from which a cell was removed to
8505   ** replace the cell deleted from the internal node. This is slightly
8506   ** tricky as the leaf node may be underfull, and the internal node may
8507   ** be either under or overfull. In this case run the balancing algorithm
8508   ** on the leaf node first. If the balance proceeds far enough up the
8509   ** tree that we can be sure that any problem in the internal node has
8510   ** been corrected, so be it. Otherwise, after balancing the leaf node,
8511   ** walk the cursor up the tree to the internal node and balance it as
8512   ** well.  */
8513   rc = balance(pCur);
8514   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
8515     releasePageNotNull(pCur->pPage);
8516     pCur->iPage--;
8517     while( pCur->iPage>iCellDepth ){
8518       releasePage(pCur->apPage[pCur->iPage--]);
8519     }
8520     pCur->pPage = pCur->apPage[pCur->iPage];
8521     rc = balance(pCur);
8522   }
8523
8524   if( rc==SQLITE_OK ){
8525     if( bSkipnext ){
8526       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
8527       assert( pPage==pCur->pPage || CORRUPT_DB );
8528       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
8529       pCur->eState = CURSOR_SKIPNEXT;
8530       if( iCellIdx>=pPage->nCell ){
8531         pCur->skipNext = -1;
8532         pCur->ix = pPage->nCell-1;
8533       }else{
8534         pCur->skipNext = 1;
8535       }
8536     }else{
8537       rc = moveToRoot(pCur);
8538       if( bPreserve ){
8539         btreeReleaseAllCursorPages(pCur);
8540         pCur->eState = CURSOR_REQUIRESEEK;
8541       }
8542       if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
8543     }
8544   }
8545   return rc;
8546 }
8547
8548 /*
8549 ** Create a new BTree table.  Write into *piTable the page
8550 ** number for the root page of the new table.
8551 **
8552 ** The type of type is determined by the flags parameter.  Only the
8553 ** following values of flags are currently in use.  Other values for
8554 ** flags might not work:
8555 **
8556 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
8557 **     BTREE_ZERODATA                  Used for SQL indices
8558 */
8559 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
8560   BtShared *pBt = p->pBt;
8561   MemPage *pRoot;
8562   Pgno pgnoRoot;
8563   int rc;
8564   int ptfFlags;          /* Page-type flage for the root page of new table */
8565
8566   assert( sqlite3BtreeHoldsMutex(p) );
8567   assert( pBt->inTransaction==TRANS_WRITE );
8568   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8569
8570 #ifdef SQLITE_OMIT_AUTOVACUUM
8571   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8572   if( rc ){
8573     return rc;
8574   }
8575 #else
8576   if( pBt->autoVacuum ){
8577     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
8578     MemPage *pPageMove; /* The page to move to. */
8579
8580     /* Creating a new table may probably require moving an existing database
8581     ** to make room for the new tables root page. In case this page turns
8582     ** out to be an overflow page, delete all overflow page-map caches
8583     ** held by open cursors.
8584     */
8585     invalidateAllOverflowCache(pBt);
8586
8587     /* Read the value of meta[3] from the database to determine where the
8588     ** root page of the new table should go. meta[3] is the largest root-page
8589     ** created so far, so the new root-page is (meta[3]+1).
8590     */
8591     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
8592     pgnoRoot++;
8593
8594     /* The new root-page may not be allocated on a pointer-map page, or the
8595     ** PENDING_BYTE page.
8596     */
8597     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
8598         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
8599       pgnoRoot++;
8600     }
8601     assert( pgnoRoot>=3 || CORRUPT_DB );
8602     testcase( pgnoRoot<3 );
8603
8604     /* Allocate a page. The page that currently resides at pgnoRoot will
8605     ** be moved to the allocated page (unless the allocated page happens
8606     ** to reside at pgnoRoot).
8607     */
8608     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
8609     if( rc!=SQLITE_OK ){
8610       return rc;
8611     }
8612
8613     if( pgnoMove!=pgnoRoot ){
8614       /* pgnoRoot is the page that will be used for the root-page of
8615       ** the new table (assuming an error did not occur). But we were
8616       ** allocated pgnoMove. If required (i.e. if it was not allocated
8617       ** by extending the file), the current page at position pgnoMove
8618       ** is already journaled.
8619       */
8620       u8 eType = 0;
8621       Pgno iPtrPage = 0;
8622
8623       /* Save the positions of any open cursors. This is required in
8624       ** case they are holding a reference to an xFetch reference
8625       ** corresponding to page pgnoRoot.  */
8626       rc = saveAllCursors(pBt, 0, 0);
8627       releasePage(pPageMove);
8628       if( rc!=SQLITE_OK ){
8629         return rc;
8630       }
8631
8632       /* Move the page currently at pgnoRoot to pgnoMove. */
8633       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8634       if( rc!=SQLITE_OK ){
8635         return rc;
8636       }
8637       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
8638       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
8639         rc = SQLITE_CORRUPT_BKPT;
8640       }
8641       if( rc!=SQLITE_OK ){
8642         releasePage(pRoot);
8643         return rc;
8644       }
8645       assert( eType!=PTRMAP_ROOTPAGE );
8646       assert( eType!=PTRMAP_FREEPAGE );
8647       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
8648       releasePage(pRoot);
8649
8650       /* Obtain the page at pgnoRoot */
8651       if( rc!=SQLITE_OK ){
8652         return rc;
8653       }
8654       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8655       if( rc!=SQLITE_OK ){
8656         return rc;
8657       }
8658       rc = sqlite3PagerWrite(pRoot->pDbPage);
8659       if( rc!=SQLITE_OK ){
8660         releasePage(pRoot);
8661         return rc;
8662       }
8663     }else{
8664       pRoot = pPageMove;
8665     }
8666
8667     /* Update the pointer-map and meta-data with the new root-page number. */
8668     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
8669     if( rc ){
8670       releasePage(pRoot);
8671       return rc;
8672     }
8673
8674     /* When the new root page was allocated, page 1 was made writable in
8675     ** order either to increase the database filesize, or to decrement the
8676     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
8677     */
8678     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
8679     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
8680     if( NEVER(rc) ){
8681       releasePage(pRoot);
8682       return rc;
8683     }
8684
8685   }else{
8686     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8687     if( rc ) return rc;
8688   }
8689 #endif
8690   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8691   if( createTabFlags & BTREE_INTKEY ){
8692     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
8693   }else{
8694     ptfFlags = PTF_ZERODATA | PTF_LEAF;
8695   }
8696   zeroPage(pRoot, ptfFlags);
8697   sqlite3PagerUnref(pRoot->pDbPage);
8698   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
8699   *piTable = (int)pgnoRoot;
8700   return SQLITE_OK;
8701 }
8702 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
8703   int rc;
8704   sqlite3BtreeEnter(p);
8705   rc = btreeCreateTable(p, piTable, flags);
8706   sqlite3BtreeLeave(p);
8707   return rc;
8708 }
8709
8710 /*
8711 ** Erase the given database page and all its children.  Return
8712 ** the page to the freelist.
8713 */
8714 static int clearDatabasePage(
8715   BtShared *pBt,           /* The BTree that contains the table */
8716   Pgno pgno,               /* Page number to clear */
8717   int freePageFlag,        /* Deallocate page if true */
8718   int *pnChange            /* Add number of Cells freed to this counter */
8719 ){
8720   MemPage *pPage;
8721   int rc;
8722   unsigned char *pCell;
8723   int i;
8724   int hdr;
8725   CellInfo info;
8726
8727   assert( sqlite3_mutex_held(pBt->mutex) );
8728   if( pgno>btreePagecount(pBt) ){
8729     return SQLITE_CORRUPT_BKPT;
8730   }
8731   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
8732   if( rc ) return rc;
8733   if( pPage->bBusy ){
8734     rc = SQLITE_CORRUPT_BKPT;
8735     goto cleardatabasepage_out;
8736   }
8737   pPage->bBusy = 1;
8738   hdr = pPage->hdrOffset;
8739   for(i=0; i<pPage->nCell; i++){
8740     pCell = findCell(pPage, i);
8741     if( !pPage->leaf ){
8742       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
8743       if( rc ) goto cleardatabasepage_out;
8744     }
8745     rc = clearCell(pPage, pCell, &info);
8746     if( rc ) goto cleardatabasepage_out;
8747   }
8748   if( !pPage->leaf ){
8749     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
8750     if( rc ) goto cleardatabasepage_out;
8751   }else if( pnChange ){
8752     assert( pPage->intKey || CORRUPT_DB );
8753     testcase( !pPage->intKey );
8754     *pnChange += pPage->nCell;
8755   }
8756   if( freePageFlag ){
8757     freePage(pPage, &rc);
8758   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
8759     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
8760   }
8761
8762 cleardatabasepage_out:
8763   pPage->bBusy = 0;
8764   releasePage(pPage);
8765   return rc;
8766 }
8767
8768 /*
8769 ** Delete all information from a single table in the database.  iTable is
8770 ** the page number of the root of the table.  After this routine returns,
8771 ** the root page is empty, but still exists.
8772 **
8773 ** This routine will fail with SQLITE_LOCKED if there are any open
8774 ** read cursors on the table.  Open write cursors are moved to the
8775 ** root of the table.
8776 **
8777 ** If pnChange is not NULL, then table iTable must be an intkey table. The
8778 ** integer value pointed to by pnChange is incremented by the number of
8779 ** entries in the table.
8780 */
8781 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
8782   int rc;
8783   BtShared *pBt = p->pBt;
8784   sqlite3BtreeEnter(p);
8785   assert( p->inTrans==TRANS_WRITE );
8786
8787   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
8788
8789   if( SQLITE_OK==rc ){
8790     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
8791     ** is the root of a table b-tree - if it is not, the following call is
8792     ** a no-op).  */
8793     invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
8794     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
8795   }
8796   sqlite3BtreeLeave(p);
8797   return rc;
8798 }
8799
8800 /*
8801 ** Delete all information from the single table that pCur is open on.
8802 **
8803 ** This routine only work for pCur on an ephemeral table.
8804 */
8805 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
8806   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
8807 }
8808
8809 /*
8810 ** Erase all information in a table and add the root of the table to
8811 ** the freelist.  Except, the root of the principle table (the one on
8812 ** page 1) is never added to the freelist.
8813 **
8814 ** This routine will fail with SQLITE_LOCKED if there are any open
8815 ** cursors on the table.
8816 **
8817 ** If AUTOVACUUM is enabled and the page at iTable is not the last
8818 ** root page in the database file, then the last root page
8819 ** in the database file is moved into the slot formerly occupied by
8820 ** iTable and that last slot formerly occupied by the last root page
8821 ** is added to the freelist instead of iTable.  In this say, all
8822 ** root pages are kept at the beginning of the database file, which
8823 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
8824 ** page number that used to be the last root page in the file before
8825 ** the move.  If no page gets moved, *piMoved is set to 0.
8826 ** The last root page is recorded in meta[3] and the value of
8827 ** meta[3] is updated by this procedure.
8828 */
8829 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
8830   int rc;
8831   MemPage *pPage = 0;
8832   BtShared *pBt = p->pBt;
8833
8834   assert( sqlite3BtreeHoldsMutex(p) );
8835   assert( p->inTrans==TRANS_WRITE );
8836   assert( iTable>=2 );
8837
8838   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
8839   if( rc ) return rc;
8840   rc = sqlite3BtreeClearTable(p, iTable, 0);
8841   if( rc ){
8842     releasePage(pPage);
8843     return rc;
8844   }
8845
8846   *piMoved = 0;
8847
8848 #ifdef SQLITE_OMIT_AUTOVACUUM
8849   freePage(pPage, &rc);
8850   releasePage(pPage);
8851 #else
8852   if( pBt->autoVacuum ){
8853     Pgno maxRootPgno;
8854     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
8855
8856     if( iTable==maxRootPgno ){
8857       /* If the table being dropped is the table with the largest root-page
8858       ** number in the database, put the root page on the free list.
8859       */
8860       freePage(pPage, &rc);
8861       releasePage(pPage);
8862       if( rc!=SQLITE_OK ){
8863         return rc;
8864       }
8865     }else{
8866       /* The table being dropped does not have the largest root-page
8867       ** number in the database. So move the page that does into the
8868       ** gap left by the deleted root-page.
8869       */
8870       MemPage *pMove;
8871       releasePage(pPage);
8872       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8873       if( rc!=SQLITE_OK ){
8874         return rc;
8875       }
8876       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
8877       releasePage(pMove);
8878       if( rc!=SQLITE_OK ){
8879         return rc;
8880       }
8881       pMove = 0;
8882       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8883       freePage(pMove, &rc);
8884       releasePage(pMove);
8885       if( rc!=SQLITE_OK ){
8886         return rc;
8887       }
8888       *piMoved = maxRootPgno;
8889     }
8890
8891     /* Set the new 'max-root-page' value in the database header. This
8892     ** is the old value less one, less one more if that happens to
8893     ** be a root-page number, less one again if that is the
8894     ** PENDING_BYTE_PAGE.
8895     */
8896     maxRootPgno--;
8897     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
8898            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
8899       maxRootPgno--;
8900     }
8901     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
8902
8903     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
8904   }else{
8905     freePage(pPage, &rc);
8906     releasePage(pPage);
8907   }
8908 #endif
8909   return rc;
8910 }
8911 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
8912   int rc;
8913   sqlite3BtreeEnter(p);
8914   rc = btreeDropTable(p, iTable, piMoved);
8915   sqlite3BtreeLeave(p);
8916   return rc;
8917 }
8918
8919
8920 /*
8921 ** This function may only be called if the b-tree connection already
8922 ** has a read or write transaction open on the database.
8923 **
8924 ** Read the meta-information out of a database file.  Meta[0]
8925 ** is the number of free pages currently in the database.  Meta[1]
8926 ** through meta[15] are available for use by higher layers.  Meta[0]
8927 ** is read-only, the others are read/write.
8928 **
8929 ** The schema layer numbers meta values differently.  At the schema
8930 ** layer (and the SetCookie and ReadCookie opcodes) the number of
8931 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
8932 **
8933 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
8934 ** of reading the value out of the header, it instead loads the "DataVersion"
8935 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
8936 ** database file.  It is a number computed by the pager.  But its access
8937 ** pattern is the same as header meta values, and so it is convenient to
8938 ** read it from this routine.
8939 */
8940 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
8941   BtShared *pBt = p->pBt;
8942
8943   sqlite3BtreeEnter(p);
8944   assert( p->inTrans>TRANS_NONE );
8945   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
8946   assert( pBt->pPage1 );
8947   assert( idx>=0 && idx<=15 );
8948
8949   if( idx==BTREE_DATA_VERSION ){
8950     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
8951   }else{
8952     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
8953   }
8954
8955   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
8956   ** database, mark the database as read-only.  */
8957 #ifdef SQLITE_OMIT_AUTOVACUUM
8958   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
8959     pBt->btsFlags |= BTS_READ_ONLY;
8960   }
8961 #endif
8962
8963   sqlite3BtreeLeave(p);
8964 }
8965
8966 /*
8967 ** Write meta-information back into the database.  Meta[0] is
8968 ** read-only and may not be written.
8969 */
8970 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
8971   BtShared *pBt = p->pBt;
8972   unsigned char *pP1;
8973   int rc;
8974   assert( idx>=1 && idx<=15 );
8975   sqlite3BtreeEnter(p);
8976   assert( p->inTrans==TRANS_WRITE );
8977   assert( pBt->pPage1!=0 );
8978   pP1 = pBt->pPage1->aData;
8979   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8980   if( rc==SQLITE_OK ){
8981     put4byte(&pP1[36 + idx*4], iMeta);
8982 #ifndef SQLITE_OMIT_AUTOVACUUM
8983     if( idx==BTREE_INCR_VACUUM ){
8984       assert( pBt->autoVacuum || iMeta==0 );
8985       assert( iMeta==0 || iMeta==1 );
8986       pBt->incrVacuum = (u8)iMeta;
8987     }
8988 #endif
8989   }
8990   sqlite3BtreeLeave(p);
8991   return rc;
8992 }
8993
8994 #ifndef SQLITE_OMIT_BTREECOUNT
8995 /*
8996 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
8997 ** number of entries in the b-tree and write the result to *pnEntry.
8998 **
8999 ** SQLITE_OK is returned if the operation is successfully executed.
9000 ** Otherwise, if an error is encountered (i.e. an IO error or database
9001 ** corruption) an SQLite error code is returned.
9002 */
9003 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
9004   i64 nEntry = 0;                      /* Value to return in *pnEntry */
9005   int rc;                              /* Return code */
9006
9007   rc = moveToRoot(pCur);
9008   if( rc==SQLITE_EMPTY ){
9009     *pnEntry = 0;
9010     return SQLITE_OK;
9011   }
9012
9013   /* Unless an error occurs, the following loop runs one iteration for each
9014   ** page in the B-Tree structure (not including overflow pages).
9015   */
9016   while( rc==SQLITE_OK ){
9017     int iIdx;                          /* Index of child node in parent */
9018     MemPage *pPage;                    /* Current page of the b-tree */
9019
9020     /* If this is a leaf page or the tree is not an int-key tree, then
9021     ** this page contains countable entries. Increment the entry counter
9022     ** accordingly.
9023     */
9024     pPage = pCur->pPage;
9025     if( pPage->leaf || !pPage->intKey ){
9026       nEntry += pPage->nCell;
9027     }
9028
9029     /* pPage is a leaf node. This loop navigates the cursor so that it
9030     ** points to the first interior cell that it points to the parent of
9031     ** the next page in the tree that has not yet been visited. The
9032     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
9033     ** of the page, or to the number of cells in the page if the next page
9034     ** to visit is the right-child of its parent.
9035     **
9036     ** If all pages in the tree have been visited, return SQLITE_OK to the
9037     ** caller.
9038     */
9039     if( pPage->leaf ){
9040       do {
9041         if( pCur->iPage==0 ){
9042           /* All pages of the b-tree have been visited. Return successfully. */
9043           *pnEntry = nEntry;
9044           return moveToRoot(pCur);
9045         }
9046         moveToParent(pCur);
9047       }while ( pCur->ix>=pCur->pPage->nCell );
9048
9049       pCur->ix++;
9050       pPage = pCur->pPage;
9051     }
9052
9053     /* Descend to the child node of the cell that the cursor currently
9054     ** points at. This is the right-child if (iIdx==pPage->nCell).
9055     */
9056     iIdx = pCur->ix;
9057     if( iIdx==pPage->nCell ){
9058       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
9059     }else{
9060       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
9061     }
9062   }
9063
9064   /* An error has occurred. Return an error code. */
9065   return rc;
9066 }
9067 #endif
9068
9069 /*
9070 ** Return the pager associated with a BTree.  This routine is used for
9071 ** testing and debugging only.
9072 */
9073 Pager *sqlite3BtreePager(Btree *p){
9074   return p->pBt->pPager;
9075 }
9076
9077 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9078 /*
9079 ** Append a message to the error message string.
9080 */
9081 static void checkAppendMsg(
9082   IntegrityCk *pCheck,
9083   const char *zFormat,
9084   ...
9085 ){
9086   va_list ap;
9087   if( !pCheck->mxErr ) return;
9088   pCheck->mxErr--;
9089   pCheck->nErr++;
9090   va_start(ap, zFormat);
9091   if( pCheck->errMsg.nChar ){
9092     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
9093   }
9094   if( pCheck->zPfx ){
9095     sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
9096   }
9097   sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
9098   va_end(ap);
9099   if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
9100     pCheck->mallocFailed = 1;
9101   }
9102 }
9103 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9104
9105 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9106
9107 /*
9108 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
9109 ** corresponds to page iPg is already set.
9110 */
9111 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9112   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9113   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
9114 }
9115
9116 /*
9117 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
9118 */
9119 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9120   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9121   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
9122 }
9123
9124
9125 /*
9126 ** Add 1 to the reference count for page iPage.  If this is the second
9127 ** reference to the page, add an error message to pCheck->zErrMsg.
9128 ** Return 1 if there are 2 or more references to the page and 0 if
9129 ** if this is the first reference to the page.
9130 **
9131 ** Also check that the page number is in bounds.
9132 */
9133 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
9134   if( iPage==0 ) return 1;
9135   if( iPage>pCheck->nPage ){
9136     checkAppendMsg(pCheck, "invalid page number %d", iPage);
9137     return 1;
9138   }
9139   if( getPageReferenced(pCheck, iPage) ){
9140     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
9141     return 1;
9142   }
9143   setPageReferenced(pCheck, iPage);
9144   return 0;
9145 }
9146
9147 #ifndef SQLITE_OMIT_AUTOVACUUM
9148 /*
9149 ** Check that the entry in the pointer-map for page iChild maps to
9150 ** page iParent, pointer type ptrType. If not, append an error message
9151 ** to pCheck.
9152 */
9153 static void checkPtrmap(
9154   IntegrityCk *pCheck,   /* Integrity check context */
9155   Pgno iChild,           /* Child page number */
9156   u8 eType,              /* Expected pointer map type */
9157   Pgno iParent           /* Expected pointer map parent page number */
9158 ){
9159   int rc;
9160   u8 ePtrmapType;
9161   Pgno iPtrmapParent;
9162
9163   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
9164   if( rc!=SQLITE_OK ){
9165     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
9166     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
9167     return;
9168   }
9169
9170   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
9171     checkAppendMsg(pCheck,
9172       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
9173       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
9174   }
9175 }
9176 #endif
9177
9178 /*
9179 ** Check the integrity of the freelist or of an overflow page list.
9180 ** Verify that the number of pages on the list is N.
9181 */
9182 static void checkList(
9183   IntegrityCk *pCheck,  /* Integrity checking context */
9184   int isFreeList,       /* True for a freelist.  False for overflow page list */
9185   int iPage,            /* Page number for first page in the list */
9186   int N                 /* Expected number of pages in the list */
9187 ){
9188   int i;
9189   int expected = N;
9190   int iFirst = iPage;
9191   while( N-- > 0 && pCheck->mxErr ){
9192     DbPage *pOvflPage;
9193     unsigned char *pOvflData;
9194     if( iPage<1 ){
9195       checkAppendMsg(pCheck,
9196          "%d of %d pages missing from overflow list starting at %d",
9197           N+1, expected, iFirst);
9198       break;
9199     }
9200     if( checkRef(pCheck, iPage) ) break;
9201     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
9202       checkAppendMsg(pCheck, "failed to get page %d", iPage);
9203       break;
9204     }
9205     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
9206     if( isFreeList ){
9207       int n = get4byte(&pOvflData[4]);
9208 #ifndef SQLITE_OMIT_AUTOVACUUM
9209       if( pCheck->pBt->autoVacuum ){
9210         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
9211       }
9212 #endif
9213       if( n>(int)pCheck->pBt->usableSize/4-2 ){
9214         checkAppendMsg(pCheck,
9215            "freelist leaf count too big on page %d", iPage);
9216         N--;
9217       }else{
9218         for(i=0; i<n; i++){
9219           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
9220 #ifndef SQLITE_OMIT_AUTOVACUUM
9221           if( pCheck->pBt->autoVacuum ){
9222             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
9223           }
9224 #endif
9225           checkRef(pCheck, iFreePage);
9226         }
9227         N -= n;
9228       }
9229     }
9230 #ifndef SQLITE_OMIT_AUTOVACUUM
9231     else{
9232       /* If this database supports auto-vacuum and iPage is not the last
9233       ** page in this overflow list, check that the pointer-map entry for
9234       ** the following page matches iPage.
9235       */
9236       if( pCheck->pBt->autoVacuum && N>0 ){
9237         i = get4byte(pOvflData);
9238         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
9239       }
9240     }
9241 #endif
9242     iPage = get4byte(pOvflData);
9243     sqlite3PagerUnref(pOvflPage);
9244
9245     if( isFreeList && N<(iPage!=0) ){
9246       checkAppendMsg(pCheck, "free-page count in header is too small");
9247     }
9248   }
9249 }
9250 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9251
9252 /*
9253 ** An implementation of a min-heap.
9254 **
9255 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
9256 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
9257 ** and aHeap[N*2+1].
9258 **
9259 ** The heap property is this:  Every node is less than or equal to both
9260 ** of its daughter nodes.  A consequence of the heap property is that the
9261 ** root node aHeap[1] is always the minimum value currently in the heap.
9262 **
9263 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9264 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9265 ** removes the root element from the heap (the minimum value in the heap)
9266 ** and then moves other nodes around as necessary to preserve the heap
9267 ** property.
9268 **
9269 ** This heap is used for cell overlap and coverage testing.  Each u32
9270 ** entry represents the span of a cell or freeblock on a btree page.
9271 ** The upper 16 bits are the index of the first byte of a range and the
9272 ** lower 16 bits are the index of the last byte of that range.
9273 */
9274 static void btreeHeapInsert(u32 *aHeap, u32 x){
9275   u32 j, i = ++aHeap[0];
9276   aHeap[i] = x;
9277   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9278     x = aHeap[j];
9279     aHeap[j] = aHeap[i];
9280     aHeap[i] = x;
9281     i = j;
9282   }
9283 }
9284 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9285   u32 j, i, x;
9286   if( (x = aHeap[0])==0 ) return 0;
9287   *pOut = aHeap[1];
9288   aHeap[1] = aHeap[x];
9289   aHeap[x] = 0xffffffff;
9290   aHeap[0]--;
9291   i = 1;
9292   while( (j = i*2)<=aHeap[0] ){
9293     if( aHeap[j]>aHeap[j+1] ) j++;
9294     if( aHeap[i]<aHeap[j] ) break;
9295     x = aHeap[i];
9296     aHeap[i] = aHeap[j];
9297     aHeap[j] = x;
9298     i = j;
9299   }
9300   return 1;
9301 }
9302
9303 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9304 /*
9305 ** Do various sanity checks on a single page of a tree.  Return
9306 ** the tree depth.  Root pages return 0.  Parents of root pages
9307 ** return 1, and so forth.
9308 **
9309 ** These checks are done:
9310 **
9311 **      1.  Make sure that cells and freeblocks do not overlap
9312 **          but combine to completely cover the page.
9313 **      2.  Make sure integer cell keys are in order.
9314 **      3.  Check the integrity of overflow pages.
9315 **      4.  Recursively call checkTreePage on all children.
9316 **      5.  Verify that the depth of all children is the same.
9317 */
9318 static int checkTreePage(
9319   IntegrityCk *pCheck,  /* Context for the sanity check */
9320   int iPage,            /* Page number of the page to check */
9321   i64 *piMinKey,        /* Write minimum integer primary key here */
9322   i64 maxKey            /* Error if integer primary key greater than this */
9323 ){
9324   MemPage *pPage = 0;      /* The page being analyzed */
9325   int i;                   /* Loop counter */
9326   int rc;                  /* Result code from subroutine call */
9327   int depth = -1, d2;      /* Depth of a subtree */
9328   int pgno;                /* Page number */
9329   int nFrag;               /* Number of fragmented bytes on the page */
9330   int hdr;                 /* Offset to the page header */
9331   int cellStart;           /* Offset to the start of the cell pointer array */
9332   int nCell;               /* Number of cells */
9333   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9334   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9335                            ** False if IPK must be strictly less than maxKey */
9336   u8 *data;                /* Page content */
9337   u8 *pCell;               /* Cell content */
9338   u8 *pCellIdx;            /* Next element of the cell pointer array */
9339   BtShared *pBt;           /* The BtShared object that owns pPage */
9340   u32 pc;                  /* Address of a cell */
9341   u32 usableSize;          /* Usable size of the page */
9342   u32 contentOffset;       /* Offset to the start of the cell content area */
9343   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9344   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9345   const char *saved_zPfx = pCheck->zPfx;
9346   int saved_v1 = pCheck->v1;
9347   int saved_v2 = pCheck->v2;
9348   u8 savedIsInit = 0;
9349
9350   /* Check that the page exists
9351   */
9352   pBt = pCheck->pBt;
9353   usableSize = pBt->usableSize;
9354   if( iPage==0 ) return 0;
9355   if( checkRef(pCheck, iPage) ) return 0;
9356   pCheck->zPfx = "Page %d: ";
9357   pCheck->v1 = iPage;
9358   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
9359     checkAppendMsg(pCheck,
9360        "unable to get the page. error code=%d", rc);
9361     goto end_of_check;
9362   }
9363
9364   /* Clear MemPage.isInit to make sure the corruption detection code in
9365   ** btreeInitPage() is executed.  */
9366   savedIsInit = pPage->isInit;
9367   pPage->isInit = 0;
9368   if( (rc = btreeInitPage(pPage))!=0 ){
9369     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9370     checkAppendMsg(pCheck,
9371                    "btreeInitPage() returns error code %d", rc);
9372     goto end_of_check;
9373   }
9374   data = pPage->aData;
9375   hdr = pPage->hdrOffset;
9376
9377   /* Set up for cell analysis */
9378   pCheck->zPfx = "On tree page %d cell %d: ";
9379   contentOffset = get2byteNotZero(&data[hdr+5]);
9380   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9381
9382   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9383   ** number of cells on the page. */
9384   nCell = get2byte(&data[hdr+3]);
9385   assert( pPage->nCell==nCell );
9386
9387   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9388   ** immediately follows the b-tree page header. */
9389   cellStart = hdr + 12 - 4*pPage->leaf;
9390   assert( pPage->aCellIdx==&data[cellStart] );
9391   pCellIdx = &data[cellStart + 2*(nCell-1)];
9392
9393   if( !pPage->leaf ){
9394     /* Analyze the right-child page of internal pages */
9395     pgno = get4byte(&data[hdr+8]);
9396 #ifndef SQLITE_OMIT_AUTOVACUUM
9397     if( pBt->autoVacuum ){
9398       pCheck->zPfx = "On page %d at right child: ";
9399       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9400     }
9401 #endif
9402     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9403     keyCanBeEqual = 0;
9404   }else{
9405     /* For leaf pages, the coverage check will occur in the same loop
9406     ** as the other cell checks, so initialize the heap.  */
9407     heap = pCheck->heap;
9408     heap[0] = 0;
9409   }
9410
9411   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9412   ** integer offsets to the cell contents. */
9413   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9414     CellInfo info;
9415
9416     /* Check cell size */
9417     pCheck->v2 = i;
9418     assert( pCellIdx==&data[cellStart + i*2] );
9419     pc = get2byteAligned(pCellIdx);
9420     pCellIdx -= 2;
9421     if( pc<contentOffset || pc>usableSize-4 ){
9422       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9423                              pc, contentOffset, usableSize-4);
9424       doCoverageCheck = 0;
9425       continue;
9426     }
9427     pCell = &data[pc];
9428     pPage->xParseCell(pPage, pCell, &info);
9429     if( pc+info.nSize>usableSize ){
9430       checkAppendMsg(pCheck, "Extends off end of page");
9431       doCoverageCheck = 0;
9432       continue;
9433     }
9434
9435     /* Check for integer primary key out of range */
9436     if( pPage->intKey ){
9437       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9438         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9439       }
9440       maxKey = info.nKey;
9441       keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
9442     }
9443
9444     /* Check the content overflow list */
9445     if( info.nPayload>info.nLocal ){
9446       int nPage;       /* Number of pages on the overflow chain */
9447       Pgno pgnoOvfl;   /* First page of the overflow chain */
9448       assert( pc + info.nSize - 4 <= usableSize );
9449       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9450       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
9451 #ifndef SQLITE_OMIT_AUTOVACUUM
9452       if( pBt->autoVacuum ){
9453         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9454       }
9455 #endif
9456       checkList(pCheck, 0, pgnoOvfl, nPage);
9457     }
9458
9459     if( !pPage->leaf ){
9460       /* Check sanity of left child page for internal pages */
9461       pgno = get4byte(pCell);
9462 #ifndef SQLITE_OMIT_AUTOVACUUM
9463       if( pBt->autoVacuum ){
9464         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9465       }
9466 #endif
9467       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9468       keyCanBeEqual = 0;
9469       if( d2!=depth ){
9470         checkAppendMsg(pCheck, "Child page depth differs");
9471         depth = d2;
9472       }
9473     }else{
9474       /* Populate the coverage-checking heap for leaf pages */
9475       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9476     }
9477   }
9478   *piMinKey = maxKey;
9479
9480   /* Check for complete coverage of the page
9481   */
9482   pCheck->zPfx = 0;
9483   if( doCoverageCheck && pCheck->mxErr>0 ){
9484     /* For leaf pages, the min-heap has already been initialized and the
9485     ** cells have already been inserted.  But for internal pages, that has
9486     ** not yet been done, so do it now */
9487     if( !pPage->leaf ){
9488       heap = pCheck->heap;
9489       heap[0] = 0;
9490       for(i=nCell-1; i>=0; i--){
9491         u32 size;
9492         pc = get2byteAligned(&data[cellStart+i*2]);
9493         size = pPage->xCellSize(pPage, &data[pc]);
9494         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
9495       }
9496     }
9497     /* Add the freeblocks to the min-heap
9498     **
9499     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
9500     ** is the offset of the first freeblock, or zero if there are no
9501     ** freeblocks on the page.
9502     */
9503     i = get2byte(&data[hdr+1]);
9504     while( i>0 ){
9505       int size, j;
9506       assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
9507       size = get2byte(&data[i+2]);
9508       assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
9509       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
9510       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
9511       ** big-endian integer which is the offset in the b-tree page of the next
9512       ** freeblock in the chain, or zero if the freeblock is the last on the
9513       ** chain. */
9514       j = get2byte(&data[i]);
9515       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
9516       ** increasing offset. */
9517       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
9518       assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
9519       i = j;
9520     }
9521     /* Analyze the min-heap looking for overlap between cells and/or
9522     ** freeblocks, and counting the number of untracked bytes in nFrag.
9523     **
9524     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
9525     ** There is an implied first entry the covers the page header, the cell
9526     ** pointer index, and the gap between the cell pointer index and the start
9527     ** of cell content.
9528     **
9529     ** The loop below pulls entries from the min-heap in order and compares
9530     ** the start_address against the previous end_address.  If there is an
9531     ** overlap, that means bytes are used multiple times.  If there is a gap,
9532     ** that gap is added to the fragmentation count.
9533     */
9534     nFrag = 0;
9535     prev = contentOffset - 1;   /* Implied first min-heap entry */
9536     while( btreeHeapPull(heap,&x) ){
9537       if( (prev&0xffff)>=(x>>16) ){
9538         checkAppendMsg(pCheck,
9539           "Multiple uses for byte %u of page %d", x>>16, iPage);
9540         break;
9541       }else{
9542         nFrag += (x>>16) - (prev&0xffff) - 1;
9543         prev = x;
9544       }
9545     }
9546     nFrag += usableSize - (prev&0xffff) - 1;
9547     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
9548     ** is stored in the fifth field of the b-tree page header.
9549     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
9550     ** number of fragmented free bytes within the cell content area.
9551     */
9552     if( heap[0]==0 && nFrag!=data[hdr+7] ){
9553       checkAppendMsg(pCheck,
9554           "Fragmentation of %d bytes reported as %d on page %d",
9555           nFrag, data[hdr+7], iPage);
9556     }
9557   }
9558
9559 end_of_check:
9560   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
9561   releasePage(pPage);
9562   pCheck->zPfx = saved_zPfx;
9563   pCheck->v1 = saved_v1;
9564   pCheck->v2 = saved_v2;
9565   return depth+1;
9566 }
9567 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9568
9569 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9570 /*
9571 ** This routine does a complete check of the given BTree file.  aRoot[] is
9572 ** an array of pages numbers were each page number is the root page of
9573 ** a table.  nRoot is the number of entries in aRoot.
9574 **
9575 ** A read-only or read-write transaction must be opened before calling
9576 ** this function.
9577 **
9578 ** Write the number of error seen in *pnErr.  Except for some memory
9579 ** allocation errors,  an error message held in memory obtained from
9580 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
9581 ** returned.  If a memory allocation error occurs, NULL is returned.
9582 */
9583 char *sqlite3BtreeIntegrityCheck(
9584   Btree *p,     /* The btree to be checked */
9585   int *aRoot,   /* An array of root pages numbers for individual trees */
9586   int nRoot,    /* Number of entries in aRoot[] */
9587   int mxErr,    /* Stop reporting errors after this many */
9588   int *pnErr    /* Write number of errors seen to this variable */
9589 ){
9590   Pgno i;
9591   IntegrityCk sCheck;
9592   BtShared *pBt = p->pBt;
9593   int savedDbFlags = pBt->db->flags;
9594   char zErr[100];
9595   VVA_ONLY( int nRef );
9596
9597   sqlite3BtreeEnter(p);
9598   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
9599   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
9600   assert( nRef>=0 );
9601   sCheck.pBt = pBt;
9602   sCheck.pPager = pBt->pPager;
9603   sCheck.nPage = btreePagecount(sCheck.pBt);
9604   sCheck.mxErr = mxErr;
9605   sCheck.nErr = 0;
9606   sCheck.mallocFailed = 0;
9607   sCheck.zPfx = 0;
9608   sCheck.v1 = 0;
9609   sCheck.v2 = 0;
9610   sCheck.aPgRef = 0;
9611   sCheck.heap = 0;
9612   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
9613   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
9614   if( sCheck.nPage==0 ){
9615     goto integrity_ck_cleanup;
9616   }
9617
9618   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
9619   if( !sCheck.aPgRef ){
9620     sCheck.mallocFailed = 1;
9621     goto integrity_ck_cleanup;
9622   }
9623   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
9624   if( sCheck.heap==0 ){
9625     sCheck.mallocFailed = 1;
9626     goto integrity_ck_cleanup;
9627   }
9628
9629   i = PENDING_BYTE_PAGE(pBt);
9630   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
9631
9632   /* Check the integrity of the freelist
9633   */
9634   sCheck.zPfx = "Main freelist: ";
9635   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
9636             get4byte(&pBt->pPage1->aData[36]));
9637   sCheck.zPfx = 0;
9638
9639   /* Check all the tables.
9640   */
9641   testcase( pBt->db->flags & SQLITE_CellSizeCk );
9642   pBt->db->flags &= ~SQLITE_CellSizeCk;
9643   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
9644     i64 notUsed;
9645     if( aRoot[i]==0 ) continue;
9646 #ifndef SQLITE_OMIT_AUTOVACUUM
9647     if( pBt->autoVacuum && aRoot[i]>1 ){
9648       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
9649     }
9650 #endif
9651     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
9652   }
9653   pBt->db->flags = savedDbFlags;
9654
9655   /* Make sure every page in the file is referenced
9656   */
9657   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
9658 #ifdef SQLITE_OMIT_AUTOVACUUM
9659     if( getPageReferenced(&sCheck, i)==0 ){
9660       checkAppendMsg(&sCheck, "Page %d is never used", i);
9661     }
9662 #else
9663     /* If the database supports auto-vacuum, make sure no tables contain
9664     ** references to pointer-map pages.
9665     */
9666     if( getPageReferenced(&sCheck, i)==0 &&
9667        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
9668       checkAppendMsg(&sCheck, "Page %d is never used", i);
9669     }
9670     if( getPageReferenced(&sCheck, i)!=0 &&
9671        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
9672       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
9673     }
9674 #endif
9675   }
9676
9677   /* Clean  up and report errors.
9678   */
9679 integrity_ck_cleanup:
9680   sqlite3PageFree(sCheck.heap);
9681   sqlite3_free(sCheck.aPgRef);
9682   if( sCheck.mallocFailed ){
9683     sqlite3StrAccumReset(&sCheck.errMsg);
9684     sCheck.nErr++;
9685   }
9686   *pnErr = sCheck.nErr;
9687   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
9688   /* Make sure this analysis did not leave any unref() pages. */
9689   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
9690   sqlite3BtreeLeave(p);
9691   return sqlite3StrAccumFinish(&sCheck.errMsg);
9692 }
9693 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9694
9695 /*
9696 ** Return the full pathname of the underlying database file.  Return
9697 ** an empty string if the database is in-memory or a TEMP database.
9698 **
9699 ** The pager filename is invariant as long as the pager is
9700 ** open so it is safe to access without the BtShared mutex.
9701 */
9702 const char *sqlite3BtreeGetFilename(Btree *p){
9703   assert( p->pBt->pPager!=0 );
9704   return sqlite3PagerFilename(p->pBt->pPager, 1);
9705 }
9706
9707 /*
9708 ** Return the pathname of the journal file for this database. The return
9709 ** value of this routine is the same regardless of whether the journal file
9710 ** has been created or not.
9711 **
9712 ** The pager journal filename is invariant as long as the pager is
9713 ** open so it is safe to access without the BtShared mutex.
9714 */
9715 const char *sqlite3BtreeGetJournalname(Btree *p){
9716   assert( p->pBt->pPager!=0 );
9717   return sqlite3PagerJournalname(p->pBt->pPager);
9718 }
9719
9720 /*
9721 ** Return non-zero if a transaction is active.
9722 */
9723 int sqlite3BtreeIsInTrans(Btree *p){
9724   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
9725   return (p && (p->inTrans==TRANS_WRITE));
9726 }
9727
9728 #ifndef SQLITE_OMIT_WAL
9729 /*
9730 ** Run a checkpoint on the Btree passed as the first argument.
9731 **
9732 ** Return SQLITE_LOCKED if this or any other connection has an open
9733 ** transaction on the shared-cache the argument Btree is connected to.
9734 **
9735 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
9736 */
9737 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
9738   int rc = SQLITE_OK;
9739   if( p ){
9740     BtShared *pBt = p->pBt;
9741     sqlite3BtreeEnter(p);
9742     if( pBt->inTransaction!=TRANS_NONE ){
9743       rc = SQLITE_LOCKED;
9744     }else{
9745       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
9746     }
9747     sqlite3BtreeLeave(p);
9748   }
9749   return rc;
9750 }
9751 #endif
9752
9753 /*
9754 ** Return non-zero if a read (or write) transaction is active.
9755 */
9756 int sqlite3BtreeIsInReadTrans(Btree *p){
9757   assert( p );
9758   assert( sqlite3_mutex_held(p->db->mutex) );
9759   return p->inTrans!=TRANS_NONE;
9760 }
9761
9762 int sqlite3BtreeIsInBackup(Btree *p){
9763   assert( p );
9764   assert( sqlite3_mutex_held(p->db->mutex) );
9765   return p->nBackup!=0;
9766 }
9767
9768 /*
9769 ** This function returns a pointer to a blob of memory associated with
9770 ** a single shared-btree. The memory is used by client code for its own
9771 ** purposes (for example, to store a high-level schema associated with
9772 ** the shared-btree). The btree layer manages reference counting issues.
9773 **
9774 ** The first time this is called on a shared-btree, nBytes bytes of memory
9775 ** are allocated, zeroed, and returned to the caller. For each subsequent
9776 ** call the nBytes parameter is ignored and a pointer to the same blob
9777 ** of memory returned.
9778 **
9779 ** If the nBytes parameter is 0 and the blob of memory has not yet been
9780 ** allocated, a null pointer is returned. If the blob has already been
9781 ** allocated, it is returned as normal.
9782 **
9783 ** Just before the shared-btree is closed, the function passed as the
9784 ** xFree argument when the memory allocation was made is invoked on the
9785 ** blob of allocated memory. The xFree function should not call sqlite3_free()
9786 ** on the memory, the btree layer does that.
9787 */
9788 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
9789   BtShared *pBt = p->pBt;
9790   sqlite3BtreeEnter(p);
9791   if( !pBt->pSchema && nBytes ){
9792     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
9793     pBt->xFreeSchema = xFree;
9794   }
9795   sqlite3BtreeLeave(p);
9796   return pBt->pSchema;
9797 }
9798
9799 /*
9800 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
9801 ** btree as the argument handle holds an exclusive lock on the
9802 ** sqlite_master table. Otherwise SQLITE_OK.
9803 */
9804 int sqlite3BtreeSchemaLocked(Btree *p){
9805   int rc;
9806   assert( sqlite3_mutex_held(p->db->mutex) );
9807   sqlite3BtreeEnter(p);
9808   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
9809   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
9810   sqlite3BtreeLeave(p);
9811   return rc;
9812 }
9813
9814
9815 #ifndef SQLITE_OMIT_SHARED_CACHE
9816 /*
9817 ** Obtain a lock on the table whose root page is iTab.  The
9818 ** lock is a write lock if isWritelock is true or a read lock
9819 ** if it is false.
9820 */
9821 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
9822   int rc = SQLITE_OK;
9823   assert( p->inTrans!=TRANS_NONE );
9824   if( p->sharable ){
9825     u8 lockType = READ_LOCK + isWriteLock;
9826     assert( READ_LOCK+1==WRITE_LOCK );
9827     assert( isWriteLock==0 || isWriteLock==1 );
9828
9829     sqlite3BtreeEnter(p);
9830     rc = querySharedCacheTableLock(p, iTab, lockType);
9831     if( rc==SQLITE_OK ){
9832       rc = setSharedCacheTableLock(p, iTab, lockType);
9833     }
9834     sqlite3BtreeLeave(p);
9835   }
9836   return rc;
9837 }
9838 #endif
9839
9840 #ifndef SQLITE_OMIT_INCRBLOB
9841 /*
9842 ** Argument pCsr must be a cursor opened for writing on an
9843 ** INTKEY table currently pointing at a valid table entry.
9844 ** This function modifies the data stored as part of that entry.
9845 **
9846 ** Only the data content may only be modified, it is not possible to
9847 ** change the length of the data stored. If this function is called with
9848 ** parameters that attempt to write past the end of the existing data,
9849 ** no modifications are made and SQLITE_CORRUPT is returned.
9850 */
9851 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
9852   int rc;
9853   assert( cursorOwnsBtShared(pCsr) );
9854   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
9855   assert( pCsr->curFlags & BTCF_Incrblob );
9856
9857   rc = restoreCursorPosition(pCsr);
9858   if( rc!=SQLITE_OK ){
9859     return rc;
9860   }
9861   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
9862   if( pCsr->eState!=CURSOR_VALID ){
9863     return SQLITE_ABORT;
9864   }
9865
9866   /* Save the positions of all other cursors open on this table. This is
9867   ** required in case any of them are holding references to an xFetch
9868   ** version of the b-tree page modified by the accessPayload call below.
9869   **
9870   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
9871   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
9872   ** saveAllCursors can only return SQLITE_OK.
9873   */
9874   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
9875   assert( rc==SQLITE_OK );
9876
9877   /* Check some assumptions:
9878   **   (a) the cursor is open for writing,
9879   **   (b) there is a read/write transaction open,
9880   **   (c) the connection holds a write-lock on the table (if required),
9881   **   (d) there are no conflicting read-locks, and
9882   **   (e) the cursor points at a valid row of an intKey table.
9883   */
9884   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
9885     return SQLITE_READONLY;
9886   }
9887   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
9888               && pCsr->pBt->inTransaction==TRANS_WRITE );
9889   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
9890   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
9891   assert( pCsr->pPage->intKey );
9892
9893   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
9894 }
9895
9896 /*
9897 ** Mark this cursor as an incremental blob cursor.
9898 */
9899 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
9900   pCur->curFlags |= BTCF_Incrblob;
9901   pCur->pBtree->hasIncrblobCur = 1;
9902 }
9903 #endif
9904
9905 /*
9906 ** Set both the "read version" (single byte at byte offset 18) and
9907 ** "write version" (single byte at byte offset 19) fields in the database
9908 ** header to iVersion.
9909 */
9910 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
9911   BtShared *pBt = pBtree->pBt;
9912   int rc;                         /* Return code */
9913
9914   assert( iVersion==1 || iVersion==2 );
9915
9916   /* If setting the version fields to 1, do not automatically open the
9917   ** WAL connection, even if the version fields are currently set to 2.
9918   */
9919   pBt->btsFlags &= ~BTS_NO_WAL;
9920   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
9921
9922   rc = sqlite3BtreeBeginTrans(pBtree, 0);
9923   if( rc==SQLITE_OK ){
9924     u8 *aData = pBt->pPage1->aData;
9925     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
9926       rc = sqlite3BtreeBeginTrans(pBtree, 2);
9927       if( rc==SQLITE_OK ){
9928         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9929         if( rc==SQLITE_OK ){
9930           aData[18] = (u8)iVersion;
9931           aData[19] = (u8)iVersion;
9932         }
9933       }
9934     }
9935   }
9936
9937   pBt->btsFlags &= ~BTS_NO_WAL;
9938   return rc;
9939 }
9940
9941 /*
9942 ** Return true if the cursor has a hint specified.  This routine is
9943 ** only used from within assert() statements
9944 */
9945 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
9946   return (pCsr->hints & mask)!=0;
9947 }
9948
9949 /*
9950 ** Return true if the given Btree is read-only.
9951 */
9952 int sqlite3BtreeIsReadonly(Btree *p){
9953   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
9954 }
9955
9956 /*
9957 ** Return the size of the header added to each page by this module.
9958 */
9959 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
9960
9961 #if !defined(SQLITE_OMIT_SHARED_CACHE)
9962 /*
9963 ** Return true if the Btree passed as the only argument is sharable.
9964 */
9965 int sqlite3BtreeSharable(Btree *p){
9966   return p->sharable;
9967 }
9968
9969 /*
9970 ** Return the number of connections to the BtShared object accessed by
9971 ** the Btree handle passed as the only argument. For private caches
9972 ** this is always 1. For shared caches it may be 1 or greater.
9973 */
9974 int sqlite3BtreeConnectionCount(Btree *p){
9975   testcase( p->sharable );
9976   return p->pBt->nRef;
9977 }
9978 #endif