xapian-core/backends/chert/chert_table.cc

   1 /* chert_table.cc: Btree implementation
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2002 Ananova Ltd
   5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts
   6  * Copyright 2008 Lemur Consulting Ltd
   7  * Copyright 2010 Richard Boulton
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  22  * USA
  23  */
  24
  25 #include <config.h>
  26
  27 #include "chert_table.h"
  28
  29 #include <xapian/error.h>
  30
  31 #include "safeerrno.h"
  32
  33 #include "errno_to_string.h"
  34 #include "omassert.h"
  35 #include "stringutils.h" // For STRINGIZE().
  36
  37 // Define to use "dangerous" mode - in this mode we write modified btree
  38 // blocks back in place.  This is somewhat faster (especially once we're
  39 // I/O bound) but the database can't be safely searched during indexing
  40 // and if indexing is terminated uncleanly, the database may be corrupt.
  41 //
  42 // Despite the risks, it's useful for speeding up a full rebuild.
  43 //
  44 // FIXME: make this mode run-time selectable, and record that it is currently
  45 // in use somewhere on disk, so readers can check and refuse to open the
  46 // database.
  47 //
  48 // #define DANGEROUS
  49
  50 #include <sys/types.h>
  51
  52 #include <cstring>   /* for memmove */
  53 #include <climits>   /* for CHAR_BIT */
  54
  55 #include "chert_btreebase.h"
  56 #include "chert_cursor.h"
  57
  58 #include "filetests.h"
  59 #include "io_utils.h"
  60 #include "debuglog.h"
  61 #include "pack.h"
  62 #include "str.h"
  63 #include "wordaccess.h"
  64
  65 #include <algorithm>  // for std::min()
  66 #include <string>
  67
  68 using namespace std;
  69
  70 // Only try to compress tags longer than this many bytes.
  71 const size_t COMPRESS_MIN = 4;
  72
  73 //#define BTREE_DEBUG_FULL 1
  74 #undef BTREE_DEBUG_FULL
  75
  76 #ifdef BTREE_DEBUG_FULL
  77 /*------debugging aids from here--------*/
  78
  79 static void print_key(const byte * p, int c, int j);
  80 static void print_tag(const byte * p, int c, int j);
  81
  82 /*
  83 static void report_cursor(int N, Btree * B, Cursor * C)
  84 {
  85     int i;
  86     printf("%d)\n", N);
  87     for (i = 0; i <= B->level; i++)
  88         printf("p=%d, c=%d, n=[%d], rewrite=%d\n",
  89                 C[i].p, C[i].c, C[i].n, C[i].rewrite);
  90 }
  91 */
  92
  93 /*------to here--------*/
  94 #endif /* BTREE_DEBUG_FULL */
  95
  96 static inline byte *zeroed_new(size_t size)
  97 {
  98     byte *temp = new byte[size];
  99     memset(temp, 0, size);
 100     return temp;
 101 }
 102
 103 /* A B-tree comprises (a) a base file, containing essential information (Block
 104    size, number of the B-tree root block etc), (b) a bitmap, the Nth bit of the
 105    bitmap being set if the Nth block of the B-tree file is in use, and (c) a
 106    file DB containing the B-tree proper. The DB file is divided into a sequence
 107    of equal sized blocks, numbered 0, 1, 2 ... some of which are free, some in
 108    use. Those in use are arranged in a tree.
 109
 110    Each block, b, has a structure like this:
 111
 112      R L M T D o1 o2 o3 ... oN <gap> [item] .. [item] .. [item] ...
 113      <---------- D ----------> <-M->
 114
 115    And then,
 116
 117    R = REVISION(b)  is the revision number the B-tree had when the block was
 118                     written into the DB file.
 119    L = GET_LEVEL(b) is the level of the block, which is the number of levels
 120                     towards the root of the B-tree structure. So leaf blocks
 121                     have level 0 and the one root block has the highest level
 122                     equal to the number of levels in the B-tree.
 123    M = MAX_FREE(b)  is the size of the gap between the end of the directory and
 124                     the first item of data. (It is not necessarily the maximum
 125                     size among the bits of space that are free, but I can't
 126                     think of a better name.)
 127    T = TOTAL_FREE(b)is the total amount of free space left in b.
 128    D = DIR_END(b)   gives the offset to the end of the directory.
 129
 130    o1, o2 ... oN are a directory of offsets to the N items held in the block.
 131    The items are key-tag pairs, and as they occur in the directory are ordered
 132    by the keys.
 133
 134    An item has this form:
 135
 136            I K key x C tag
 137              <--K-->
 138            <------I------>
 139
 140    A long tag presented through the API is split up into C tags small enough to
 141    be accommodated in the blocks of the B-tree. The key is extended to include
 142    a counter, x, which runs from 1 to C. The key is preceded by a length, K,
 143    and the whole item with a length, I, as depicted above.
 144
 145    Here are the corresponding definitions:
 146
 147 */
 148
 149 /** Flip to sequential addition block-splitting after this number of observed
 150  *  sequential additions (in negated form). */
 151 #define SEQ_START_POINT (-10)
 152
 153
 154 /* There are two bit maps in bit_map0 and bit_map. The nth bit of bitmap is 0
 155    if the nth block is free, otherwise 1. bit_map0 is the initial state of
 156    bitmap at the start of the current transaction.
 157
 158    Note use of the limits.h values:
 159    UCHAR_MAX = 255, an unsigned with all bits set, and
 160    CHAR_BIT = 8, the number of bits per byte
 161
 162    BYTE_PAIR_RANGE below is the smallest +ve number that can't be held in two
 163    bytes -- 64K effectively.
 164 */
 165
 166 #define BYTE_PAIR_RANGE (1 << 2 * CHAR_BIT)
 167
 168 /// read_block(n, p) reads block n of the DB file to address p.
 169 void
 170 ChertTable::read_block(uint4 n, byte * p) const
 171 {
 172     // Log the value of p, not the contents of the block it points to...
 173     LOGCALL_VOID(DB, "ChertTable::read_block", n | (void*)p);
 174     if (rare(handle == -2))
 175         ChertTable::throw_database_closed();
 176
 177     /* Use the base bit_map_size not the bitmap's size, because
 178      * the latter is uninitialised in readonly mode.
 179      */
 180     Assert(n / CHAR_BIT < base.get_bit_map_size());
 181
 182     io_read_block(handle, reinterpret_cast<char *>(p), block_size, n);
 183
 184     int dir_end = DIR_END(p);
 185     if (rare(dir_end < DIR_START || unsigned(dir_end) > block_size)) {
 186         string msg("dir_end invalid in block ");
 187         msg += str(n);
 188         throw Xapian::DatabaseCorruptError(msg);
 189     }
 190 }
 191
 192 /** write_block(n, p) writes block n in the DB file from address p.
 193  *  When writing we check to see if the DB file has already been
 194  *  modified. If not (so this is the first write) the old base is
 195  *  deleted. This prevents the possibility of it being opened
 196  *  subsequently as an invalid base.
 197  */
 198 void
 199 ChertTable::write_block(uint4 n, const byte * p) const
 200 {
 201     LOGCALL_VOID(DB, "ChertTable::write_block", n | p);
 202     Assert(writable);
 203     /* Check that n is in range. */
 204     Assert(n / CHAR_BIT < base.get_bit_map_size());
 205
 206     /* don't write to non-free */;
 207     AssertParanoid(base.block_free_at_start(n));
 208
 209     /* write revision is okay */
 210     AssertEqParanoid(REVISION(p), latest_revision_number + 1);
 211
 212     if (both_bases) {
 213         // Delete the old base before modifying the database.
 214         //
 215         // If the file is on NFS, then io_unlink() may return false even if
 216         // the file was removed, so on balance throwing an exception in this
 217         // case is unhelpful, since we wanted the file gone anyway!  The
 218         // likely explanation is that somebody moved, deleted, or changed a
 219         // symlink to the database directory.
 220         (void)io_unlink(name + "base" + other_base_letter());
 221         both_bases = false;
 222         latest_revision_number = revision_number;
 223     }
 224
 225     io_write_block(handle, reinterpret_cast<const char *>(p), block_size, n);
 226 }
 227
 228
 229 /* A note on cursors:
 230
 231    Each B-tree level has a corresponding array element C[j] in a
 232    cursor, C. C[0] is the leaf (or data) level, and C[B->level] is the
 233    root block level. Within a level j,
 234
 235        C[j].p  addresses the block
 236        C[j].c  is the offset into the directory entry in the block
 237        C[j].n  is the number of the block at C[j].p
 238
 239    A look up in the B-tree causes navigation of the blocks starting
 240    from the root. In each block, p, we find an offset, c, to an item
 241    which gives the number, n, of the block for the next level. This
 242    leads to an array of values p,c,n which are held inside the cursor.
 243
 244    Any Btree object B has a built-in cursor, at B->C. But other cursors may
 245    be created.  If BC is a created cursor, BC->C is the cursor in the
 246    sense given above, and BC->B is the handle for the B-tree again.
 247 */
 248
 249
 250 void
 251 ChertTable::set_overwritten() const
 252 {
 253     LOGCALL_VOID(DB, "ChertTable::set_overwritten", NO_ARGS);
 254     // If we're writable, there shouldn't be another writer who could cause
 255     // overwritten to be flagged, so that's a DatabaseCorruptError.
 256     if (writable)
 257         throw Xapian::DatabaseCorruptError("Db block overwritten - are there multiple writers?");
 258     throw Xapian::DatabaseModifiedError("The revision being read has been discarded - you should call Xapian::Database::reopen() and retry the operation");
 259 }
 260
 261 /* block_to_cursor(C, j, n) puts block n into position C[j] of cursor
 262    C, writing the block currently at C[j] back to disk if necessary.
 263    Note that
 264
 265        C[j].rewrite
 266
 267    is true iff C[j].n is different from block n in file DB. If it is
 268    false no rewriting is necessary.
 269 */
 270
 271 void
 272 ChertTable::block_to_cursor(Cursor * C_, int j, uint4 n) const
 273 {
 274     LOGCALL_VOID(DB, "ChertTable::block_to_cursor", (void*)C_ | j | n);
 275     if (n == C_[j].n) return;
 276     byte * p = C_[j].p;
 277     Assert(p);
 278
 279     // FIXME: only needs to be done in write mode
 280     if (C_[j].rewrite) {
 281         Assert(writable);
 282         Assert(C == C_);
 283         write_block(C_[j].n, p);
 284         C_[j].rewrite = false;
 285     }
 286
 287     // Check if the block is in the built-in cursor (potentially in
 288     // modified form).
 289     if (n == C[j].n) {
 290         if (p != C[j].p)
 291             memcpy(p, C[j].p, block_size);
 292     } else {
 293         read_block(n, p);
 294     }
 295
 296     C_[j].n = n;
 297     if (j < level) {
 298         /* unsigned comparison */
 299         if (rare(REVISION(p) > REVISION(C_[j + 1].p))) {
 300             set_overwritten();
 301             return;
 302         }
 303     }
 304
 305     if (rare(j != GET_LEVEL(p))) {
 306         string msg = "Expected block ";
 307         msg += str(n);
 308         msg += " to be level ";
 309         msg += str(j);
 310         msg += ", not ";
 311         msg += str(GET_LEVEL(p));
 312         throw Xapian::DatabaseCorruptError(msg);
 313     }
 314 }
 315
 316 /** Btree::alter(); is called when the B-tree is to be altered.
 317
 318    It causes new blocks to be forced for the current set of blocks in
 319    the cursor.
 320
 321    The point is that if a block at level 0 is to be altered it may get
 322    a new number. Then the pointer to this block from level 1 will need
 323    changing. So the block at level 1 needs altering and may get a new
 324    block number. Then the pointer to this block from level 2 will need
 325    changing ... and so on back to the root.
 326
 327    The clever bit here is spotting the cases when we can make an early
 328    exit from this process. If C[j].rewrite is true, C[j+k].rewrite
 329    will be true for k = 1,2 ... We have been through all this before,
 330    and there is no need to do it again. If C[j].n was free at the
 331    start of the transaction, we can copy it back to the same place
 332    without violating the integrity of the B-tree. We don't then need a
 333    new n and can return. The corresponding C[j].rewrite may be true or
 334    false in that case.
 335 */
 336
 337 void
 338 ChertTable::alter()
 339 {
 340     LOGCALL_VOID(DB, "ChertTable::alter", NO_ARGS);
 341     Assert(writable);
 342 #ifdef DANGEROUS
 343     C[0].rewrite = true;
 344 #else
 345     int j = 0;
 346     byte * p = C[j].p;
 347     while (true) {
 348         if (C[j].rewrite) return; /* all new, so return */
 349         C[j].rewrite = true;
 350
 351         uint4 n = C[j].n;
 352         if (base.block_free_at_start(n)) {
 353             Assert(REVISION(p) == latest_revision_number + 1);
 354             return;
 355         }
 356         Assert(REVISION(p) < latest_revision_number + 1);
 357         base.free_block(n);
 358         n = base.next_free_block();
 359         C[j].n = n;
 360         SET_REVISION(p, latest_revision_number + 1);
 361
 362         if (j == level) return;
 363         j++;
 364         p = C[j].p;
 365         Item_wr(p, C[j].c).set_block_given_by(n);
 366     }
 367 #endif
 368 }
 369
 370 /** find_in_block(p, key, leaf, c) searches for the key in the block at p.
 371
 372    leaf is true for a data block, and false for an index block (when the
 373    first key is dummy and never needs to be tested). What we get is the
 374    directory entry to the last key <= the key being searched for.
 375
 376    The lookup is by binary chop, with i and j set to the left and
 377    right ends of the search area. In sequential addition, c will often
 378    be the answer, so we test the keys round c and move i and j towards
 379    c if possible.
 380
 381    The returned value is < DIR_END(p).  If leaf is false, the returned
 382    value is >= DIR_START; if leaf is true, it can also be == DIR_START - D2.
 383 */
 384
 385 int
 386 ChertTable::find_in_block(const byte * p, Key key, bool leaf, int c)
 387 {
 388     LOGCALL_STATIC(DB, int, "ChertTable::find_in_block", (const void*)p | (const void *)key.get_address() | leaf | c);
 389     // c should be odd (either -1, or an even offset from DIR_START).
 390     Assert((c & 1) == 1);
 391     int i = DIR_START;
 392     if (leaf) i -= D2;
 393     if (c != -1) {
 394         AssertRel(i,<=,c);
 395     }
 396     int j = DIR_END(p);
 397
 398     if (c != -1) {
 399         if (c < j && i < c && Item(p, c).key() <= key)
 400             i = c;
 401         c += D2;
 402         if (c < j && i < c && key < Item(p, c).key())
 403             j = c;
 404     }
 405
 406     while (j - i > D2) {
 407         int k = i + ((j - i)/(D2 * 2))*D2; /* mid way */
 408         if (key < Item(p, k).key()) j = k; else i = k;
 409     }
 410     if (leaf) {
 411         AssertRel(DIR_START - D2,<=,i);
 412     } else {
 413         AssertRel(DIR_START,<=,i);
 414     }
 415     AssertRel(i,<,DIR_END(p));
 416     RETURN(i);
 417 }
 418
 419 /** find(C_) searches for the key of B->kt in the B-tree.
 420
 421    Result is true if found, false otherwise.  When false, the B_tree
 422    cursor is positioned at the last key in the B-tree <= the search
 423    key.  Goes to first (null) item in B-tree when key length == 0.
 424
 425    Note: The cursor can be left with C_[0].c == DIR_START - D2 if the
 426    requested key doesn't exist and is less than the smallest key in a
 427    leaf block, but after the dividing key.  The caller needs to fix up
 428    C_[0].c in this case, either explicitly or by performing an
 429    operation which gives C_[0].c a valid value.
 430 */
 431
 432 bool
 433 ChertTable::find(Cursor * C_) const
 434 {
 435     LOGCALL(DB, bool, "ChertTable::find", (void*)C_);
 436     // Note: the parameter is needed when we're called by ChertCursor
 437     const byte * p;
 438     int c;
 439     Key key = kt.key();
 440     for (int j = level; j > 0; --j) {
 441         p = C_[j].p;
 442         c = find_in_block(p, key, false, C_[j].c);
 443 #ifdef BTREE_DEBUG_FULL
 444         printf("Block in ChertTable:find - code position 1");
 445         report_block_full(j, C_[j].n, p);
 446 #endif /* BTREE_DEBUG_FULL */
 447         C_[j].c = c;
 448         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
 449     }
 450     p = C_[0].p;
 451     c = find_in_block(p, key, true, C_[0].c);
 452 #ifdef BTREE_DEBUG_FULL
 453     printf("Block in ChertTable:find - code position 2");
 454     report_block_full(0, C_[0].n, p);
 455 #endif /* BTREE_DEBUG_FULL */
 456     C_[0].c = c;
 457     if (c < DIR_START) {
 458         RETURN(false);
 459     }
 460     RETURN(Item(p, c).key() == key);
 461 }
 462
 463 /** compact(p) compact the block at p by shuffling all the items up to the end.
 464
 465    MAX_FREE(p) is then maximized, and is equal to TOTAL_FREE(p).
 466 */
 467
 468 void
 469 ChertTable::compact(byte * p)
 470 {
 471     LOGCALL_VOID(DB, "ChertTable::compact", (void*)p);
 472     Assert(writable);
 473     int e = block_size;
 474     byte * b = buffer;
 475     int dir_end = DIR_END(p);
 476     for (int c = DIR_START; c < dir_end; c += D2) {
 477         Item item(p, c);
 478         int l = item.size();
 479         e -= l;
 480         memmove(b + e, item.get_address(), l);
 481         setD(p, c, e);  /* reform in b */
 482     }
 483     memmove(p + e, b + e, block_size - e);  /* copy back */
 484     e -= dir_end;
 485     SET_TOTAL_FREE(p, e);
 486     SET_MAX_FREE(p, e);
 487 }
 488
 489 /** Btree needs to gain a new level to insert more items: so split root block
 490  *  and construct a new one.
 491  */
 492 void
 493 ChertTable::split_root(uint4 split_n)
 494 {
 495     LOGCALL_VOID(DB, "ChertTable::split_root", split_n);
 496     /* gain a level */
 497     ++level;
 498
 499     /* check level overflow - this isn't something that should ever happen
 500      * but deserves more than an Assert()... */
 501     if (level == BTREE_CURSOR_LEVELS) {
 502         throw Xapian::DatabaseCorruptError("Btree has grown impossibly large (" STRINGIZE(BTREE_CURSOR_LEVELS) " levels)");
 503     }
 504
 505     byte * q = zeroed_new(block_size);
 506     C[level].p = q;
 507     C[level].c = DIR_START;
 508     C[level].n = base.next_free_block();
 509     C[level].rewrite = true;
 510     SET_REVISION(q, latest_revision_number + 1);
 511     SET_LEVEL(q, level);
 512     SET_DIR_END(q, DIR_START);
 513     compact(q);   /* to reset TOTAL_FREE, MAX_FREE */
 514
 515     /* form a null key in b with a pointer to the old root */
 516     byte b[10]; /* 7 is exact */
 517     Item_wr item(b);
 518     item.form_null_key(split_n);
 519     add_item(item, level);
 520 }
 521
 522 /** enter_key(j, prevkey, newkey) is called after a block split.
 523
 524    It enters in the block at level C[j] a separating key for the block
 525    at level C[j - 1]. The key itself is newkey. prevkey is the
 526    preceding key, and at level 1 newkey can be trimmed down to the
 527    first point of difference to prevkey for entry in C[j].
 528
 529    This code looks longer than it really is. If j exceeds the number
 530    of B-tree levels the root block has split and we have to construct
 531    a new one, but this is a rare event.
 532
 533    The key is constructed in b, with block number C[j - 1].n as tag,
 534    and this is added in with add_item. add_item may itself cause a
 535    block split, with a further call to enter_key. Hence the recursion.
 536 */
 537 void
 538 ChertTable::enter_key(int j, Key prevkey, Key newkey)
 539 {
 540     LOGCALL_VOID(DB, "ChertTable::enter_key", j | Literal("prevkey") | Literal("newkey"));
 541     Assert(writable);
 542     Assert(prevkey < newkey);
 543     AssertRel(j,>=,1);
 544
 545     uint4 blocknumber = C[j - 1].n;
 546
 547     // FIXME update to use Key
 548     // Keys are truncated here: but don't truncate the count at the end away.
 549     const int newkey_len = newkey.length();
 550     AssertRel(newkey_len,>,0);
 551     int i;
 552
 553     if (j == 1) {
 554         // Truncate the key to the minimal key which differs from prevkey,
 555         // the preceding key in the block.
 556         i = 0;
 557         const int min_len = min(newkey_len, prevkey.length());
 558         while (i < min_len && prevkey[i] == newkey[i]) {
 559             i++;
 560         }
 561
 562         // Want one byte of difference.
 563         if (i < newkey_len) i++;
 564     } else {
 565         /* Can't truncate between branch levels, since the separated keys
 566          * are in at the leaf level, and truncating again will change the
 567          * branch point.
 568          */
 569         i = newkey_len;
 570     }
 571
 572     byte b[UCHAR_MAX + 6];
 573     Item_wr item(b);
 574     Assert(i <= 256 - I2 - C2);
 575     Assert(i <= (int)sizeof(b) - I2 - C2 - 4);
 576     item.set_key_and_block(newkey, i, blocknumber);
 577
 578     // When j > 1 we can make the first key of block p null.  This is probably
 579     // worthwhile as it trades a small amount of CPU and RAM use for a small
 580     // saving in disk use.  Other redundant keys will still creep in though.
 581     if (j > 1) {
 582         byte * p = C[j - 1].p;
 583         uint4 n = getint4(newkey.get_address(), newkey_len + K1 + C2);
 584         int new_total_free = TOTAL_FREE(p) + newkey_len + C2;
 585         // FIXME: incredibly icky going from key to item like this...
 586         Item_wr(const_cast<byte*>(newkey.get_address()) - I2).form_null_key(n);
 587         SET_TOTAL_FREE(p, new_total_free);
 588     }
 589
 590     // The split block gets inserted into the parent after the pointer to the
 591     // current child.
 592     AssertEq(C[j].c, find_in_block(C[j].p, item.key(), false, C[j].c));
 593     C[j].c += D2;
 594     C[j].rewrite = true; /* a subtle point: this *is* required. */
 595     add_item(item, j);
 596 }
 597
 598 /** mid_point(p) finds the directory entry in c that determines the
 599    approximate mid point of the data in the block at p.
 600  */
 601
 602 int
 603 ChertTable::mid_point(byte * p)
 604 {
 605     LOGCALL(DB, int, "ChertTable::mid_point", (void*)p);
 606     int n = 0;
 607     int dir_end = DIR_END(p);
 608     int size = block_size - TOTAL_FREE(p) - dir_end;
 609     for (int c = DIR_START; c < dir_end; c += D2) {
 610         int l = Item(p, c).size();
 611         n += 2 * l;
 612         if (n >= size) {
 613             if (l < n - size) RETURN(c);
 614             RETURN(c + D2);
 615         }
 616     }
 617
 618     /* This shouldn't happen, as the sum of the item sizes should be the same
 619      * as the value calculated in size, so assert but return a sane value just
 620      * in case. */
 621     Assert(false);
 622     RETURN(dir_end);
 623 }
 624
 625 /** add_item_to_block(p, kt_, c) adds item kt_ to the block at p.
 626
 627    c is the offset in the directory that needs to be expanded to accommodate
 628    the new entry for the item.  We know before this is called that there is
 629    enough contiguous room for the item in the block, so it's just a matter of
 630    shuffling up any directory entries after where we're inserting and copying
 631    in the item.
 632 */
 633
 634 void
 635 ChertTable::add_item_to_block(byte * p, Item_wr kt_, int c)
 636 {
 637     LOGCALL_VOID(DB, "ChertTable::add_item_to_block", (void*)p | Literal("kt_") | c);
 638     Assert(writable);
 639     int dir_end = DIR_END(p);
 640     int kt_len = kt_.size();
 641     int needed = kt_len + D2;
 642     int new_total = TOTAL_FREE(p) - needed;
 643     int new_max = MAX_FREE(p) - needed;
 644
 645     Assert(new_total >= 0);
 646
 647     AssertRel(MAX_FREE(p),>=,needed);
 648
 649     AssertRel(DIR_START,<=,c);
 650     AssertRel(c,<=,dir_end);
 651
 652     memmove(p + c + D2, p + c, dir_end - c);
 653     dir_end += D2;
 654     SET_DIR_END(p, dir_end);
 655
 656     int o = dir_end + new_max;
 657     setD(p, c, o);
 658     memmove(p + o, kt_.get_address(), kt_len);
 659
 660     SET_MAX_FREE(p, new_max);
 661
 662     SET_TOTAL_FREE(p, new_total);
 663 }
 664
 665 /** ChertTable::add_item(kt_, j) adds item kt_ to the block at cursor level C[j].
 666  *
 667  *  If there is not enough room the block splits and the item is then
 668  *  added to the appropriate half.
 669  */
 670 void
 671 ChertTable::add_item(Item_wr kt_, int j)
 672 {
 673     LOGCALL_VOID(DB, "ChertTable::add_item", Literal("kt_") | j);
 674     Assert(writable);
 675     byte * p = C[j].p;
 676     int c = C[j].c;
 677     uint4 n;
 678
 679     int needed = kt_.size() + D2;
 680     if (TOTAL_FREE(p) < needed) {
 681         int m;
 682         // Prepare to split p. After splitting, the block is in two halves, the
 683         // lower half is split_p, the upper half p again. add_to_upper_half
 684         // becomes true when the item gets added to p, false when it gets added
 685         // to split_p.
 686
 687         if (seq_count < 0) {
 688             // If we're not in sequential mode, we split at the mid point
 689             // of the node.
 690             m = mid_point(p);
 691         } else {
 692             // During sequential addition, split at the insert point
 693             AssertRel(c,>=,DIR_START);
 694             m = c;
 695         }
 696
 697         uint4 split_n = C[j].n;
 698         C[j].n = base.next_free_block();
 699
 700         memcpy(split_p, p, block_size);  // replicate the whole block in split_p
 701         SET_DIR_END(split_p, m);
 702         compact(split_p);      /* to reset TOTAL_FREE, MAX_FREE */
 703
 704         {
 705             int residue = DIR_END(p) - m;
 706             int new_dir_end = DIR_START + residue;
 707             memmove(p + DIR_START, p + m, residue);
 708             SET_DIR_END(p, new_dir_end);
 709         }
 710
 711         compact(p);      /* to reset TOTAL_FREE, MAX_FREE */
 712
 713         bool add_to_upper_half;
 714         if (seq_count < 0) {
 715             add_to_upper_half = (c >= m);
 716         } else {
 717             // And add item to lower half if split_p has room, otherwise upper
 718             // half
 719             add_to_upper_half = (TOTAL_FREE(split_p) < needed);
 720         }
 721
 722         if (add_to_upper_half) {
 723             c -= (m - DIR_START);
 724             Assert(seq_count < 0 || c <= DIR_START + D2);
 725             Assert(c >= DIR_START);
 726             Assert(c <= DIR_END(p));
 727             add_item_to_block(p, kt_, c);
 728             n = C[j].n;
 729         } else {
 730             Assert(c >= DIR_START);
 731             Assert(c <= DIR_END(split_p));
 732             add_item_to_block(split_p, kt_, c);
 733             n = split_n;
 734         }
 735         write_block(split_n, split_p);
 736
 737         // Check if we're splitting the root block.
 738         if (j == level) split_root(split_n);
 739
 740         /* Enter a separating key at level j + 1 between */
 741         /* the last key of block split_p, and the first key of block p */
 742         enter_key(j + 1,
 743                   Item(split_p, DIR_END(split_p) - D2).key(),
 744                   Item(p, DIR_START).key());
 745     } else {
 746         AssertRel(TOTAL_FREE(p),>=,needed);
 747
 748         if (MAX_FREE(p) < needed) {
 749             compact(p);
 750             AssertRel(MAX_FREE(p),>=,needed);
 751         }
 752
 753         add_item_to_block(p, kt_, c);
 754         n = C[j].n;
 755     }
 756     if (j == 0) {
 757         changed_n = n;
 758         changed_c = c;
 759     }
 760 }
 761
 762 /** ChertTable::delete_item(j, repeatedly) is (almost) the converse of add_item.
 763  *
 764  * If repeatedly is true, the process repeats at the next level when a
 765  * block has been completely emptied, freeing the block and taking out
 766  * the pointer to it.  Emptied root blocks are also removed, which
 767  * reduces the number of levels in the B-tree.
 768  */
 769 void
 770 ChertTable::delete_item(int j, bool repeatedly)
 771 {
 772     LOGCALL_VOID(DB, "ChertTable::delete_item", j | repeatedly);
 773     Assert(writable);
 774     byte * p = C[j].p;
 775     int c = C[j].c;
 776     AssertRel(DIR_START,<=,c);
 777     AssertRel(c,<,DIR_END(p));
 778     int kt_len = Item(p, c).size(); /* size of the item to be deleted */
 779     int dir_end = DIR_END(p) - D2;   /* directory length will go down by 2 bytes */
 780
 781     memmove(p + c, p + c + D2, dir_end - c);
 782     SET_DIR_END(p, dir_end);
 783     SET_MAX_FREE(p, MAX_FREE(p) + D2);
 784     SET_TOTAL_FREE(p, TOTAL_FREE(p) + kt_len + D2);
 785
 786     if (!repeatedly) return;
 787     if (j < level) {
 788         if (dir_end == DIR_START) {
 789             base.free_block(C[j].n);
 790             C[j].rewrite = false;
 791             C[j].n = BLK_UNUSED;
 792             C[j + 1].rewrite = true;  /* *is* necessary */
 793             delete_item(j + 1, true);
 794         }
 795     } else {
 796         Assert(j == level);
 797         while (dir_end == DIR_START + D2 && level > 0) {
 798             /* single item in the root block, so lose a level */
 799             uint4 new_root = Item(p, DIR_START).block_given_by();
 800             delete [] p;
 801             C[level].p = 0;
 802             base.free_block(C[level].n);
 803             C[level].rewrite = false;
 804             C[level].n = BLK_UNUSED;
 805             level--;
 806
 807             block_to_cursor(C, level, new_root);
 808
 809             p = C[level].p;
 810             dir_end = DIR_END(p); /* prepare for the loop */
 811         }
 812     }
 813 }
 814
 815 /* debugging aid:
 816 static addcount = 0;
 817 */
 818
 819 /** add_kt(found) adds the item (key-tag pair) at B->kt into the
 820    B-tree, using cursor C.
 821
 822    found == find() is handed over as a parameter from Btree::add.
 823    Btree::alter() prepares for the alteration to the B-tree. Then
 824    there are a number of cases to consider:
 825
 826      If an item with the same key is in the B-tree (found is true),
 827      the new kt replaces it.
 828
 829      If then kt is smaller, or the same size as, the item it replaces,
 830      kt is put in the same place as the item it replaces, and the
 831      TOTAL_FREE measure is reduced.
 832
 833      If kt is larger than the item it replaces it is put in the
 834      MAX_FREE space if there is room, and the directory entry and
 835      space counts are adjusted accordingly.
 836
 837      - But if there is not room we do it the long way: the old item is
 838      deleted with delete_item and kt is added in with add_item.
 839
 840      If the key of kt is not in the B-tree (found is false), the new
 841      kt is added in with add_item.
 842 */
 843
 844 int
 845 ChertTable::add_kt(bool found)
 846 {
 847     LOGCALL(DB, int, "ChertTable::add_kt", found);
 848     Assert(writable);
 849     int components = 0;
 850
 851     /*
 852     {
 853         printf("%d) %s ", addcount++, (found ? "replacing" : "adding"));
 854         print_bytes(kt[I2] - K1 - C2, kt + I2 + K1); putchar('\n');
 855     }
 856     */
 857     alter();
 858
 859     if (found) { /* replacement */
 860         seq_count = SEQ_START_POINT;
 861         sequential = false;
 862
 863         byte * p = C[0].p;
 864         int c = C[0].c;
 865         AssertRel(DIR_START,<=,c);
 866         AssertRel(c,<,DIR_END(p));
 867         Item item(p, c);
 868         int kt_size = kt.size();
 869         int needed = kt_size - item.size();
 870
 871         components = item.components_of();
 872
 873         if (needed <= 0) {
 874             /* simple replacement */
 875             memmove(const_cast<byte *>(item.get_address()),
 876                     kt.get_address(), kt_size);
 877             SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
 878         } else {
 879             /* new item into the block's freespace */
 880             int new_max = MAX_FREE(p) - kt_size;
 881             if (new_max >= 0) {
 882                 int o = DIR_END(p) + new_max;
 883                 memmove(p + o, kt.get_address(), kt_size);
 884                 setD(p, c, o);
 885                 SET_MAX_FREE(p, new_max);
 886                 SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
 887             } else {
 888                 /* do it the long way */
 889                 delete_item(0, false);
 890                 add_item(kt, 0);
 891             }
 892         }
 893     } else {
 894         /* addition */
 895         if (changed_n == C[0].n && changed_c == C[0].c) {
 896             if (seq_count < 0) seq_count++;
 897         } else {
 898             seq_count = SEQ_START_POINT;
 899             sequential = false;
 900         }
 901         C[0].c += D2;
 902         add_item(kt, 0);
 903     }
 904     RETURN(components);
 905 }
 906
 907 /* delete_kt() corresponds to add_kt(found), but there are only
 908    two cases: if the key is not found nothing is done, and if it is
 909    found the corresponding item is deleted with delete_item.
 910 */
 911
 912 int
 913 ChertTable::delete_kt()
 914 {
 915     LOGCALL(DB, int, "ChertTable::delete_kt", NO_ARGS);
 916     Assert(writable);
 917
 918     bool found = find(C);
 919
 920     int components = 0;
 921     seq_count = SEQ_START_POINT;
 922     sequential = false;
 923
 924     /*
 925     {
 926         printf("%d) %s ", addcount++, (found ? "deleting " : "ignoring "));
 927         print_bytes(B->kt[I2] - K1 - C2, B->kt + I2 + K1); putchar('\n');
 928     }
 929     */
 930     if (found) {
 931         components = Item(C[0].p, C[0].c).components_of();
 932         alter();
 933         delete_item(0, true);
 934     }
 935     RETURN(components);
 936 }
 937
 938 /* ChertTable::form_key(key) treats address kt as an item holder and fills in
 939 the key part:
 940
 941            (I) K key c (C tag)
 942
 943 The bracketed parts are left blank. The key is filled in with key_len bytes and
 944 K set accordingly. c is set to 1.
 945 */
 946
 947 void ChertTable::form_key(const string & key) const
 948 {
 949     LOGCALL_VOID(DB, "ChertTable::form_key", key);
 950     kt.form_key(key);
 951 }
 952
 953 /* ChertTable::add(key, tag) adds the key/tag item to the
 954    B-tree, replacing any existing item with the same key.
 955
 956    For a long tag, we end up having to add m components, of the form
 957
 958        key 1 m tag1
 959        key 2 m tag2
 960        ...
 961        key m m tagm
 962
 963    and tag1+tag2+...+tagm are equal to tag. These in their turn may be replacing
 964    n components of the form
 965
 966        key 1 n TAG1
 967        key 2 n TAG2
 968        ...
 969        key n n TAGn
 970
 971    and n may be greater than, equal to, or less than m. These cases are dealt
 972    with in the code below. If m < n for example, we end up with a series of
 973    deletions.
 974 */
 975
 976 void
 977 ChertTable::add(const string &key, string tag, bool already_compressed)
 978 {
 979     LOGCALL_VOID(DB, "ChertTable::add", key | tag | already_compressed);
 980     Assert(writable);
 981
 982     if (handle < 0) create_and_open(block_size);
 983
 984     form_key(key);
 985
 986     bool compressed = false;
 987     if (already_compressed) {
 988         compressed = true;
 989     } else if (compress_strategy != DONT_COMPRESS && tag.size() > COMPRESS_MIN) {
 990         static_assert(DONT_COMPRESS != Z_DEFAULT_STRATEGY,
 991                       "DONT_COMPRESS clashes with zlib constant");
 992         static_assert(DONT_COMPRESS != Z_FILTERED,
 993                       "DONT_COMPRESS clashes with zlib constant");
 994         static_assert(DONT_COMPRESS != Z_HUFFMAN_ONLY,
 995                       "DONT_COMPRESS clashes with zlib constant");
 996 #ifdef Z_RLE
 997         static_assert(DONT_COMPRESS != Z_RLE,
 998                       "DONT_COMPRESS clashes with zlib constant");
 999 #endif
1000
1001         lazy_alloc_deflate_zstream();
1002
1003         deflate_zstream->next_in =
1004             reinterpret_cast<Bytef *>(const_cast<char *>(tag.data()));
1005         deflate_zstream->avail_in = static_cast<uInt>(tag.size());
1006
1007         // If compressed size is >= tag.size(), we don't want to compress.
1008         unsigned long blk_len = tag.size() - 1;
1009         unsigned char * blk = new unsigned char[blk_len];
1010         deflate_zstream->next_out = blk;
1011         deflate_zstream->avail_out = static_cast<uInt>(blk_len);
1012
1013         int err = deflate(deflate_zstream, Z_FINISH);
1014         if (err == Z_STREAM_END) {
1015             // If deflate succeeded, then the output was at least one byte
1016             // smaller than the input.
1017             tag.assign(reinterpret_cast<const char *>(blk), deflate_zstream->total_out);
1018             compressed = true;
1019         } else {
1020             // Deflate failed - presumably the data wasn't compressible.
1021         }
1022
1023         delete [] blk;
1024     }
1025
1026     // sort of matching kt.append_chunk(), but setting the chunk
1027     const size_t cd = kt.key().length() + K1 + I2 + C2 + C2;  // offset to the tag data
1028     const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
1029     size_t first_L = L;                  // - amount for tag1
1030     bool found = find(C);
1031     if (!found) {
1032         byte * p = C[0].p;
1033         size_t n = TOTAL_FREE(p) % (max_item_size + D2);
1034         if (n > D2 + cd) {
1035             n -= (D2 + cd);
1036             // if n >= last then fully filling this block won't produce
1037             // an extra item, so we might as well do this even if
1038             // full_compaction isn't active.
1039             //
1040             // In the full_compaction case, it turns out we shouldn't always
1041             // try to fill every last byte.  Doing so can actually increase the
1042             // total space required (I believe this effect is due to longer
1043             // dividing keys being required in the index blocks).  Empirically,
1044             // n >= key.size() + K appears a good criterion for K ~= 34.  This
1045             // seems to save about 0.2% in total database size over always
1046             // splitting the tag.  It'll also give be slightly faster retrieval
1047             // as we can avoid reading an extra block occasionally.
1048             size_t last = tag.length() % L;
1049             if (n >= last || (full_compaction && n >= key.size() + 34))
1050                 first_L = n;
1051         }
1052     }
1053
1054     // a null tag must be added in of course
1055     int m = tag.empty() ? 1 : (tag.length() - first_L + L - 1) / L + 1;
1056                                       // there are m items to add
1057     /* FIXME: sort out this error higher up and turn this into
1058      * an assert.
1059      */
1060     if (m >= BYTE_PAIR_RANGE)
1061         throw Xapian::UnimplementedError("Can't handle insanely large tags");
1062
1063     int n = 0; // initialise to shut off warning
1064                                       // - and there will be n to delete
1065     int o = 0;                        // Offset into the tag
1066     size_t residue = tag.length();    // Bytes of the tag remaining to add in
1067     int replacement = false;          // Has there been a replacement ?
1068     int i;
1069     kt.set_components_of(m);
1070     for (i = 1; i <= m; i++) {
1071         size_t l = (i == m ? residue : (i == 1 ? first_L : L));
1072         Assert(cd + l <= block_size);
1073         Assert(string::size_type(o + l) <= tag.length());
1074         kt.set_tag(cd, tag.data() + o, l, compressed);
1075         kt.set_component_of(i);
1076
1077         o += l;
1078         residue -= l;
1079
1080         if (i > 1) found = find(C);
1081         n = add_kt(found);
1082         if (n > 0) replacement = true;
1083     }
1084     /* o == tag.length() here, and n may be zero */
1085     for (i = m + 1; i <= n; i++) {
1086         kt.set_component_of(i);
1087         delete_kt();
1088     }
1089     if (!replacement) ++item_count;
1090     Btree_modified = true;
1091     if (cursor_created_since_last_modification) {
1092         cursor_created_since_last_modification = false;
1093         ++cursor_version;
1094     }
1095 }
1096
1097 /* ChertTable::del(key) returns false if the key is not in the B-tree,
1098    otherwise deletes it and returns true.
1099
1100    Again, this is parallel to ChertTable::add, but simpler in form.
1101 */
1102
1103 bool
1104 ChertTable::del(const string &key)
1105 {
1106     LOGCALL(DB, bool, "ChertTable::del", key);
1107     Assert(writable);
1108
1109     if (handle < 0) {
1110         if (handle == -2) {
1111             ChertTable::throw_database_closed();
1112         }
1113         RETURN(false);
1114     }
1115
1116     // We can't delete a key which we is too long for us to store.
1117     if (key.size() > CHERT_BTREE_MAX_KEY_LEN) RETURN(false);
1118
1119     if (key.empty()) RETURN(false);
1120     form_key(key);
1121
1122     int n = delete_kt();  /* there are n items to delete */
1123     if (n <= 0) RETURN(false);
1124
1125     for (int i = 2; i <= n; i++) {
1126         kt.set_component_of(i);
1127         delete_kt();
1128     }
1129
1130     item_count--;
1131     Btree_modified = true;
1132     if (cursor_created_since_last_modification) {
1133         cursor_created_since_last_modification = false;
1134         ++cursor_version;
1135     }
1136     RETURN(true);
1137 }
1138
1139 bool
1140 ChertTable::readahead_key(const string &key) const
1141 {
1142     LOGCALL(DB, bool, "ChertTable::readahead_key", key);
1143     Assert(!key.empty());
1144
1145     // Two cases:
1146     //
1147     // handle = -1:  Lazy table which isn't yet open
1148     //
1149     // handle = -2:  Table has been closed.  Since the readahead is just a
1150     // hint, we can safely ignore it for a closed table.
1151     if (handle < 0)
1152         RETURN(false);
1153
1154     // If the table only has one level, there are no branch blocks to preread.
1155     if (level == 0)
1156         RETURN(false);
1157
1158     form_key(key);
1159     Key ktkey = kt.key();
1160
1161     // We'll only readahead the first level, since descending the B-tree would
1162     // require actual reads that would likely hurt performance more than help.
1163     const byte * p = C[level].p;
1164     int c = find_in_block(p, ktkey, false, C[level].c);
1165     uint4 n = Item(p, c).block_given_by();
1166     // Don't preread if it's the block we last preread or already in the
1167     // cursor.
1168     if (n != last_readahead && n != C[level - 1].n) {
1169         /* Use the base bit_map_size not the bitmap's size, because the latter
1170          * is uninitialised in readonly mode.
1171          */
1172         Assert(n / CHAR_BIT < base.get_bit_map_size());
1173
1174         last_readahead = n;
1175         if (!io_readahead_block(handle, block_size, n))
1176             RETURN(false);
1177     }
1178     RETURN(true);
1179 }
1180
1181 bool
1182 ChertTable::get_exact_entry(const string &key, string & tag) const
1183 {
1184     LOGCALL(DB, bool, "ChertTable::get_exact_entry", key | tag);
1185     Assert(!key.empty());
1186
1187     if (handle < 0) {
1188         if (handle == -2) {
1189             ChertTable::throw_database_closed();
1190         }
1191         RETURN(false);
1192     }
1193
1194     // An oversized key can't exist, so attempting to search for it should fail.
1195     if (key.size() > CHERT_BTREE_MAX_KEY_LEN) RETURN(false);
1196
1197     form_key(key);
1198     if (!find(C)) RETURN(false);
1199
1200     (void)read_tag(C, &tag, false);
1201     RETURN(true);
1202 }
1203
1204 bool
1205 ChertTable::key_exists(const string &key) const
1206 {
1207     LOGCALL(DB, bool, "ChertTable::key_exists", key);
1208     Assert(!key.empty());
1209
1210     // An oversized key can't exist, so attempting to search for it should fail.
1211     if (key.size() > CHERT_BTREE_MAX_KEY_LEN) RETURN(false);
1212
1213     form_key(key);
1214     RETURN(find(C));
1215 }
1216
1217 bool
1218 ChertTable::read_tag(Cursor * C_, string *tag, bool keep_compressed) const
1219 {
1220     LOGCALL(DB, bool, "ChertTable::read_tag", Literal("C_") | tag | keep_compressed);
1221     Item item(C_[0].p, C_[0].c);
1222
1223     /* n components to join */
1224     int n = item.components_of();
1225
1226     tag->resize(0);
1227     // max_item_size also includes K1 + I2 + C2 + C2 bytes overhead and the key
1228     // (which is at least 1 byte long).
1229     if (n > 1) tag->reserve((max_item_size - (1 + K1 + I2 + C2 + C2)) * n);
1230
1231     item.append_chunk(tag);
1232     bool compressed = item.get_compressed();
1233
1234     for (int i = 2; i <= n; i++) {
1235         if (!next(C_, 0)) {
1236             throw Xapian::DatabaseCorruptError("Unexpected end of table when reading continuation of tag");
1237         }
1238         (void)Item(C_[0].p, C_[0].c).append_chunk(tag);
1239     }
1240     // At this point the cursor is on the last item - calling next will move
1241     // it to the next key (ChertCursor::get_tag() relies on this).
1242     if (!compressed || keep_compressed) RETURN(compressed);
1243
1244     // FIXME: Perhaps we should decompress each chunk as we read it so we
1245     // don't need both the full compressed and uncompressed tags in memory
1246     // at once.
1247
1248     string utag;
1249     // May not be enough for a compressed tag, but it's a reasonable guess.
1250     utag.reserve(tag->size() + tag->size() / 2);
1251
1252     Bytef buf[8192];
1253
1254     lazy_alloc_inflate_zstream();
1255
1256     inflate_zstream->next_in =
1257         reinterpret_cast<Bytef*>(const_cast<char *>(tag->data()));
1258     inflate_zstream->avail_in = static_cast<uInt>(tag->size());
1259
1260     int err = Z_OK;
1261     while (err != Z_STREAM_END) {
1262         inflate_zstream->next_out = buf;
1263         inflate_zstream->avail_out = static_cast<uInt>(sizeof(buf));
1264         err = inflate(inflate_zstream, Z_SYNC_FLUSH);
1265         if (err == Z_BUF_ERROR && inflate_zstream->avail_in == 0) {
1266             LOGLINE(DB, "Z_BUF_ERROR - faking checksum of " << inflate_zstream->adler);
1267             Bytef header2[4];
1268             aligned_write4(header2, inflate_zstream->adler);
1269             inflate_zstream->next_in = header2;
1270             inflate_zstream->avail_in = 4;
1271             err = inflate(inflate_zstream, Z_SYNC_FLUSH);
1272             if (err == Z_STREAM_END) break;
1273         }
1274
1275         if (err != Z_OK && err != Z_STREAM_END) {
1276             if (err == Z_MEM_ERROR) throw std::bad_alloc();
1277             string msg = "inflate failed";
1278             if (inflate_zstream->msg) {
1279                 msg += " (";
1280                 msg += inflate_zstream->msg;
1281                 msg += ')';
1282             }
1283             throw Xapian::DatabaseError(msg);
1284         }
1285
1286         utag.append(reinterpret_cast<const char *>(buf),
1287                     inflate_zstream->next_out - buf);
1288     }
1289     if (utag.size() != inflate_zstream->total_out) {
1290         string msg = "compressed tag didn't expand to the expected size: ";
1291         msg += str(utag.size());
1292         msg += " != ";
1293         // OpenBSD's zlib.h uses off_t instead of uLong for total_out.
1294         msg += str(size_t(inflate_zstream->total_out));
1295         throw Xapian::DatabaseCorruptError(msg);
1296     }
1297
1298     swap(*tag, utag);
1299
1300     RETURN(false);
1301 }
1302
1303 void
1304 ChertTable::set_full_compaction(bool parity)
1305 {
1306     LOGCALL_VOID(DB, "ChertTable::set_full_compaction", parity);
1307     Assert(writable);
1308
1309     if (parity) seq_count = 0;
1310     full_compaction = parity;
1311 }
1312
1313 ChertCursor * ChertTable::cursor_get() const {
1314     LOGCALL(DB, ChertCursor *, "ChertTable::cursor_get", NO_ARGS);
1315     if (handle < 0) {
1316         if (handle == -2) {
1317             ChertTable::throw_database_closed();
1318         }
1319         RETURN(NULL);
1320     }
1321     // FIXME Ick - casting away const is nasty
1322     RETURN(new ChertCursor(const_cast<ChertTable *>(this)));
1323 }
1324
1325 /************ B-tree opening and closing ************/
1326
1327 bool
1328 ChertTable::basic_open(bool revision_supplied, chert_revision_number_t revision_)
1329 {
1330     LOGCALL(DB, bool, "ChertTable::basic_open", revision_supplied | revision_);
1331     int ch = 'X'; /* will be 'A' or 'B' */
1332
1333     {
1334         const size_t BTREE_BASES = 2;
1335         string err_msg;
1336         static const char basenames[BTREE_BASES] = { 'A', 'B' };
1337
1338         ChertTable_base bases[BTREE_BASES];
1339         bool base_ok[BTREE_BASES];
1340
1341         both_bases = true;
1342         bool valid_base = false;
1343         {
1344             for (size_t i = 0; i < BTREE_BASES; ++i) {
1345                 bool ok = bases[i].read(name, basenames[i], writable, err_msg);
1346                 base_ok[i] = ok;
1347                 if (ok) {
1348                     valid_base = true;
1349                 } else {
1350                     both_bases = false;
1351                 }
1352             }
1353         }
1354
1355         if (!valid_base) {
1356             if (handle >= 0) {
1357                 ::close(handle);
1358                 handle = -1;
1359             }
1360             string message = "Error opening table '";
1361             message += name;
1362             message += "':\n";
1363             message += err_msg;
1364             throw Xapian::DatabaseOpeningError(message);
1365         }
1366
1367         if (revision_supplied) {
1368             bool found_revision = false;
1369             for (size_t i = 0; i < BTREE_BASES; ++i) {
1370                 if (base_ok[i] && bases[i].get_revision() == revision_) {
1371                     ch = basenames[i];
1372                     found_revision = true;
1373                     break;
1374                 }
1375             }
1376             if (!found_revision) {
1377                 /* Couldn't open the revision that was asked for.
1378                  * This shouldn't throw an exception, but should just return
1379                  * false to upper levels.
1380                  */
1381                 RETURN(false);
1382             }
1383         } else {
1384             chert_revision_number_t highest_revision = 0;
1385             for (size_t i = 0; i < BTREE_BASES; ++i) {
1386                 if (base_ok[i] && bases[i].get_revision() >= highest_revision) {
1387                     ch = basenames[i];
1388                     highest_revision = bases[i].get_revision();
1389                 }
1390             }
1391         }
1392
1393         ChertTable_base *basep = 0;
1394         ChertTable_base *other_base = 0;
1395
1396         for (size_t i = 0; i < BTREE_BASES; ++i) {
1397             LOGLINE(DB, "Checking (ch == " << ch << ") against "
1398                     "basenames[" << i << "] == " << basenames[i]);
1399             LOGLINE(DB, "bases[" << i << "].get_revision() == " <<
1400                     bases[i].get_revision());
1401             LOGLINE(DB, "base_ok[" << i << "] == " << base_ok[i]);
1402             if (ch == basenames[i]) {
1403                 basep = &bases[i];
1404
1405                 // FIXME: assuming only two bases for other_base
1406                 size_t otherbase_num = 1 - i;
1407                 if (base_ok[otherbase_num]) {
1408                     other_base = &bases[otherbase_num];
1409                 }
1410                 break;
1411             }
1412         }
1413         Assert(basep);
1414
1415         /* basep now points to the most recent base block */
1416
1417         /* Avoid copying the bitmap etc. - swap contents with the base
1418          * object in the vector, since it'll be destroyed anyway soon.
1419          */
1420         base.swap(*basep);
1421
1422         revision_number =  base.get_revision();
1423         block_size =       base.get_block_size();
1424         root =             base.get_root();
1425         level =            base.get_level();
1426         //bit_map_size =     basep->get_bit_map_size();
1427         item_count =       base.get_item_count();
1428         faked_root_block = base.get_have_fakeroot();
1429         sequential =       base.get_sequential();
1430
1431         if (other_base != 0) {
1432             latest_revision_number = other_base->get_revision();
1433             if (revision_number > latest_revision_number)
1434                 latest_revision_number = revision_number;
1435         } else {
1436             latest_revision_number = revision_number;
1437         }
1438     }
1439
1440     /* kt holds constructed items as well as keys */
1441     kt = Item_wr(zeroed_new(block_size));
1442
1443     set_max_item_size(BLOCK_CAPACITY);
1444
1445     base_letter = ch;
1446
1447     if (cursor_created_since_last_modification) {
1448         cursor_created_since_last_modification = false;
1449         ++cursor_version;
1450     }
1451
1452     /* ready to open the main file */
1453
1454     RETURN(true);
1455 }
1456
1457 void
1458 ChertTable::read_root()
1459 {
1460     LOGCALL_VOID(DB, "ChertTable::read_root", NO_ARGS);
1461     if (faked_root_block) {
1462         /* root block for an unmodified database. */
1463         byte * p = C[0].p;
1464         Assert(p);
1465
1466         /* clear block - shouldn't be necessary, but is a bit nicer,
1467          * and means that the same operations should always produce
1468          * the same database. */
1469         memset(p, 0, block_size);
1470
1471         int o = block_size - I2 - K1 - C2 - C2;
1472         Item_wr(p + o).fake_root_item();
1473
1474         setD(p, DIR_START, o);         // its directory entry
1475         SET_DIR_END(p, DIR_START + D2);// the directory size
1476
1477         o -= (DIR_START + D2);
1478         SET_MAX_FREE(p, o);
1479         SET_TOTAL_FREE(p, o);
1480         SET_LEVEL(p, 0);
1481
1482         if (!writable) {
1483             /* reading - revision number doesn't matter as long as
1484              * it's not greater than the current one. */
1485             SET_REVISION(p, 0);
1486             C[0].n = 0;
1487         } else {
1488             /* writing - */
1489             SET_REVISION(p, latest_revision_number + 1);
1490             C[0].n = base.next_free_block();
1491         }
1492     } else {
1493         /* using a root block stored on disk */
1494         block_to_cursor(C, level, root);
1495
1496         if (REVISION(C[level].p) > revision_number) set_overwritten();
1497         /* although this is unlikely */
1498     }
1499 }
1500
1501 bool
1502 ChertTable::do_open_to_write(bool revision_supplied,
1503                              chert_revision_number_t revision_,
1504                              bool create_db)
1505 {
1506     LOGCALL(DB, bool, "ChertTable::do_open_to_write", revision_supplied | revision_ | create_db);
1507     if (handle == -2) {
1508         ChertTable::throw_database_closed();
1509     }
1510     handle = io_open_block_wr(name + "DB", create_db);
1511     if (handle < 0) {
1512         // lazy doesn't make a lot of sense with create_db anyway, but ENOENT
1513         // with O_CREAT means a parent directory doesn't exist.
1514         if (lazy && !create_db && errno == ENOENT) {
1515             revision_number = revision_;
1516             RETURN(true);
1517         }
1518         string message(create_db ? "Couldn't create " : "Couldn't open ");
1519         message += name;
1520         message += "DB read/write: ";
1521         errno_to_string(errno, message);
1522         throw Xapian::DatabaseOpeningError(message);
1523     }
1524
1525     if (!basic_open(revision_supplied, revision_)) {
1526         ::close(handle);
1527         handle = -1;
1528         if (!revision_supplied) {
1529             throw Xapian::DatabaseOpeningError("Failed to open for writing");
1530         }
1531         /* When the revision is supplied, it's not an exceptional
1532          * case when open failed, so we just return false here.
1533          */
1534         RETURN(false);
1535     }
1536
1537     writable = true;
1538
1539     for (int j = 0; j <= level; j++) {
1540         C[j].n = BLK_UNUSED;
1541         C[j].p = new byte[block_size];
1542     }
1543     split_p = new byte[block_size];
1544     read_root();
1545
1546     buffer = zeroed_new(block_size);
1547
1548     changed_n = 0;
1549     changed_c = DIR_START;
1550     seq_count = SEQ_START_POINT;
1551
1552     RETURN(true);
1553 }
1554
1555 ChertTable::ChertTable(const char * tablename_, const string & path_,
1556                        bool readonly_, int compress_strategy_, bool lazy_)
1557         : tablename(tablename_),
1558           revision_number(0),
1559           item_count(0),
1560           block_size(0),
1561           latest_revision_number(0),
1562           both_bases(false),
1563           base_letter('A'),
1564           faked_root_block(true),
1565           sequential(true),
1566           handle(-1),
1567           level(0),
1568           root(0),
1569           kt(0),
1570           buffer(0),
1571           base(),
1572           name(path_),
1573           seq_count(0),
1574           changed_n(0),
1575           changed_c(0),
1576           max_item_size(0),
1577           Btree_modified(false),
1578           full_compaction(false),
1579           writable(!readonly_),
1580           cursor_created_since_last_modification(false),
1581           cursor_version(0),
1582           split_p(0),
1583           compress_strategy(compress_strategy_),
1584           deflate_zstream(NULL),
1585           inflate_zstream(NULL),
1586           lazy(lazy_),
1587           last_readahead(BLK_UNUSED)
1588 {
1589     LOGCALL_CTOR(DB, "ChertTable", tablename_ | path_ | readonly_ | compress_strategy_ | lazy_);
1590 }
1591
1592 bool
1593 ChertTable::really_empty() const
1594 {
1595     if (handle < 0) {
1596         if (handle == -2) {
1597             ChertTable::throw_database_closed();
1598         }
1599         return true;
1600     }
1601     ChertCursor cur(const_cast<ChertTable*>(this));
1602     cur.find_entry(string());
1603     return !cur.next();
1604 }
1605
1606 void
1607 ChertTable::lazy_alloc_deflate_zstream() const {
1608     if (usual(deflate_zstream)) {
1609         if (usual(deflateReset(deflate_zstream) == Z_OK)) return;
1610         // Try to recover by deleting the stream and starting from scratch.
1611         delete deflate_zstream;
1612     }
1613
1614     deflate_zstream = new z_stream;
1615
1616     deflate_zstream->zalloc = Z_NULL;
1617     deflate_zstream->zfree = Z_NULL;
1618     deflate_zstream->opaque = Z_NULL;
1619
1620     // -15 means raw deflate with 32K LZ77 window (largest)
1621     // memLevel 9 is the highest (8 is default)
1622     int err;
1623     err = deflateInit2(deflate_zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
1624                        -15, 9, compress_strategy);
1625     if (rare(err != Z_OK)) {
1626         if (err == Z_MEM_ERROR) {
1627             delete deflate_zstream;
1628             deflate_zstream = 0;
1629             throw std::bad_alloc();
1630         }
1631         string msg = "deflateInit2 failed (";
1632         if (deflate_zstream->msg) {
1633             msg += deflate_zstream->msg;
1634         } else {
1635             msg += str(err);
1636         }
1637         msg += ')';
1638         delete deflate_zstream;
1639         deflate_zstream = 0;
1640         throw Xapian::DatabaseError(msg);
1641     }
1642 }
1643
1644 void
1645 ChertTable::lazy_alloc_inflate_zstream() const {
1646     if (usual(inflate_zstream)) {
1647         if (usual(inflateReset(inflate_zstream) == Z_OK)) return;
1648         // Try to recover by deleting the stream and starting from scratch.
1649         delete inflate_zstream;
1650     }
1651
1652     inflate_zstream = new z_stream;
1653
1654     inflate_zstream->zalloc = Z_NULL;
1655     inflate_zstream->zfree = Z_NULL;
1656     inflate_zstream->opaque = Z_NULL;
1657
1658     inflate_zstream->next_in = Z_NULL;
1659     inflate_zstream->avail_in = 0;
1660
1661     int err = inflateInit2(inflate_zstream, -15);
1662     if (rare(err != Z_OK)) {
1663         if (err == Z_MEM_ERROR) {
1664             delete inflate_zstream;
1665             inflate_zstream = 0;
1666             throw std::bad_alloc();
1667         }
1668         string msg = "inflateInit2 failed (";
1669         if (inflate_zstream->msg) {
1670             msg += inflate_zstream->msg;
1671         } else {
1672             msg += str(err);
1673         }
1674         msg += ')';
1675         delete inflate_zstream;
1676         inflate_zstream = 0;
1677         throw Xapian::DatabaseError(msg);
1678     }
1679 }
1680
1681 bool
1682 ChertTable::exists() const {
1683     LOGCALL(DB, bool, "ChertTable::exists", NO_ARGS);
1684     RETURN(file_exists(name + "DB") &&
1685            (file_exists(name + "baseA") || file_exists(name + "baseB")));
1686 }
1687
1688 void
1689 ChertTable::erase()
1690 {
1691     LOGCALL_VOID(DB, "ChertTable::erase", NO_ARGS);
1692     close();
1693
1694     (void)io_unlink(name + "baseA");
1695     (void)io_unlink(name + "baseB");
1696     (void)io_unlink(name + "DB");
1697 }
1698
1699 void
1700 ChertTable::set_block_size(unsigned int block_size_)
1701 {
1702     LOGCALL_VOID(DB, "ChertTable::set_block_size", block_size_);
1703     // Block size must in the range 2048..BYTE_PAIR_RANGE, and a power of two.
1704     if (block_size_ < 2048 || block_size_ > BYTE_PAIR_RANGE ||
1705         (block_size_ & (block_size_ - 1)) != 0) {
1706         block_size_ = CHERT_DEFAULT_BLOCK_SIZE;
1707     }
1708     block_size = block_size_;
1709 }
1710
1711 void
1712 ChertTable::create_and_open(unsigned int block_size_)
1713 {
1714     LOGCALL_VOID(DB, "ChertTable::create_and_open", block_size_);
1715     if (handle == -2) {
1716         ChertTable::throw_database_closed();
1717     }
1718     Assert(writable);
1719     close();
1720
1721     set_block_size(block_size_);
1722
1723     // FIXME: it would be good to arrange that this works such that there's
1724     // always a valid table in place if you run create_and_open() on an
1725     // existing table.
1726
1727     /* write initial values to files */
1728
1729     /* create the base file */
1730     ChertTable_base base_;
1731     base_.set_revision(revision_number);
1732     base_.set_block_size(block_size);
1733     base_.set_have_fakeroot(true);
1734     base_.set_sequential(true);
1735     // Doing a full sync here would be overly paranoid, as an empty table
1736     // contains no precious data and xapian-check can recreate lost base
1737     // files.
1738     base_.write_to_file(name + "baseA", 'A', string(), -1, NULL);
1739
1740     /* remove the alternative base file, if any */
1741     (void)io_unlink(name + "baseB");
1742
1743     // Any errors are thrown if revision_supplied is false.
1744     (void)do_open_to_write(false, 0, true);
1745 }
1746
1747 ChertTable::~ChertTable() {
1748     LOGCALL_DTOR(DB, "ChertTable");
1749     ChertTable::close();
1750
1751     if (deflate_zstream) {
1752         // Errors which we care about have already been handled, so just ignore
1753         // any which get returned here.
1754         (void) deflateEnd(deflate_zstream);
1755         delete deflate_zstream;
1756     }
1757
1758     if (inflate_zstream) {
1759         // Errors which we care about have already been handled, so just ignore
1760         // any which get returned here.
1761         (void) inflateEnd(inflate_zstream);
1762         delete inflate_zstream;
1763     }
1764 }
1765
1766 void ChertTable::close(bool permanent) {
1767     LOGCALL_VOID(DB, "ChertTable::close", permanent);
1768
1769     if (handle >= 0) {
1770         // If an error occurs here, we just ignore it, since we're just
1771         // trying to free everything.
1772         (void)::close(handle);
1773         handle = -1;
1774     }
1775
1776     if (permanent) {
1777         handle = -2;
1778         // Don't delete the resources in the table, since they may
1779         // still be used to look up cached content.
1780         return;
1781     }
1782     for (int j = level; j >= 0; j--) {
1783         delete [] C[j].p;
1784         C[j].p = 0;
1785     }
1786     delete [] split_p;
1787     split_p = 0;
1788
1789     delete [] kt.get_address();
1790     kt = 0;
1791     delete [] buffer;
1792     buffer = 0;
1793 }
1794
1795 void
1796 ChertTable::flush_db()
1797 {
1798     LOGCALL_VOID(DB, "ChertTable::flush_db", NO_ARGS);
1799     Assert(writable);
1800     if (handle < 0) {
1801         if (handle == -2) {
1802             ChertTable::throw_database_closed();
1803         }
1804         return;
1805     }
1806
1807     for (int j = level; j >= 0; j--) {
1808         if (C[j].rewrite) {
1809             write_block(C[j].n, C[j].p);
1810         }
1811     }
1812
1813     if (Btree_modified) {
1814         faked_root_block = false;
1815     }
1816 }
1817
1818 void
1819 ChertTable::commit(chert_revision_number_t revision, int changes_fd,
1820                    const string * changes_tail)
1821 {
1822     LOGCALL_VOID(DB, "ChertTable::commit", revision | changes_fd | changes_tail);
1823     Assert(writable);
1824
1825     if (revision <= revision_number) {
1826         throw Xapian::DatabaseError("New revision too low");
1827     }
1828
1829     if (handle < 0) {
1830         if (handle == -2) {
1831             ChertTable::throw_database_closed();
1832         }
1833         latest_revision_number = revision_number = revision;
1834         return;
1835     }
1836
1837     try {
1838         if (faked_root_block) {
1839             /* We will use a dummy bitmap. */
1840             base.clear_bit_map();
1841         }
1842
1843         base.set_revision(revision);
1844         base.set_root(C[level].n);
1845         base.set_level(level);
1846         base.set_item_count(item_count);
1847         base.set_have_fakeroot(faked_root_block);
1848         base.set_sequential(sequential);
1849
1850         base_letter = other_base_letter();
1851
1852         both_bases = true;
1853         latest_revision_number = revision_number = revision;
1854         root = C[level].n;
1855
1856         Btree_modified = false;
1857
1858         for (int i = 0; i < BTREE_CURSOR_LEVELS; ++i) {
1859             C[i].n = BLK_UNUSED;
1860             C[i].c = -1;
1861             C[i].rewrite = false;
1862         }
1863
1864         // Save to "<table>.tmp" and then rename to "<table>.base<letter>" so
1865         // that a reader can't try to read a partially written base file.
1866         string tmp = name;
1867         tmp += "tmp";
1868         string basefile = name;
1869         basefile += "base";
1870         basefile += char(base_letter);
1871         base.write_to_file(tmp, base_letter, tablename, changes_fd, changes_tail);
1872
1873         // Do this as late as possible to allow maximum time for writes to
1874         // happen, and so the calls to io_sync() are adjacent which may be
1875         // more efficient, at least with some Linux kernel versions.
1876         if (changes_tail ? !io_full_sync(handle) : !io_sync(handle)) {
1877             (void)::close(handle);
1878             handle = -1;
1879             (void)unlink(tmp.c_str());
1880             throw Xapian::DatabaseError("Can't commit new revision - failed to flush DB to disk");
1881         }
1882
1883         if (!io_tmp_rename(tmp, basefile)) {
1884             string msg("Couldn't update base file ");
1885             msg += basefile;
1886             throw Xapian::DatabaseError(msg, errno);
1887         }
1888         base.commit();
1889
1890         read_root();
1891
1892         changed_n = 0;
1893         changed_c = DIR_START;
1894         seq_count = SEQ_START_POINT;
1895     } catch (...) {
1896         ChertTable::close();
1897         throw;
1898     }
1899 }
1900
1901 void
1902 ChertTable::write_changed_blocks(int changes_fd)
1903 {
1904     LOGCALL_VOID(DB, "ChertTable::write_changed_blocks", changes_fd);
1905     Assert(changes_fd >= 0);
1906     if (handle < 0) return;
1907     if (faked_root_block) return;
1908
1909     string buf;
1910     pack_uint(buf, 2u); // Indicate the item is a list of blocks
1911     pack_string(buf, tablename);
1912     pack_uint(buf, block_size);
1913     io_write(changes_fd, buf.data(), buf.size());
1914
1915     // Compare the old and new bitmaps to find blocks which have changed, and
1916     // write them to the file descriptor.
1917     uint4 n = 0;
1918     byte * p = new byte[block_size];
1919     try {
1920         base.calculate_last_block();
1921         while (base.find_changed_block(&n)) {
1922             buf.resize(0);
1923             pack_uint(buf, n + 1);
1924             io_write(changes_fd, buf.data(), buf.size());
1925
1926             // Read block n.
1927             read_block(n, p);
1928
1929             // Write block n to the file.
1930             io_write(changes_fd, reinterpret_cast<const char *>(p),
1931                      block_size);
1932             ++n;
1933         }
1934         delete[] p;
1935         p = 0;
1936     } catch (...) {
1937         delete[] p;
1938         throw;
1939     }
1940     buf.resize(0);
1941     pack_uint(buf, 0u);
1942     io_write(changes_fd, buf.data(), buf.size());
1943 }
1944
1945 void
1946 ChertTable::cancel()
1947 {
1948     LOGCALL_VOID(DB, "ChertTable::cancel", NO_ARGS);
1949     Assert(writable);
1950
1951     if (handle < 0) {
1952         if (handle == -2) {
1953             ChertTable::throw_database_closed();
1954         }
1955         latest_revision_number = revision_number; // FIXME: we can end up reusing a revision if we opened a btree at an older revision, start to modify it, then cancel...
1956         return;
1957     }
1958
1959     // This causes problems: if (!Btree_modified) return;
1960
1961     string err_msg;
1962     if (!base.read(name, base_letter, writable, err_msg)) {
1963         throw Xapian::DatabaseCorruptError(string("Couldn't reread base ") + base_letter);
1964     }
1965
1966     revision_number =  base.get_revision();
1967     block_size =       base.get_block_size();
1968     root =             base.get_root();
1969     level =            base.get_level();
1970     //bit_map_size =     basep->get_bit_map_size();
1971     item_count =       base.get_item_count();
1972     faked_root_block = base.get_have_fakeroot();
1973     sequential =       base.get_sequential();
1974
1975     latest_revision_number = revision_number; // FIXME: we can end up reusing a revision if we opened a btree at an older revision, start to modify it, then cancel...
1976
1977     Btree_modified = false;
1978
1979     for (int j = 0; j <= level; j++) {
1980         C[j].n = BLK_UNUSED;
1981         C[j].rewrite = false;
1982     }
1983     read_root();
1984
1985     changed_n = 0;
1986     changed_c = DIR_START;
1987     seq_count = SEQ_START_POINT;
1988
1989     if (cursor_created_since_last_modification) {
1990         cursor_created_since_last_modification = false;
1991         ++cursor_version;
1992     }
1993 }
1994
1995 /************ B-tree reading ************/
1996
1997 bool
1998 ChertTable::do_open_to_read(bool revision_supplied, chert_revision_number_t revision_)
1999 {
2000     LOGCALL(DB, bool, "ChertTable::do_open_to_read", revision_supplied | revision_);
2001     if (handle == -2) {
2002         ChertTable::throw_database_closed();
2003     }
2004     handle = io_open_block_rd(name + "DB");
2005     if (handle < 0) {
2006         if (lazy) {
2007             // This table is optional when reading!
2008             revision_number = revision_;
2009             RETURN(true);
2010         }
2011         string message("Couldn't open ");
2012         message += name;
2013         message += "DB to read: ";
2014         errno_to_string(errno, message);
2015         throw Xapian::DatabaseOpeningError(message);
2016     }
2017
2018     if (!basic_open(revision_supplied, revision_)) {
2019         ::close(handle);
2020         handle = -1;
2021         if (revision_supplied) {
2022             // The requested revision was not available.
2023             // This could be because the database was modified underneath us, or
2024             // because a base file is missing.  Return false, and work out what
2025             // the problem was at a higher level.
2026             RETURN(false);
2027         }
2028         throw Xapian::DatabaseOpeningError("Failed to open table for reading");
2029     }
2030
2031     for (int j = 0; j <= level; j++) {
2032         C[j].n = BLK_UNUSED;
2033         C[j].p = new byte[block_size];
2034     }
2035
2036     read_root();
2037     RETURN(true);
2038 }
2039
2040 void
2041 ChertTable::open()
2042 {
2043     LOGCALL_VOID(DB, "ChertTable::open", NO_ARGS);
2044     LOGLINE(DB, "opening at path " << name);
2045     close();
2046
2047     if (!writable) {
2048         // Any errors are thrown if revision_supplied is false
2049         (void)do_open_to_read(false, 0);
2050         return;
2051     }
2052
2053     // Any errors are thrown if revision_supplied is false.
2054     (void)do_open_to_write(false, 0);
2055 }
2056
2057 bool
2058 ChertTable::open(chert_revision_number_t revision)
2059 {
2060     LOGCALL(DB, bool, "ChertTable::open", revision);
2061     LOGLINE(DB, "opening for particular revision at path " << name);
2062     close();
2063
2064     if (!writable) {
2065         if (do_open_to_read(true, revision)) {
2066             AssertEq(revision_number, revision);
2067             RETURN(true);
2068         } else {
2069             close();
2070             RETURN(false);
2071         }
2072     }
2073
2074     if (!do_open_to_write(true, revision)) {
2075         // Can't open at the requested revision.
2076         close();
2077         RETURN(false);
2078     }
2079
2080     AssertEq(revision_number, revision);
2081     RETURN(true);
2082 }
2083
2084 bool
2085 ChertTable::prev_for_sequential(Cursor * C_, int /*dummy*/) const
2086 {
2087     LOGCALL(DB, bool, "ChertTable::prev_for_sequential", Literal("C_") | Literal("/*dummy*/"));
2088     int c = C_[0].c;
2089     AssertRel(DIR_START,<=,c);
2090     AssertRel(c,<,DIR_END(C_[0].p));
2091     if (c == DIR_START) {
2092         byte * p = C_[0].p;
2093         Assert(p);
2094         uint4 n = C_[0].n;
2095         while (true) {
2096             if (n == 0) RETURN(false);
2097             n--;
2098             if (writable) {
2099                 if (n == C[0].n) {
2100                     // Block is a leaf block in the built-in cursor
2101                     // (potentially in modified form).
2102                     memcpy(p, C[0].p, block_size);
2103                 } else {
2104                     // Blocks in the built-in cursor may not have been written
2105                     // to disk yet, so we have to check that the block number
2106                     // isn't in the built-in cursor or we'll read an
2107                     // uninitialised block (for which GET_LEVEL(p) will
2108                     // probably return 0).
2109                     int j;
2110                     for (j = 1; j <= level; ++j) {
2111                         if (n == C[j].n) break;
2112                     }
2113                     if (j <= level) continue;
2114
2115                     // Block isn't in the built-in cursor, so the form on disk
2116                     // is valid, so read it to check if it's the next level 0
2117                     // block.
2118                     read_block(n, p);
2119                 }
2120             } else {
2121                 read_block(n, p);
2122             }
2123             if (writable) AssertEq(revision_number, latest_revision_number);
2124             if (REVISION(p) > revision_number + writable) {
2125                 set_overwritten();
2126                 RETURN(false);
2127             }
2128             if (GET_LEVEL(p) == 0) break;
2129         }
2130         c = DIR_END(p);
2131         C_[0].n = n;
2132         AssertRel(DIR_START,<,c);
2133     }
2134     c -= D2;
2135     C_[0].c = c;
2136     RETURN(true);
2137 }
2138
2139 bool
2140 ChertTable::next_for_sequential(Cursor * C_, int /*dummy*/) const
2141 {
2142     LOGCALL(DB, bool, "ChertTable::next_for_sequential", Literal("C_") | Literal("/*dummy*/"));
2143     byte * p = C_[0].p;
2144     Assert(p);
2145     int c = C_[0].c;
2146     AssertRel(c,<,DIR_END(p));
2147     c += D2;
2148     Assert((unsigned)c < block_size);
2149     if (c == DIR_END(p)) {
2150         uint4 n = C_[0].n;
2151         while (true) {
2152             n++;
2153             if (n > base.get_last_block()) RETURN(false);
2154             if (writable) {
2155                 if (n == C[0].n) {
2156                     // Block is a leaf block in the built-in cursor
2157                     // (potentially in modified form).
2158                     memcpy(p, C[0].p, block_size);
2159                 } else {
2160                     // Blocks in the built-in cursor may not have been written
2161                     // to disk yet, so we have to check that the block number
2162                     // isn't in the built-in cursor or we'll read an
2163                     // uninitialised block (for which GET_LEVEL(p) will
2164                     // probably return 0).
2165                     int j;
2166                     for (j = 1; j <= level; ++j) {
2167                         if (n == C[j].n) break;
2168                     }
2169                     if (j <= level) continue;
2170
2171                     // Block isn't in the built-in cursor, so the form on disk
2172                     // is valid, so read it to check if it's the next level 0
2173                     // block.
2174                     read_block(n, p);
2175                 }
2176             } else {
2177                 read_block(n, p);
2178             }
2179             if (writable) AssertEq(revision_number, latest_revision_number);
2180             if (REVISION(p) > revision_number + writable) {
2181                 set_overwritten();
2182                 RETURN(false);
2183             }
2184             if (GET_LEVEL(p) == 0) break;
2185         }
2186         c = DIR_START;
2187         C_[0].n = n;
2188     }
2189     C_[0].c = c;
2190     RETURN(true);
2191 }
2192
2193 bool
2194 ChertTable::prev_default(Cursor * C_, int j) const
2195 {
2196     LOGCALL(DB, bool, "ChertTable::prev_default", Literal("C_") | j);
2197     byte * p = C_[j].p;
2198     int c = C_[j].c;
2199     AssertRel(DIR_START,<=,c);
2200     AssertRel(c,<,DIR_END(p));
2201     AssertRel((unsigned)DIR_END(p),<=,block_size);
2202     if (c == DIR_START) {
2203         if (j == level) RETURN(false);
2204         if (!prev_default(C_, j + 1)) RETURN(false);
2205         c = DIR_END(p);
2206         AssertRel(DIR_START,<,c);
2207     }
2208     c -= D2;
2209     C_[j].c = c;
2210     if (j > 0) {
2211         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
2212     }
2213     RETURN(true);
2214 }
2215
2216 bool
2217 ChertTable::next_default(Cursor * C_, int j) const
2218 {
2219     LOGCALL(DB, bool, "ChertTable::next_default", Literal("C_") | j);
2220     byte * p = C_[j].p;
2221     int c = C_[j].c;
2222     AssertRel(c,<,DIR_END(p));
2223     AssertRel((unsigned)DIR_END(p),<=,block_size);
2224     c += D2;
2225     if (j > 0) {
2226         AssertRel(DIR_START,<,c);
2227     } else {
2228         AssertRel(DIR_START,<=,c);
2229     }
2230     // Sometimes c can be DIR_END(p) + 2 here it appears...
2231     if (c >= DIR_END(p)) {
2232         if (j == level) RETURN(false);
2233         if (!next_default(C_, j + 1)) RETURN(false);
2234         c = DIR_START;
2235     }
2236     C_[j].c = c;
2237     if (j > 0) {
2238         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
2239 #ifdef BTREE_DEBUG_FULL
2240         printf("Block in ChertTable:next_default");
2241         report_block_full(j - 1, C_[j - 1].n, C_[j - 1].p);
2242 #endif /* BTREE_DEBUG_FULL */
2243     }
2244     RETURN(true);
2245 }
2246
2247 void
2248 ChertTable::throw_database_closed()
2249 {
2250     throw Xapian::DatabaseError("Database has been closed");
2251 }
2252
2253 /** Compares this key with key2.
2254
2255    The result is true if this key precedes key2. The comparison is for byte
2256    sequence collating order, taking lengths into account. So if the keys are
2257    made up of lower case ASCII letters we get alphabetical ordering.
2258
2259    Now remember that items are added into the B-tree in fastest time
2260    when they are preordered by their keys. This is therefore the piece
2261    of code that needs to be followed to arrange for the preordering.
2262
2263    This is complicated by the fact that keys have two parts - a value
2264    and then a count.  We first compare the values, and only if they
2265    are equal do we compare the counts.
2266 */
2267
2268 bool Key::operator<(Key key2) const
2269 {
2270     LOGCALL(DB, bool, "Key::operator<", static_cast<const void*>(key2.p));
2271     int key1_len = length();
2272     int key2_len = key2.length();
2273     if (key1_len == key2_len) {
2274         // The keys are the same length, so we can compare the counts
2275         // in the same operation since they're stored as 2 byte
2276         // bigendian numbers.
2277         RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) < 0);
2278     }
2279
2280     int k_smaller = (key2_len < key1_len ? key2_len : key1_len);
2281
2282     // Compare the common part of the keys
2283     int diff = memcmp(p + K1, key2.p + K1, k_smaller);
2284     if (diff != 0) RETURN(diff < 0);
2285
2286     // We dealt with the "same length" case above so we never need to check
2287     // the count here.
2288     RETURN(key1_len < key2_len);
2289 }
2290
2291 bool Key::operator==(Key key2) const
2292 {
2293     LOGCALL(DB, bool, "Key::operator==", static_cast<const void*>(key2.p));
2294     int key1_len = length();
2295     if (key1_len != key2.length()) RETURN(false);
2296     // The keys are the same length, so we can compare the counts
2297     // in the same operation since they're stored as 2 byte
2298     // bigendian numbers.
2299     RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) == 0);
2300 }