Fix whitespace irregularities in code
[xapian.git] / xapian-core / backends / chert / chert_table.cc
blob34b04b9a9a62291a2ec43548522e9bc6e606af41
1 /* chert_table.cc: Btree implementation
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts
6 * Copyright 2008 Lemur Consulting Ltd
7 * Copyright 2010 Richard Boulton
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #include <config.h>
27 #include "chert_table.h"
29 #include <xapian/error.h>
31 #include "safeerrno.h"
33 #include "errno_to_string.h"
34 #include "omassert.h"
35 #include "stringutils.h" // For STRINGIZE().
37 // Define to use "dangerous" mode - in this mode we write modified btree
38 // blocks back in place. This is somewhat faster (especially once we're
39 // I/O bound) but the database can't be safely searched during indexing
40 // and if indexing is terminated uncleanly, the database may be corrupt.
42 // Despite the risks, it's useful for speeding up a full rebuild.
44 // FIXME: make this mode run-time selectable, and record that it is currently
45 // in use somewhere on disk, so readers can check and refuse to open the
46 // database.
48 // #define DANGEROUS
50 #include <sys/types.h>
52 #include <cstring> /* for memmove */
53 #include <climits> /* for CHAR_BIT */
55 #include "chert_btreebase.h"
56 #include "chert_cursor.h"
58 #include "filetests.h"
59 #include "io_utils.h"
60 #include "debuglog.h"
61 #include "pack.h"
62 #include "str.h"
63 #include "wordaccess.h"
65 #include <algorithm> // for std::min()
66 #include <string>
68 using namespace std;
70 // Only try to compress tags longer than this many bytes.
71 const size_t COMPRESS_MIN = 4;
73 //#define BTREE_DEBUG_FULL 1
74 #undef BTREE_DEBUG_FULL
76 #ifdef BTREE_DEBUG_FULL
77 /*------debugging aids from here--------*/
79 static void print_key(const byte * p, int c, int j);
80 static void print_tag(const byte * p, int c, int j);
83 static void report_cursor(int N, Btree * B, Cursor * C)
85 int i;
86 printf("%d)\n", N);
87 for (i = 0; i <= B->level; i++)
88 printf("p=%d, c=%d, n=[%d], rewrite=%d\n",
89 C[i].p, C[i].c, C[i].n, C[i].rewrite);
93 /*------to here--------*/
94 #endif /* BTREE_DEBUG_FULL */
96 static inline byte *zeroed_new(size_t size)
98 byte *temp = new byte[size];
99 memset(temp, 0, size);
100 return temp;
103 /* A B-tree comprises (a) a base file, containing essential information (Block
104 size, number of the B-tree root block etc), (b) a bitmap, the Nth bit of the
105 bitmap being set if the Nth block of the B-tree file is in use, and (c) a
106 file DB containing the B-tree proper. The DB file is divided into a sequence
107 of equal sized blocks, numbered 0, 1, 2 ... some of which are free, some in
108 use. Those in use are arranged in a tree.
110 Each block, b, has a structure like this:
112 R L M T D o1 o2 o3 ... oN <gap> [item] .. [item] .. [item] ...
113 <---------- D ----------> <-M->
115 And then,
117 R = REVISION(b) is the revision number the B-tree had when the block was
118 written into the DB file.
119 L = GET_LEVEL(b) is the level of the block, which is the number of levels
120 towards the root of the B-tree structure. So leaf blocks
121 have level 0 and the one root block has the highest level
122 equal to the number of levels in the B-tree.
123 M = MAX_FREE(b) is the size of the gap between the end of the directory and
124 the first item of data. (It is not necessarily the maximum
125 size among the bits of space that are free, but I can't
126 think of a better name.)
127 T = TOTAL_FREE(b)is the total amount of free space left in b.
128 D = DIR_END(b) gives the offset to the end of the directory.
130 o1, o2 ... oN are a directory of offsets to the N items held in the block.
131 The items are key-tag pairs, and as they occur in the directory are ordered
132 by the keys.
134 An item has this form:
136 I K key x C tag
137 <--K-->
138 <------I------>
140 A long tag presented through the API is split up into C tags small enough to
141 be accommodated in the blocks of the B-tree. The key is extended to include
142 a counter, x, which runs from 1 to C. The key is preceded by a length, K,
143 and the whole item with a length, I, as depicted above.
145 Here are the corresponding definitions:
149 /** Flip to sequential addition block-splitting after this number of observed
150 * sequential additions (in negated form). */
151 #define SEQ_START_POINT (-10)
154 /* There are two bit maps in bit_map0 and bit_map. The nth bit of bitmap is 0
155 if the nth block is free, otherwise 1. bit_map0 is the initial state of
156 bitmap at the start of the current transaction.
158 Note use of the limits.h values:
159 UCHAR_MAX = 255, an unsigned with all bits set, and
160 CHAR_BIT = 8, the number of bits per byte
162 BYTE_PAIR_RANGE below is the smallest +ve number that can't be held in two
163 bytes -- 64K effectively.
166 #define BYTE_PAIR_RANGE (1 << 2 * CHAR_BIT)
168 /// read_block(n, p) reads block n of the DB file to address p.
169 void
170 ChertTable::read_block(uint4 n, byte * p) const
172 // Log the value of p, not the contents of the block it points to...
173 LOGCALL_VOID(DB, "ChertTable::read_block", n | (void*)p);
174 if (rare(handle == -2))
175 ChertTable::throw_database_closed();
177 /* Use the base bit_map_size not the bitmap's size, because
178 * the latter is uninitialised in readonly mode.
180 Assert(n / CHAR_BIT < base.get_bit_map_size());
182 io_read_block(handle, reinterpret_cast<char *>(p), block_size, n);
184 int dir_end = DIR_END(p);
185 if (rare(dir_end < DIR_START || unsigned(dir_end) > block_size)) {
186 string msg("dir_end invalid in block ");
187 msg += str(n);
188 throw Xapian::DatabaseCorruptError(msg);
192 /** write_block(n, p) writes block n in the DB file from address p.
193 * When writing we check to see if the DB file has already been
194 * modified. If not (so this is the first write) the old base is
195 * deleted. This prevents the possibility of it being opened
196 * subsequently as an invalid base.
198 void
199 ChertTable::write_block(uint4 n, const byte * p) const
201 LOGCALL_VOID(DB, "ChertTable::write_block", n | p);
202 Assert(writable);
203 /* Check that n is in range. */
204 Assert(n / CHAR_BIT < base.get_bit_map_size());
206 /* don't write to non-free */;
207 AssertParanoid(base.block_free_at_start(n));
209 /* write revision is okay */
210 AssertEqParanoid(REVISION(p), latest_revision_number + 1);
212 if (both_bases) {
213 // Delete the old base before modifying the database.
215 // If the file is on NFS, then io_unlink() may return false even if
216 // the file was removed, so on balance throwing an exception in this
217 // case is unhelpful, since we wanted the file gone anyway! The
218 // likely explanation is that somebody moved, deleted, or changed a
219 // symlink to the database directory.
220 (void)io_unlink(name + "base" + other_base_letter());
221 both_bases = false;
222 latest_revision_number = revision_number;
225 io_write_block(handle, reinterpret_cast<const char *>(p), block_size, n);
229 /* A note on cursors:
231 Each B-tree level has a corresponding array element C[j] in a
232 cursor, C. C[0] is the leaf (or data) level, and C[B->level] is the
233 root block level. Within a level j,
235 C[j].p addresses the block
236 C[j].c is the offset into the directory entry in the block
237 C[j].n is the number of the block at C[j].p
239 A look up in the B-tree causes navigation of the blocks starting
240 from the root. In each block, p, we find an offset, c, to an item
241 which gives the number, n, of the block for the next level. This
242 leads to an array of values p,c,n which are held inside the cursor.
244 Any Btree object B has a built-in cursor, at B->C. But other cursors may
245 be created. If BC is a created cursor, BC->C is the cursor in the
246 sense given above, and BC->B is the handle for the B-tree again.
250 void
251 ChertTable::set_overwritten() const
253 LOGCALL_VOID(DB, "ChertTable::set_overwritten", NO_ARGS);
254 // If we're writable, there shouldn't be another writer who could cause
255 // overwritten to be flagged, so that's a DatabaseCorruptError.
256 if (writable)
257 throw Xapian::DatabaseCorruptError("Db block overwritten - are there multiple writers?");
258 throw Xapian::DatabaseModifiedError("The revision being read has been discarded - you should call Xapian::Database::reopen() and retry the operation");
261 /* block_to_cursor(C, j, n) puts block n into position C[j] of cursor
262 C, writing the block currently at C[j] back to disk if necessary.
263 Note that
265 C[j].rewrite
267 is true iff C[j].n is different from block n in file DB. If it is
268 false no rewriting is necessary.
271 void
272 ChertTable::block_to_cursor(Cursor * C_, int j, uint4 n) const
274 LOGCALL_VOID(DB, "ChertTable::block_to_cursor", (void*)C_ | j | n);
275 if (n == C_[j].n) return;
276 byte * p = C_[j].p;
277 Assert(p);
279 // FIXME: only needs to be done in write mode
280 if (C_[j].rewrite) {
281 Assert(writable);
282 Assert(C == C_);
283 write_block(C_[j].n, p);
284 C_[j].rewrite = false;
287 // Check if the block is in the built-in cursor (potentially in
288 // modified form).
289 if (n == C[j].n) {
290 if (p != C[j].p)
291 memcpy(p, C[j].p, block_size);
292 } else {
293 read_block(n, p);
296 C_[j].n = n;
297 if (j < level) {
298 /* unsigned comparison */
299 if (rare(REVISION(p) > REVISION(C_[j + 1].p))) {
300 set_overwritten();
301 return;
305 if (rare(j != GET_LEVEL(p))) {
306 string msg = "Expected block ";
307 msg += str(n);
308 msg += " to be level ";
309 msg += str(j);
310 msg += ", not ";
311 msg += str(GET_LEVEL(p));
312 throw Xapian::DatabaseCorruptError(msg);
316 /** Btree::alter(); is called when the B-tree is to be altered.
318 It causes new blocks to be forced for the current set of blocks in
319 the cursor.
321 The point is that if a block at level 0 is to be altered it may get
322 a new number. Then the pointer to this block from level 1 will need
323 changing. So the block at level 1 needs altering and may get a new
324 block number. Then the pointer to this block from level 2 will need
325 changing ... and so on back to the root.
327 The clever bit here is spotting the cases when we can make an early
328 exit from this process. If C[j].rewrite is true, C[j+k].rewrite
329 will be true for k = 1,2 ... We have been through all this before,
330 and there is no need to do it again. If C[j].n was free at the
331 start of the transaction, we can copy it back to the same place
332 without violating the integrity of the B-tree. We don't then need a
333 new n and can return. The corresponding C[j].rewrite may be true or
334 false in that case.
337 void
338 ChertTable::alter()
340 LOGCALL_VOID(DB, "ChertTable::alter", NO_ARGS);
341 Assert(writable);
342 #ifdef DANGEROUS
343 C[0].rewrite = true;
344 #else
345 int j = 0;
346 byte * p = C[j].p;
347 while (true) {
348 if (C[j].rewrite) return; /* all new, so return */
349 C[j].rewrite = true;
351 uint4 n = C[j].n;
352 if (base.block_free_at_start(n)) {
353 Assert(REVISION(p) == latest_revision_number + 1);
354 return;
356 Assert(REVISION(p) < latest_revision_number + 1);
357 base.free_block(n);
358 n = base.next_free_block();
359 C[j].n = n;
360 SET_REVISION(p, latest_revision_number + 1);
362 if (j == level) return;
363 j++;
364 p = C[j].p;
365 Item_wr(p, C[j].c).set_block_given_by(n);
367 #endif
370 /** find_in_block(p, key, leaf, c) searches for the key in the block at p.
372 leaf is true for a data block, and false for an index block (when the
373 first key is dummy and never needs to be tested). What we get is the
374 directory entry to the last key <= the key being searched for.
376 The lookup is by binary chop, with i and j set to the left and
377 right ends of the search area. In sequential addition, c will often
378 be the answer, so we test the keys round c and move i and j towards
379 c if possible.
381 The returned value is < DIR_END(p). If leaf is false, the returned
382 value is >= DIR_START; if leaf is true, it can also be == DIR_START - D2.
386 ChertTable::find_in_block(const byte * p, Key key, bool leaf, int c)
388 LOGCALL_STATIC(DB, int, "ChertTable::find_in_block", (const void*)p | (const void *)key.get_address() | leaf | c);
389 // c should be odd (either -1, or an even offset from DIR_START).
390 Assert((c & 1) == 1);
391 int i = DIR_START;
392 if (leaf) i -= D2;
393 if (c != -1) {
394 AssertRel(i,<=,c);
396 int j = DIR_END(p);
398 if (c != -1) {
399 if (c < j && i < c && Item(p, c).key() <= key)
400 i = c;
401 c += D2;
402 if (c < j && i < c && key < Item(p, c).key())
403 j = c;
406 while (j - i > D2) {
407 int k = i + ((j - i)/(D2 * 2))*D2; /* mid way */
408 if (key < Item(p, k).key()) j = k; else i = k;
410 if (leaf) {
411 AssertRel(DIR_START - D2,<=,i);
412 } else {
413 AssertRel(DIR_START,<=,i);
415 AssertRel(i,<,DIR_END(p));
416 RETURN(i);
419 /** find(C_) searches for the key of B->kt in the B-tree.
421 Result is true if found, false otherwise. When false, the B_tree
422 cursor is positioned at the last key in the B-tree <= the search
423 key. Goes to first (null) item in B-tree when key length == 0.
425 Note: The cursor can be left with C_[0].c == DIR_START - D2 if the
426 requested key doesn't exist and is less than the smallest key in a
427 leaf block, but after the dividing key. The caller needs to fix up
428 C_[0].c in this case, either explicitly or by performing an
429 operation which gives C_[0].c a valid value.
432 bool
433 ChertTable::find(Cursor * C_) const
435 LOGCALL(DB, bool, "ChertTable::find", (void*)C_);
436 // Note: the parameter is needed when we're called by ChertCursor
437 const byte * p;
438 int c;
439 Key key = kt.key();
440 for (int j = level; j > 0; --j) {
441 p = C_[j].p;
442 c = find_in_block(p, key, false, C_[j].c);
443 #ifdef BTREE_DEBUG_FULL
444 printf("Block in ChertTable:find - code position 1");
445 report_block_full(j, C_[j].n, p);
446 #endif /* BTREE_DEBUG_FULL */
447 C_[j].c = c;
448 block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
450 p = C_[0].p;
451 c = find_in_block(p, key, true, C_[0].c);
452 #ifdef BTREE_DEBUG_FULL
453 printf("Block in ChertTable:find - code position 2");
454 report_block_full(0, C_[0].n, p);
455 #endif /* BTREE_DEBUG_FULL */
456 C_[0].c = c;
457 if (c < DIR_START) {
458 RETURN(false);
460 RETURN(Item(p, c).key() == key);
463 /** compact(p) compact the block at p by shuffling all the items up to the end.
465 MAX_FREE(p) is then maximized, and is equal to TOTAL_FREE(p).
468 void
469 ChertTable::compact(byte * p)
471 LOGCALL_VOID(DB, "ChertTable::compact", (void*)p);
472 Assert(writable);
473 int e = block_size;
474 byte * b = buffer;
475 int dir_end = DIR_END(p);
476 for (int c = DIR_START; c < dir_end; c += D2) {
477 Item item(p, c);
478 int l = item.size();
479 e -= l;
480 memmove(b + e, item.get_address(), l);
481 setD(p, c, e); /* reform in b */
483 memmove(p + e, b + e, block_size - e); /* copy back */
484 e -= dir_end;
485 SET_TOTAL_FREE(p, e);
486 SET_MAX_FREE(p, e);
489 /** Btree needs to gain a new level to insert more items: so split root block
490 * and construct a new one.
492 void
493 ChertTable::split_root(uint4 split_n)
495 LOGCALL_VOID(DB, "ChertTable::split_root", split_n);
496 /* gain a level */
497 ++level;
499 /* check level overflow - this isn't something that should ever happen
500 * but deserves more than an Assert()... */
501 if (level == BTREE_CURSOR_LEVELS) {
502 throw Xapian::DatabaseCorruptError("Btree has grown impossibly large (" STRINGIZE(BTREE_CURSOR_LEVELS) " levels)");
505 byte * q = zeroed_new(block_size);
506 C[level].p = q;
507 C[level].c = DIR_START;
508 C[level].n = base.next_free_block();
509 C[level].rewrite = true;
510 SET_REVISION(q, latest_revision_number + 1);
511 SET_LEVEL(q, level);
512 SET_DIR_END(q, DIR_START);
513 compact(q); /* to reset TOTAL_FREE, MAX_FREE */
515 /* form a null key in b with a pointer to the old root */
516 byte b[10]; /* 7 is exact */
517 Item_wr item(b);
518 item.form_null_key(split_n);
519 add_item(item, level);
522 /** enter_key(j, prevkey, newkey) is called after a block split.
524 It enters in the block at level C[j] a separating key for the block
525 at level C[j - 1]. The key itself is newkey. prevkey is the
526 preceding key, and at level 1 newkey can be trimmed down to the
527 first point of difference to prevkey for entry in C[j].
529 This code looks longer than it really is. If j exceeds the number
530 of B-tree levels the root block has split and we have to construct
531 a new one, but this is a rare event.
533 The key is constructed in b, with block number C[j - 1].n as tag,
534 and this is added in with add_item. add_item may itself cause a
535 block split, with a further call to enter_key. Hence the recursion.
537 void
538 ChertTable::enter_key(int j, Key prevkey, Key newkey)
540 LOGCALL_VOID(DB, "ChertTable::enter_key", j | Literal("prevkey") | Literal("newkey"));
541 Assert(writable);
542 Assert(prevkey < newkey);
543 AssertRel(j,>=,1);
545 uint4 blocknumber = C[j - 1].n;
547 // FIXME update to use Key
548 // Keys are truncated here: but don't truncate the count at the end away.
549 const int newkey_len = newkey.length();
550 AssertRel(newkey_len,>,0);
551 int i;
553 if (j == 1) {
554 // Truncate the key to the minimal key which differs from prevkey,
555 // the preceding key in the block.
556 i = 0;
557 const int min_len = min(newkey_len, prevkey.length());
558 while (i < min_len && prevkey[i] == newkey[i]) {
559 i++;
562 // Want one byte of difference.
563 if (i < newkey_len) i++;
564 } else {
565 /* Can't truncate between branch levels, since the separated keys
566 * are in at the leaf level, and truncating again will change the
567 * branch point.
569 i = newkey_len;
572 byte b[UCHAR_MAX + 6];
573 Item_wr item(b);
574 Assert(i <= 256 - I2 - C2);
575 Assert(i <= (int)sizeof(b) - I2 - C2 - 4);
576 item.set_key_and_block(newkey, i, blocknumber);
578 // When j > 1 we can make the first key of block p null. This is probably
579 // worthwhile as it trades a small amount of CPU and RAM use for a small
580 // saving in disk use. Other redundant keys will still creep in though.
581 if (j > 1) {
582 byte * p = C[j - 1].p;
583 uint4 n = getint4(newkey.get_address(), newkey_len + K1 + C2);
584 int new_total_free = TOTAL_FREE(p) + newkey_len + C2;
585 // FIXME: incredibly icky going from key to item like this...
586 Item_wr(const_cast<byte*>(newkey.get_address()) - I2).form_null_key(n);
587 SET_TOTAL_FREE(p, new_total_free);
590 // The split block gets inserted into the parent after the pointer to the
591 // current child.
592 AssertEq(C[j].c, find_in_block(C[j].p, item.key(), false, C[j].c));
593 C[j].c += D2;
594 C[j].rewrite = true; /* a subtle point: this *is* required. */
595 add_item(item, j);
598 /** mid_point(p) finds the directory entry in c that determines the
599 approximate mid point of the data in the block at p.
603 ChertTable::mid_point(byte * p)
605 LOGCALL(DB, int, "ChertTable::mid_point", (void*)p);
606 int n = 0;
607 int dir_end = DIR_END(p);
608 int size = block_size - TOTAL_FREE(p) - dir_end;
609 for (int c = DIR_START; c < dir_end; c += D2) {
610 int l = Item(p, c).size();
611 n += 2 * l;
612 if (n >= size) {
613 if (l < n - size) RETURN(c);
614 RETURN(c + D2);
618 /* This shouldn't happen, as the sum of the item sizes should be the same
619 * as the value calculated in size, so assert but return a sane value just
620 * in case. */
621 Assert(false);
622 RETURN(dir_end);
625 /** add_item_to_block(p, kt_, c) adds item kt_ to the block at p.
627 c is the offset in the directory that needs to be expanded to accommodate
628 the new entry for the item. We know before this is called that there is
629 enough contiguous room for the item in the block, so it's just a matter of
630 shuffling up any directory entries after where we're inserting and copying
631 in the item.
634 void
635 ChertTable::add_item_to_block(byte * p, Item_wr kt_, int c)
637 LOGCALL_VOID(DB, "ChertTable::add_item_to_block", (void*)p | Literal("kt_") | c);
638 Assert(writable);
639 int dir_end = DIR_END(p);
640 int kt_len = kt_.size();
641 int needed = kt_len + D2;
642 int new_total = TOTAL_FREE(p) - needed;
643 int new_max = MAX_FREE(p) - needed;
645 Assert(new_total >= 0);
647 AssertRel(MAX_FREE(p),>=,needed);
649 AssertRel(DIR_START,<=,c);
650 AssertRel(c,<=,dir_end);
652 memmove(p + c + D2, p + c, dir_end - c);
653 dir_end += D2;
654 SET_DIR_END(p, dir_end);
656 int o = dir_end + new_max;
657 setD(p, c, o);
658 memmove(p + o, kt_.get_address(), kt_len);
660 SET_MAX_FREE(p, new_max);
662 SET_TOTAL_FREE(p, new_total);
665 /** ChertTable::add_item(kt_, j) adds item kt_ to the block at cursor level C[j].
667 * If there is not enough room the block splits and the item is then
668 * added to the appropriate half.
670 void
671 ChertTable::add_item(Item_wr kt_, int j)
673 LOGCALL_VOID(DB, "ChertTable::add_item", Literal("kt_") | j);
674 Assert(writable);
675 byte * p = C[j].p;
676 int c = C[j].c;
677 uint4 n;
679 int needed = kt_.size() + D2;
680 if (TOTAL_FREE(p) < needed) {
681 int m;
682 // Prepare to split p. After splitting, the block is in two halves, the
683 // lower half is split_p, the upper half p again. add_to_upper_half
684 // becomes true when the item gets added to p, false when it gets added
685 // to split_p.
687 if (seq_count < 0) {
688 // If we're not in sequential mode, we split at the mid point
689 // of the node.
690 m = mid_point(p);
691 } else {
692 // During sequential addition, split at the insert point
693 AssertRel(c,>=,DIR_START);
694 m = c;
697 uint4 split_n = C[j].n;
698 C[j].n = base.next_free_block();
700 memcpy(split_p, p, block_size); // replicate the whole block in split_p
701 SET_DIR_END(split_p, m);
702 compact(split_p); /* to reset TOTAL_FREE, MAX_FREE */
705 int residue = DIR_END(p) - m;
706 int new_dir_end = DIR_START + residue;
707 memmove(p + DIR_START, p + m, residue);
708 SET_DIR_END(p, new_dir_end);
711 compact(p); /* to reset TOTAL_FREE, MAX_FREE */
713 bool add_to_upper_half;
714 if (seq_count < 0) {
715 add_to_upper_half = (c >= m);
716 } else {
717 // And add item to lower half if split_p has room, otherwise upper
718 // half
719 add_to_upper_half = (TOTAL_FREE(split_p) < needed);
722 if (add_to_upper_half) {
723 c -= (m - DIR_START);
724 Assert(seq_count < 0 || c <= DIR_START + D2);
725 Assert(c >= DIR_START);
726 Assert(c <= DIR_END(p));
727 add_item_to_block(p, kt_, c);
728 n = C[j].n;
729 } else {
730 Assert(c >= DIR_START);
731 Assert(c <= DIR_END(split_p));
732 add_item_to_block(split_p, kt_, c);
733 n = split_n;
735 write_block(split_n, split_p);
737 // Check if we're splitting the root block.
738 if (j == level) split_root(split_n);
740 /* Enter a separating key at level j + 1 between */
741 /* the last key of block split_p, and the first key of block p */
742 enter_key(j + 1,
743 Item(split_p, DIR_END(split_p) - D2).key(),
744 Item(p, DIR_START).key());
745 } else {
746 AssertRel(TOTAL_FREE(p),>=,needed);
748 if (MAX_FREE(p) < needed) {
749 compact(p);
750 AssertRel(MAX_FREE(p),>=,needed);
753 add_item_to_block(p, kt_, c);
754 n = C[j].n;
756 if (j == 0) {
757 changed_n = n;
758 changed_c = c;
762 /** ChertTable::delete_item(j, repeatedly) is (almost) the converse of add_item.
764 * If repeatedly is true, the process repeats at the next level when a
765 * block has been completely emptied, freeing the block and taking out
766 * the pointer to it. Emptied root blocks are also removed, which
767 * reduces the number of levels in the B-tree.
769 void
770 ChertTable::delete_item(int j, bool repeatedly)
772 LOGCALL_VOID(DB, "ChertTable::delete_item", j | repeatedly);
773 Assert(writable);
774 byte * p = C[j].p;
775 int c = C[j].c;
776 AssertRel(DIR_START,<=,c);
777 AssertRel(c,<,DIR_END(p));
778 int kt_len = Item(p, c).size(); /* size of the item to be deleted */
779 int dir_end = DIR_END(p) - D2; /* directory length will go down by 2 bytes */
781 memmove(p + c, p + c + D2, dir_end - c);
782 SET_DIR_END(p, dir_end);
783 SET_MAX_FREE(p, MAX_FREE(p) + D2);
784 SET_TOTAL_FREE(p, TOTAL_FREE(p) + kt_len + D2);
786 if (!repeatedly) return;
787 if (j < level) {
788 if (dir_end == DIR_START) {
789 base.free_block(C[j].n);
790 C[j].rewrite = false;
791 C[j].n = BLK_UNUSED;
792 C[j + 1].rewrite = true; /* *is* necessary */
793 delete_item(j + 1, true);
795 } else {
796 Assert(j == level);
797 while (dir_end == DIR_START + D2 && level > 0) {
798 /* single item in the root block, so lose a level */
799 uint4 new_root = Item(p, DIR_START).block_given_by();
800 delete [] p;
801 C[level].p = 0;
802 base.free_block(C[level].n);
803 C[level].rewrite = false;
804 C[level].n = BLK_UNUSED;
805 level--;
807 block_to_cursor(C, level, new_root);
809 p = C[level].p;
810 dir_end = DIR_END(p); /* prepare for the loop */
815 /* debugging aid:
816 static addcount = 0;
819 /** add_kt(found) adds the item (key-tag pair) at B->kt into the
820 B-tree, using cursor C.
822 found == find() is handed over as a parameter from Btree::add.
823 Btree::alter() prepares for the alteration to the B-tree. Then
824 there are a number of cases to consider:
826 If an item with the same key is in the B-tree (found is true),
827 the new kt replaces it.
829 If then kt is smaller, or the same size as, the item it replaces,
830 kt is put in the same place as the item it replaces, and the
831 TOTAL_FREE measure is reduced.
833 If kt is larger than the item it replaces it is put in the
834 MAX_FREE space if there is room, and the directory entry and
835 space counts are adjusted accordingly.
837 - But if there is not room we do it the long way: the old item is
838 deleted with delete_item and kt is added in with add_item.
840 If the key of kt is not in the B-tree (found is false), the new
841 kt is added in with add_item.
845 ChertTable::add_kt(bool found)
847 LOGCALL(DB, int, "ChertTable::add_kt", found);
848 Assert(writable);
849 int components = 0;
853 printf("%d) %s ", addcount++, (found ? "replacing" : "adding"));
854 print_bytes(kt[I2] - K1 - C2, kt + I2 + K1); putchar('\n');
857 alter();
859 if (found) { /* replacement */
860 seq_count = SEQ_START_POINT;
861 sequential = false;
863 byte * p = C[0].p;
864 int c = C[0].c;
865 AssertRel(DIR_START,<=,c);
866 AssertRel(c,<,DIR_END(p));
867 Item item(p, c);
868 int kt_size = kt.size();
869 int needed = kt_size - item.size();
871 components = item.components_of();
873 if (needed <= 0) {
874 /* simple replacement */
875 memmove(const_cast<byte *>(item.get_address()),
876 kt.get_address(), kt_size);
877 SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
878 } else {
879 /* new item into the block's freespace */
880 int new_max = MAX_FREE(p) - kt_size;
881 if (new_max >= 0) {
882 int o = DIR_END(p) + new_max;
883 memmove(p + o, kt.get_address(), kt_size);
884 setD(p, c, o);
885 SET_MAX_FREE(p, new_max);
886 SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
887 } else {
888 /* do it the long way */
889 delete_item(0, false);
890 add_item(kt, 0);
893 } else {
894 /* addition */
895 if (changed_n == C[0].n && changed_c == C[0].c) {
896 if (seq_count < 0) seq_count++;
897 } else {
898 seq_count = SEQ_START_POINT;
899 sequential = false;
901 C[0].c += D2;
902 add_item(kt, 0);
904 RETURN(components);
907 /* delete_kt() corresponds to add_kt(found), but there are only
908 two cases: if the key is not found nothing is done, and if it is
909 found the corresponding item is deleted with delete_item.
913 ChertTable::delete_kt()
915 LOGCALL(DB, int, "ChertTable::delete_kt", NO_ARGS);
916 Assert(writable);
918 bool found = find(C);
920 int components = 0;
921 seq_count = SEQ_START_POINT;
922 sequential = false;
926 printf("%d) %s ", addcount++, (found ? "deleting " : "ignoring "));
927 print_bytes(B->kt[I2] - K1 - C2, B->kt + I2 + K1); putchar('\n');
930 if (found) {
931 components = Item(C[0].p, C[0].c).components_of();
932 alter();
933 delete_item(0, true);
935 RETURN(components);
938 /* ChertTable::form_key(key) treats address kt as an item holder and fills in
939 the key part:
941 (I) K key c (C tag)
943 The bracketed parts are left blank. The key is filled in with key_len bytes and
944 K set accordingly. c is set to 1.
947 void ChertTable::form_key(const string & key) const
949 LOGCALL_VOID(DB, "ChertTable::form_key", key);
950 kt.form_key(key);
953 /* ChertTable::add(key, tag) adds the key/tag item to the
954 B-tree, replacing any existing item with the same key.
956 For a long tag, we end up having to add m components, of the form
958 key 1 m tag1
959 key 2 m tag2
961 key m m tagm
963 and tag1+tag2+...+tagm are equal to tag. These in their turn may be replacing
964 n components of the form
966 key 1 n TAG1
967 key 2 n TAG2
969 key n n TAGn
971 and n may be greater than, equal to, or less than m. These cases are dealt
972 with in the code below. If m < n for example, we end up with a series of
973 deletions.
976 void
977 ChertTable::add(const string &key, string tag, bool already_compressed)
979 LOGCALL_VOID(DB, "ChertTable::add", key | tag | already_compressed);
980 Assert(writable);
982 if (handle < 0) create_and_open(block_size);
984 form_key(key);
986 bool compressed = false;
987 if (already_compressed) {
988 compressed = true;
989 } else if (compress_strategy != DONT_COMPRESS && tag.size() > COMPRESS_MIN) {
990 static_assert(DONT_COMPRESS != Z_DEFAULT_STRATEGY,
991 "DONT_COMPRESS clashes with zlib constant");
992 static_assert(DONT_COMPRESS != Z_FILTERED,
993 "DONT_COMPRESS clashes with zlib constant");
994 static_assert(DONT_COMPRESS != Z_HUFFMAN_ONLY,
995 "DONT_COMPRESS clashes with zlib constant");
996 #ifdef Z_RLE
997 static_assert(DONT_COMPRESS != Z_RLE,
998 "DONT_COMPRESS clashes with zlib constant");
999 #endif
1001 lazy_alloc_deflate_zstream();
1003 deflate_zstream->next_in =
1004 reinterpret_cast<Bytef *>(const_cast<char *>(tag.data()));
1005 deflate_zstream->avail_in = static_cast<uInt>(tag.size());
1007 // If compressed size is >= tag.size(), we don't want to compress.
1008 unsigned long blk_len = tag.size() - 1;
1009 unsigned char * blk = new unsigned char[blk_len];
1010 deflate_zstream->next_out = blk;
1011 deflate_zstream->avail_out = static_cast<uInt>(blk_len);
1013 int err = deflate(deflate_zstream, Z_FINISH);
1014 if (err == Z_STREAM_END) {
1015 // If deflate succeeded, then the output was at least one byte
1016 // smaller than the input.
1017 tag.assign(reinterpret_cast<const char *>(blk), deflate_zstream->total_out);
1018 compressed = true;
1019 } else {
1020 // Deflate failed - presumably the data wasn't compressible.
1023 delete [] blk;
1026 // sort of matching kt.append_chunk(), but setting the chunk
1027 const size_t cd = kt.key().length() + K1 + I2 + C2 + C2; // offset to the tag data
1028 const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
1029 size_t first_L = L; // - amount for tag1
1030 bool found = find(C);
1031 if (!found) {
1032 byte * p = C[0].p;
1033 size_t n = TOTAL_FREE(p) % (max_item_size + D2);
1034 if (n > D2 + cd) {
1035 n -= (D2 + cd);
1036 // if n >= last then fully filling this block won't produce
1037 // an extra item, so we might as well do this even if
1038 // full_compaction isn't active.
1040 // In the full_compaction case, it turns out we shouldn't always
1041 // try to fill every last byte. Doing so can actually increase the
1042 // total space required (I believe this effect is due to longer
1043 // dividing keys being required in the index blocks). Empirically,
1044 // n >= key.size() + K appears a good criterion for K ~= 34. This
1045 // seems to save about 0.2% in total database size over always
1046 // splitting the tag. It'll also give be slightly faster retrieval
1047 // as we can avoid reading an extra block occasionally.
1048 size_t last = tag.length() % L;
1049 if (n >= last || (full_compaction && n >= key.size() + 34))
1050 first_L = n;
1054 // a null tag must be added in of course
1055 int m = tag.empty() ? 1 : (tag.length() - first_L + L - 1) / L + 1;
1056 // there are m items to add
1057 /* FIXME: sort out this error higher up and turn this into
1058 * an assert.
1060 if (m >= BYTE_PAIR_RANGE)
1061 throw Xapian::UnimplementedError("Can't handle insanely large tags");
1063 int n = 0; // initialise to shut off warning
1064 // - and there will be n to delete
1065 int o = 0; // Offset into the tag
1066 size_t residue = tag.length(); // Bytes of the tag remaining to add in
1067 int replacement = false; // Has there been a replacement ?
1068 int i;
1069 kt.set_components_of(m);
1070 for (i = 1; i <= m; i++) {
1071 size_t l = (i == m ? residue : (i == 1 ? first_L : L));
1072 Assert(cd + l <= block_size);
1073 Assert(string::size_type(o + l) <= tag.length());
1074 kt.set_tag(cd, tag.data() + o, l, compressed);
1075 kt.set_component_of(i);
1077 o += l;
1078 residue -= l;
1080 if (i > 1) found = find(C);
1081 n = add_kt(found);
1082 if (n > 0) replacement = true;
1084 /* o == tag.length() here, and n may be zero */
1085 for (i = m + 1; i <= n; i++) {
1086 kt.set_component_of(i);
1087 delete_kt();
1089 if (!replacement) ++item_count;
1090 Btree_modified = true;
1091 if (cursor_created_since_last_modification) {
1092 cursor_created_since_last_modification = false;
1093 ++cursor_version;
1097 /* ChertTable::del(key) returns false if the key is not in the B-tree,
1098 otherwise deletes it and returns true.
1100 Again, this is parallel to ChertTable::add, but simpler in form.
1103 bool
1104 ChertTable::del(const string &key)
1106 LOGCALL(DB, bool, "ChertTable::del", key);
1107 Assert(writable);
1109 if (handle < 0) {
1110 if (handle == -2) {
1111 ChertTable::throw_database_closed();
1113 RETURN(false);
1116 // We can't delete a key which we is too long for us to store.
1117 if (key.size() > CHERT_BTREE_MAX_KEY_LEN) RETURN(false);
1119 if (key.empty()) RETURN(false);
1120 form_key(key);
1122 int n = delete_kt(); /* there are n items to delete */
1123 if (n <= 0) RETURN(false);
1125 for (int i = 2; i <= n; i++) {
1126 kt.set_component_of(i);
1127 delete_kt();
1130 item_count--;
1131 Btree_modified = true;
1132 if (cursor_created_since_last_modification) {
1133 cursor_created_since_last_modification = false;
1134 ++cursor_version;
1136 RETURN(true);
1139 bool
1140 ChertTable::readahead_key(const string &key) const
1142 LOGCALL(DB, bool, "ChertTable::readahead_key", key);
1143 Assert(!key.empty());
1145 // Two cases:
1147 // handle = -1: Lazy table which isn't yet open
1149 // handle = -2: Table has been closed. Since the readahead is just a
1150 // hint, we can safely ignore it for a closed table.
1151 if (handle < 0)
1152 RETURN(false);
1154 // If the table only has one level, there are no branch blocks to preread.
1155 if (level == 0)
1156 RETURN(false);
1158 form_key(key);
1159 Key ktkey = kt.key();
1161 // We'll only readahead the first level, since descending the B-tree would
1162 // require actual reads that would likely hurt performance more than help.
1163 const byte * p = C[level].p;
1164 int c = find_in_block(p, ktkey, false, C[level].c);
1165 uint4 n = Item(p, c).block_given_by();
1166 // Don't preread if it's the block we last preread or already in the
1167 // cursor.
1168 if (n != last_readahead && n != C[level - 1].n) {
1169 /* Use the base bit_map_size not the bitmap's size, because the latter
1170 * is uninitialised in readonly mode.
1172 Assert(n / CHAR_BIT < base.get_bit_map_size());
1174 last_readahead = n;
1175 if (!io_readahead_block(handle, block_size, n))
1176 RETURN(false);
1178 RETURN(true);
1181 bool
1182 ChertTable::get_exact_entry(const string &key, string & tag) const
1184 LOGCALL(DB, bool, "ChertTable::get_exact_entry", key | tag);
1185 Assert(!key.empty());
1187 if (handle < 0) {
1188 if (handle == -2) {
1189 ChertTable::throw_database_closed();
1191 RETURN(false);
1194 // An oversized key can't exist, so attempting to search for it should fail.
1195 if (key.size() > CHERT_BTREE_MAX_KEY_LEN) RETURN(false);
1197 form_key(key);
1198 if (!find(C)) RETURN(false);
1200 (void)read_tag(C, &tag, false);
1201 RETURN(true);
1204 bool
1205 ChertTable::key_exists(const string &key) const
1207 LOGCALL(DB, bool, "ChertTable::key_exists", key);
1208 Assert(!key.empty());
1210 // An oversized key can't exist, so attempting to search for it should fail.
1211 if (key.size() > CHERT_BTREE_MAX_KEY_LEN) RETURN(false);
1213 form_key(key);
1214 RETURN(find(C));
1217 bool
1218 ChertTable::read_tag(Cursor * C_, string *tag, bool keep_compressed) const
1220 LOGCALL(DB, bool, "ChertTable::read_tag", Literal("C_") | tag | keep_compressed);
1221 Item item(C_[0].p, C_[0].c);
1223 /* n components to join */
1224 int n = item.components_of();
1226 tag->resize(0);
1227 // max_item_size also includes K1 + I2 + C2 + C2 bytes overhead and the key
1228 // (which is at least 1 byte long).
1229 if (n > 1) tag->reserve((max_item_size - (1 + K1 + I2 + C2 + C2)) * n);
1231 item.append_chunk(tag);
1232 bool compressed = item.get_compressed();
1234 for (int i = 2; i <= n; i++) {
1235 if (!next(C_, 0)) {
1236 throw Xapian::DatabaseCorruptError("Unexpected end of table when reading continuation of tag");
1238 (void)Item(C_[0].p, C_[0].c).append_chunk(tag);
1240 // At this point the cursor is on the last item - calling next will move
1241 // it to the next key (ChertCursor::get_tag() relies on this).
1242 if (!compressed || keep_compressed) RETURN(compressed);
1244 // FIXME: Perhaps we should decompress each chunk as we read it so we
1245 // don't need both the full compressed and uncompressed tags in memory
1246 // at once.
1248 string utag;
1249 // May not be enough for a compressed tag, but it's a reasonable guess.
1250 utag.reserve(tag->size() + tag->size() / 2);
1252 Bytef buf[8192];
1254 lazy_alloc_inflate_zstream();
1256 inflate_zstream->next_in =
1257 reinterpret_cast<Bytef*>(const_cast<char *>(tag->data()));
1258 inflate_zstream->avail_in = static_cast<uInt>(tag->size());
1260 int err = Z_OK;
1261 while (err != Z_STREAM_END) {
1262 inflate_zstream->next_out = buf;
1263 inflate_zstream->avail_out = static_cast<uInt>(sizeof(buf));
1264 err = inflate(inflate_zstream, Z_SYNC_FLUSH);
1265 if (err == Z_BUF_ERROR && inflate_zstream->avail_in == 0) {
1266 LOGLINE(DB, "Z_BUF_ERROR - faking checksum of " << inflate_zstream->adler);
1267 Bytef header2[4];
1268 aligned_write4(header2, inflate_zstream->adler);
1269 inflate_zstream->next_in = header2;
1270 inflate_zstream->avail_in = 4;
1271 err = inflate(inflate_zstream, Z_SYNC_FLUSH);
1272 if (err == Z_STREAM_END) break;
1275 if (err != Z_OK && err != Z_STREAM_END) {
1276 if (err == Z_MEM_ERROR) throw std::bad_alloc();
1277 string msg = "inflate failed";
1278 if (inflate_zstream->msg) {
1279 msg += " (";
1280 msg += inflate_zstream->msg;
1281 msg += ')';
1283 throw Xapian::DatabaseError(msg);
1286 utag.append(reinterpret_cast<const char *>(buf),
1287 inflate_zstream->next_out - buf);
1289 if (utag.size() != inflate_zstream->total_out) {
1290 string msg = "compressed tag didn't expand to the expected size: ";
1291 msg += str(utag.size());
1292 msg += " != ";
1293 // OpenBSD's zlib.h uses off_t instead of uLong for total_out.
1294 msg += str(size_t(inflate_zstream->total_out));
1295 throw Xapian::DatabaseCorruptError(msg);
1298 swap(*tag, utag);
1300 RETURN(false);
1303 void
1304 ChertTable::set_full_compaction(bool parity)
1306 LOGCALL_VOID(DB, "ChertTable::set_full_compaction", parity);
1307 Assert(writable);
1309 if (parity) seq_count = 0;
1310 full_compaction = parity;
1313 ChertCursor * ChertTable::cursor_get() const {
1314 LOGCALL(DB, ChertCursor *, "ChertTable::cursor_get", NO_ARGS);
1315 if (handle < 0) {
1316 if (handle == -2) {
1317 ChertTable::throw_database_closed();
1319 RETURN(NULL);
1321 // FIXME Ick - casting away const is nasty
1322 RETURN(new ChertCursor(const_cast<ChertTable *>(this)));
1325 /************ B-tree opening and closing ************/
1327 bool
1328 ChertTable::basic_open(bool revision_supplied, chert_revision_number_t revision_)
1330 LOGCALL(DB, bool, "ChertTable::basic_open", revision_supplied | revision_);
1331 int ch = 'X'; /* will be 'A' or 'B' */
1334 const size_t BTREE_BASES = 2;
1335 string err_msg;
1336 static const char basenames[BTREE_BASES] = { 'A', 'B' };
1338 ChertTable_base bases[BTREE_BASES];
1339 bool base_ok[BTREE_BASES];
1341 both_bases = true;
1342 bool valid_base = false;
1344 for (size_t i = 0; i < BTREE_BASES; ++i) {
1345 bool ok = bases[i].read(name, basenames[i], writable, err_msg);
1346 base_ok[i] = ok;
1347 if (ok) {
1348 valid_base = true;
1349 } else {
1350 both_bases = false;
1355 if (!valid_base) {
1356 if (handle >= 0) {
1357 ::close(handle);
1358 handle = -1;
1360 string message = "Error opening table '";
1361 message += name;
1362 message += "':\n";
1363 message += err_msg;
1364 throw Xapian::DatabaseOpeningError(message);
1367 if (revision_supplied) {
1368 bool found_revision = false;
1369 for (size_t i = 0; i < BTREE_BASES; ++i) {
1370 if (base_ok[i] && bases[i].get_revision() == revision_) {
1371 ch = basenames[i];
1372 found_revision = true;
1373 break;
1376 if (!found_revision) {
1377 /* Couldn't open the revision that was asked for.
1378 * This shouldn't throw an exception, but should just return
1379 * false to upper levels.
1381 RETURN(false);
1383 } else {
1384 chert_revision_number_t highest_revision = 0;
1385 for (size_t i = 0; i < BTREE_BASES; ++i) {
1386 if (base_ok[i] && bases[i].get_revision() >= highest_revision) {
1387 ch = basenames[i];
1388 highest_revision = bases[i].get_revision();
1393 ChertTable_base *basep = 0;
1394 ChertTable_base *other_base = 0;
1396 for (size_t i = 0; i < BTREE_BASES; ++i) {
1397 LOGLINE(DB, "Checking (ch == " << ch << ") against "
1398 "basenames[" << i << "] == " << basenames[i]);
1399 LOGLINE(DB, "bases[" << i << "].get_revision() == " <<
1400 bases[i].get_revision());
1401 LOGLINE(DB, "base_ok[" << i << "] == " << base_ok[i]);
1402 if (ch == basenames[i]) {
1403 basep = &bases[i];
1405 // FIXME: assuming only two bases for other_base
1406 size_t otherbase_num = 1 - i;
1407 if (base_ok[otherbase_num]) {
1408 other_base = &bases[otherbase_num];
1410 break;
1413 Assert(basep);
1415 /* basep now points to the most recent base block */
1417 /* Avoid copying the bitmap etc. - swap contents with the base
1418 * object in the vector, since it'll be destroyed anyway soon.
1420 base.swap(*basep);
1422 revision_number = base.get_revision();
1423 block_size = base.get_block_size();
1424 root = base.get_root();
1425 level = base.get_level();
1426 //bit_map_size = basep->get_bit_map_size();
1427 item_count = base.get_item_count();
1428 faked_root_block = base.get_have_fakeroot();
1429 sequential = base.get_sequential();
1431 if (other_base != 0) {
1432 latest_revision_number = other_base->get_revision();
1433 if (revision_number > latest_revision_number)
1434 latest_revision_number = revision_number;
1435 } else {
1436 latest_revision_number = revision_number;
1440 /* kt holds constructed items as well as keys */
1441 kt = Item_wr(zeroed_new(block_size));
1443 set_max_item_size(BLOCK_CAPACITY);
1445 base_letter = ch;
1447 if (cursor_created_since_last_modification) {
1448 cursor_created_since_last_modification = false;
1449 ++cursor_version;
1452 /* ready to open the main file */
1454 RETURN(true);
1457 void
1458 ChertTable::read_root()
1460 LOGCALL_VOID(DB, "ChertTable::read_root", NO_ARGS);
1461 if (faked_root_block) {
1462 /* root block for an unmodified database. */
1463 byte * p = C[0].p;
1464 Assert(p);
1466 /* clear block - shouldn't be necessary, but is a bit nicer,
1467 * and means that the same operations should always produce
1468 * the same database. */
1469 memset(p, 0, block_size);
1471 int o = block_size - I2 - K1 - C2 - C2;
1472 Item_wr(p + o).fake_root_item();
1474 setD(p, DIR_START, o); // its directory entry
1475 SET_DIR_END(p, DIR_START + D2);// the directory size
1477 o -= (DIR_START + D2);
1478 SET_MAX_FREE(p, o);
1479 SET_TOTAL_FREE(p, o);
1480 SET_LEVEL(p, 0);
1482 if (!writable) {
1483 /* reading - revision number doesn't matter as long as
1484 * it's not greater than the current one. */
1485 SET_REVISION(p, 0);
1486 C[0].n = 0;
1487 } else {
1488 /* writing - */
1489 SET_REVISION(p, latest_revision_number + 1);
1490 C[0].n = base.next_free_block();
1492 } else {
1493 /* using a root block stored on disk */
1494 block_to_cursor(C, level, root);
1496 if (REVISION(C[level].p) > revision_number) set_overwritten();
1497 /* although this is unlikely */
1501 bool
1502 ChertTable::do_open_to_write(bool revision_supplied,
1503 chert_revision_number_t revision_,
1504 bool create_db)
1506 LOGCALL(DB, bool, "ChertTable::do_open_to_write", revision_supplied | revision_ | create_db);
1507 if (handle == -2) {
1508 ChertTable::throw_database_closed();
1510 handle = io_open_block_wr(name + "DB", create_db);
1511 if (handle < 0) {
1512 // lazy doesn't make a lot of sense with create_db anyway, but ENOENT
1513 // with O_CREAT means a parent directory doesn't exist.
1514 if (lazy && !create_db && errno == ENOENT) {
1515 revision_number = revision_;
1516 RETURN(true);
1518 string message(create_db ? "Couldn't create " : "Couldn't open ");
1519 message += name;
1520 message += "DB read/write: ";
1521 errno_to_string(errno, message);
1522 throw Xapian::DatabaseOpeningError(message);
1525 if (!basic_open(revision_supplied, revision_)) {
1526 ::close(handle);
1527 handle = -1;
1528 if (!revision_supplied) {
1529 throw Xapian::DatabaseOpeningError("Failed to open for writing");
1531 /* When the revision is supplied, it's not an exceptional
1532 * case when open failed, so we just return false here.
1534 RETURN(false);
1537 writable = true;
1539 for (int j = 0; j <= level; j++) {
1540 C[j].n = BLK_UNUSED;
1541 C[j].p = new byte[block_size];
1543 split_p = new byte[block_size];
1544 read_root();
1546 buffer = zeroed_new(block_size);
1548 changed_n = 0;
1549 changed_c = DIR_START;
1550 seq_count = SEQ_START_POINT;
1552 RETURN(true);
1555 ChertTable::ChertTable(const char * tablename_, const string & path_,
1556 bool readonly_, int compress_strategy_, bool lazy_)
1557 : tablename(tablename_),
1558 revision_number(0),
1559 item_count(0),
1560 block_size(0),
1561 latest_revision_number(0),
1562 both_bases(false),
1563 base_letter('A'),
1564 faked_root_block(true),
1565 sequential(true),
1566 handle(-1),
1567 level(0),
1568 root(0),
1569 kt(0),
1570 buffer(0),
1571 base(),
1572 name(path_),
1573 seq_count(0),
1574 changed_n(0),
1575 changed_c(0),
1576 max_item_size(0),
1577 Btree_modified(false),
1578 full_compaction(false),
1579 writable(!readonly_),
1580 cursor_created_since_last_modification(false),
1581 cursor_version(0),
1582 split_p(0),
1583 compress_strategy(compress_strategy_),
1584 deflate_zstream(NULL),
1585 inflate_zstream(NULL),
1586 lazy(lazy_),
1587 last_readahead(BLK_UNUSED)
1589 LOGCALL_CTOR(DB, "ChertTable", tablename_ | path_ | readonly_ | compress_strategy_ | lazy_);
1592 bool
1593 ChertTable::really_empty() const
1595 if (handle < 0) {
1596 if (handle == -2) {
1597 ChertTable::throw_database_closed();
1599 return true;
1601 ChertCursor cur(const_cast<ChertTable*>(this));
1602 cur.find_entry(string());
1603 return !cur.next();
1606 void
1607 ChertTable::lazy_alloc_deflate_zstream() const {
1608 if (usual(deflate_zstream)) {
1609 if (usual(deflateReset(deflate_zstream) == Z_OK)) return;
1610 // Try to recover by deleting the stream and starting from scratch.
1611 delete deflate_zstream;
1614 deflate_zstream = new z_stream;
1616 deflate_zstream->zalloc = Z_NULL;
1617 deflate_zstream->zfree = Z_NULL;
1618 deflate_zstream->opaque = Z_NULL;
1620 // -15 means raw deflate with 32K LZ77 window (largest)
1621 // memLevel 9 is the highest (8 is default)
1622 int err;
1623 err = deflateInit2(deflate_zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
1624 -15, 9, compress_strategy);
1625 if (rare(err != Z_OK)) {
1626 if (err == Z_MEM_ERROR) {
1627 delete deflate_zstream;
1628 deflate_zstream = 0;
1629 throw std::bad_alloc();
1631 string msg = "deflateInit2 failed (";
1632 if (deflate_zstream->msg) {
1633 msg += deflate_zstream->msg;
1634 } else {
1635 msg += str(err);
1637 msg += ')';
1638 delete deflate_zstream;
1639 deflate_zstream = 0;
1640 throw Xapian::DatabaseError(msg);
1644 void
1645 ChertTable::lazy_alloc_inflate_zstream() const {
1646 if (usual(inflate_zstream)) {
1647 if (usual(inflateReset(inflate_zstream) == Z_OK)) return;
1648 // Try to recover by deleting the stream and starting from scratch.
1649 delete inflate_zstream;
1652 inflate_zstream = new z_stream;
1654 inflate_zstream->zalloc = Z_NULL;
1655 inflate_zstream->zfree = Z_NULL;
1656 inflate_zstream->opaque = Z_NULL;
1658 inflate_zstream->next_in = Z_NULL;
1659 inflate_zstream->avail_in = 0;
1661 int err = inflateInit2(inflate_zstream, -15);
1662 if (rare(err != Z_OK)) {
1663 if (err == Z_MEM_ERROR) {
1664 delete inflate_zstream;
1665 inflate_zstream = 0;
1666 throw std::bad_alloc();
1668 string msg = "inflateInit2 failed (";
1669 if (inflate_zstream->msg) {
1670 msg += inflate_zstream->msg;
1671 } else {
1672 msg += str(err);
1674 msg += ')';
1675 delete inflate_zstream;
1676 inflate_zstream = 0;
1677 throw Xapian::DatabaseError(msg);
1681 bool
1682 ChertTable::exists() const {
1683 LOGCALL(DB, bool, "ChertTable::exists", NO_ARGS);
1684 RETURN(file_exists(name + "DB") &&
1685 (file_exists(name + "baseA") || file_exists(name + "baseB")));
1688 void
1689 ChertTable::erase()
1691 LOGCALL_VOID(DB, "ChertTable::erase", NO_ARGS);
1692 close();
1694 (void)io_unlink(name + "baseA");
1695 (void)io_unlink(name + "baseB");
1696 (void)io_unlink(name + "DB");
1699 void
1700 ChertTable::set_block_size(unsigned int block_size_)
1702 LOGCALL_VOID(DB, "ChertTable::set_block_size", block_size_);
1703 // Block size must in the range 2048..BYTE_PAIR_RANGE, and a power of two.
1704 if (block_size_ < 2048 || block_size_ > BYTE_PAIR_RANGE ||
1705 (block_size_ & (block_size_ - 1)) != 0) {
1706 block_size_ = CHERT_DEFAULT_BLOCK_SIZE;
1708 block_size = block_size_;
1711 void
1712 ChertTable::create_and_open(unsigned int block_size_)
1714 LOGCALL_VOID(DB, "ChertTable::create_and_open", block_size_);
1715 if (handle == -2) {
1716 ChertTable::throw_database_closed();
1718 Assert(writable);
1719 close();
1721 set_block_size(block_size_);
1723 // FIXME: it would be good to arrange that this works such that there's
1724 // always a valid table in place if you run create_and_open() on an
1725 // existing table.
1727 /* write initial values to files */
1729 /* create the base file */
1730 ChertTable_base base_;
1731 base_.set_revision(revision_number);
1732 base_.set_block_size(block_size);
1733 base_.set_have_fakeroot(true);
1734 base_.set_sequential(true);
1735 // Doing a full sync here would be overly paranoid, as an empty table
1736 // contains no precious data and xapian-check can recreate lost base
1737 // files.
1738 base_.write_to_file(name + "baseA", 'A', string(), -1, NULL);
1740 /* remove the alternative base file, if any */
1741 (void)io_unlink(name + "baseB");
1743 // Any errors are thrown if revision_supplied is false.
1744 (void)do_open_to_write(false, 0, true);
1747 ChertTable::~ChertTable() {
1748 LOGCALL_DTOR(DB, "ChertTable");
1749 ChertTable::close();
1751 if (deflate_zstream) {
1752 // Errors which we care about have already been handled, so just ignore
1753 // any which get returned here.
1754 (void) deflateEnd(deflate_zstream);
1755 delete deflate_zstream;
1758 if (inflate_zstream) {
1759 // Errors which we care about have already been handled, so just ignore
1760 // any which get returned here.
1761 (void) inflateEnd(inflate_zstream);
1762 delete inflate_zstream;
1766 void ChertTable::close(bool permanent) {
1767 LOGCALL_VOID(DB, "ChertTable::close", permanent);
1769 if (handle >= 0) {
1770 // If an error occurs here, we just ignore it, since we're just
1771 // trying to free everything.
1772 (void)::close(handle);
1773 handle = -1;
1776 if (permanent) {
1777 handle = -2;
1778 // Don't delete the resources in the table, since they may
1779 // still be used to look up cached content.
1780 return;
1782 for (int j = level; j >= 0; j--) {
1783 delete [] C[j].p;
1784 C[j].p = 0;
1786 delete [] split_p;
1787 split_p = 0;
1789 delete [] kt.get_address();
1790 kt = 0;
1791 delete [] buffer;
1792 buffer = 0;
1795 void
1796 ChertTable::flush_db()
1798 LOGCALL_VOID(DB, "ChertTable::flush_db", NO_ARGS);
1799 Assert(writable);
1800 if (handle < 0) {
1801 if (handle == -2) {
1802 ChertTable::throw_database_closed();
1804 return;
1807 for (int j = level; j >= 0; j--) {
1808 if (C[j].rewrite) {
1809 write_block(C[j].n, C[j].p);
1813 if (Btree_modified) {
1814 faked_root_block = false;
1818 void
1819 ChertTable::commit(chert_revision_number_t revision, int changes_fd,
1820 const string * changes_tail)
1822 LOGCALL_VOID(DB, "ChertTable::commit", revision | changes_fd | changes_tail);
1823 Assert(writable);
1825 if (revision <= revision_number) {
1826 throw Xapian::DatabaseError("New revision too low");
1829 if (handle < 0) {
1830 if (handle == -2) {
1831 ChertTable::throw_database_closed();
1833 latest_revision_number = revision_number = revision;
1834 return;
1837 try {
1838 if (faked_root_block) {
1839 /* We will use a dummy bitmap. */
1840 base.clear_bit_map();
1843 base.set_revision(revision);
1844 base.set_root(C[level].n);
1845 base.set_level(level);
1846 base.set_item_count(item_count);
1847 base.set_have_fakeroot(faked_root_block);
1848 base.set_sequential(sequential);
1850 base_letter = other_base_letter();
1852 both_bases = true;
1853 latest_revision_number = revision_number = revision;
1854 root = C[level].n;
1856 Btree_modified = false;
1858 for (int i = 0; i < BTREE_CURSOR_LEVELS; ++i) {
1859 C[i].n = BLK_UNUSED;
1860 C[i].c = -1;
1861 C[i].rewrite = false;
1864 // Save to "<table>.tmp" and then rename to "<table>.base<letter>" so
1865 // that a reader can't try to read a partially written base file.
1866 string tmp = name;
1867 tmp += "tmp";
1868 string basefile = name;
1869 basefile += "base";
1870 basefile += char(base_letter);
1871 base.write_to_file(tmp, base_letter, tablename, changes_fd, changes_tail);
1873 // Do this as late as possible to allow maximum time for writes to
1874 // happen, and so the calls to io_sync() are adjacent which may be
1875 // more efficient, at least with some Linux kernel versions.
1876 if (changes_tail ? !io_full_sync(handle) : !io_sync(handle)) {
1877 (void)::close(handle);
1878 handle = -1;
1879 (void)unlink(tmp.c_str());
1880 throw Xapian::DatabaseError("Can't commit new revision - failed to flush DB to disk");
1883 if (!io_tmp_rename(tmp, basefile)) {
1884 string msg("Couldn't update base file ");
1885 msg += basefile;
1886 throw Xapian::DatabaseError(msg, errno);
1888 base.commit();
1890 read_root();
1892 changed_n = 0;
1893 changed_c = DIR_START;
1894 seq_count = SEQ_START_POINT;
1895 } catch (...) {
1896 ChertTable::close();
1897 throw;
1901 void
1902 ChertTable::write_changed_blocks(int changes_fd)
1904 LOGCALL_VOID(DB, "ChertTable::write_changed_blocks", changes_fd);
1905 Assert(changes_fd >= 0);
1906 if (handle < 0) return;
1907 if (faked_root_block) return;
1909 string buf;
1910 pack_uint(buf, 2u); // Indicate the item is a list of blocks
1911 pack_string(buf, tablename);
1912 pack_uint(buf, block_size);
1913 io_write(changes_fd, buf.data(), buf.size());
1915 // Compare the old and new bitmaps to find blocks which have changed, and
1916 // write them to the file descriptor.
1917 uint4 n = 0;
1918 byte * p = new byte[block_size];
1919 try {
1920 base.calculate_last_block();
1921 while (base.find_changed_block(&n)) {
1922 buf.resize(0);
1923 pack_uint(buf, n + 1);
1924 io_write(changes_fd, buf.data(), buf.size());
1926 // Read block n.
1927 read_block(n, p);
1929 // Write block n to the file.
1930 io_write(changes_fd, reinterpret_cast<const char *>(p),
1931 block_size);
1932 ++n;
1934 delete[] p;
1935 p = 0;
1936 } catch (...) {
1937 delete[] p;
1938 throw;
1940 buf.resize(0);
1941 pack_uint(buf, 0u);
1942 io_write(changes_fd, buf.data(), buf.size());
1945 void
1946 ChertTable::cancel()
1948 LOGCALL_VOID(DB, "ChertTable::cancel", NO_ARGS);
1949 Assert(writable);
1951 if (handle < 0) {
1952 if (handle == -2) {
1953 ChertTable::throw_database_closed();
1955 latest_revision_number = revision_number; // FIXME: we can end up reusing a revision if we opened a btree at an older revision, start to modify it, then cancel...
1956 return;
1959 // This causes problems: if (!Btree_modified) return;
1961 string err_msg;
1962 if (!base.read(name, base_letter, writable, err_msg)) {
1963 throw Xapian::DatabaseCorruptError(string("Couldn't reread base ") + base_letter);
1966 revision_number = base.get_revision();
1967 block_size = base.get_block_size();
1968 root = base.get_root();
1969 level = base.get_level();
1970 //bit_map_size = basep->get_bit_map_size();
1971 item_count = base.get_item_count();
1972 faked_root_block = base.get_have_fakeroot();
1973 sequential = base.get_sequential();
1975 latest_revision_number = revision_number; // FIXME: we can end up reusing a revision if we opened a btree at an older revision, start to modify it, then cancel...
1977 Btree_modified = false;
1979 for (int j = 0; j <= level; j++) {
1980 C[j].n = BLK_UNUSED;
1981 C[j].rewrite = false;
1983 read_root();
1985 changed_n = 0;
1986 changed_c = DIR_START;
1987 seq_count = SEQ_START_POINT;
1989 if (cursor_created_since_last_modification) {
1990 cursor_created_since_last_modification = false;
1991 ++cursor_version;
1995 /************ B-tree reading ************/
1997 bool
1998 ChertTable::do_open_to_read(bool revision_supplied, chert_revision_number_t revision_)
2000 LOGCALL(DB, bool, "ChertTable::do_open_to_read", revision_supplied | revision_);
2001 if (handle == -2) {
2002 ChertTable::throw_database_closed();
2004 handle = io_open_block_rd(name + "DB");
2005 if (handle < 0) {
2006 if (lazy) {
2007 // This table is optional when reading!
2008 revision_number = revision_;
2009 RETURN(true);
2011 string message("Couldn't open ");
2012 message += name;
2013 message += "DB to read: ";
2014 errno_to_string(errno, message);
2015 throw Xapian::DatabaseOpeningError(message);
2018 if (!basic_open(revision_supplied, revision_)) {
2019 ::close(handle);
2020 handle = -1;
2021 if (revision_supplied) {
2022 // The requested revision was not available.
2023 // This could be because the database was modified underneath us, or
2024 // because a base file is missing. Return false, and work out what
2025 // the problem was at a higher level.
2026 RETURN(false);
2028 throw Xapian::DatabaseOpeningError("Failed to open table for reading");
2031 for (int j = 0; j <= level; j++) {
2032 C[j].n = BLK_UNUSED;
2033 C[j].p = new byte[block_size];
2036 read_root();
2037 RETURN(true);
2040 void
2041 ChertTable::open()
2043 LOGCALL_VOID(DB, "ChertTable::open", NO_ARGS);
2044 LOGLINE(DB, "opening at path " << name);
2045 close();
2047 if (!writable) {
2048 // Any errors are thrown if revision_supplied is false
2049 (void)do_open_to_read(false, 0);
2050 return;
2053 // Any errors are thrown if revision_supplied is false.
2054 (void)do_open_to_write(false, 0);
2057 bool
2058 ChertTable::open(chert_revision_number_t revision)
2060 LOGCALL(DB, bool, "ChertTable::open", revision);
2061 LOGLINE(DB, "opening for particular revision at path " << name);
2062 close();
2064 if (!writable) {
2065 if (do_open_to_read(true, revision)) {
2066 AssertEq(revision_number, revision);
2067 RETURN(true);
2068 } else {
2069 close();
2070 RETURN(false);
2074 if (!do_open_to_write(true, revision)) {
2075 // Can't open at the requested revision.
2076 close();
2077 RETURN(false);
2080 AssertEq(revision_number, revision);
2081 RETURN(true);
2084 bool
2085 ChertTable::prev_for_sequential(Cursor * C_, int /*dummy*/) const
2087 LOGCALL(DB, bool, "ChertTable::prev_for_sequential", Literal("C_") | Literal("/*dummy*/"));
2088 int c = C_[0].c;
2089 AssertRel(DIR_START,<=,c);
2090 AssertRel(c,<,DIR_END(C_[0].p));
2091 if (c == DIR_START) {
2092 byte * p = C_[0].p;
2093 Assert(p);
2094 uint4 n = C_[0].n;
2095 while (true) {
2096 if (n == 0) RETURN(false);
2097 n--;
2098 if (writable) {
2099 if (n == C[0].n) {
2100 // Block is a leaf block in the built-in cursor
2101 // (potentially in modified form).
2102 memcpy(p, C[0].p, block_size);
2103 } else {
2104 // Blocks in the built-in cursor may not have been written
2105 // to disk yet, so we have to check that the block number
2106 // isn't in the built-in cursor or we'll read an
2107 // uninitialised block (for which GET_LEVEL(p) will
2108 // probably return 0).
2109 int j;
2110 for (j = 1; j <= level; ++j) {
2111 if (n == C[j].n) break;
2113 if (j <= level) continue;
2115 // Block isn't in the built-in cursor, so the form on disk
2116 // is valid, so read it to check if it's the next level 0
2117 // block.
2118 read_block(n, p);
2120 } else {
2121 read_block(n, p);
2123 if (writable) AssertEq(revision_number, latest_revision_number);
2124 if (REVISION(p) > revision_number + writable) {
2125 set_overwritten();
2126 RETURN(false);
2128 if (GET_LEVEL(p) == 0) break;
2130 c = DIR_END(p);
2131 C_[0].n = n;
2132 AssertRel(DIR_START,<,c);
2134 c -= D2;
2135 C_[0].c = c;
2136 RETURN(true);
2139 bool
2140 ChertTable::next_for_sequential(Cursor * C_, int /*dummy*/) const
2142 LOGCALL(DB, bool, "ChertTable::next_for_sequential", Literal("C_") | Literal("/*dummy*/"));
2143 byte * p = C_[0].p;
2144 Assert(p);
2145 int c = C_[0].c;
2146 AssertRel(c,<,DIR_END(p));
2147 c += D2;
2148 Assert((unsigned)c < block_size);
2149 if (c == DIR_END(p)) {
2150 uint4 n = C_[0].n;
2151 while (true) {
2152 n++;
2153 if (n > base.get_last_block()) RETURN(false);
2154 if (writable) {
2155 if (n == C[0].n) {
2156 // Block is a leaf block in the built-in cursor
2157 // (potentially in modified form).
2158 memcpy(p, C[0].p, block_size);
2159 } else {
2160 // Blocks in the built-in cursor may not have been written
2161 // to disk yet, so we have to check that the block number
2162 // isn't in the built-in cursor or we'll read an
2163 // uninitialised block (for which GET_LEVEL(p) will
2164 // probably return 0).
2165 int j;
2166 for (j = 1; j <= level; ++j) {
2167 if (n == C[j].n) break;
2169 if (j <= level) continue;
2171 // Block isn't in the built-in cursor, so the form on disk
2172 // is valid, so read it to check if it's the next level 0
2173 // block.
2174 read_block(n, p);
2176 } else {
2177 read_block(n, p);
2179 if (writable) AssertEq(revision_number, latest_revision_number);
2180 if (REVISION(p) > revision_number + writable) {
2181 set_overwritten();
2182 RETURN(false);
2184 if (GET_LEVEL(p) == 0) break;
2186 c = DIR_START;
2187 C_[0].n = n;
2189 C_[0].c = c;
2190 RETURN(true);
2193 bool
2194 ChertTable::prev_default(Cursor * C_, int j) const
2196 LOGCALL(DB, bool, "ChertTable::prev_default", Literal("C_") | j);
2197 byte * p = C_[j].p;
2198 int c = C_[j].c;
2199 AssertRel(DIR_START,<=,c);
2200 AssertRel(c,<,DIR_END(p));
2201 AssertRel((unsigned)DIR_END(p),<=,block_size);
2202 if (c == DIR_START) {
2203 if (j == level) RETURN(false);
2204 if (!prev_default(C_, j + 1)) RETURN(false);
2205 c = DIR_END(p);
2206 AssertRel(DIR_START,<,c);
2208 c -= D2;
2209 C_[j].c = c;
2210 if (j > 0) {
2211 block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
2213 RETURN(true);
2216 bool
2217 ChertTable::next_default(Cursor * C_, int j) const
2219 LOGCALL(DB, bool, "ChertTable::next_default", Literal("C_") | j);
2220 byte * p = C_[j].p;
2221 int c = C_[j].c;
2222 AssertRel(c,<,DIR_END(p));
2223 AssertRel((unsigned)DIR_END(p),<=,block_size);
2224 c += D2;
2225 if (j > 0) {
2226 AssertRel(DIR_START,<,c);
2227 } else {
2228 AssertRel(DIR_START,<=,c);
2230 // Sometimes c can be DIR_END(p) + 2 here it appears...
2231 if (c >= DIR_END(p)) {
2232 if (j == level) RETURN(false);
2233 if (!next_default(C_, j + 1)) RETURN(false);
2234 c = DIR_START;
2236 C_[j].c = c;
2237 if (j > 0) {
2238 block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
2239 #ifdef BTREE_DEBUG_FULL
2240 printf("Block in ChertTable:next_default");
2241 report_block_full(j - 1, C_[j - 1].n, C_[j - 1].p);
2242 #endif /* BTREE_DEBUG_FULL */
2244 RETURN(true);
2247 void
2248 ChertTable::throw_database_closed()
2250 throw Xapian::DatabaseError("Database has been closed");
2253 /** Compares this key with key2.
2255 The result is true if this key precedes key2. The comparison is for byte
2256 sequence collating order, taking lengths into account. So if the keys are
2257 made up of lower case ASCII letters we get alphabetical ordering.
2259 Now remember that items are added into the B-tree in fastest time
2260 when they are preordered by their keys. This is therefore the piece
2261 of code that needs to be followed to arrange for the preordering.
2263 This is complicated by the fact that keys have two parts - a value
2264 and then a count. We first compare the values, and only if they
2265 are equal do we compare the counts.
2268 bool Key::operator<(Key key2) const
2270 LOGCALL(DB, bool, "Key::operator<", static_cast<const void*>(key2.p));
2271 int key1_len = length();
2272 int key2_len = key2.length();
2273 if (key1_len == key2_len) {
2274 // The keys are the same length, so we can compare the counts
2275 // in the same operation since they're stored as 2 byte
2276 // bigendian numbers.
2277 RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) < 0);
2280 int k_smaller = (key2_len < key1_len ? key2_len : key1_len);
2282 // Compare the common part of the keys
2283 int diff = memcmp(p + K1, key2.p + K1, k_smaller);
2284 if (diff != 0) RETURN(diff < 0);
2286 // We dealt with the "same length" case above so we never need to check
2287 // the count here.
2288 RETURN(key1_len < key2_len);
2291 bool Key::operator==(Key key2) const
2293 LOGCALL(DB, bool, "Key::operator==", static_cast<const void*>(key2.p));
2294 int key1_len = length();
2295 if (key1_len != key2.length()) RETURN(false);
2296 // The keys are the same length, so we can compare the counts
2297 // in the same operation since they're stored as 2 byte
2298 // bigendian numbers.
2299 RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) == 0);