Merge branch 'honey-spelling-changes'
[xapian.git] / xapian-core / backends / honey / honey_compact.cc
blob9f70fa910019d5ae179cf36cd1e81b198b3a96e5
1 /** @file honey_compact.cc
2 * @brief Compact a honey database, or merge and compact several.
3 */
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "xapian/compactor.h"
25 #include "xapian/constants.h"
26 #include "xapian/error.h"
27 #include "xapian/types.h"
29 #include <algorithm>
30 #include <memory>
31 #include <queue>
32 #include <type_traits>
34 #include <cstdio>
36 #include "safeerrno.h"
38 #include "backends/flint_lock.h"
39 #include "compression_stream.h"
40 #include "honey_cursor.h"
41 #include "honey_database.h"
42 #include "honey_defs.h"
43 #include "honey_postlist_encodings.h"
44 #include "honey_table.h"
45 #include "honey_version.h"
46 #include "filetests.h"
47 #include "internaltypes.h"
48 #include "pack.h"
49 #include "backends/valuestats.h"
50 #include "wordaccess.h"
52 #include "../byte_length_strings.h"
53 #include "../prefix_compressed_strings.h"
55 #ifndef DISABLE_GPL_LIBXAPIAN
56 # include "../glass/glass_database.h"
57 # include "../glass/glass_table.h"
58 # include "../glass/glass_values.h"
59 #endif
61 using namespace std;
63 #ifndef DISABLE_GPL_LIBXAPIAN
64 [[noreturn]]
65 static void
66 throw_database_corrupt(const char* item, const char* pos)
68 string message;
69 if (pos != NULL) {
70 message = "Value overflow unpacking termlist: ";
71 } else {
72 message = "Out of data unpacking termlist: ";
74 message += item;
75 throw Xapian::DatabaseCorruptError(message);
78 namespace GlassCompact {
80 static inline bool
81 is_user_metadata_key(const string & key)
83 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
86 static inline bool
87 is_valuestats_key(const string & key)
89 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
92 static inline bool
93 is_valuechunk_key(const string & key)
95 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
98 static inline bool
99 is_doclenchunk_key(const string & key)
101 return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
106 inline static bool
107 termlist_key_is_values_used(const string& key)
109 const char* p = key.data();
110 const char* e = p + key.size();
111 Xapian::docid did;
112 if (unpack_uint_preserving_sort(&p, e, &did)) {
113 if (p == e)
114 return false;
115 if (e - p == 1 && *p == '\0')
116 return true;
118 throw Xapian::DatabaseCorruptError("termlist key format");
120 #endif
122 // Put all the helpers in a namespace to avoid symbols colliding with those of
123 // the same name in other flint-derived backends.
124 namespace HoneyCompact {
126 enum {
127 KEY_USER_METADATA = 0x00,
128 KEY_VALUE_STATS = 0x01,
129 KEY_VALUE_CHUNK = 0xd8,
130 KEY_DOCLEN_CHUNK = 0xe0,
131 KEY_POSTING_CHUNK = 0xff
134 /// Return one of the KEY_* constants, or a different value for an invalid key.
135 static inline int
136 key_type(const string& key)
138 if (key[0] != '\0')
139 return KEY_POSTING_CHUNK;
141 if (key.size() <= 1)
142 return -1;
144 switch (static_cast<unsigned char>(key[1])) {
145 case 0x01: case 0x02: case 0x03: case 0x04:
146 return KEY_VALUE_STATS;
149 // If key[1] is \xff then this correctly returns KEY_POSTING_CHUNK.
150 return static_cast<unsigned char>(key[1]);
153 template<typename T> class PostlistCursor;
155 #ifndef DISABLE_GPL_LIBXAPIAN
156 // Convert glass to honey.
157 template<>
158 class PostlistCursor<const GlassTable&> : private GlassCursor {
159 Xapian::docid offset;
161 public:
162 string key, tag;
163 Xapian::docid firstdid;
164 Xapian::docid chunk_lastdid;
165 Xapian::termcount tf, cf;
166 Xapian::termcount first_wdf;
167 Xapian::termcount wdf_max;
169 PostlistCursor(const GlassTable *in, Xapian::docid offset_)
170 : GlassCursor(in), offset(offset_), firstdid(0)
172 rewind();
173 next();
176 bool next() {
177 if (!GlassCursor::next()) return false;
178 // We put all chunks into the non-initial chunk form here, then fix up
179 // the first chunk for each term in the merged database as we merge.
180 read_tag();
181 key = current_key;
182 tag = current_tag;
183 tf = cf = 0;
184 if (GlassCompact::is_user_metadata_key(key)) {
185 key[1] = KEY_USER_METADATA;
186 return true;
188 if (GlassCompact::is_valuestats_key(key)) {
189 // Adjust key.
190 const char * p = key.data();
191 const char * end = p + key.length();
192 p += 2;
193 Xapian::valueno slot;
194 if (!unpack_uint_last(&p, end, &slot))
195 throw Xapian::DatabaseCorruptError("bad value stats key");
196 key = pack_honey_valuestats_key(slot);
197 return true;
199 if (GlassCompact::is_valuechunk_key(key)) {
200 const char * p = key.data();
201 const char * end = p + key.length();
202 p += 2;
203 Xapian::valueno slot;
204 if (!unpack_uint(&p, end, &slot))
205 throw Xapian::DatabaseCorruptError("bad value key");
206 Xapian::docid first_did;
207 if (!unpack_uint_preserving_sort(&p, end, &first_did))
208 throw Xapian::DatabaseCorruptError("bad value key");
209 first_did += offset;
211 Glass::ValueChunkReader reader(tag.data(), tag.size(), first_did);
212 Xapian::docid last_did = first_did;
213 while (reader.next(), !reader.at_end()) {
214 last_did = reader.get_docid();
217 key.assign("\0\xd8", 2);
218 pack_uint(key, slot);
219 pack_uint_preserving_sort(key, last_did);
221 // Add the docid delta across the chunk to the start of the tag.
222 string newtag;
223 pack_uint(newtag, last_did - first_did);
224 tag.insert(0, newtag);
226 return true;
229 if (GlassCompact::is_doclenchunk_key(key)) {
230 const char * d = key.data();
231 const char * e = d + key.size();
232 d += 2;
234 if (d == e) {
235 // This is an initial chunk, so adjust tag header.
236 d = tag.data();
237 e = d + tag.size();
238 if (!unpack_uint(&d, e, &tf) ||
239 !unpack_uint(&d, e, &cf) ||
240 !unpack_uint(&d, e, &firstdid)) {
241 throw Xapian::DatabaseCorruptError("Bad postlist key");
243 ++firstdid;
244 tag.erase(0, d - tag.data());
245 } else {
246 // Not an initial chunk, so adjust key.
247 size_t tmp = d - key.data();
248 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
249 throw Xapian::DatabaseCorruptError("Bad postlist key");
250 key.erase(tmp);
252 firstdid += offset;
254 d = tag.data();
255 e = d + tag.size();
257 // Convert doclen chunk to honey format.
258 string newtag;
260 // Skip the "last chunk" flag and increase_to_last.
261 if (d == e)
262 throw Xapian::DatabaseCorruptError("No last chunk flag in "
263 "glass docdata chunk");
264 ++d;
265 Xapian::docid increase_to_last;
266 if (!unpack_uint(&d, e, &increase_to_last))
267 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
268 "in glass docdata chunk");
270 Xapian::termcount doclen_max = 0;
271 while (true) {
272 Xapian::termcount doclen;
273 if (!unpack_uint(&d, e, &doclen))
274 throw Xapian::DatabaseCorruptError("Decoding doclen in "
275 "glass docdata chunk");
276 if (doclen > doclen_max)
277 doclen_max = doclen;
278 unsigned char buf[4];
279 unaligned_write4(buf, doclen);
280 newtag.append(reinterpret_cast<char*>(buf), 4);
281 if (d == e)
282 break;
283 Xapian::docid gap_size;
284 if (!unpack_uint(&d, e, &gap_size))
285 throw Xapian::DatabaseCorruptError("Decoding docid "
286 "gap_size in glass "
287 "docdata chunk");
288 // FIXME: Split chunk if the gap_size is at all large.
289 newtag.append(4 * gap_size, '\xff');
292 Assert(!startswith(newtag, "\xff\xff\xff\xff"));
293 Assert(!endswith(newtag, "\xff\xff\xff\xff"));
295 chunk_lastdid = firstdid - 1 + newtag.size() / 4;
297 // Only encode document lengths using a whole number of bytes for
298 // now. We could allow arbitrary bit widths, but it complicates
299 // encoding and decoding so we should consider if the fairly small
300 // additional saving is worth it.
301 if (doclen_max >= 0xffff) {
302 if (doclen_max >= 0xffffff) {
303 newtag.insert(0, 1, char(32));
304 swap(tag, newtag);
305 } else if (doclen_max >= 0xffffffff) {
306 // FIXME: Handle these.
307 const char* m = "Document length values >= 0xffffffff not "
308 "currently handled";
309 throw Xapian::FeatureUnavailableError(m);
310 } else {
311 tag.assign(1, char(24));
312 for (size_t i = 1; i < newtag.size(); i += 4)
313 tag.append(newtag, i, 3);
315 } else {
316 if (doclen_max >= 0xff) {
317 tag.assign(1, char(16));
318 for (size_t i = 2; i < newtag.size(); i += 4)
319 tag.append(newtag, i, 2);
320 } else {
321 tag.assign(1, char(8));
322 for (size_t i = 3; i < newtag.size(); i += 4)
323 tag.append(newtag, i, 1);
327 return true;
330 // Adjust key if this is *NOT* an initial chunk.
331 // key is: pack_string_preserving_sort(key, tname)
332 // plus optionally: pack_uint_preserving_sort(key, did)
333 const char * d = key.data();
334 const char * e = d + key.size();
335 string tname;
336 if (!unpack_string_preserving_sort(&d, e, tname))
337 throw Xapian::DatabaseCorruptError("Bad postlist key");
339 if (d == e) {
340 // This is an initial chunk for a term, so adjust tag header.
341 d = tag.data();
342 e = d + tag.size();
343 if (!unpack_uint(&d, e, &tf) ||
344 !unpack_uint(&d, e, &cf) ||
345 !unpack_uint(&d, e, &firstdid)) {
346 throw Xapian::DatabaseCorruptError("Bad postlist key");
348 ++firstdid;
349 tag.erase(0, d - tag.data());
350 wdf_max = 0;
351 } else {
352 // Not an initial chunk, so adjust key.
353 size_t tmp = d - key.data();
354 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
355 throw Xapian::DatabaseCorruptError("Bad postlist key");
356 key.erase(tmp - 1);
358 firstdid += offset;
360 d = tag.data();
361 e = d + tag.size();
363 // Convert posting chunk to honey format, but without any header.
364 string newtag;
366 // Skip the "last chunk" flag; decode increase_to_last.
367 if (d == e)
368 throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
369 "posting chunk");
370 ++d;
371 Xapian::docid increase_to_last;
372 if (!unpack_uint(&d, e, &increase_to_last))
373 throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
374 "glass posting chunk");
375 chunk_lastdid = firstdid + increase_to_last;
376 if (!unpack_uint(&d, e, &first_wdf))
377 throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
378 "posting chunk");
379 wdf_max = max(wdf_max, first_wdf);
381 while (d != e) {
382 Xapian::docid delta;
383 if (!unpack_uint(&d, e, &delta))
384 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
385 "glass posting chunk");
386 pack_uint(newtag, delta);
387 Xapian::termcount wdf;
388 if (!unpack_uint(&d, e, &wdf))
389 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
390 "posting chunk");
391 pack_uint(newtag, wdf);
392 wdf_max = max(wdf_max, wdf);
395 swap(tag, newtag);
397 return true;
400 #endif
402 template<>
403 class PostlistCursor<const HoneyTable&> : private HoneyCursor {
404 Xapian::docid offset;
406 public:
407 string key, tag;
408 Xapian::docid firstdid;
409 Xapian::docid chunk_lastdid;
410 Xapian::termcount tf, cf;
411 Xapian::termcount first_wdf;
412 Xapian::termcount wdf_max;
414 PostlistCursor(const HoneyTable *in, Xapian::docid offset_)
415 : HoneyCursor(in), offset(offset_), firstdid(0)
417 rewind();
418 next();
421 bool next() {
422 if (!HoneyCursor::next()) return false;
423 // We put all chunks into the non-initial chunk form here, then fix up
424 // the first chunk for each term in the merged database as we merge.
425 read_tag();
426 key = current_key;
427 tag = current_tag;
428 tf = 0;
429 switch (key_type(key)) {
430 case KEY_USER_METADATA:
431 case KEY_VALUE_STATS:
432 return true;
433 case KEY_VALUE_CHUNK: {
434 const char * p = key.data();
435 const char * end = p + key.length();
436 p += 2;
437 Xapian::valueno slot;
438 if (!unpack_uint(&p, end, &slot))
439 throw Xapian::DatabaseCorruptError("bad value key");
440 Xapian::docid did;
441 if (!unpack_uint_preserving_sort(&p, end, &did))
442 throw Xapian::DatabaseCorruptError("bad value key");
443 did += offset;
445 key.assign("\0\xd8", 2);
446 pack_uint(key, slot);
447 pack_uint_preserving_sort(key, did);
448 return true;
450 case KEY_DOCLEN_CHUNK: {
451 const char * p = key.data();
452 const char * end = p + key.length();
453 p += 2;
454 Xapian::docid did = 1;
455 if (p != end &&
456 (!unpack_uint_preserving_sort(&p, end, &did) || p != end)) {
457 throw Xapian::DatabaseCorruptError("Bad doclen key");
459 did += offset;
461 key.erase(2);
462 if (did != 1) {
463 pack_uint_preserving_sort(key, did);
465 return true;
467 case KEY_POSTING_CHUNK:
468 break;
469 default:
470 throw Xapian::DatabaseCorruptError("Bad postlist table key "
471 "type");
474 // Adjust key if this is *NOT* an initial chunk.
475 // key is: pack_string_preserving_sort(key, term)
476 // plus optionally: pack_uint_preserving_sort(key, did)
477 const char * d = key.data();
478 const char * e = d + key.size();
479 string term;
480 if (!unpack_string_preserving_sort(&d, e, term))
481 throw Xapian::DatabaseCorruptError("Bad postlist key");
483 if (d == e) {
484 // This is an initial chunk for a term, so remove tag header.
485 d = tag.data();
486 e = d + tag.size();
488 Xapian::docid lastdid;
489 if (!decode_initial_chunk_header(&d, e, tf, cf,
490 firstdid, lastdid, chunk_lastdid,
491 first_wdf, wdf_max)) {
492 throw Xapian::DatabaseCorruptError("Bad postlist initial "
493 "chunk header");
495 // Ignore lastdid - we'll need to recalculate it (at least when
496 // merging, and for simplicity we always do).
497 (void)lastdid;
498 tag.erase(0, d - tag.data());
499 } else {
500 // Not an initial chunk, so adjust key and remove tag header.
501 size_t tmp = d - key.data();
502 if (!unpack_uint_preserving_sort(&d, e, &chunk_lastdid) ||
503 d != e) {
504 throw Xapian::DatabaseCorruptError("Bad postlist key");
506 // -1 to remove the terminating zero byte too.
507 key.erase(tmp - 1);
509 d = tag.data();
510 e = d + tag.size();
513 bool have_wdfs = (cf != 0);
514 if (have_wdfs && tf > 2) {
515 Xapian::termcount remaining_cf_for_flat_wdf =
516 (tf - 1) * wdf_max;
517 // Check this matches and that it isn't a false match due
518 // to overflow of the multiplication above.
519 if (cf - first_wdf == remaining_cf_for_flat_wdf &&
520 usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
521 // The wdf is flat so we don't need to store it.
522 have_wdfs = false;
526 if (have_wdfs) {
527 if (!decode_delta_chunk_header(&d, e, chunk_lastdid, firstdid,
528 first_wdf)) {
529 throw Xapian::DatabaseCorruptError("Bad postlist delta "
530 "chunk header");
532 tag.erase(0, d - tag.data());
533 } else {
534 if (!decode_delta_chunk_header_no_wdf(&d, e, chunk_lastdid,
535 firstdid)) {
536 throw Xapian::DatabaseCorruptError("Bad postlist delta "
537 "chunk header");
539 first_wdf = 0;
540 // Splice in a 0 wdf value after each docid delta.
541 string newtag;
542 for (size_t i = d - tag.data(); i < tag.size(); i += 4) {
543 newtag.append(tag, i, 4);
544 newtag.append(4, '\0');
546 tag = std::move(newtag);
549 firstdid += offset;
550 chunk_lastdid += offset;
551 return true;
555 template<>
556 class PostlistCursor<HoneyTable&> : public PostlistCursor<const HoneyTable&> {
557 public:
558 PostlistCursor(HoneyTable *in, Xapian::docid offset_)
559 : PostlistCursor<const HoneyTable&>(in, offset_) {}
562 template<typename T>
563 class PostlistCursorGt {
564 public:
565 /** Return true if and only if a's key is strictly greater than b's key.
567 bool operator()(const T* a, const T* b) const {
568 if (a->key > b->key) return true;
569 if (a->key != b->key) return false;
570 return (a->firstdid > b->firstdid);
574 static string
575 encode_valuestats(Xapian::doccount freq,
576 const string & lbound, const string & ubound)
578 string value;
579 pack_uint(value, freq);
580 pack_string(value, lbound);
581 // We don't store or count empty values, so neither of the bounds
582 // can be empty. So we can safely store an empty upper bound when
583 // the bounds are equal.
584 if (lbound != ubound) value += ubound;
585 return value;
588 // U : vector<HoneyTable*>::const_iterator
589 template<typename T, typename U> void
590 merge_postlists(Xapian::Compactor * compactor,
591 T * out, vector<Xapian::docid>::const_iterator offset,
592 U b, U e)
594 typedef decltype(**b) table_type; // E.g. HoneyTable
595 typedef PostlistCursor<table_type> cursor_type;
596 typedef PostlistCursorGt<cursor_type> gt_type;
597 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
598 for ( ; b != e; ++b, ++offset) {
599 auto in = *b;
600 if (in->empty()) {
601 // Skip empty tables.
602 continue;
605 pq.push(new cursor_type(in, *offset));
608 string last_key;
610 // Merge user metadata.
611 vector<string> tags;
612 while (!pq.empty()) {
613 cursor_type * cur = pq.top();
614 const string& key = cur->key;
615 if (key_type(key) != KEY_USER_METADATA) break;
617 if (key != last_key) {
618 if (!tags.empty()) {
619 if (tags.size() > 1 && compactor) {
620 Assert(!last_key.empty());
621 // FIXME: It would be better to merge all duplicates
622 // for a key in one call, but currently we don't in
623 // multipass mode.
624 const string & resolved_tag =
625 compactor->resolve_duplicate_metadata(last_key,
626 tags.size(),
627 &tags[0]);
628 out->add(last_key, resolved_tag);
629 } else {
630 Assert(!last_key.empty());
631 out->add(last_key, tags[0]);
633 tags.resize(0);
635 last_key = key;
637 tags.push_back(cur->tag);
639 pq.pop();
640 if (cur->next()) {
641 pq.push(cur);
642 } else {
643 delete cur;
646 if (!tags.empty()) {
647 if (tags.size() > 1 && compactor) {
648 Assert(!last_key.empty());
649 const string & resolved_tag =
650 compactor->resolve_duplicate_metadata(last_key,
651 tags.size(),
652 &tags[0]);
653 out->add(last_key, resolved_tag);
654 } else {
655 Assert(!last_key.empty());
656 out->add(last_key, tags[0]);
662 // Merge valuestats.
663 Xapian::doccount freq = 0;
664 string lbound, ubound;
666 while (!pq.empty()) {
667 cursor_type * cur = pq.top();
668 const string& key = cur->key;
669 if (key_type(key) != KEY_VALUE_STATS) break;
670 if (key != last_key) {
671 // For the first valuestats key, last_key will be the previous
672 // key we wrote, which we don't want to overwrite. This is the
673 // only time that freq will be 0, so check that.
674 if (freq) {
675 out->add(last_key, encode_valuestats(freq, lbound, ubound));
676 freq = 0;
678 last_key = key;
681 const string & tag = cur->tag;
683 const char * pos = tag.data();
684 const char * end = pos + tag.size();
686 Xapian::doccount f;
687 string l, u;
688 if (!unpack_uint(&pos, end, &f)) {
689 if (*pos == 0)
690 throw Xapian::DatabaseCorruptError("Incomplete stats item "
691 "in value table");
692 throw Xapian::RangeError("Frequency statistic in value table "
693 "is too large");
695 if (!unpack_string(&pos, end, l)) {
696 if (*pos == 0)
697 throw Xapian::DatabaseCorruptError("Incomplete stats item "
698 "in value table");
699 throw Xapian::RangeError("Lower bound in value table is too "
700 "large");
702 size_t len = end - pos;
703 if (len == 0) {
704 u = l;
705 } else {
706 u.assign(pos, len);
708 if (freq == 0) {
709 freq = f;
710 lbound = l;
711 ubound = u;
712 } else {
713 freq += f;
714 if (l < lbound) lbound = l;
715 if (u > ubound) ubound = u;
718 pq.pop();
719 if (cur->next()) {
720 pq.push(cur);
721 } else {
722 delete cur;
726 if (freq) {
727 out->add(last_key, encode_valuestats(freq, lbound, ubound));
731 // Merge valuestream chunks.
732 while (!pq.empty()) {
733 cursor_type * cur = pq.top();
734 const string & key = cur->key;
735 if (key_type(key) != KEY_VALUE_CHUNK) break;
736 out->add(key, cur->tag);
737 pq.pop();
738 if (cur->next()) {
739 pq.push(cur);
740 } else {
741 delete cur;
745 // Merge doclen chunks.
746 while (!pq.empty()) {
747 cursor_type * cur = pq.top();
748 string & key = cur->key;
749 if (key_type(key) != KEY_DOCLEN_CHUNK) break;
750 pack_uint_preserving_sort(key, cur->chunk_lastdid);
751 out->add(key, cur->tag);
752 pq.pop();
753 if (cur->next()) {
754 pq.push(cur);
755 } else {
756 delete cur;
760 Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
762 struct HoneyPostListChunk {
763 Xapian::docid first, last;
764 Xapian::termcount first_wdf;
765 Xapian::termcount wdf_max;
766 string data;
768 HoneyPostListChunk(Xapian::docid first_,
769 Xapian::docid last_,
770 Xapian::termcount first_wdf_,
771 Xapian::termcount wdf_max_,
772 string&& data_)
773 : first(first_),
774 last(last_),
775 first_wdf(first_wdf_),
776 wdf_max(wdf_max_),
777 data(data_) {}
779 vector<HoneyPostListChunk> tags;
780 while (true) {
781 cursor_type * cur = NULL;
782 if (!pq.empty()) {
783 cur = pq.top();
784 pq.pop();
786 if (cur) {
787 AssertEq(key_type(cur->key), KEY_POSTING_CHUNK);
789 if (cur == NULL || cur->key != last_key) {
790 if (!tags.empty()) {
791 Xapian::termcount first_wdf = tags[0].first_wdf;
792 Xapian::docid chunk_lastdid = tags[0].last;
793 Xapian::docid last_did = tags.back().last;
794 Xapian::termcount wdf_max = tags.back().wdf_max;
796 string first_tag;
797 encode_initial_chunk_header(tf, cf, tags[0].first, last_did,
798 chunk_lastdid,
799 first_wdf, wdf_max, first_tag);
800 bool have_wdfs = (cf != 0);
801 if (have_wdfs && tf > 2) {
802 Xapian::termcount remaining_cf_for_flat_wdf =
803 (tf - 1) * wdf_max;
804 // Check this matches and that it isn't a false match due
805 // to overflow of the multiplication above.
806 if (cf - first_wdf == remaining_cf_for_flat_wdf &&
807 usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
808 // The wdf is flat so we don't need to store it.
809 have_wdfs = false;
813 if (tf > 2) {
814 if (have_wdfs) {
815 first_tag += tags[0].data;
816 } else {
817 const char* pos = tags[0].data.data();
818 const char* pos_end = pos + tags[0].data.size();
819 while (pos != pos_end) {
820 Xapian::docid delta;
821 if (!unpack_uint(&pos, pos_end, &delta))
822 throw_database_corrupt("Decoding docid "
823 "delta", pos);
824 pack_uint(first_tag, delta);
825 Xapian::termcount wdf;
826 if (!unpack_uint(&pos, pos_end, &wdf))
827 throw_database_corrupt("Decoding wdf",
828 pos);
829 // Only copy over the docid deltas, not the
830 // wdfs.
831 (void)wdf;
835 out->add(last_key, first_tag);
837 // If tf == 2, the data could be split over two tags when
838 // merging, but we only output an initial tag in this case.
839 if (tf > 2 && tags.size() > 1) {
840 string term;
841 const char* p = last_key.data();
842 const char* end = p + last_key.size();
843 if (!unpack_string_preserving_sort(&p, end, term) ||
844 p != end) {
845 throw Xapian::DatabaseCorruptError("Bad postlist "
846 "chunk key");
849 auto i = tags.begin();
850 while (++i != tags.end()) {
851 last_did = i->last;
852 const string& key = pack_honey_postlist_key(term,
853 last_did);
854 string tag;
855 if (have_wdfs) {
856 encode_delta_chunk_header(i->first,
857 last_did,
858 i->first_wdf,
859 tag);
860 tag += i->data;
861 } else {
862 encode_delta_chunk_header_no_wdf(i->first,
863 last_did,
864 tag);
865 const char* pos = i->data.data();
866 const char* pos_end = pos + i->data.size();
867 while (pos != pos_end) {
868 Xapian::docid delta;
869 if (!unpack_uint(&pos, pos_end, &delta))
870 throw_database_corrupt("Decoding docid "
871 "delta", pos);
872 pack_uint(tag, delta);
873 Xapian::termcount wdf;
874 if (!unpack_uint(&pos, pos_end, &wdf))
875 throw_database_corrupt("Decoding wdf",
876 pos);
877 // Only copy over the docid deltas, not the
878 // wdfs.
879 (void)wdf;
883 out->add(key, tag);
887 tags.clear();
888 if (cur == NULL) break;
889 tf = cf = 0;
890 last_key = cur->key;
892 if (cur->tf) {
893 tf += cur->tf;
894 cf += cur->cf;
896 tags.push_back(HoneyPostListChunk(cur->firstdid,
897 cur->chunk_lastdid,
898 cur->first_wdf,
899 cur->wdf_max,
900 std::move(cur->tag)));
901 if (cur->next()) {
902 pq.push(cur);
903 } else {
904 delete cur;
909 template<typename T> struct MergeCursor;
911 #ifndef DISABLE_GPL_LIBXAPIAN
912 template<>
913 struct MergeCursor<const GlassTable&> : public GlassCursor {
914 explicit MergeCursor(const GlassTable *in) : GlassCursor(in) {
915 rewind();
916 next();
919 #endif
921 template<>
922 struct MergeCursor<const HoneyTable&> : public HoneyCursor {
923 explicit MergeCursor(const HoneyTable *in) : HoneyCursor(in) {
924 rewind();
925 next();
929 template<typename T>
930 struct CursorGt {
931 /// Return true if and only if a's key is strictly greater than b's key.
932 bool operator()(const T* a, const T* b) const {
933 if (b->after_end()) return false;
934 if (a->after_end()) return true;
935 return (a->current_key > b->current_key);
939 #ifndef DISABLE_GPL_LIBXAPIAN
940 // Convert glass to honey.
941 static void
942 merge_spellings(HoneyTable* out,
943 vector<const GlassTable*>::const_iterator b,
944 vector<const GlassTable*>::const_iterator e)
946 typedef MergeCursor<const GlassTable&> cursor_type;
947 typedef CursorGt<cursor_type> gt_type;
948 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
949 for ( ; b != e; ++b) {
950 auto in = *b;
951 if (!in->empty()) {
952 pq.push(new cursor_type(in));
956 while (!pq.empty()) {
957 cursor_type * cur = pq.top();
958 pq.pop();
960 // Glass vs honey spelling key prefix map:
962 // Type Glass Honey
964 // bookend Bbd \x00bd
965 // head Hhe \x01he
966 // middle Mddl \x02ddl
967 // tail Til \x03il
968 // word Wword word
970 // In honey, if the word's first byte is <= '\x04', we add a prefix
971 // of '\x04' (which is unlikely in real world use but ensures that
972 // we can handle arbitrary strings).
974 // The prefixes for honey are chosen such that we save a byte for the
975 // key of all real-world words, and so that more first bytes are used
976 // so that a top-level array index is more effective, especially for
977 // random-access lookup of word entries (which we do to check the
978 // frequency of potential spelling candidates).
980 // The other prefixes are chosen such that we can do look up in key
981 // order at search time, which is more efficient for a cursor which can
982 // only efficiently move forwards.
984 // Note that the key ordering is the same for glass and honey, which
985 // makes translating during compaction simpler.
986 string key = cur->current_key;
987 switch (key[0]) {
988 case 'B':
989 key[0] = Honey::KEY_PREFIX_BOOKEND;
990 break;
991 case 'H':
992 key[0] = Honey::KEY_PREFIX_HEAD;
993 break;
994 case 'M':
995 key[0] = Honey::KEY_PREFIX_MIDDLE;
996 break;
997 case 'T':
998 key[0] = Honey::KEY_PREFIX_TAIL;
999 break;
1000 case 'W':
1001 if (static_cast<unsigned char>(key[1]) > Honey::KEY_PREFIX_WORD)
1002 key.erase(0, 1);
1003 else
1004 key[0] = Honey::KEY_PREFIX_WORD;
1005 break;
1006 default: {
1007 string m = "Bad spelling key prefix: ";
1008 m += static_cast<unsigned char>(key[0]);
1009 throw Xapian::DatabaseCorruptError(m);
1013 if (pq.empty() || pq.top()->current_key > key) {
1014 // No merging to do for this key so just copy the tag value,
1015 // adjusting if necessary. If we don't need to adjust it, just
1016 // copy the compressed value.
1017 bool compressed;
1018 switch (key[0]) {
1019 case Honey::KEY_PREFIX_HEAD: {
1020 compressed = cur->read_tag(false);
1021 unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1022 cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1023 AssertEq(cur->current_tag[1], key[1]);
1024 AssertEq(cur->current_tag[2], key[2]);
1025 cur->current_tag.erase(1, 2);
1026 break;
1028 case Honey::KEY_PREFIX_TAIL: {
1029 compressed = cur->read_tag(false);
1030 unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1031 cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1032 AssertEq(cur->current_tag[len - 1], key[1]);
1033 AssertEq(cur->current_tag[len], key[2]);
1034 cur->current_tag.erase(len - 1, 2);
1035 break;
1037 case Honey::KEY_PREFIX_BOOKEND: {
1038 compressed = cur->read_tag(false);
1039 unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1040 cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1041 AssertEq(cur->current_tag[1], key[1]);
1042 AssertEq(cur->current_tag[len], key[2]);
1043 cur->current_tag.erase(len, 1);
1044 cur->current_tag.erase(1, 1);
1045 break;
1047 default:
1048 compressed = cur->read_tag(true);
1049 break;
1051 out->add(key, cur->current_tag, compressed);
1052 if (cur->next()) {
1053 pq.push(cur);
1054 } else {
1055 delete cur;
1057 continue;
1060 // Merge tag values with the same key:
1061 string tag;
1062 if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1063 // We just want the union of words, so copy over the first instance
1064 // and skip any identical ones.
1065 priority_queue<PrefixCompressedStringItor *,
1066 vector<PrefixCompressedStringItor *>,
1067 PrefixCompressedStringItorGt> pqtag;
1068 // Stick all the cursor_type pointers in a vector because their
1069 // current_tag members must remain valid while we're merging their
1070 // tags, but we need to call next() on them all afterwards.
1071 vector<cursor_type *> vec;
1072 vec.reserve(pq.size());
1074 while (true) {
1075 cur->read_tag();
1076 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
1077 vec.push_back(cur);
1078 if (pq.empty() || pq.top()->current_key != key) break;
1079 cur = pq.top();
1080 pq.pop();
1083 PrefixCompressedStringWriter wr(tag, key);
1084 string lastword;
1085 while (!pqtag.empty()) {
1086 PrefixCompressedStringItor * it = pqtag.top();
1087 pqtag.pop();
1088 string word = **it;
1089 if (word != lastword) {
1090 lastword = word;
1091 wr.append(lastword);
1093 ++*it;
1094 if (!it->at_end()) {
1095 pqtag.push(it);
1096 } else {
1097 delete it;
1101 for (auto i : vec) {
1102 cur = i;
1103 if (cur->next()) {
1104 pq.push(cur);
1105 } else {
1106 delete cur;
1109 } else {
1110 // We want to sum the frequencies from tags for the same key.
1111 Xapian::termcount tot_freq = 0;
1112 while (true) {
1113 cur->read_tag();
1114 Xapian::termcount freq;
1115 const char * p = cur->current_tag.data();
1116 const char * end = p + cur->current_tag.size();
1117 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1118 throw_database_corrupt("Bad spelling word freq", p);
1120 tot_freq += freq;
1121 if (cur->next()) {
1122 pq.push(cur);
1123 } else {
1124 delete cur;
1126 if (pq.empty() || pq.top()->current_key != key) break;
1127 cur = pq.top();
1128 pq.pop();
1130 tag.resize(0);
1131 pack_uint_last(tag, tot_freq);
1133 out->add(key, tag);
1136 #endif
1138 static void
1139 merge_spellings(HoneyTable* out,
1140 vector<const HoneyTable*>::const_iterator b,
1141 vector<const HoneyTable*>::const_iterator e)
1143 typedef MergeCursor<const HoneyTable&> cursor_type;
1144 typedef CursorGt<cursor_type> gt_type;
1145 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1146 for ( ; b != e; ++b) {
1147 auto in = *b;
1148 if (!in->empty()) {
1149 pq.push(new cursor_type(in));
1153 while (!pq.empty()) {
1154 cursor_type * cur = pq.top();
1155 pq.pop();
1157 string key = cur->current_key;
1158 if (pq.empty() || pq.top()->current_key > key) {
1159 // No need to merge the tags, just copy the (possibly compressed)
1160 // tag value.
1161 bool compressed = cur->read_tag(true);
1162 out->add(key, cur->current_tag, compressed);
1163 if (cur->next()) {
1164 pq.push(cur);
1165 } else {
1166 delete cur;
1168 continue;
1171 // Merge tag values with the same key:
1172 string tag;
1173 if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1174 // We just want the union of words, so copy over the first instance
1175 // and skip any identical ones.
1176 priority_queue<PrefixCompressedStringItor *,
1177 vector<PrefixCompressedStringItor *>,
1178 PrefixCompressedStringItorGt> pqtag;
1179 // Stick all the cursor_type pointers in a vector because their
1180 // current_tag members must remain valid while we're merging their
1181 // tags, but we need to call next() on them all afterwards.
1182 vector<cursor_type *> vec;
1183 vec.reserve(pq.size());
1185 while (true) {
1186 cur->read_tag();
1187 pqtag.push(new PrefixCompressedStringItor(cur->current_tag,
1188 key));
1189 vec.push_back(cur);
1190 if (pq.empty() || pq.top()->current_key != key) break;
1191 cur = pq.top();
1192 pq.pop();
1195 PrefixCompressedStringWriter wr(tag, key);
1196 string lastword;
1197 while (!pqtag.empty()) {
1198 PrefixCompressedStringItor * it = pqtag.top();
1199 pqtag.pop();
1200 string word = **it;
1201 if (word != lastword) {
1202 lastword = word;
1203 wr.append(lastword);
1205 ++*it;
1206 if (!it->at_end()) {
1207 pqtag.push(it);
1208 } else {
1209 delete it;
1213 for (auto i : vec) {
1214 cur = i;
1215 if (cur->next()) {
1216 pq.push(cur);
1217 } else {
1218 delete cur;
1221 } else {
1222 // We want to sum the frequencies from tags for the same key.
1223 Xapian::termcount tot_freq = 0;
1224 while (true) {
1225 cur->read_tag();
1226 Xapian::termcount freq;
1227 const char * p = cur->current_tag.data();
1228 const char * end = p + cur->current_tag.size();
1229 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1230 throw_database_corrupt("Bad spelling word freq", p);
1232 tot_freq += freq;
1233 if (cur->next()) {
1234 pq.push(cur);
1235 } else {
1236 delete cur;
1238 if (pq.empty() || pq.top()->current_key != key) break;
1239 cur = pq.top();
1240 pq.pop();
1242 tag.resize(0);
1243 pack_uint_last(tag, tot_freq);
1245 out->add(key, tag);
1249 // U : vector<HoneyTable*>::const_iterator
1250 template<typename T, typename U> void
1251 merge_synonyms(T* out, U b, U e)
1253 typedef decltype(**b) table_type; // E.g. HoneyTable
1254 typedef MergeCursor<table_type> cursor_type;
1255 typedef CursorGt<cursor_type> gt_type;
1256 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1257 for ( ; b != e; ++b) {
1258 auto in = *b;
1259 if (!in->empty()) {
1260 pq.push(new cursor_type(in));
1264 while (!pq.empty()) {
1265 cursor_type * cur = pq.top();
1266 pq.pop();
1268 string key = cur->current_key;
1269 if (pq.empty() || pq.top()->current_key > key) {
1270 // No need to merge the tags, just copy the (possibly compressed)
1271 // tag value.
1272 bool compressed = cur->read_tag(true);
1273 out->add(key, cur->current_tag, compressed);
1274 if (cur->next()) {
1275 pq.push(cur);
1276 } else {
1277 delete cur;
1279 continue;
1282 // Merge tag values with the same key:
1283 string tag;
1285 // We just want the union of words, so copy over the first instance
1286 // and skip any identical ones.
1287 priority_queue<ByteLengthPrefixedStringItor *,
1288 vector<ByteLengthPrefixedStringItor *>,
1289 ByteLengthPrefixedStringItorGt> pqtag;
1290 vector<cursor_type *> vec;
1292 while (true) {
1293 cur->read_tag();
1294 pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
1295 vec.push_back(cur);
1296 if (pq.empty() || pq.top()->current_key != key) break;
1297 cur = pq.top();
1298 pq.pop();
1301 string lastword;
1302 while (!pqtag.empty()) {
1303 ByteLengthPrefixedStringItor * it = pqtag.top();
1304 pqtag.pop();
1305 if (**it != lastword) {
1306 lastword = **it;
1307 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
1308 tag += lastword;
1310 ++*it;
1311 if (!it->at_end()) {
1312 pqtag.push(it);
1313 } else {
1314 delete it;
1318 for (auto i : vec) {
1319 cur = i;
1320 if (cur->next()) {
1321 pq.push(cur);
1322 } else {
1323 delete cur;
1327 out->add(key, tag);
1331 template<typename T, typename U> void
1332 multimerge_postlists(Xapian::Compactor * compactor,
1333 T* out, const char * tmpdir,
1334 const vector<U*>& in,
1335 vector<Xapian::docid> off)
1337 if (in.size() <= 3) {
1338 merge_postlists(compactor, out, off.begin(), in.begin(), in.end());
1339 return;
1341 unsigned int c = 0;
1342 vector<HoneyTable *> tmp;
1343 tmp.reserve(in.size() / 2);
1345 vector<Xapian::docid> newoff;
1346 newoff.resize(in.size() / 2);
1347 for (unsigned int i = 0, j; i < in.size(); i = j) {
1348 j = i + 2;
1349 if (j == in.size() - 1) ++j;
1351 string dest = tmpdir;
1352 char buf[64];
1353 sprintf(buf, "/tmp%u_%u.", c, i / 2);
1354 dest += buf;
1356 HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1358 // Use maximum blocksize for temporary tables. And don't compress
1359 // entries in temporary tables, even if the final table would do
1360 // so. Any already compressed entries will get copied in
1361 // compressed form. (FIXME: HoneyTable has no blocksize)
1362 Honey::RootInfo root_info;
1363 root_info.init(65536, 0);
1364 const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1365 tmptab->create_and_open(flags, root_info);
1367 merge_postlists(compactor, tmptab, off.begin() + i,
1368 in.begin() + i, in.begin() + j);
1369 tmp.push_back(tmptab);
1370 tmptab->flush_db();
1371 tmptab->commit(1, &root_info);
1372 AssertRel(root_info.get_blocksize(),==,65536);
1374 swap(off, newoff);
1375 ++c;
1378 while (tmp.size() > 3) {
1379 vector<HoneyTable *> tmpout;
1380 tmpout.reserve(tmp.size() / 2);
1381 vector<Xapian::docid> newoff;
1382 newoff.resize(tmp.size() / 2);
1383 for (unsigned int i = 0, j; i < tmp.size(); i = j) {
1384 j = i + 2;
1385 if (j == tmp.size() - 1) ++j;
1387 string dest = tmpdir;
1388 char buf[64];
1389 sprintf(buf, "/tmp%u_%u.", c, i / 2);
1390 dest += buf;
1392 HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1394 // Use maximum blocksize for temporary tables. And don't compress
1395 // entries in temporary tables, even if the final table would do
1396 // so. Any already compressed entries will get copied in
1397 // compressed form. (FIXME: HoneyTable has no blocksize)
1398 Honey::RootInfo root_info;
1399 root_info.init(65536, 0);
1400 const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1401 tmptab->create_and_open(flags, root_info);
1403 merge_postlists(compactor, tmptab, off.begin() + i,
1404 tmp.begin() + i, tmp.begin() + j);
1405 if (c > 0) {
1406 for (unsigned int k = i; k < j; ++k) {
1407 // FIXME: unlink(tmp[k]->get_path().c_str());
1408 delete tmp[k];
1409 tmp[k] = NULL;
1412 tmpout.push_back(tmptab);
1413 tmptab->flush_db();
1414 tmptab->commit(1, &root_info);
1415 AssertRel(root_info.get_blocksize(),==,65536);
1417 swap(tmp, tmpout);
1418 swap(off, newoff);
1419 ++c;
1421 merge_postlists(compactor, out, off.begin(), tmp.begin(), tmp.end());
1422 if (c > 0) {
1423 for (size_t k = 0; k < tmp.size(); ++k) {
1424 // FIXME: unlink(tmp[k]->get_path().c_str());
1425 delete tmp[k];
1426 tmp[k] = NULL;
1431 template<typename T> class PositionCursor;
1433 #ifndef DISABLE_GPL_LIBXAPIAN
1434 template<>
1435 class PositionCursor<const GlassTable&> : private GlassCursor {
1436 Xapian::docid offset;
1438 public:
1439 string key;
1440 Xapian::docid firstdid;
1442 PositionCursor(const GlassTable *in, Xapian::docid offset_)
1443 : GlassCursor(in), offset(offset_), firstdid(0) {
1444 rewind();
1445 next();
1448 bool next() {
1449 if (!GlassCursor::next()) return false;
1450 read_tag();
1451 const char * d = current_key.data();
1452 const char * e = d + current_key.size();
1453 string term;
1454 Xapian::docid did;
1455 if (!unpack_string_preserving_sort(&d, e, term) ||
1456 !unpack_uint_preserving_sort(&d, e, &did) ||
1457 d != e) {
1458 throw Xapian::DatabaseCorruptError("Bad position key");
1461 key.resize(0);
1462 pack_string_preserving_sort(key, term);
1463 pack_uint_preserving_sort(key, did + offset);
1464 return true;
1467 const string & get_tag() const {
1468 return current_tag;
1471 #endif
1473 template<>
1474 class PositionCursor<const HoneyTable&> : private HoneyCursor {
1475 Xapian::docid offset;
1477 public:
1478 string key;
1479 Xapian::docid firstdid;
1481 PositionCursor(const HoneyTable *in, Xapian::docid offset_)
1482 : HoneyCursor(in), offset(offset_), firstdid(0) {
1483 rewind();
1484 next();
1487 bool next() {
1488 if (!HoneyCursor::next()) return false;
1489 read_tag();
1490 const char * d = current_key.data();
1491 const char * e = d + current_key.size();
1492 string term;
1493 Xapian::docid did;
1494 if (!unpack_string_preserving_sort(&d, e, term) ||
1495 !unpack_uint_preserving_sort(&d, e, &did) ||
1496 d != e) {
1497 throw Xapian::DatabaseCorruptError("Bad position key");
1500 key.resize(0);
1501 pack_string_preserving_sort(key, term);
1502 pack_uint_preserving_sort(key, did + offset);
1503 return true;
1506 const string & get_tag() const {
1507 return current_tag;
1511 template<typename T>
1512 class PositionCursorGt {
1513 public:
1514 /** Return true if and only if a's key is strictly greater than b's key.
1516 bool operator()(const T* a, const T* b) const {
1517 return a->key > b->key;
1521 template<typename T, typename U> void
1522 merge_positions(T* out, const vector<U*> & inputs,
1523 const vector<Xapian::docid> & offset)
1525 typedef decltype(*inputs[0]) table_type; // E.g. HoneyTable
1526 typedef PositionCursor<table_type> cursor_type;
1527 typedef PositionCursorGt<cursor_type> gt_type;
1528 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1529 for (size_t i = 0; i < inputs.size(); ++i) {
1530 auto in = inputs[i];
1531 if (in->empty()) {
1532 // Skip empty tables.
1533 continue;
1536 pq.push(new cursor_type(in, offset[i]));
1539 while (!pq.empty()) {
1540 cursor_type * cur = pq.top();
1541 pq.pop();
1542 out->add(cur->key, cur->get_tag());
1543 if (cur->next()) {
1544 pq.push(cur);
1545 } else {
1546 delete cur;
1551 template<typename T, typename U> void
1552 merge_docid_keyed(T *out, const vector<U*> & inputs,
1553 const vector<Xapian::docid> & offset,
1554 int = 0)
1556 for (size_t i = 0; i < inputs.size(); ++i) {
1557 Xapian::docid off = offset[i];
1559 auto in = inputs[i];
1560 if (in->empty()) continue;
1562 HoneyCursor cur(in);
1563 cur.rewind();
1565 string key;
1566 while (cur.next()) {
1567 // Adjust the key if this isn't the first database.
1568 if (off) {
1569 Xapian::docid did;
1570 const char * d = cur.current_key.data();
1571 const char * e = d + cur.current_key.size();
1572 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1573 string msg = "Bad key in ";
1574 msg += inputs[i]->get_path();
1575 throw Xapian::DatabaseCorruptError(msg);
1577 did += off;
1578 key.resize(0);
1579 pack_uint_preserving_sort(key, did);
1580 if (d != e) {
1581 // Copy over anything extra in the key (e.g. the zero byte
1582 // at the end of "used value slots" in the termlist table).
1583 key.append(d, e - d);
1585 } else {
1586 key = cur.current_key;
1588 bool compressed = cur.read_tag(true);
1589 out->add(key, cur.current_tag, compressed);
1594 #ifndef DISABLE_GPL_LIBXAPIAN
1595 template<typename T> void
1596 merge_docid_keyed(T *out, const vector<const GlassTable*> & inputs,
1597 const vector<Xapian::docid> & offset,
1598 int table_type = 0)
1600 for (size_t i = 0; i < inputs.size(); ++i) {
1601 Xapian::docid off = offset[i];
1603 auto in = inputs[i];
1604 if (in->empty()) continue;
1606 GlassCursor cur(in);
1607 cur.rewind();
1609 string key;
1610 while (cur.next()) {
1611 next_without_next:
1612 // Adjust the key if this isn't the first database.
1613 if (off) {
1614 Xapian::docid did;
1615 const char * d = cur.current_key.data();
1616 const char * e = d + cur.current_key.size();
1617 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1618 string msg = "Bad key in ";
1619 msg += inputs[i]->get_path();
1620 throw Xapian::DatabaseCorruptError(msg);
1622 did += off;
1623 key.resize(0);
1624 pack_uint_preserving_sort(key, did);
1625 if (d != e) {
1626 // Copy over anything extra in the key (e.g. the zero byte
1627 // at the end of "used value slots" in the termlist table).
1628 key.append(d, e - d);
1630 } else {
1631 key = cur.current_key;
1633 if (table_type == Honey::TERMLIST) {
1634 if (termlist_key_is_values_used(key)) {
1635 throw Xapian::DatabaseCorruptError("value slots used "
1636 "entry without a "
1637 "corresponding "
1638 "termlist entry");
1640 cur.read_tag();
1641 string tag = cur.current_tag;
1643 bool next_result = cur.next();
1644 bool next_already_done = true;
1645 unsigned bitmap_slots_used = 0;
1646 string encoded_slots_used;
1647 if (next_result &&
1648 termlist_key_is_values_used(cur.current_key)) {
1649 next_already_done = false;
1650 cur.read_tag();
1651 const string& valtag = cur.current_tag;
1653 const char* p = valtag.data();
1654 const char* end = p + valtag.size();
1656 Xapian::VecCOW<Xapian::termpos> slots;
1658 Xapian::valueno first_slot;
1659 if (!unpack_uint(&p, end, &first_slot)) {
1660 throw_database_corrupt("Value slot encoding corrupt",
1663 slots.push_back(first_slot);
1664 Xapian::valueno last_slot = first_slot;
1665 while (p != end) {
1666 Xapian::valueno slot;
1667 if (!unpack_uint(&p, end, &slot)) {
1668 throw_database_corrupt("Value slot encoding "
1669 "corrupt", p);
1671 slot += last_slot + 1;
1672 last_slot = slot;
1674 slots.push_back(slot);
1677 if (slots.back() <= 6) {
1678 // Encode as a bitmap if only slots in the range 0-6
1679 // are used.
1680 for (auto slot : slots) {
1681 bitmap_slots_used |= 1 << slot;
1683 } else {
1684 string enc;
1685 pack_uint(enc, last_slot);
1686 if (slots.size() > 1) {
1687 BitWriter slots_used(enc);
1688 slots_used.encode(first_slot, last_slot);
1689 slots_used.encode(slots.size() - 2,
1690 last_slot - first_slot);
1691 slots_used.encode_interpolative(slots, 0,
1692 slots.size() - 1);
1693 encoded_slots_used = slots_used.freeze();
1694 } else {
1695 encoded_slots_used = std::move(enc);
1700 const char* pos = tag.data();
1701 const char* end = pos + tag.size();
1703 string newtag;
1704 if (encoded_slots_used.empty()) {
1705 newtag += char(bitmap_slots_used);
1706 } else {
1707 auto size = encoded_slots_used.size();
1708 if (size < 0x80) {
1709 newtag += char(0x80 | size);
1710 } else {
1711 newtag += '\x80';
1712 pack_uint(newtag, size);
1714 newtag += encoded_slots_used;
1717 if (pos != end) {
1718 Xapian::termcount doclen;
1719 if (!unpack_uint(&pos, end, &doclen)) {
1720 throw_database_corrupt("doclen", pos);
1722 Xapian::termcount termlist_size;
1723 if (!unpack_uint(&pos, end, &termlist_size)) {
1724 throw_database_corrupt("termlist length", pos);
1726 pack_uint(newtag, termlist_size - 1);
1727 pack_uint(newtag, doclen);
1729 string current_term;
1730 while (pos != end) {
1731 Xapian::termcount current_wdf = 0;
1733 if (!current_term.empty()) {
1734 size_t reuse = static_cast<unsigned char>(*pos++);
1735 newtag += char(reuse);
1737 if (reuse > current_term.size()) {
1738 current_wdf = reuse / (current_term.size() + 1);
1739 reuse = reuse % (current_term.size() + 1);
1741 current_term.resize(reuse);
1743 if (pos == end)
1744 throw_database_corrupt("term", NULL);
1747 size_t append = static_cast<unsigned char>(*pos++);
1748 if (size_t(end - pos) < append)
1749 throw_database_corrupt("term", NULL);
1751 current_term.append(pos, append);
1752 pos += append;
1754 if (current_wdf) {
1755 --current_wdf;
1756 } else {
1757 if (!unpack_uint(&pos, end, &current_wdf)) {
1758 throw_database_corrupt("wdf", pos);
1760 pack_uint(newtag, current_wdf);
1763 newtag += char(append);
1764 newtag.append(current_term.end() - append,
1765 current_term.end());
1768 if (!newtag.empty())
1769 out->add(key, newtag);
1770 if (!next_result) break;
1771 if (next_already_done) goto next_without_next;
1772 } else {
1773 bool compressed = cur.read_tag(true);
1774 out->add(key, cur.current_tag, compressed);
1779 #endif
1783 using namespace HoneyCompact;
1785 void
1786 HoneyDatabase::compact(Xapian::Compactor* compactor,
1787 const char* destdir,
1788 int fd,
1789 int source_backend,
1790 const vector<const Xapian::Database::Internal*>& sources,
1791 const vector<Xapian::docid>& offset,
1792 size_t block_size,
1793 Xapian::Compactor::compaction_level compaction,
1794 unsigned flags,
1795 Xapian::docid last_docid)
1797 struct table_list {
1798 // The "base name" of the table.
1799 char name[9];
1800 // The type.
1801 Honey::table_type type;
1802 // Create tables after position lazily.
1803 bool lazy;
1806 static const table_list tables[] = {
1807 // name type lazy
1808 { "postlist", Honey::POSTLIST, false },
1809 { "docdata", Honey::DOCDATA, true },
1810 { "termlist", Honey::TERMLIST, false },
1811 { "position", Honey::POSITION, true },
1812 { "spelling", Honey::SPELLING, true },
1813 { "synonym", Honey::SYNONYM, true }
1815 const table_list * tables_end = tables +
1816 (sizeof(tables) / sizeof(tables[0]));
1818 const int FLAGS = Xapian::DB_DANGEROUS;
1820 bool single_file = (flags & Xapian::DBCOMPACT_SINGLE_FILE);
1821 bool multipass = (flags & Xapian::DBCOMPACT_MULTIPASS);
1822 if (single_file) {
1823 // FIXME: Support this combination - we need to put temporary files
1824 // somewhere.
1825 multipass = false;
1828 if (single_file) {
1829 for (size_t i = 0; i != sources.size(); ++i) {
1830 bool has_uncommitted_changes;
1831 if (source_backend == Xapian::DB_BACKEND_GLASS) {
1832 auto db = static_cast<const GlassDatabase*>(sources[i]);
1833 has_uncommitted_changes = db->has_uncommitted_changes();
1834 } else {
1835 auto db = static_cast<const HoneyDatabase*>(sources[i]);
1836 has_uncommitted_changes = db->has_uncommitted_changes();
1838 if (has_uncommitted_changes) {
1839 const char * m =
1840 "Can't compact from a WritableDatabase with uncommitted "
1841 "changes - either call commit() first, or create a new "
1842 "Database object from the filename on disk";
1843 throw Xapian::InvalidOperationError(m);
1848 if (block_size < HONEY_MIN_BLOCKSIZE ||
1849 block_size > HONEY_MAX_BLOCKSIZE ||
1850 (block_size & (block_size - 1)) != 0) {
1851 block_size = HONEY_DEFAULT_BLOCKSIZE;
1854 FlintLock lock(destdir ? destdir : "");
1855 if (!single_file) {
1856 string explanation;
1857 FlintLock::reason why = lock.lock(true, false, explanation);
1858 if (why != FlintLock::SUCCESS) {
1859 lock.throw_databaselockerror(why, destdir, explanation);
1863 unique_ptr<HoneyVersion> version_file_out;
1864 if (single_file) {
1865 if (destdir) {
1866 fd = open(destdir, O_RDWR|O_CREAT|O_TRUNC|O_BINARY|O_CLOEXEC, 0666);
1867 if (fd < 0) {
1868 throw Xapian::DatabaseCreateError("open() failed", errno);
1871 version_file_out.reset(new HoneyVersion(fd));
1872 } else {
1873 fd = -1;
1874 version_file_out.reset(new HoneyVersion(destdir));
1877 version_file_out->create(block_size);
1878 for (size_t i = 0; i != sources.size(); ++i) {
1879 if (source_backend == Xapian::DB_BACKEND_GLASS) {
1880 #ifdef DISABLE_GPL_LIBXAPIAN
1881 Assert(false);
1882 #else
1883 auto db = static_cast<const GlassDatabase*>(sources[i]);
1884 auto& v_in = db->version_file;
1885 auto& v_out = version_file_out;
1886 v_out->merge_stats(v_in.get_doccount(),
1887 v_in.get_doclength_lower_bound(),
1888 v_in.get_doclength_upper_bound(),
1889 v_in.get_wdf_upper_bound(),
1890 v_in.get_total_doclen(),
1891 v_in.get_spelling_wordfreq_upper_bound());
1892 #endif
1893 } else {
1894 auto db = static_cast<const HoneyDatabase*>(sources[i]);
1895 version_file_out->merge_stats(db->version_file);
1899 string fl_serialised;
1900 #if 0
1901 if (single_file) {
1902 HoneyFreeList fl;
1903 fl.set_first_unused_block(1); // FIXME: Assumption?
1904 fl.pack(fl_serialised);
1906 #endif
1908 // Set to true if stat() failed (which can happen if the files are > 2GB
1909 // and off_t is 32 bit) or one of the totals overflowed.
1910 bool bad_totals = false;
1911 off_t in_total = 0, out_total = 0;
1913 // FIXME: sort out indentation.
1914 if (source_backend == Xapian::DB_BACKEND_GLASS) {
1915 #ifdef DISABLE_GPL_LIBXAPIAN
1916 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1917 #else
1918 vector<HoneyTable *> tabs;
1919 tabs.reserve(tables_end - tables);
1920 off_t prev_size = block_size;
1921 for (const table_list * t = tables; t < tables_end; ++t) {
1922 // The postlist table requires an N-way merge, adjusting the
1923 // headers of various blocks. The spelling and synonym tables also
1924 // need special handling. The other tables have keys sorted in
1925 // docid order, so we can merge them by simply copying all the keys
1926 // from each source table in turn.
1927 if (compactor)
1928 compactor->set_status(t->name, string());
1930 string dest;
1931 if (!single_file) {
1932 dest = destdir;
1933 dest += '/';
1934 dest += t->name;
1935 dest += '.';
1938 bool output_will_exist = !t->lazy;
1940 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
1941 // on certain systems).
1942 bool bad_stat = false;
1944 // We can't currently report input sizes if there's a single file DB
1945 // amongst the inputs.
1946 bool single_file_in = false;
1948 off_t in_size = 0;
1950 vector<const GlassTable*> inputs;
1951 inputs.reserve(sources.size());
1952 size_t inputs_present = 0;
1953 for (auto src : sources) {
1954 auto db = static_cast<const GlassDatabase*>(src);
1955 const GlassTable * table;
1956 switch (t->type) {
1957 case Honey::POSTLIST:
1958 table = &(db->postlist_table);
1959 break;
1960 case Honey::DOCDATA:
1961 table = &(db->docdata_table);
1962 break;
1963 case Honey::TERMLIST:
1964 table = &(db->termlist_table);
1965 break;
1966 case Honey::POSITION:
1967 table = &(db->position_table);
1968 break;
1969 case Honey::SPELLING:
1970 table = &(db->spelling_table);
1971 break;
1972 case Honey::SYNONYM:
1973 table = &(db->synonym_table);
1974 break;
1975 default:
1976 Assert(false);
1977 return;
1980 if (db->single_file()) {
1981 if (t->lazy && table->empty()) {
1982 // Essentially doesn't exist.
1983 } else {
1984 // FIXME: Find actual size somehow?
1985 // in_size += table->size() / 1024;
1986 single_file_in = true;
1987 bad_totals = true;
1988 output_will_exist = true;
1989 ++inputs_present;
1991 } else {
1992 off_t db_size = file_size(table->get_path());
1993 if (errno == 0) {
1994 // FIXME: check overflow and set bad_totals
1995 in_total += db_size;
1996 in_size += db_size / 1024;
1997 output_will_exist = true;
1998 ++inputs_present;
1999 } else if (errno != ENOENT) {
2000 // We get ENOENT for an optional table.
2001 bad_totals = bad_stat = true;
2002 output_will_exist = true;
2003 ++inputs_present;
2006 inputs.push_back(table);
2009 // If any inputs lack a termlist table, suppress it in the output.
2010 if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2011 if (inputs_present != 0) {
2012 if (compactor) {
2013 string m = str(inputs_present);
2014 m += " of ";
2015 m += str(sources.size());
2016 m += " inputs present, so suppressing output";
2017 compactor->set_status(t->name, m);
2019 continue;
2021 output_will_exist = false;
2024 if (!output_will_exist) {
2025 if (compactor)
2026 compactor->set_status(t->name, "doesn't exist");
2027 continue;
2030 HoneyTable * out;
2031 off_t table_start_offset = -1;
2032 if (single_file) {
2033 if (t == tables) {
2034 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2035 // space for version file. It's tricky to exactly know the
2036 // size of the version file beforehand.
2037 table_start_offset = HONEY_VERSION_MAX_SIZE;
2038 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2039 throw Xapian::DatabaseError("lseek() failed", errno);
2040 } else {
2041 table_start_offset = lseek(fd, 0, SEEK_CUR);
2043 out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2044 false, false);
2045 } else {
2046 out = new HoneyTable(t->name, dest, false, t->lazy);
2048 tabs.push_back(out);
2049 Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2050 if (single_file) {
2051 root_info->set_free_list(fl_serialised);
2052 root_info->set_offset(table_start_offset);
2053 out->open(FLAGS,
2054 version_file_out->get_root(t->type),
2055 version_file_out->get_revision());
2056 } else {
2057 out->create_and_open(FLAGS, *root_info);
2060 out->set_full_compaction(compaction != compactor->STANDARD);
2061 if (compaction == compactor->FULLER) out->set_max_item_size(1);
2063 switch (t->type) {
2064 case Honey::POSTLIST: {
2065 if (multipass && inputs.size() > 3) {
2066 multimerge_postlists(compactor, out, destdir,
2067 inputs, offset);
2068 } else {
2069 merge_postlists(compactor, out, offset.begin(),
2070 inputs.begin(), inputs.end());
2072 break;
2074 case Honey::SPELLING:
2075 merge_spellings(out, inputs.cbegin(), inputs.cend());
2076 break;
2077 case Honey::SYNONYM:
2078 merge_synonyms(out, inputs.begin(), inputs.end());
2079 break;
2080 case Honey::POSITION:
2081 merge_positions(out, inputs, offset);
2082 break;
2083 default:
2084 // DocData, Termlist
2085 merge_docid_keyed(out, inputs, offset, t->type);
2086 break;
2089 // Commit as revision 1.
2090 out->flush_db();
2091 out->commit(1, root_info);
2092 out->sync();
2093 if (single_file) fl_serialised = root_info->get_free_list();
2095 off_t out_size = 0;
2096 if (!bad_stat && !single_file_in) {
2097 off_t db_size;
2098 if (single_file) {
2099 db_size = file_size(fd);
2100 } else {
2101 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2103 if (errno == 0) {
2104 if (single_file) {
2105 off_t old_prev_size = max(prev_size, off_t(block_size));
2106 prev_size = db_size;
2107 db_size -= old_prev_size;
2109 // FIXME: check overflow and set bad_totals
2110 out_total += db_size;
2111 out_size = db_size / 1024;
2112 } else if (errno != ENOENT) {
2113 bad_totals = bad_stat = true;
2116 if (bad_stat) {
2117 if (compactor)
2118 compactor->set_status(t->name,
2119 "Done (couldn't stat all the DB files)");
2120 } else if (single_file_in) {
2121 if (compactor)
2122 compactor->set_status(t->name,
2123 "Done (table sizes unknown for single "
2124 "file DB input)");
2125 } else {
2126 string status;
2127 if (out_size == in_size) {
2128 status = "Size unchanged (";
2129 } else {
2130 off_t delta;
2131 if (out_size < in_size) {
2132 delta = in_size - out_size;
2133 status = "Reduced by ";
2134 } else {
2135 delta = out_size - in_size;
2136 status = "INCREASED by ";
2138 if (in_size) {
2139 status += str(100 * delta / in_size);
2140 status += "% ";
2142 status += str(delta);
2143 status += "K (";
2144 status += str(in_size);
2145 status += "K -> ";
2147 status += str(out_size);
2148 status += "K)";
2149 if (compactor)
2150 compactor->set_status(t->name, status);
2154 // If compacting to a single file output and all the tables are empty, pad
2155 // the output so that it isn't mistaken for a stub database when we try to
2156 // open it. For this it needs to be a multiple of 2KB in size.
2157 if (single_file && prev_size < off_t(block_size)) {
2158 #ifdef HAVE_FTRUNCATE
2159 if (ftruncate(fd, block_size) < 0) {
2160 throw Xapian::DatabaseError("Failed to set size of output "
2161 "database", errno);
2163 #else
2164 const off_t off = block_size - 1;
2165 if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2166 throw Xapian::DatabaseError("Failed to set size of output "
2167 "database", errno);
2169 #endif
2172 if (single_file) {
2173 if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2174 throw Xapian::DatabaseError("lseek() failed", errno);
2177 version_file_out->set_last_docid(last_docid);
2178 string tmpfile = version_file_out->write(1, FLAGS);
2179 if (single_file) {
2180 off_t version_file_size = lseek(fd, 0, SEEK_CUR);
2181 if (version_file_size < 0) {
2182 throw Xapian::DatabaseError("lseek() failed", errno);
2184 if (version_file_size > HONEY_VERSION_MAX_SIZE) {
2185 throw Xapian::DatabaseError("Didn't allow enough space for "
2186 "version file data");
2189 for (unsigned j = 0; j != tabs.size(); ++j) {
2190 tabs[j]->sync();
2192 // Commit with revision 1.
2193 version_file_out->sync(tmpfile, 1, FLAGS);
2194 for (unsigned j = 0; j != tabs.size(); ++j) {
2195 delete tabs[j];
2197 #endif
2198 } else {
2199 vector<HoneyTable *> tabs;
2200 tabs.reserve(tables_end - tables);
2201 off_t prev_size = block_size;
2202 for (const table_list * t = tables; t < tables_end; ++t) {
2203 // The postlist table requires an N-way merge, adjusting the
2204 // headers of various blocks. The spelling and synonym tables also
2205 // need special handling. The other tables have keys sorted in
2206 // docid order, so we can merge them by simply copying all the keys
2207 // from each source table in turn.
2208 if (compactor)
2209 compactor->set_status(t->name, string());
2211 string dest;
2212 if (!single_file) {
2213 dest = destdir;
2214 dest += '/';
2215 dest += t->name;
2216 dest += '.';
2219 bool output_will_exist = !t->lazy;
2221 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2222 // on certain systems).
2223 bool bad_stat = false;
2225 // We can't currently report input sizes if there's a single file DB
2226 // amongst the inputs.
2227 bool single_file_in = false;
2229 off_t in_size = 0;
2231 vector<const HoneyTable*> inputs;
2232 inputs.reserve(sources.size());
2233 size_t inputs_present = 0;
2234 for (auto src : sources) {
2235 auto db = static_cast<const HoneyDatabase*>(src);
2236 const HoneyTable * table;
2237 switch (t->type) {
2238 case Honey::POSTLIST:
2239 table = &(db->postlist_table);
2240 break;
2241 case Honey::DOCDATA:
2242 table = &(db->docdata_table);
2243 break;
2244 case Honey::TERMLIST:
2245 table = &(db->termlist_table);
2246 break;
2247 case Honey::POSITION:
2248 table = &(db->position_table);
2249 break;
2250 case Honey::SPELLING:
2251 table = &(db->spelling_table);
2252 break;
2253 case Honey::SYNONYM:
2254 table = &(db->synonym_table);
2255 break;
2256 default:
2257 Assert(false);
2258 return;
2261 if (db->single_file()) {
2262 if (t->lazy && table->empty()) {
2263 // Essentially doesn't exist.
2264 } else {
2265 // FIXME: Find actual size somehow?
2266 // in_size += table->size() / 1024;
2267 single_file_in = true;
2268 bad_totals = true;
2269 output_will_exist = true;
2270 ++inputs_present;
2272 } else {
2273 off_t db_size = file_size(table->get_path());
2274 if (errno == 0) {
2275 // FIXME: check overflow and set bad_totals
2276 in_total += db_size;
2277 in_size += db_size / 1024;
2278 output_will_exist = true;
2279 ++inputs_present;
2280 } else if (errno != ENOENT) {
2281 // We get ENOENT for an optional table.
2282 bad_totals = bad_stat = true;
2283 output_will_exist = true;
2284 ++inputs_present;
2287 inputs.push_back(table);
2290 // If any inputs lack a termlist table, suppress it in the output.
2291 if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2292 if (inputs_present != 0) {
2293 if (compactor) {
2294 string m = str(inputs_present);
2295 m += " of ";
2296 m += str(sources.size());
2297 m += " inputs present, so suppressing output";
2298 compactor->set_status(t->name, m);
2300 continue;
2302 output_will_exist = false;
2305 if (!output_will_exist) {
2306 if (compactor)
2307 compactor->set_status(t->name, "doesn't exist");
2308 continue;
2311 HoneyTable * out;
2312 off_t table_start_offset = -1;
2313 if (single_file) {
2314 if (t == tables) {
2315 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2316 // space for version file. It's tricky to exactly know the
2317 // size of the version file beforehand.
2318 table_start_offset = HONEY_VERSION_MAX_SIZE;
2319 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2320 throw Xapian::DatabaseError("lseek() failed", errno);
2321 } else {
2322 table_start_offset = lseek(fd, 0, SEEK_CUR);
2324 out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2325 false, false);
2326 } else {
2327 out = new HoneyTable(t->name, dest, false, t->lazy);
2329 tabs.push_back(out);
2330 Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2331 if (single_file) {
2332 root_info->set_free_list(fl_serialised);
2333 root_info->set_offset(table_start_offset);
2334 out->open(FLAGS,
2335 version_file_out->get_root(t->type),
2336 version_file_out->get_revision());
2337 } else {
2338 out->create_and_open(FLAGS, *root_info);
2341 out->set_full_compaction(compaction != compactor->STANDARD);
2342 if (compaction == compactor->FULLER) out->set_max_item_size(1);
2344 switch (t->type) {
2345 case Honey::POSTLIST: {
2346 if (multipass && inputs.size() > 3) {
2347 multimerge_postlists(compactor, out, destdir,
2348 inputs, offset);
2349 } else {
2350 merge_postlists(compactor, out, offset.begin(),
2351 inputs.begin(), inputs.end());
2353 break;
2355 case Honey::SPELLING:
2356 merge_spellings(out, inputs.begin(), inputs.end());
2357 break;
2358 case Honey::SYNONYM:
2359 merge_synonyms(out, inputs.begin(), inputs.end());
2360 break;
2361 case Honey::POSITION:
2362 merge_positions(out, inputs, offset);
2363 break;
2364 default:
2365 // DocData, Termlist
2366 merge_docid_keyed(out, inputs, offset);
2367 break;
2370 // Commit as revision 1.
2371 out->flush_db();
2372 out->commit(1, root_info);
2373 out->sync();
2374 if (single_file) fl_serialised = root_info->get_free_list();
2376 off_t out_size = 0;
2377 if (!bad_stat && !single_file_in) {
2378 off_t db_size;
2379 if (single_file) {
2380 db_size = file_size(fd);
2381 } else {
2382 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2384 if (errno == 0) {
2385 if (single_file) {
2386 off_t old_prev_size = max(prev_size, off_t(block_size));
2387 prev_size = db_size;
2388 db_size -= old_prev_size;
2390 // FIXME: check overflow and set bad_totals
2391 out_total += db_size;
2392 out_size = db_size / 1024;
2393 } else if (errno != ENOENT) {
2394 bad_totals = bad_stat = true;
2397 if (bad_stat) {
2398 if (compactor)
2399 compactor->set_status(t->name,
2400 "Done (couldn't stat all the DB files)");
2401 } else if (single_file_in) {
2402 if (compactor)
2403 compactor->set_status(t->name,
2404 "Done (table sizes unknown for single "
2405 "file DB input)");
2406 } else {
2407 string status;
2408 if (out_size == in_size) {
2409 status = "Size unchanged (";
2410 } else {
2411 off_t delta;
2412 if (out_size < in_size) {
2413 delta = in_size - out_size;
2414 status = "Reduced by ";
2415 } else {
2416 delta = out_size - in_size;
2417 status = "INCREASED by ";
2419 if (in_size) {
2420 status += str(100 * delta / in_size);
2421 status += "% ";
2423 status += str(delta);
2424 status += "K (";
2425 status += str(in_size);
2426 status += "K -> ";
2428 status += str(out_size);
2429 status += "K)";
2430 if (compactor)
2431 compactor->set_status(t->name, status);
2435 // If compacting to a single file output and all the tables are empty, pad
2436 // the output so that it isn't mistaken for a stub database when we try to
2437 // open it. For this it needs to be a multiple of 2KB in size.
2438 if (single_file && prev_size < off_t(block_size)) {
2439 #ifdef HAVE_FTRUNCATE
2440 if (ftruncate(fd, block_size) < 0) {
2441 throw Xapian::DatabaseError("Failed to set size of output "
2442 "database", errno);
2444 #else
2445 const off_t off = block_size - 1;
2446 if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2447 throw Xapian::DatabaseError("Failed to set size of output "
2448 "database", errno);
2450 #endif
2453 if (single_file) {
2454 if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2455 throw Xapian::DatabaseError("lseek() failed", errno);
2458 version_file_out->set_last_docid(last_docid);
2459 string tmpfile = version_file_out->write(1, FLAGS);
2460 for (unsigned j = 0; j != tabs.size(); ++j) {
2461 tabs[j]->sync();
2463 // Commit with revision 1.
2464 version_file_out->sync(tmpfile, 1, FLAGS);
2465 for (unsigned j = 0; j != tabs.size(); ++j) {
2466 delete tabs[j];
2470 if (!single_file) lock.release();
2472 if (!bad_totals && compactor) {
2473 string status;
2474 in_total /= 1024;
2475 out_total /= 1024;
2476 if (out_total == in_total) {
2477 status = "Size unchanged (";
2478 } else {
2479 off_t delta;
2480 if (out_total < in_total) {
2481 delta = in_total - out_total;
2482 status = "Reduced by ";
2483 } else {
2484 delta = out_total - in_total;
2485 status = "INCREASED by ";
2487 if (in_total) {
2488 status += str(100 * delta / in_total);
2489 status += "% ";
2491 status += str(delta);
2492 status += "K (";
2493 status += str(in_total);
2494 status += "K -> ";
2496 status += str(out_total);
2497 status += "K)";
2498 compactor->set_status("Total", status);