Drop special handling for Compaq C++
[xapian.git] / xapian-core / backends / honey / honey_compact.cc
blob1a9a7654805c6a2322ca635c9722402e5100ee8e
1 /** @file honey_compact.cc
2 * @brief Compact a honey database, or merge and compact several.
3 */
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "xapian/compactor.h"
25 #include "xapian/constants.h"
26 #include "xapian/error.h"
27 #include "xapian/types.h"
29 #include <algorithm>
30 #include <memory>
31 #include <queue>
32 #include <type_traits>
34 #include <cerrno>
35 #include <cstdio>
37 #include "backends/flint_lock.h"
38 #include "compression_stream.h"
39 #include "honey_cursor.h"
40 #include "honey_database.h"
41 #include "honey_defs.h"
42 #include "honey_postlist_encodings.h"
43 #include "honey_table.h"
44 #include "honey_values.h"
45 #include "honey_version.h"
46 #include "filetests.h"
47 #include "internaltypes.h"
48 #include "pack.h"
49 #include "backends/valuestats.h"
50 #include "wordaccess.h"
52 #include "../byte_length_strings.h"
53 #include "../prefix_compressed_strings.h"
55 #ifdef XAPIAN_HAS_GLASS_BACKEND
56 # include "../glass/glass_database.h"
57 # include "../glass/glass_table.h"
58 # include "../glass/glass_values.h"
59 #endif
61 using namespace std;
62 using Honey::encode_valuestats;
64 [[noreturn]]
65 static void
66 throw_database_corrupt(const char* item, const char* pos)
68 string message;
69 if (pos != NULL) {
70 message = "Value overflow unpacking termlist: ";
71 } else {
72 message = "Out of data unpacking termlist: ";
74 message += item;
75 throw Xapian::DatabaseCorruptError(message);
78 #ifdef XAPIAN_HAS_GLASS_BACKEND
79 namespace GlassCompact {
81 static inline bool
82 is_user_metadata_key(const string & key)
84 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
87 static inline bool
88 is_valuestats_key(const string & key)
90 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
93 static inline bool
94 is_valuechunk_key(const string & key)
96 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
99 static inline bool
100 is_doclenchunk_key(const string & key)
102 return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
107 inline static bool
108 termlist_key_is_values_used(const string& key)
110 const char* p = key.data();
111 const char* e = p + key.size();
112 Xapian::docid did;
113 if (unpack_uint_preserving_sort(&p, e, &did)) {
114 if (p == e)
115 return false;
116 if (e - p == 1 && *p == '\0')
117 return true;
119 throw Xapian::DatabaseCorruptError("termlist key format");
121 #endif
123 // Put all the helpers in a namespace to avoid symbols colliding with those of
124 // the same name in other flint-derived backends.
125 namespace HoneyCompact {
127 /// Return a Honey::KEY_* constant, or a different value for an invalid key.
128 static inline int
129 key_type(const string& key)
131 if (key[0] != '\0')
132 return Honey::KEY_POSTING_CHUNK;
134 if (key.size() <= 1)
135 return -1;
137 unsigned char ch = key[1];
138 if (ch >= Honey::KEY_VALUE_STATS && ch <= Honey::KEY_VALUE_STATS_HI)
139 return Honey::KEY_VALUE_STATS;
140 if (ch >= Honey::KEY_VALUE_CHUNK && ch <= Honey::KEY_VALUE_CHUNK_HI)
141 return Honey::KEY_VALUE_CHUNK;
142 if (ch >= Honey::KEY_DOCLEN_CHUNK && ch <= Honey::KEY_DOCLEN_CHUNK_HI)
143 return Honey::KEY_DOCLEN_CHUNK;
145 // Handle Honey::KEY_USER_METADATA, Honey::KEY_POSTING_CHUNK, and currently
146 // unused values.
147 return ch;
150 template<typename T> class PostlistCursor;
152 #ifdef XAPIAN_HAS_GLASS_BACKEND
153 // Convert glass to honey.
154 template<>
155 class PostlistCursor<const GlassTable&> : private GlassCursor {
156 Xapian::docid offset;
158 public:
159 string key, tag;
160 Xapian::docid firstdid;
161 Xapian::docid chunk_lastdid;
162 Xapian::termcount tf, cf;
163 Xapian::termcount first_wdf;
164 Xapian::termcount wdf_max;
165 bool have_wdfs;
167 PostlistCursor(const GlassTable *in, Xapian::docid offset_)
168 : GlassCursor(in), offset(offset_), firstdid(0)
170 rewind();
173 bool next() {
174 if (!GlassCursor::next()) return false;
175 // We put all chunks into the non-initial chunk form here, then fix up
176 // the first chunk for each term in the merged database as we merge.
177 read_tag();
178 key = current_key;
179 tag = current_tag;
180 tf = cf = 0;
181 if (GlassCompact::is_user_metadata_key(key)) {
182 key[1] = Honey::KEY_USER_METADATA;
183 return true;
185 if (GlassCompact::is_valuestats_key(key)) {
186 // Adjust key.
187 const char * p = key.data();
188 const char * end = p + key.length();
189 p += 2;
190 Xapian::valueno slot;
191 if (!unpack_uint_last(&p, end, &slot))
192 throw Xapian::DatabaseCorruptError("bad value stats key");
193 // FIXME: pack_uint_last() is not a good encoding for keys, as the
194 // encoded values do not in general sort the same way as the
195 // numeric values. The first 256 values will be in order at least.
197 // We could just buffer up the stats for all the slots - it's
198 // unlikely there are going to be very many. Another option is
199 // to use multiple cursors or seek a single cursor around.
200 AssertRel(slot, <=, 256);
201 key = Honey::make_valuestats_key(slot);
202 return true;
204 if (GlassCompact::is_valuechunk_key(key)) {
205 const char * p = key.data();
206 const char * end = p + key.length();
207 p += 2;
208 Xapian::valueno slot;
209 if (!unpack_uint(&p, end, &slot))
210 throw Xapian::DatabaseCorruptError("bad value key");
211 // FIXME: pack_uint() is not a good encoding for keys, as the
212 // encoded values do not in general sort the same way as the
213 // numeric values. The first 128 values will be in order at least.
215 // Buffering up this data for all slots is potentially prohibitively
216 // costly. We probably need to use multiple cursors or seek a single
217 // cursor around.
218 AssertRel(slot, <=, 128);
219 Xapian::docid first_did;
220 if (!unpack_uint_preserving_sort(&p, end, &first_did))
221 throw Xapian::DatabaseCorruptError("bad value key");
222 first_did += offset;
224 Glass::ValueChunkReader reader(tag.data(), tag.size(), first_did);
225 Xapian::docid last_did = first_did;
226 while (reader.next(), !reader.at_end()) {
227 last_did = reader.get_docid();
230 key = Honey::make_valuechunk_key(slot, last_did);
232 // Add the docid delta across the chunk to the start of the tag.
233 string newtag;
234 pack_uint(newtag, last_did - first_did);
235 tag.insert(0, newtag);
237 return true;
240 if (GlassCompact::is_doclenchunk_key(key)) {
241 const char * d = key.data();
242 const char * e = d + key.size();
243 d += 2;
245 if (d == e) {
246 // This is an initial chunk, so adjust tag header.
247 d = tag.data();
248 e = d + tag.size();
249 if (!unpack_uint(&d, e, &tf) ||
250 !unpack_uint(&d, e, &cf) ||
251 !unpack_uint(&d, e, &firstdid)) {
252 throw Xapian::DatabaseCorruptError("Bad postlist key");
254 ++firstdid;
255 tag.erase(0, d - tag.data());
256 } else {
257 // Not an initial chunk, just unpack firstdid.
258 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
259 throw Xapian::DatabaseCorruptError("Bad postlist key");
261 firstdid += offset;
263 // Set key to placeholder value which just indicates that this is a
264 // doclen chunk.
265 static const char doclen_key_prefix[2] = {
266 0, char(Honey::KEY_DOCLEN_CHUNK)
268 key.assign(doclen_key_prefix, 2);
270 d = tag.data();
271 e = d + tag.size();
273 // Convert doclen chunk to honey format.
274 string newtag;
276 // Skip the "last chunk" flag and increase_to_last.
277 if (d == e)
278 throw Xapian::DatabaseCorruptError("No last chunk flag in "
279 "glass docdata chunk");
280 ++d;
281 Xapian::docid increase_to_last;
282 if (!unpack_uint(&d, e, &increase_to_last))
283 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
284 "in glass docdata chunk");
286 Xapian::termcount doclen_max = 0;
287 while (true) {
288 Xapian::termcount doclen;
289 if (!unpack_uint(&d, e, &doclen))
290 throw Xapian::DatabaseCorruptError("Decoding doclen in "
291 "glass docdata chunk");
292 if (doclen > doclen_max)
293 doclen_max = doclen;
294 unsigned char buf[4];
295 unaligned_write4(buf, doclen);
296 newtag.append(reinterpret_cast<char*>(buf), 4);
297 if (d == e)
298 break;
299 Xapian::docid gap_size;
300 if (!unpack_uint(&d, e, &gap_size))
301 throw Xapian::DatabaseCorruptError("Decoding docid "
302 "gap_size in glass "
303 "docdata chunk");
304 // FIXME: Split chunk if the gap_size is at all large.
305 newtag.append(4 * gap_size, '\xff');
308 Assert(!startswith(newtag, "\xff\xff\xff\xff"));
309 Assert(!endswith(newtag, "\xff\xff\xff\xff"));
311 AssertEq(newtag.size() % 4, 0);
312 chunk_lastdid = firstdid - 1 + newtag.size() / 4;
314 // Only encode document lengths using a whole number of bytes for
315 // now. We could allow arbitrary bit widths, but it complicates
316 // encoding and decoding so we should consider if the fairly small
317 // additional saving is worth it.
318 if (doclen_max >= 0xffff) {
319 if (doclen_max >= 0xffffff) {
320 newtag.insert(0, 1, char(32));
321 swap(tag, newtag);
322 } else if (doclen_max >= 0xffffffff) {
323 // FIXME: Handle these.
324 const char* m = "Document length values >= 0xffffffff not "
325 "currently handled";
326 throw Xapian::FeatureUnavailableError(m);
327 } else {
328 tag.assign(1, char(24));
329 for (size_t i = 1; i < newtag.size(); i += 4)
330 tag.append(newtag, i, 3);
332 } else {
333 if (doclen_max >= 0xff) {
334 tag.assign(1, char(16));
335 for (size_t i = 2; i < newtag.size(); i += 4)
336 tag.append(newtag, i, 2);
337 } else {
338 tag.assign(1, char(8));
339 for (size_t i = 3; i < newtag.size(); i += 4)
340 tag.append(newtag, i, 1);
344 return true;
347 // Adjust key if this is *NOT* an initial chunk.
348 // key is: pack_string_preserving_sort(key, tname)
349 // plus optionally: pack_uint_preserving_sort(key, did)
350 const char * d = key.data();
351 const char * e = d + key.size();
352 string tname;
353 if (!unpack_string_preserving_sort(&d, e, tname))
354 throw Xapian::DatabaseCorruptError("Bad postlist key");
356 if (d == e) {
357 // This is an initial chunk for a term, so adjust tag header.
358 d = tag.data();
359 e = d + tag.size();
360 if (!unpack_uint(&d, e, &tf) ||
361 !unpack_uint(&d, e, &cf) ||
362 !unpack_uint(&d, e, &firstdid)) {
363 throw Xapian::DatabaseCorruptError("Bad postlist key");
365 ++firstdid;
366 have_wdfs = (cf != 0);
367 tag.erase(0, d - tag.data());
368 wdf_max = 0;
369 } else {
370 // Not an initial chunk, so adjust key.
371 size_t tmp = d - key.data();
372 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
373 throw Xapian::DatabaseCorruptError("Bad postlist key");
374 key.erase(tmp - 1);
376 firstdid += offset;
378 d = tag.data();
379 e = d + tag.size();
381 // Convert posting chunk to honey format, but without any header.
382 string newtag;
384 // Skip the "last chunk" flag; decode increase_to_last.
385 if (d == e)
386 throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
387 "posting chunk");
388 ++d;
389 Xapian::docid increase_to_last;
390 if (!unpack_uint(&d, e, &increase_to_last))
391 throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
392 "glass posting chunk");
393 chunk_lastdid = firstdid + increase_to_last;
394 if (!unpack_uint(&d, e, &first_wdf))
395 throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
396 "posting chunk");
397 if ((first_wdf != 0) != have_wdfs) {
398 // FIXME: Also need to adjust document length, total document length.
399 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
400 // first_wdf = 1;
401 // ++cf;
402 throw Xapian::DatabaseError("Honey does not support a term having "
403 "both zero and non-zero wdf");
405 wdf_max = max(wdf_max, first_wdf);
407 while (d != e) {
408 Xapian::docid delta;
409 if (!unpack_uint(&d, e, &delta))
410 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
411 "glass posting chunk");
412 pack_uint(newtag, delta);
413 Xapian::termcount wdf;
414 if (!unpack_uint(&d, e, &wdf))
415 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
416 "posting chunk");
417 if ((wdf != 0) != have_wdfs) {
418 // FIXME: Also need to adjust document length, total document length.
419 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
420 // wdf = 1;
421 // ++cf;
422 throw Xapian::DatabaseError("Honey does not support a term "
423 "having both zero and non-zero "
424 "wdf");
426 if (have_wdfs) {
427 pack_uint(newtag, wdf);
428 wdf_max = max(wdf_max, wdf);
432 swap(tag, newtag);
434 return true;
437 #endif
439 template<>
440 class PostlistCursor<const HoneyTable&> : private HoneyCursor {
441 Xapian::docid offset;
443 public:
444 string key, tag;
445 Xapian::docid firstdid;
446 Xapian::docid chunk_lastdid;
447 Xapian::termcount tf, cf;
448 Xapian::termcount first_wdf;
449 Xapian::termcount wdf_max;
450 bool have_wdfs;
452 PostlistCursor(const HoneyTable *in, Xapian::docid offset_)
453 : HoneyCursor(in), offset(offset_), firstdid(0)
455 rewind();
458 bool next() {
459 if (!HoneyCursor::next()) return false;
460 // We put all chunks into the non-initial chunk form here, then fix up
461 // the first chunk for each term in the merged database as we merge.
462 read_tag();
463 key = current_key;
464 tag = current_tag;
465 tf = 0;
466 switch (key_type(key)) {
467 case Honey::KEY_USER_METADATA:
468 case Honey::KEY_VALUE_STATS:
469 return true;
470 case Honey::KEY_VALUE_CHUNK: {
471 const char * p = key.data();
472 const char * end = p + key.length();
473 p += 2;
474 Xapian::valueno slot;
475 if (p[-1] != char(Honey::KEY_VALUE_CHUNK_HI)) {
476 slot = p[-1] - Honey::KEY_VALUE_CHUNK;
477 } else {
478 if (!unpack_uint_preserving_sort(&p, end, &slot))
479 throw Xapian::DatabaseCorruptError("bad value key");
481 Xapian::docid did;
482 if (!unpack_uint_preserving_sort(&p, end, &did))
483 throw Xapian::DatabaseCorruptError("bad value key");
484 key = Honey::make_valuechunk_key(slot, did + offset);
485 return true;
487 case Honey::KEY_DOCLEN_CHUNK: {
488 Xapian::docid did = Honey::docid_from_key(key);
489 if (did == 0)
490 throw Xapian::DatabaseCorruptError("Bad doclen key");
491 chunk_lastdid = did + offset;
492 key.resize(2);
493 return true;
495 case Honey::KEY_POSTING_CHUNK:
496 break;
497 default:
498 throw Xapian::DatabaseCorruptError("Bad postlist table key "
499 "type");
502 // Adjust key if this is *NOT* an initial chunk.
503 // key is: pack_string_preserving_sort(key, term)
504 // plus optionally: pack_uint_preserving_sort(key, did)
505 const char * d = key.data();
506 const char * e = d + key.size();
507 string term;
508 if (!unpack_string_preserving_sort(&d, e, term))
509 throw Xapian::DatabaseCorruptError("Bad postlist key");
511 if (d == e) {
512 // This is an initial chunk for a term, so remove tag header.
513 d = tag.data();
514 e = d + tag.size();
516 Xapian::docid lastdid;
517 if (!decode_initial_chunk_header(&d, e, tf, cf,
518 firstdid, lastdid, chunk_lastdid,
519 first_wdf, wdf_max)) {
520 throw Xapian::DatabaseCorruptError("Bad postlist initial "
521 "chunk header");
523 // Ignore lastdid - we'll need to recalculate it (at least when
524 // merging, and for simplicity we always do).
525 (void)lastdid;
526 tag.erase(0, d - tag.data());
528 have_wdfs = (cf != 0) && (cf != tf - 1 + first_wdf);
529 if (have_wdfs && tf > 2) {
530 Xapian::termcount remaining_cf_for_flat_wdf =
531 (tf - 1) * wdf_max;
532 // Check this matches and that it isn't a false match due
533 // to overflow of the multiplication above.
534 if (cf - first_wdf == remaining_cf_for_flat_wdf &&
535 usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
536 // The wdf is flat so we don't need to store it.
537 have_wdfs = false;
540 } else {
541 if (cf > 0) {
542 // The cf we report should only be non-zero for initial chunks
543 // (of which there may be several for the same term from
544 // different instances of this class when merging databases)
545 // and gets summed to give the total cf for the term, so we
546 // need to zero it here, after potentially using it to
547 // calculate first_wdf.
549 // If cf is zero for a term, first_wdf will be zero for all
550 // chunks so doesn't need setting specially here.
551 if (!have_wdfs)
552 first_wdf = (cf - first_wdf) / (tf - 1);
553 cf = 0;
556 // Not an initial chunk, so adjust key and remove tag header.
557 size_t tmp = d - key.data();
558 if (!unpack_uint_preserving_sort(&d, e, &chunk_lastdid) ||
559 d != e) {
560 throw Xapian::DatabaseCorruptError("Bad postlist key");
562 // -1 to remove the terminating zero byte too.
563 key.erase(tmp - 1);
565 d = tag.data();
566 e = d + tag.size();
568 if (have_wdfs) {
569 if (!decode_delta_chunk_header(&d, e, chunk_lastdid, firstdid,
570 first_wdf)) {
571 throw Xapian::DatabaseCorruptError("Bad postlist delta "
572 "chunk header");
574 } else {
575 if (!decode_delta_chunk_header_no_wdf(&d, e, chunk_lastdid,
576 firstdid)) {
577 throw Xapian::DatabaseCorruptError("Bad postlist delta "
578 "chunk header");
581 tag.erase(0, d - tag.data());
583 firstdid += offset;
584 chunk_lastdid += offset;
585 return true;
589 template<>
590 class PostlistCursor<HoneyTable&> : public PostlistCursor<const HoneyTable&> {
591 public:
592 PostlistCursor(HoneyTable *in, Xapian::docid offset_)
593 : PostlistCursor<const HoneyTable&>(in, offset_) {}
596 template<typename T>
597 class PostlistCursorGt {
598 public:
599 /** Return true if and only if a's key is strictly greater than b's key.
601 bool operator()(const T* a, const T* b) const {
602 if (a->key > b->key) return true;
603 if (a->key != b->key) return false;
604 return (a->firstdid > b->firstdid);
608 // U : vector<HoneyTable*>::const_iterator
609 template<typename T, typename U> void
610 merge_postlists(Xapian::Compactor * compactor,
611 T * out, vector<Xapian::docid>::const_iterator offset,
612 U b, U e)
614 typedef decltype(**b) table_type; // E.g. HoneyTable
615 typedef PostlistCursor<table_type> cursor_type;
616 typedef PostlistCursorGt<cursor_type> gt_type;
617 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
618 for ( ; b != e; ++b, ++offset) {
619 auto in = *b;
620 auto cursor = new cursor_type(in, *offset);
621 if (cursor->next()) {
622 pq.push(cursor);
623 } else {
624 // Skip empty tables.
625 delete cursor;
629 string last_key;
631 // Merge user metadata.
632 vector<string> tags;
633 while (!pq.empty()) {
634 cursor_type * cur = pq.top();
635 const string& key = cur->key;
636 if (key_type(key) != Honey::KEY_USER_METADATA) break;
638 if (key != last_key) {
639 if (!tags.empty()) {
640 if (tags.size() > 1 && compactor) {
641 Assert(!last_key.empty());
642 // FIXME: It would be better to merge all duplicates
643 // for a key in one call, but currently we don't in
644 // multipass mode.
645 const string & resolved_tag =
646 compactor->resolve_duplicate_metadata(last_key,
647 tags.size(),
648 &tags[0]);
649 if (!resolved_tag.empty())
650 out->add(last_key, resolved_tag);
651 } else {
652 Assert(!last_key.empty());
653 out->add(last_key, tags[0]);
655 tags.resize(0);
657 last_key = key;
659 tags.push_back(cur->tag);
661 pq.pop();
662 if (cur->next()) {
663 pq.push(cur);
664 } else {
665 delete cur;
668 if (!tags.empty()) {
669 if (tags.size() > 1 && compactor) {
670 Assert(!last_key.empty());
671 const string & resolved_tag =
672 compactor->resolve_duplicate_metadata(last_key,
673 tags.size(),
674 &tags[0]);
675 if (!resolved_tag.empty())
676 out->add(last_key, resolved_tag);
677 } else {
678 Assert(!last_key.empty());
679 out->add(last_key, tags[0]);
685 // Merge valuestats.
686 Xapian::doccount freq = 0;
687 string lbound, ubound;
689 while (!pq.empty()) {
690 cursor_type * cur = pq.top();
691 const string& key = cur->key;
692 if (key_type(key) != Honey::KEY_VALUE_STATS) break;
693 if (key != last_key) {
694 // For the first valuestats key, last_key will be the previous
695 // key we wrote, which we don't want to overwrite. This is the
696 // only time that freq will be 0, so check that.
697 if (freq) {
698 out->add(last_key, encode_valuestats(freq, lbound, ubound));
699 freq = 0;
701 last_key = key;
704 const string & tag = cur->tag;
706 const char * pos = tag.data();
707 const char * end = pos + tag.size();
709 Xapian::doccount f;
710 string l, u;
711 if (!unpack_uint(&pos, end, &f)) {
712 if (*pos == 0)
713 throw Xapian::DatabaseCorruptError("Incomplete stats item "
714 "in value table");
715 throw Xapian::RangeError("Frequency statistic in value table "
716 "is too large");
718 if (!unpack_string(&pos, end, l)) {
719 if (*pos == 0)
720 throw Xapian::DatabaseCorruptError("Incomplete stats item "
721 "in value table");
722 throw Xapian::RangeError("Lower bound in value table is too "
723 "large");
725 size_t len = end - pos;
726 if (len == 0) {
727 u = l;
728 } else {
729 u.assign(pos, len);
731 if (freq == 0) {
732 freq = f;
733 lbound = l;
734 ubound = u;
735 } else {
736 freq += f;
737 if (l < lbound) lbound = l;
738 if (u > ubound) ubound = u;
741 pq.pop();
742 if (cur->next()) {
743 pq.push(cur);
744 } else {
745 delete cur;
749 if (freq) {
750 out->add(last_key, encode_valuestats(freq, lbound, ubound));
754 // Merge valuestream chunks.
755 while (!pq.empty()) {
756 cursor_type * cur = pq.top();
757 const string & key = cur->key;
758 if (key_type(key) != Honey::KEY_VALUE_CHUNK) break;
759 out->add(key, cur->tag);
760 pq.pop();
761 if (cur->next()) {
762 pq.push(cur);
763 } else {
764 delete cur;
768 // Merge doclen chunks.
769 while (!pq.empty()) {
770 cursor_type * cur = pq.top();
771 if (key_type(cur->key) != Honey::KEY_DOCLEN_CHUNK) break;
772 string tag = std::move(cur->tag);
773 auto chunk_lastdid = cur->chunk_lastdid;
774 pq.pop();
775 if (cur->next()) {
776 pq.push(cur);
777 } else {
778 delete cur;
780 while (!pq.empty()) {
781 cur = pq.top();
782 if (key_type(cur->key) != Honey::KEY_DOCLEN_CHUNK)
783 break;
784 if (tag[0] != cur->tag[0]) {
785 // Different width values in the two tags, so punt for now.
786 // FIXME: We would ideally optimise the total size here.
787 break;
789 size_t byte_width = tag[0] / 8;
790 auto new_size = tag.size();
791 Xapian::docid gap_size = cur->firstdid - chunk_lastdid - 1;
792 new_size += gap_size * byte_width;
793 if (new_size >= HONEY_DOCLEN_CHUNK_MAX) {
794 // The gap spans beyond HONEY_DOCLEN_CHUNK_MAX.
795 break;
797 new_size += cur->tag.size() - 1;
798 auto full_new_size = new_size;
799 if (new_size > HONEY_DOCLEN_CHUNK_MAX) {
800 if (byte_width > 1) {
801 // HONEY_DOCLEN_CHUNK_MAX should be one more than a
802 // multiple of 12 so for widths 1,2,3,4 we can fix the
803 // initial byte which indicates the width for the chunk
804 // plus an exact number of entries.
805 auto m = (new_size - HONEY_DOCLEN_CHUNK_MAX) % byte_width;
806 (void)m;
807 AssertEq(m, 0);
809 new_size = HONEY_DOCLEN_CHUNK_MAX;
811 tag.reserve(new_size);
812 tag.append(byte_width * gap_size, '\xff');
813 if (new_size != full_new_size) {
814 // Partial copy.
815 auto copy_size = new_size - tag.size();
816 tag.append(cur->tag, 1, copy_size);
817 cur->tag.erase(1, copy_size);
818 copy_size /= byte_width;
819 cur->firstdid += copy_size;
820 chunk_lastdid += gap_size;
821 chunk_lastdid += copy_size;
822 break;
825 tag.append(cur->tag, 1, string::npos);
826 chunk_lastdid = cur->chunk_lastdid;
828 pq.pop();
829 if (cur->next()) {
830 pq.push(cur);
831 } else {
832 delete cur;
835 out->add(Honey::make_doclenchunk_key(chunk_lastdid), tag);
838 Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
840 struct HoneyPostListChunk {
841 Xapian::docid first, last;
842 Xapian::termcount first_wdf;
843 Xapian::termcount wdf_max;
844 Xapian::doccount tf;
845 Xapian::termcount cf;
846 bool have_wdfs;
847 string data;
849 HoneyPostListChunk(Xapian::docid first_,
850 Xapian::docid last_,
851 Xapian::termcount first_wdf_,
852 Xapian::termcount wdf_max_,
853 Xapian::doccount tf_,
854 Xapian::termcount cf_,
855 bool have_wdfs_,
856 string&& data_)
857 : first(first_),
858 last(last_),
859 first_wdf(first_wdf_),
860 wdf_max(wdf_max_),
861 tf(tf_),
862 cf(cf_),
863 have_wdfs(have_wdfs_),
864 data(data_) {}
866 void append_postings_to(string& tag, bool want_wdfs) {
867 if (want_wdfs && !have_wdfs) {
868 // Need to add wdfs which were implicit.
869 auto wdf = (cf - first_wdf) / (tf - 1);
870 const char* pos = data.data();
871 const char* pos_end = pos + data.size();
872 while (pos != pos_end) {
873 Xapian::docid delta;
874 if (!unpack_uint(&pos, pos_end, &delta))
875 throw_database_corrupt("Decoding docid delta", pos);
876 pack_uint(tag, delta);
877 pack_uint(tag, wdf);
879 } else if (have_wdfs && !want_wdfs) {
880 // Need to remove wdfs which are now implicit.
881 const char* pos = data.data();
882 const char* pos_end = pos + data.size();
883 while (pos != pos_end) {
884 Xapian::docid delta;
885 if (!unpack_uint(&pos, pos_end, &delta))
886 throw_database_corrupt("Decoding docid delta", pos);
887 pack_uint(tag, delta);
888 Xapian::termcount wdf;
889 if (!unpack_uint(&pos, pos_end, &wdf))
890 throw_database_corrupt("Decoding wdf", pos);
891 (void)wdf;
893 } else {
894 tag += data;
898 vector<HoneyPostListChunk> tags;
899 while (true) {
900 cursor_type * cur = NULL;
901 if (!pq.empty()) {
902 cur = pq.top();
903 pq.pop();
905 if (cur) {
906 AssertEq(key_type(cur->key), Honey::KEY_POSTING_CHUNK);
908 if (cur == NULL || cur->key != last_key) {
909 if (!tags.empty()) {
910 Xapian::termcount first_wdf = tags[0].first_wdf;
911 Xapian::docid chunk_lastdid = tags[0].last;
912 Xapian::docid last_did = tags.back().last;
913 Xapian::termcount wdf_max = tags.back().wdf_max;
915 bool have_wdfs = true;
916 if (cf == 0) {
917 // wdf must always be zero.
918 have_wdfs = false;
919 } else if (tf <= 2) {
920 // We only need to store cf and first_wdf to know all wdfs.
921 have_wdfs = false;
922 } else if (cf == tf - 1 + first_wdf) {
923 // wdf must be 1 for second and subsequent entries.
924 have_wdfs = false;
925 } else {
926 Xapian::termcount remaining_cf_for_flat_wdf =
927 (tf - 1) * wdf_max;
928 // Check this matches and that it isn't a false match due
929 // to overflow of the multiplication above.
930 if (cf - first_wdf == remaining_cf_for_flat_wdf &&
931 usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
932 // The wdf is flat so we don't need to store it.
933 have_wdfs = false;
937 Xapian::docid splice_last = 0;
938 if (!have_wdfs && tags[0].have_wdfs && tf > 2 &&
939 tags.size() > 1) {
940 // Stripping wdfs will approximately halve the size so
941 // double up the chunks.
942 splice_last = chunk_lastdid;
943 chunk_lastdid = tags[1].last;
946 string first_tag;
947 encode_initial_chunk_header(tf, cf, tags[0].first, last_did,
948 chunk_lastdid,
949 first_wdf, wdf_max, first_tag);
951 if (tf > 2) {
952 tags[0].append_postings_to(first_tag, have_wdfs);
953 if (!have_wdfs && splice_last) {
954 pack_uint(first_tag, tags[1].first - splice_last);
955 tags[1].append_postings_to(first_tag, have_wdfs);
958 out->add(last_key, first_tag);
960 // If tf == 2, the data could be split over two tags when
961 // merging, but we only output an initial tag in this case.
962 if (tf > 2 && tags.size() > 1) {
963 string term;
964 const char* p = last_key.data();
965 const char* end = p + last_key.size();
966 if (!unpack_string_preserving_sort(&p, end, term) ||
967 p != end) {
968 throw Xapian::DatabaseCorruptError("Bad postlist "
969 "chunk key");
972 auto i = tags.begin();
973 if (splice_last) {
974 splice_last = 0;
975 ++i;
977 while (++i != tags.end()) {
978 last_did = i->last;
979 string tag;
980 if (have_wdfs) {
981 encode_delta_chunk_header(i->first,
982 last_did,
983 i->first_wdf,
984 tag);
985 tag += i->data;
986 } else {
987 if (i->have_wdfs && i + 1 != tags.end()) {
988 splice_last = last_did;
989 last_did = (i + 1)->last;
992 encode_delta_chunk_header_no_wdf(i->first,
993 last_did,
994 tag);
995 tag += i->data;
996 if (splice_last) {
997 ++i;
998 pack_uint(tag, i->first - splice_last);
999 splice_last = 0;
1000 tag += i->data;
1004 out->add(pack_honey_postlist_key(term, last_did), tag);
1008 tags.clear();
1009 if (cur == NULL) break;
1010 tf = cf = 0;
1011 last_key = cur->key;
1014 if (tf && cur->tf && (cf == 0) != (cur->cf == 0)) {
1015 // FIXME: Also need to adjust document length, total document
1016 // length.
1017 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
1018 throw Xapian::DatabaseError("Honey does not support a term having "
1019 "both zero and non-zero wdf");
1021 tf += cur->tf;
1022 cf += cur->cf;
1024 tags.push_back(HoneyPostListChunk(cur->firstdid,
1025 cur->chunk_lastdid,
1026 cur->first_wdf,
1027 cur->wdf_max,
1028 cur->tf,
1029 cur->cf,
1030 cur->have_wdfs,
1031 std::move(cur->tag)));
1032 if (cur->next()) {
1033 pq.push(cur);
1034 } else {
1035 delete cur;
1040 template<typename T> struct MergeCursor;
1042 #ifdef XAPIAN_HAS_GLASS_BACKEND
1043 template<>
1044 struct MergeCursor<const GlassTable&> : public GlassCursor {
1045 explicit MergeCursor(const GlassTable *in) : GlassCursor(in) {
1046 rewind();
1049 #endif
1051 template<>
1052 struct MergeCursor<const HoneyTable&> : public HoneyCursor {
1053 explicit MergeCursor(const HoneyTable *in) : HoneyCursor(in) {
1054 rewind();
1058 template<typename T>
1059 struct CursorGt {
1060 /// Return true if and only if a's key is strictly greater than b's key.
1061 bool operator()(const T* a, const T* b) const {
1062 if (b->after_end()) return false;
1063 if (a->after_end()) return true;
1064 return (a->current_key > b->current_key);
1068 #ifdef XAPIAN_HAS_GLASS_BACKEND
1069 // Convert glass to honey.
1070 static void
1071 merge_spellings(HoneyTable* out,
1072 vector<const GlassTable*>::const_iterator b,
1073 vector<const GlassTable*>::const_iterator e)
1075 typedef MergeCursor<const GlassTable&> cursor_type;
1076 typedef CursorGt<cursor_type> gt_type;
1077 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1078 for ( ; b != e; ++b) {
1079 auto in = *b;
1080 auto cursor = new cursor_type(in);
1081 if (cursor->next()) {
1082 pq.push(cursor);
1083 } else {
1084 // Skip empty tables.
1085 delete cursor;
1089 while (!pq.empty()) {
1090 cursor_type * cur = pq.top();
1091 pq.pop();
1093 // Glass vs honey spelling key prefix map:
1095 // Type Glass Honey
1097 // bookend Bbd \x00bd
1098 // head Hhe \x01he
1099 // middle Mddl \x02ddl
1100 // tail Til \x03il
1101 // word Wword word
1103 // In honey, if the word's first byte is <= '\x04', we add a prefix
1104 // of '\x04' (which is unlikely in real world use but ensures that
1105 // we can handle arbitrary strings).
1107 // The prefixes for honey are chosen such that we save a byte for the
1108 // key of all real-world words, and so that more first bytes are used
1109 // so that a top-level array index is more effective, especially for
1110 // random-access lookup of word entries (which we do to check the
1111 // frequency of potential spelling candidates).
1113 // The other prefixes are chosen such that we can do look up in key
1114 // order at search time, which is more efficient for a cursor which can
1115 // only efficiently move forwards.
1117 // Note that the key ordering is the same for glass and honey, which
1118 // makes translating during compaction simpler.
1119 string key = cur->current_key;
1120 switch (key[0]) {
1121 case 'B':
1122 key[0] = Honey::KEY_PREFIX_BOOKEND;
1123 break;
1124 case 'H':
1125 key[0] = Honey::KEY_PREFIX_HEAD;
1126 break;
1127 case 'M':
1128 key[0] = Honey::KEY_PREFIX_MIDDLE;
1129 break;
1130 case 'T':
1131 key[0] = Honey::KEY_PREFIX_TAIL;
1132 break;
1133 case 'W':
1134 if (static_cast<unsigned char>(key[1]) > Honey::KEY_PREFIX_WORD)
1135 key.erase(0, 1);
1136 else
1137 key[0] = Honey::KEY_PREFIX_WORD;
1138 break;
1139 default: {
1140 string m = "Bad spelling key prefix: ";
1141 m += static_cast<unsigned char>(key[0]);
1142 throw Xapian::DatabaseCorruptError(m);
1146 if (pq.empty() || pq.top()->current_key > key) {
1147 // No merging to do for this key so just copy the tag value,
1148 // adjusting if necessary. If we don't need to adjust it, just
1149 // copy the compressed value.
1150 bool compressed;
1151 switch (key[0]) {
1152 case Honey::KEY_PREFIX_HEAD: {
1153 // Honey omits the first two bytes from the first entry
1154 // since we know what those most be from the key
1155 // (subsequent entries won't include those anyway, since
1156 // they'll be part of the reused prefix).
1157 compressed = cur->read_tag(false);
1158 unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1159 cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1160 AssertEq(cur->current_tag[1], key[1]);
1161 AssertEq(cur->current_tag[2], key[2]);
1162 cur->current_tag.erase(1, 2);
1163 break;
1165 case Honey::KEY_PREFIX_TAIL:
1166 case Honey::KEY_PREFIX_BOOKEND: {
1167 // Need to repack because honey omits the last 2 or 1 bytes
1168 // from each entry (2 for tail, 1 for bookend) since we
1169 // know what those must be from the key.
1170 compressed = cur->read_tag(false);
1171 PrefixCompressedStringItor spell_in(cur->current_tag);
1172 string new_tag;
1173 PrefixCompressedStringWriter spell_out(new_tag, key);
1174 while (!spell_in.at_end()) {
1175 spell_out.append(*spell_in);
1176 ++spell_in;
1178 cur->current_tag = std::move(new_tag);
1179 break;
1181 default:
1182 compressed = cur->read_tag(true);
1183 break;
1185 out->add(key, cur->current_tag, compressed);
1186 if (cur->next()) {
1187 pq.push(cur);
1188 } else {
1189 delete cur;
1191 continue;
1194 // Merge tag values with the same key:
1195 string tag;
1196 if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1197 // We just want the union of words, so copy over the first instance
1198 // and skip any identical ones.
1199 priority_queue<PrefixCompressedStringItor *,
1200 vector<PrefixCompressedStringItor *>,
1201 PrefixCompressedStringItorGt> pqtag;
1202 // Stick all the cursor_type pointers in a vector because their
1203 // current_tag members must remain valid while we're merging their
1204 // tags, but we need to call next() on them all afterwards.
1205 vector<cursor_type *> vec;
1206 vec.reserve(pq.size());
1208 while (true) {
1209 cur->read_tag();
1210 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
1211 vec.push_back(cur);
1212 if (pq.empty() || pq.top()->current_key != key) break;
1213 cur = pq.top();
1214 pq.pop();
1217 PrefixCompressedStringWriter wr(tag, key);
1218 string lastword;
1219 while (!pqtag.empty()) {
1220 PrefixCompressedStringItor * it = pqtag.top();
1221 pqtag.pop();
1222 string word = **it;
1223 if (word != lastword) {
1224 lastword = word;
1225 wr.append(lastword);
1227 ++*it;
1228 if (!it->at_end()) {
1229 pqtag.push(it);
1230 } else {
1231 delete it;
1235 for (auto i : vec) {
1236 cur = i;
1237 if (cur->next()) {
1238 pq.push(cur);
1239 } else {
1240 delete cur;
1243 } else {
1244 // We want to sum the frequencies from tags for the same key.
1245 Xapian::termcount tot_freq = 0;
1246 while (true) {
1247 cur->read_tag();
1248 Xapian::termcount freq;
1249 const char * p = cur->current_tag.data();
1250 const char * end = p + cur->current_tag.size();
1251 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1252 throw_database_corrupt("Bad spelling word freq", p);
1254 tot_freq += freq;
1255 if (cur->next()) {
1256 pq.push(cur);
1257 } else {
1258 delete cur;
1260 if (pq.empty() || pq.top()->current_key != key) break;
1261 cur = pq.top();
1262 pq.pop();
1264 tag.resize(0);
1265 pack_uint_last(tag, tot_freq);
1267 out->add(key, tag);
1270 #endif
1272 static void
1273 merge_spellings(HoneyTable* out,
1274 vector<const HoneyTable*>::const_iterator b,
1275 vector<const HoneyTable*>::const_iterator e)
1277 typedef MergeCursor<const HoneyTable&> cursor_type;
1278 typedef CursorGt<cursor_type> gt_type;
1279 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1280 for ( ; b != e; ++b) {
1281 auto in = *b;
1282 auto cursor = new cursor_type(in);
1283 if (cursor->next()) {
1284 pq.push(cursor);
1285 } else {
1286 // Skip empty tables.
1287 delete cursor;
1291 while (!pq.empty()) {
1292 cursor_type * cur = pq.top();
1293 pq.pop();
1295 string key = cur->current_key;
1296 if (pq.empty() || pq.top()->current_key > key) {
1297 // No need to merge the tags, just copy the (possibly compressed)
1298 // tag value.
1299 bool compressed = cur->read_tag(true);
1300 out->add(key, cur->current_tag, compressed);
1301 if (cur->next()) {
1302 pq.push(cur);
1303 } else {
1304 delete cur;
1306 continue;
1309 // Merge tag values with the same key:
1310 string tag;
1311 if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1312 // We just want the union of words, so copy over the first instance
1313 // and skip any identical ones.
1314 priority_queue<PrefixCompressedStringItor *,
1315 vector<PrefixCompressedStringItor *>,
1316 PrefixCompressedStringItorGt> pqtag;
1317 // Stick all the cursor_type pointers in a vector because their
1318 // current_tag members must remain valid while we're merging their
1319 // tags, but we need to call next() on them all afterwards.
1320 vector<cursor_type *> vec;
1321 vec.reserve(pq.size());
1323 while (true) {
1324 cur->read_tag();
1325 pqtag.push(new PrefixCompressedStringItor(cur->current_tag,
1326 key));
1327 vec.push_back(cur);
1328 if (pq.empty() || pq.top()->current_key != key) break;
1329 cur = pq.top();
1330 pq.pop();
1333 PrefixCompressedStringWriter wr(tag, key);
1334 string lastword;
1335 while (!pqtag.empty()) {
1336 PrefixCompressedStringItor * it = pqtag.top();
1337 pqtag.pop();
1338 string word = **it;
1339 if (word != lastword) {
1340 lastword = word;
1341 wr.append(lastword);
1343 ++*it;
1344 if (!it->at_end()) {
1345 pqtag.push(it);
1346 } else {
1347 delete it;
1351 for (auto i : vec) {
1352 cur = i;
1353 if (cur->next()) {
1354 pq.push(cur);
1355 } else {
1356 delete cur;
1359 } else {
1360 // We want to sum the frequencies from tags for the same key.
1361 Xapian::termcount tot_freq = 0;
1362 while (true) {
1363 cur->read_tag();
1364 Xapian::termcount freq;
1365 const char * p = cur->current_tag.data();
1366 const char * end = p + cur->current_tag.size();
1367 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1368 throw_database_corrupt("Bad spelling word freq", p);
1370 tot_freq += freq;
1371 if (cur->next()) {
1372 pq.push(cur);
1373 } else {
1374 delete cur;
1376 if (pq.empty() || pq.top()->current_key != key) break;
1377 cur = pq.top();
1378 pq.pop();
1380 tag.resize(0);
1381 pack_uint_last(tag, tot_freq);
1383 out->add(key, tag);
1387 // U : vector<HoneyTable*>::const_iterator
1388 template<typename T, typename U> void
1389 merge_synonyms(T* out, U b, U e)
1391 typedef decltype(**b) table_type; // E.g. HoneyTable
1392 typedef MergeCursor<table_type> cursor_type;
1393 typedef CursorGt<cursor_type> gt_type;
1394 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1395 for ( ; b != e; ++b) {
1396 auto in = *b;
1397 auto cursor = new cursor_type(in);
1398 if (cursor->next()) {
1399 pq.push(cursor);
1400 } else {
1401 // Skip empty tables.
1402 delete cursor;
1406 while (!pq.empty()) {
1407 cursor_type * cur = pq.top();
1408 pq.pop();
1410 string key = cur->current_key;
1411 if (pq.empty() || pq.top()->current_key > key) {
1412 // No need to merge the tags, just copy the (possibly compressed)
1413 // tag value.
1414 bool compressed = cur->read_tag(true);
1415 out->add(key, cur->current_tag, compressed);
1416 if (cur->next()) {
1417 pq.push(cur);
1418 } else {
1419 delete cur;
1421 continue;
1424 // Merge tag values with the same key:
1425 string tag;
1427 // We just want the union of words, so copy over the first instance
1428 // and skip any identical ones.
1429 priority_queue<ByteLengthPrefixedStringItor *,
1430 vector<ByteLengthPrefixedStringItor *>,
1431 ByteLengthPrefixedStringItorGt> pqtag;
1432 vector<cursor_type *> vec;
1434 while (true) {
1435 cur->read_tag();
1436 pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
1437 vec.push_back(cur);
1438 if (pq.empty() || pq.top()->current_key != key) break;
1439 cur = pq.top();
1440 pq.pop();
1443 string lastword;
1444 while (!pqtag.empty()) {
1445 ByteLengthPrefixedStringItor * it = pqtag.top();
1446 pqtag.pop();
1447 if (**it != lastword) {
1448 lastword = **it;
1449 tag += uint8_t(lastword.size() ^ MAGIC_XOR_VALUE);
1450 tag += lastword;
1452 ++*it;
1453 if (!it->at_end()) {
1454 pqtag.push(it);
1455 } else {
1456 delete it;
1460 for (auto i : vec) {
1461 cur = i;
1462 if (cur->next()) {
1463 pq.push(cur);
1464 } else {
1465 delete cur;
1469 out->add(key, tag);
1473 template<typename T, typename U> void
1474 multimerge_postlists(Xapian::Compactor * compactor,
1475 T* out, const char * tmpdir,
1476 const vector<U*>& in,
1477 vector<Xapian::docid> off)
1479 if (in.size() <= 3) {
1480 merge_postlists(compactor, out, off.begin(), in.begin(), in.end());
1481 return;
1483 unsigned int c = 0;
1484 vector<HoneyTable *> tmp;
1485 tmp.reserve(in.size() / 2);
1487 vector<Xapian::docid> newoff;
1488 newoff.resize(in.size() / 2);
1489 for (unsigned int i = 0, j; i < in.size(); i = j) {
1490 j = i + 2;
1491 if (j == in.size() - 1) ++j;
1493 string dest = tmpdir;
1494 char buf[64];
1495 sprintf(buf, "/tmp%u_%u.", c, i / 2);
1496 dest += buf;
1498 HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1500 // Don't compress entries in temporary tables, even if the final
1501 // table would do so. Any already compressed entries will get
1502 // copied in compressed form.
1503 Honey::RootInfo root_info;
1504 root_info.init(0);
1505 const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1506 tmptab->create_and_open(flags, root_info);
1508 merge_postlists(compactor, tmptab, off.begin() + i,
1509 in.begin() + i, in.begin() + j);
1510 tmp.push_back(tmptab);
1511 tmptab->flush_db();
1512 tmptab->commit(1, &root_info);
1514 swap(off, newoff);
1515 ++c;
1518 while (tmp.size() > 3) {
1519 vector<HoneyTable *> tmpout;
1520 tmpout.reserve(tmp.size() / 2);
1521 vector<Xapian::docid> newoff;
1522 newoff.resize(tmp.size() / 2);
1523 for (unsigned int i = 0, j; i < tmp.size(); i = j) {
1524 j = i + 2;
1525 if (j == tmp.size() - 1) ++j;
1527 string dest = tmpdir;
1528 char buf[64];
1529 sprintf(buf, "/tmp%u_%u.", c, i / 2);
1530 dest += buf;
1532 HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1534 // Don't compress entries in temporary tables, even if the final
1535 // table would do so. Any already compressed entries will get
1536 // copied in compressed form.
1537 Honey::RootInfo root_info;
1538 root_info.init(0);
1539 const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1540 tmptab->create_and_open(flags, root_info);
1542 merge_postlists(compactor, tmptab, off.begin() + i,
1543 tmp.begin() + i, tmp.begin() + j);
1544 if (c > 0) {
1545 for (unsigned int k = i; k < j; ++k) {
1546 // FIXME: unlink(tmp[k]->get_path().c_str());
1547 delete tmp[k];
1548 tmp[k] = NULL;
1551 tmpout.push_back(tmptab);
1552 tmptab->flush_db();
1553 tmptab->commit(1, &root_info);
1555 swap(tmp, tmpout);
1556 swap(off, newoff);
1557 ++c;
1559 merge_postlists(compactor, out, off.begin(), tmp.begin(), tmp.end());
1560 if (c > 0) {
1561 for (size_t k = 0; k < tmp.size(); ++k) {
1562 // FIXME: unlink(tmp[k]->get_path().c_str());
1563 delete tmp[k];
1564 tmp[k] = NULL;
1569 template<typename T> class PositionCursor;
1571 #ifdef XAPIAN_HAS_GLASS_BACKEND
1572 template<>
1573 class PositionCursor<const GlassTable&> : private GlassCursor {
1574 Xapian::docid offset;
1576 public:
1577 string key;
1578 Xapian::docid firstdid;
1580 PositionCursor(const GlassTable *in, Xapian::docid offset_)
1581 : GlassCursor(in), offset(offset_), firstdid(0) {
1582 rewind();
1585 bool next() {
1586 if (!GlassCursor::next()) return false;
1587 read_tag();
1588 const char * d = current_key.data();
1589 const char * e = d + current_key.size();
1590 string term;
1591 Xapian::docid did;
1592 if (!unpack_string_preserving_sort(&d, e, term) ||
1593 !unpack_uint_preserving_sort(&d, e, &did) ||
1594 d != e) {
1595 throw Xapian::DatabaseCorruptError("Bad position key");
1598 key.resize(0);
1599 pack_string_preserving_sort(key, term);
1600 pack_uint_preserving_sort(key, did + offset);
1601 return true;
1604 const string & get_tag() const {
1605 return current_tag;
1608 #endif
1610 template<>
1611 class PositionCursor<const HoneyTable&> : private HoneyCursor {
1612 Xapian::docid offset;
1614 public:
1615 string key;
1616 Xapian::docid firstdid;
1618 PositionCursor(const HoneyTable *in, Xapian::docid offset_)
1619 : HoneyCursor(in), offset(offset_), firstdid(0) {
1620 rewind();
1623 bool next() {
1624 if (!HoneyCursor::next()) return false;
1625 read_tag();
1626 const char * d = current_key.data();
1627 const char * e = d + current_key.size();
1628 string term;
1629 Xapian::docid did;
1630 if (!unpack_string_preserving_sort(&d, e, term) ||
1631 !unpack_uint_preserving_sort(&d, e, &did) ||
1632 d != e) {
1633 throw Xapian::DatabaseCorruptError("Bad position key");
1636 key.resize(0);
1637 pack_string_preserving_sort(key, term);
1638 pack_uint_preserving_sort(key, did + offset);
1639 return true;
1642 const string & get_tag() const {
1643 return current_tag;
1647 template<typename T>
1648 class PositionCursorGt {
1649 public:
1650 /** Return true if and only if a's key is strictly greater than b's key.
1652 bool operator()(const T* a, const T* b) const {
1653 return a->key > b->key;
1657 template<typename T, typename U> void
1658 merge_positions(T* out, const vector<U*> & inputs,
1659 const vector<Xapian::docid> & offset)
1661 typedef decltype(*inputs[0]) table_type; // E.g. HoneyTable
1662 typedef PositionCursor<table_type> cursor_type;
1663 typedef PositionCursorGt<cursor_type> gt_type;
1664 priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1665 for (size_t i = 0; i < inputs.size(); ++i) {
1666 auto in = inputs[i];
1667 auto cursor = new cursor_type(in, offset[i]);
1668 if (cursor->next()) {
1669 pq.push(cursor);
1670 } else {
1671 // Skip empty tables.
1672 delete cursor;
1676 while (!pq.empty()) {
1677 cursor_type * cur = pq.top();
1678 pq.pop();
1679 out->add(cur->key, cur->get_tag());
1680 if (cur->next()) {
1681 pq.push(cur);
1682 } else {
1683 delete cur;
1688 template<typename T, typename U> void
1689 merge_docid_keyed(T *out, const vector<U*> & inputs,
1690 const vector<Xapian::docid> & offset,
1691 int = 0)
1693 for (size_t i = 0; i < inputs.size(); ++i) {
1694 Xapian::docid off = offset[i];
1696 auto in = inputs[i];
1697 HoneyCursor cur(in);
1698 cur.rewind();
1700 string key;
1701 while (cur.next()) {
1702 // Adjust the key if this isn't the first database.
1703 if (off) {
1704 Xapian::docid did;
1705 const char * d = cur.current_key.data();
1706 const char * e = d + cur.current_key.size();
1707 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1708 string msg = "Bad key in ";
1709 msg += inputs[i]->get_path();
1710 throw Xapian::DatabaseCorruptError(msg);
1712 did += off;
1713 key.resize(0);
1714 pack_uint_preserving_sort(key, did);
1715 if (d != e) {
1716 // Copy over anything extra in the key (e.g. the zero byte
1717 // at the end of "used value slots" in the termlist table).
1718 key.append(d, e - d);
1720 } else {
1721 key = cur.current_key;
1723 bool compressed = cur.read_tag(true);
1724 out->add(key, cur.current_tag, compressed);
1729 #ifdef XAPIAN_HAS_GLASS_BACKEND
1730 template<typename T> void
1731 merge_docid_keyed(T *out, const vector<const GlassTable*> & inputs,
1732 const vector<Xapian::docid> & offset,
1733 int table_type = 0)
1735 for (size_t i = 0; i < inputs.size(); ++i) {
1736 Xapian::docid off = offset[i];
1738 auto in = inputs[i];
1739 if (in->empty()) continue;
1741 GlassCursor cur(in);
1742 cur.rewind();
1744 string key;
1745 while (cur.next()) {
1746 next_without_next:
1747 // Adjust the key if this isn't the first database.
1748 if (off) {
1749 Xapian::docid did;
1750 const char * d = cur.current_key.data();
1751 const char * e = d + cur.current_key.size();
1752 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1753 string msg = "Bad key in ";
1754 msg += inputs[i]->get_path();
1755 throw Xapian::DatabaseCorruptError(msg);
1757 did += off;
1758 key.resize(0);
1759 pack_uint_preserving_sort(key, did);
1760 if (d != e) {
1761 // Copy over anything extra in the key (e.g. the zero byte
1762 // at the end of "used value slots" in the termlist table).
1763 key.append(d, e - d);
1765 } else {
1766 key = cur.current_key;
1768 if (table_type == Honey::TERMLIST) {
1769 if (termlist_key_is_values_used(key)) {
1770 throw Xapian::DatabaseCorruptError("value slots used "
1771 "entry without a "
1772 "corresponding "
1773 "termlist entry");
1775 cur.read_tag();
1776 string tag = cur.current_tag;
1778 bool next_result = cur.next();
1779 bool next_already_done = true;
1780 unsigned bitmap_slots_used = 0;
1781 string encoded_slots_used;
1782 if (next_result &&
1783 termlist_key_is_values_used(cur.current_key)) {
1784 next_already_done = false;
1785 cur.read_tag();
1786 const string& valtag = cur.current_tag;
1788 const char* p = valtag.data();
1789 const char* end = p + valtag.size();
1791 Xapian::VecCOW<Xapian::termpos> slots;
1793 Xapian::valueno first_slot;
1794 if (!unpack_uint(&p, end, &first_slot)) {
1795 throw_database_corrupt("Value slot encoding corrupt",
1798 slots.push_back(first_slot);
1799 Xapian::valueno last_slot = first_slot;
1800 while (p != end) {
1801 Xapian::valueno slot;
1802 if (!unpack_uint(&p, end, &slot)) {
1803 throw_database_corrupt("Value slot encoding "
1804 "corrupt", p);
1806 slot += last_slot + 1;
1807 last_slot = slot;
1809 slots.push_back(slot);
1812 if (slots.back() <= 6) {
1813 // Encode as a bitmap if only slots in the range 0-6
1814 // are used.
1815 for (auto slot : slots) {
1816 bitmap_slots_used |= 1 << slot;
1818 } else {
1819 string enc;
1820 pack_uint(enc, last_slot);
1821 if (slots.size() > 1) {
1822 BitWriter slots_used(enc);
1823 slots_used.encode(first_slot, last_slot);
1824 slots_used.encode(slots.size() - 2,
1825 last_slot - first_slot);
1826 slots_used.encode_interpolative(slots, 0,
1827 slots.size() - 1);
1828 encoded_slots_used = slots_used.freeze();
1829 } else {
1830 encoded_slots_used = std::move(enc);
1835 const char* pos = tag.data();
1836 const char* end = pos + tag.size();
1838 string newtag;
1839 if (encoded_slots_used.empty()) {
1840 newtag += char(bitmap_slots_used);
1841 } else {
1842 auto size = encoded_slots_used.size();
1843 if (size < 0x80) {
1844 newtag += char(0x80 | size);
1845 } else {
1846 newtag += '\x80';
1847 pack_uint(newtag, size);
1849 newtag += encoded_slots_used;
1852 if (pos != end) {
1853 Xapian::termcount doclen;
1854 if (!unpack_uint(&pos, end, &doclen)) {
1855 throw_database_corrupt("doclen", pos);
1857 Xapian::termcount termlist_size;
1858 if (!unpack_uint(&pos, end, &termlist_size)) {
1859 throw_database_corrupt("termlist length", pos);
1861 pack_uint(newtag, termlist_size - 1);
1862 pack_uint(newtag, doclen);
1864 string current_term;
1865 while (pos != end) {
1866 Xapian::termcount current_wdf = 0;
1868 if (!current_term.empty()) {
1869 size_t reuse = static_cast<unsigned char>(*pos++);
1870 newtag += char(reuse);
1872 if (reuse > current_term.size()) {
1873 current_wdf = reuse / (current_term.size() + 1);
1874 reuse = reuse % (current_term.size() + 1);
1876 current_term.resize(reuse);
1878 if (pos == end)
1879 throw_database_corrupt("term", NULL);
1882 size_t append = static_cast<unsigned char>(*pos++);
1883 if (size_t(end - pos) < append)
1884 throw_database_corrupt("term", NULL);
1886 current_term.append(pos, append);
1887 pos += append;
1889 if (current_wdf) {
1890 --current_wdf;
1891 } else {
1892 if (!unpack_uint(&pos, end, &current_wdf)) {
1893 throw_database_corrupt("wdf", pos);
1895 pack_uint(newtag, current_wdf);
1898 newtag += char(append);
1899 newtag.append(current_term.end() - append,
1900 current_term.end());
1903 if (!newtag.empty())
1904 out->add(key, newtag);
1905 if (!next_result) break;
1906 if (next_already_done) goto next_without_next;
1907 } else {
1908 bool compressed = cur.read_tag(true);
1909 out->add(key, cur.current_tag, compressed);
1914 #endif
1918 using namespace HoneyCompact;
1920 void
1921 HoneyDatabase::compact(Xapian::Compactor* compactor,
1922 const char* destdir,
1923 int fd,
1924 int source_backend,
1925 const vector<const Xapian::Database::Internal*>& sources,
1926 const vector<Xapian::docid>& offset,
1927 Xapian::Compactor::compaction_level compaction,
1928 unsigned flags,
1929 Xapian::docid last_docid)
1931 // Currently unused for honey.
1932 (void)compaction;
1934 struct table_list {
1935 // The "base name" of the table.
1936 char name[9];
1937 // The type.
1938 Honey::table_type type;
1939 // Create tables after position lazily.
1940 bool lazy;
1943 static const table_list tables[] = {
1944 // name type lazy
1945 { "postlist", Honey::POSTLIST, false },
1946 { "docdata", Honey::DOCDATA, true },
1947 { "termlist", Honey::TERMLIST, false },
1948 { "position", Honey::POSITION, true },
1949 { "spelling", Honey::SPELLING, true },
1950 { "synonym", Honey::SYNONYM, true }
1952 const table_list * tables_end = tables +
1953 (sizeof(tables) / sizeof(tables[0]));
1955 const int FLAGS = Xapian::DB_DANGEROUS;
1957 bool single_file = (flags & Xapian::DBCOMPACT_SINGLE_FILE);
1958 bool multipass = (flags & Xapian::DBCOMPACT_MULTIPASS);
1959 if (single_file) {
1960 // FIXME: Support this combination - we need to put temporary files
1961 // somewhere.
1962 multipass = false;
1965 if (single_file) {
1966 for (size_t i = 0; i != sources.size(); ++i) {
1967 bool has_uncommitted_changes;
1968 if (source_backend == Xapian::DB_BACKEND_GLASS) {
1969 #ifdef XAPIAN_HAS_GLASS_BACKEND
1970 auto db = static_cast<const GlassDatabase*>(sources[i]);
1971 has_uncommitted_changes = db->has_uncommitted_changes();
1972 #else
1973 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1974 #endif
1975 } else {
1976 auto db = static_cast<const HoneyDatabase*>(sources[i]);
1977 has_uncommitted_changes = db->has_uncommitted_changes();
1979 if (has_uncommitted_changes) {
1980 const char * m =
1981 "Can't compact from a WritableDatabase with uncommitted "
1982 "changes - either call commit() first, or create a new "
1983 "Database object from the filename on disk";
1984 throw Xapian::InvalidOperationError(m);
1989 FlintLock lock(destdir ? destdir : "");
1990 if (!single_file) {
1991 string explanation;
1992 FlintLock::reason why = lock.lock(true, false, explanation);
1993 if (why != FlintLock::SUCCESS) {
1994 lock.throw_databaselockerror(why, destdir, explanation);
1998 unique_ptr<HoneyVersion> version_file_out;
1999 if (single_file) {
2000 if (destdir) {
2001 fd = open(destdir, O_RDWR|O_CREAT|O_TRUNC|O_BINARY|O_CLOEXEC, 0666);
2002 if (fd < 0) {
2003 throw Xapian::DatabaseCreateError("open() failed", errno);
2006 version_file_out.reset(new HoneyVersion(fd));
2007 } else {
2008 fd = -1;
2009 version_file_out.reset(new HoneyVersion(destdir));
2012 // Set to true if stat() failed (which can happen if the files are > 2GB
2013 // and off_t is 32 bit) or one of the totals overflowed.
2014 bool bad_totals = false;
2015 off_t in_total = 0, out_total = 0;
2017 version_file_out->create();
2018 for (size_t i = 0; i != sources.size(); ++i) {
2019 bool source_single_file = false;
2020 if (source_backend == Xapian::DB_BACKEND_GLASS) {
2021 #ifdef XAPIAN_HAS_GLASS_BACKEND
2022 auto db = static_cast<const GlassDatabase*>(sources[i]);
2023 auto& v_in = db->version_file;
2024 auto& v_out = version_file_out;
2025 v_out->merge_stats(v_in.get_doccount(),
2026 v_in.get_doclength_lower_bound(),
2027 v_in.get_doclength_upper_bound(),
2028 v_in.get_wdf_upper_bound(),
2029 v_in.get_total_doclen(),
2030 v_in.get_spelling_wordfreq_upper_bound());
2031 source_single_file = db->single_file();
2032 #else
2033 Assert(false);
2034 #endif
2035 } else {
2036 auto db = static_cast<const HoneyDatabase*>(sources[i]);
2037 version_file_out->merge_stats(db->version_file);
2038 source_single_file = db->single_file();
2040 if (source_single_file) {
2041 // Add single file input DB sizes to in_total here. For other
2042 // databases, we sum per-table below.
2043 string path;
2044 sources[i]->get_backend_info(&path);
2045 off_t db_size = file_size(path);
2046 if (errno == 0) {
2047 // FIXME: check overflow and set bad_totals
2048 in_total += db_size;
2049 } else {
2050 bad_totals = true;
2055 string fl_serialised;
2056 #if 0
2057 if (single_file) {
2058 HoneyFreeList fl;
2059 fl.set_first_unused_block(1); // FIXME: Assumption?
2060 fl.pack(fl_serialised);
2062 #endif
2064 // FIXME: sort out indentation.
2065 if (source_backend == Xapian::DB_BACKEND_GLASS) {
2066 #ifndef XAPIAN_HAS_GLASS_BACKEND
2067 throw Xapian::FeatureUnavailableError("Glass backend disabled");
2068 #else
2069 vector<HoneyTable *> tabs;
2070 tabs.reserve(tables_end - tables);
2071 off_t prev_size = 0;
2072 for (const table_list * t = tables; t < tables_end; ++t) {
2073 // The postlist table requires an N-way merge, adjusting the
2074 // headers of various blocks. The spelling and synonym tables also
2075 // need special handling. The other tables have keys sorted in
2076 // docid order, so we can merge them by simply copying all the keys
2077 // from each source table in turn.
2078 if (compactor)
2079 compactor->set_status(t->name, string());
2081 string dest;
2082 if (!single_file) {
2083 dest = destdir;
2084 dest += '/';
2085 dest += t->name;
2086 dest += '.';
2089 bool output_will_exist = !t->lazy;
2091 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2092 // on certain systems).
2093 bool bad_stat = false;
2095 // We can't currently report input sizes if there's a single file DB
2096 // amongst the inputs.
2097 bool single_file_in = false;
2099 off_t in_size = 0;
2101 vector<const GlassTable*> inputs;
2102 inputs.reserve(sources.size());
2103 size_t inputs_present = 0;
2104 for (auto src : sources) {
2105 auto db = static_cast<const GlassDatabase*>(src);
2106 const GlassTable * table;
2107 switch (t->type) {
2108 case Honey::POSTLIST:
2109 table = &(db->postlist_table);
2110 break;
2111 case Honey::DOCDATA:
2112 table = &(db->docdata_table);
2113 break;
2114 case Honey::TERMLIST:
2115 table = &(db->termlist_table);
2116 break;
2117 case Honey::POSITION:
2118 table = &(db->position_table);
2119 break;
2120 case Honey::SPELLING:
2121 table = &(db->spelling_table);
2122 break;
2123 case Honey::SYNONYM:
2124 table = &(db->synonym_table);
2125 break;
2126 default:
2127 Assert(false);
2128 return;
2131 if (db->single_file()) {
2132 if (t->lazy && !GlassCursor(table).next()) {
2133 // Lazy table with no entries, so essentially doesn't
2134 // exist.
2135 } else {
2136 // FIXME: Find actual size somehow?
2137 // in_size += table->size() / 1024;
2138 single_file_in = true;
2139 output_will_exist = true;
2140 ++inputs_present;
2142 } else {
2143 off_t db_size = file_size(table->get_path());
2144 if (errno == 0) {
2145 // FIXME: check overflow and set bad_totals
2146 in_total += db_size;
2147 in_size += db_size / 1024;
2148 output_will_exist = true;
2149 ++inputs_present;
2150 } else if (errno != ENOENT) {
2151 // We get ENOENT for an optional table.
2152 bad_totals = bad_stat = true;
2153 output_will_exist = true;
2154 ++inputs_present;
2157 inputs.push_back(table);
2160 // If any inputs lack a termlist table, suppress it in the output.
2161 if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2162 if (inputs_present != 0) {
2163 if (compactor) {
2164 string m = str(inputs_present);
2165 m += " of ";
2166 m += str(sources.size());
2167 m += " inputs present, so suppressing output";
2168 compactor->set_status(t->name, m);
2170 continue;
2172 output_will_exist = false;
2175 if (!output_will_exist) {
2176 if (compactor)
2177 compactor->set_status(t->name, "doesn't exist");
2178 continue;
2181 HoneyTable * out;
2182 off_t table_start_offset = -1;
2183 if (single_file) {
2184 if (t == tables) {
2185 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2186 // space for version file. It's tricky to exactly know the
2187 // size of the version file beforehand.
2188 table_start_offset = HONEY_VERSION_MAX_SIZE;
2189 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2190 throw Xapian::DatabaseError("lseek() failed", errno);
2191 } else {
2192 table_start_offset = lseek(fd, 0, SEEK_CUR);
2194 out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2195 false, false);
2196 } else {
2197 out = new HoneyTable(t->name, dest, false, t->lazy);
2199 tabs.push_back(out);
2200 Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2201 if (single_file) {
2202 root_info->set_free_list(fl_serialised);
2203 root_info->set_offset(table_start_offset);
2204 out->open(FLAGS,
2205 version_file_out->get_root(t->type),
2206 version_file_out->get_revision());
2207 } else {
2208 out->create_and_open(FLAGS, *root_info);
2211 switch (t->type) {
2212 case Honey::POSTLIST: {
2213 if (multipass && inputs.size() > 3) {
2214 multimerge_postlists(compactor, out, destdir,
2215 inputs, offset);
2216 } else {
2217 merge_postlists(compactor, out, offset.begin(),
2218 inputs.begin(), inputs.end());
2220 break;
2222 case Honey::SPELLING:
2223 merge_spellings(out, inputs.cbegin(), inputs.cend());
2224 break;
2225 case Honey::SYNONYM:
2226 merge_synonyms(out, inputs.begin(), inputs.end());
2227 break;
2228 case Honey::POSITION:
2229 merge_positions(out, inputs, offset);
2230 break;
2231 default:
2232 // DocData, Termlist
2233 merge_docid_keyed(out, inputs, offset, t->type);
2234 break;
2237 // Commit as revision 1.
2238 out->flush_db();
2239 out->commit(1, root_info);
2240 out->sync();
2241 if (single_file) fl_serialised = root_info->get_free_list();
2243 off_t out_size = 0;
2244 if (!bad_stat && !single_file_in) {
2245 off_t db_size;
2246 if (single_file) {
2247 db_size = file_size(fd);
2248 } else {
2249 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2251 if (errno == 0) {
2252 if (single_file) {
2253 off_t old_prev_size = prev_size;
2254 prev_size = db_size;
2255 db_size -= old_prev_size;
2257 // FIXME: check overflow and set bad_totals
2258 out_total += db_size;
2259 out_size = db_size / 1024;
2260 } else if (errno != ENOENT) {
2261 bad_totals = bad_stat = true;
2264 if (bad_stat) {
2265 if (compactor)
2266 compactor->set_status(t->name,
2267 "Done (couldn't stat all the DB files)");
2268 } else if (single_file_in) {
2269 if (compactor)
2270 compactor->set_status(t->name,
2271 "Done (table sizes unknown for single "
2272 "file DB input)");
2273 } else {
2274 string status;
2275 if (out_size == in_size) {
2276 status = "Size unchanged (";
2277 } else {
2278 off_t delta;
2279 if (out_size < in_size) {
2280 delta = in_size - out_size;
2281 status = "Reduced by ";
2282 } else {
2283 delta = out_size - in_size;
2284 status = "INCREASED by ";
2286 if (in_size) {
2287 status += str(100 * delta / in_size);
2288 status += "% ";
2290 status += str(delta);
2291 status += "K (";
2292 status += str(in_size);
2293 status += "K -> ";
2295 status += str(out_size);
2296 status += "K)";
2297 if (compactor)
2298 compactor->set_status(t->name, status);
2302 // If compacting to a single file output and all the tables are empty, pad
2303 // the output so that it isn't mistaken for a stub database when we try to
2304 // open it. For this it needs to at least HONEY_MIN_DB_SIZE in size.
2305 if (single_file && prev_size < HONEY_MIN_DB_SIZE) {
2306 out_total = HONEY_MIN_DB_SIZE;
2307 #ifdef HAVE_FTRUNCATE
2308 if (ftruncate(fd, HONEY_MIN_DB_SIZE) < 0) {
2309 throw Xapian::DatabaseError("Failed to set size of output "
2310 "database", errno);
2312 #else
2313 const off_t off = HONEY_MIN_DB_SIZE - 1;
2314 if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2315 throw Xapian::DatabaseError("Failed to set size of output "
2316 "database", errno);
2318 #endif
2321 if (single_file) {
2322 if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2323 throw Xapian::DatabaseError("lseek() failed", errno);
2326 version_file_out->set_last_docid(last_docid);
2327 string tmpfile = version_file_out->write(1, FLAGS);
2328 if (single_file) {
2329 off_t version_file_size = lseek(fd, 0, SEEK_CUR);
2330 if (version_file_size < 0) {
2331 throw Xapian::DatabaseError("lseek() failed", errno);
2333 if (version_file_size > HONEY_VERSION_MAX_SIZE) {
2334 throw Xapian::DatabaseError("Didn't allow enough space for "
2335 "version file data");
2338 for (unsigned j = 0; j != tabs.size(); ++j) {
2339 tabs[j]->sync();
2341 // Commit with revision 1.
2342 version_file_out->sync(tmpfile, 1, FLAGS);
2343 for (unsigned j = 0; j != tabs.size(); ++j) {
2344 delete tabs[j];
2346 #endif
2347 } else {
2348 vector<HoneyTable *> tabs;
2349 tabs.reserve(tables_end - tables);
2350 off_t prev_size = HONEY_MIN_DB_SIZE;
2351 for (const table_list * t = tables; t < tables_end; ++t) {
2352 // The postlist table requires an N-way merge, adjusting the
2353 // headers of various blocks. The spelling and synonym tables also
2354 // need special handling. The other tables have keys sorted in
2355 // docid order, so we can merge them by simply copying all the keys
2356 // from each source table in turn.
2357 if (compactor)
2358 compactor->set_status(t->name, string());
2360 string dest;
2361 if (!single_file) {
2362 dest = destdir;
2363 dest += '/';
2364 dest += t->name;
2365 dest += '.';
2368 bool output_will_exist = !t->lazy;
2370 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2371 // on certain systems).
2372 bool bad_stat = false;
2374 // We can't currently report input sizes if there's a single file DB
2375 // amongst the inputs.
2376 bool single_file_in = false;
2378 off_t in_size = 0;
2380 vector<const HoneyTable*> inputs;
2381 inputs.reserve(sources.size());
2382 size_t inputs_present = 0;
2383 for (auto src : sources) {
2384 auto db = static_cast<const HoneyDatabase*>(src);
2385 const HoneyTable * table;
2386 switch (t->type) {
2387 case Honey::POSTLIST:
2388 table = &(db->postlist_table);
2389 break;
2390 case Honey::DOCDATA:
2391 table = &(db->docdata_table);
2392 break;
2393 case Honey::TERMLIST:
2394 table = &(db->termlist_table);
2395 break;
2396 case Honey::POSITION:
2397 table = &(db->position_table);
2398 break;
2399 case Honey::SPELLING:
2400 table = &(db->spelling_table);
2401 break;
2402 case Honey::SYNONYM:
2403 table = &(db->synonym_table);
2404 break;
2405 default:
2406 Assert(false);
2407 return;
2410 if (db->single_file()) {
2411 if (t->lazy && !HoneyCursor(table).next()) {
2412 // Lazy table with no entries, so essentially doesn't
2413 // exist.
2414 } else {
2415 // FIXME: Find actual size somehow?
2416 // in_size += table->size() / 1024;
2417 single_file_in = true;
2418 output_will_exist = true;
2419 ++inputs_present;
2421 } else {
2422 off_t db_size = file_size(table->get_path());
2423 if (errno == 0) {
2424 // FIXME: check overflow and set bad_totals
2425 in_total += db_size;
2426 in_size += db_size / 1024;
2427 output_will_exist = true;
2428 ++inputs_present;
2429 } else if (errno != ENOENT) {
2430 // We get ENOENT for an optional table.
2431 bad_totals = bad_stat = true;
2432 output_will_exist = true;
2433 ++inputs_present;
2436 inputs.push_back(table);
2439 // If any inputs lack a termlist table, suppress it in the output.
2440 if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2441 if (inputs_present != 0) {
2442 if (compactor) {
2443 string m = str(inputs_present);
2444 m += " of ";
2445 m += str(sources.size());
2446 m += " inputs present, so suppressing output";
2447 compactor->set_status(t->name, m);
2449 continue;
2451 output_will_exist = false;
2454 if (!output_will_exist) {
2455 if (compactor)
2456 compactor->set_status(t->name, "doesn't exist");
2457 continue;
2460 HoneyTable * out;
2461 off_t table_start_offset = -1;
2462 if (single_file) {
2463 if (t == tables) {
2464 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2465 // space for version file. It's tricky to exactly know the
2466 // size of the version file beforehand.
2467 table_start_offset = HONEY_VERSION_MAX_SIZE;
2468 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2469 throw Xapian::DatabaseError("lseek() failed", errno);
2470 } else {
2471 table_start_offset = lseek(fd, 0, SEEK_CUR);
2473 out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2474 false, false);
2475 } else {
2476 out = new HoneyTable(t->name, dest, false, t->lazy);
2478 tabs.push_back(out);
2479 Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2480 if (single_file) {
2481 root_info->set_free_list(fl_serialised);
2482 root_info->set_offset(table_start_offset);
2483 out->open(FLAGS,
2484 version_file_out->get_root(t->type),
2485 version_file_out->get_revision());
2486 } else {
2487 out->create_and_open(FLAGS, *root_info);
2490 switch (t->type) {
2491 case Honey::POSTLIST: {
2492 if (multipass && inputs.size() > 3) {
2493 multimerge_postlists(compactor, out, destdir,
2494 inputs, offset);
2495 } else {
2496 merge_postlists(compactor, out, offset.begin(),
2497 inputs.begin(), inputs.end());
2499 break;
2501 case Honey::SPELLING:
2502 merge_spellings(out, inputs.begin(), inputs.end());
2503 break;
2504 case Honey::SYNONYM:
2505 merge_synonyms(out, inputs.begin(), inputs.end());
2506 break;
2507 case Honey::POSITION:
2508 merge_positions(out, inputs, offset);
2509 break;
2510 default:
2511 // DocData, Termlist
2512 merge_docid_keyed(out, inputs, offset);
2513 break;
2516 // Commit as revision 1.
2517 out->flush_db();
2518 out->commit(1, root_info);
2519 out->sync();
2520 if (single_file) fl_serialised = root_info->get_free_list();
2522 off_t out_size = 0;
2523 if (!bad_stat && !single_file_in) {
2524 off_t db_size;
2525 if (single_file) {
2526 db_size = file_size(fd);
2527 } else {
2528 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2530 if (errno == 0) {
2531 if (single_file) {
2532 off_t old_prev_size = prev_size;
2533 prev_size = db_size;
2534 db_size -= old_prev_size;
2536 // FIXME: check overflow and set bad_totals
2537 out_total += db_size;
2538 out_size = db_size / 1024;
2539 } else if (errno != ENOENT) {
2540 bad_totals = bad_stat = true;
2543 if (bad_stat) {
2544 if (compactor)
2545 compactor->set_status(t->name,
2546 "Done (couldn't stat all the DB files)");
2547 } else if (single_file_in) {
2548 if (compactor)
2549 compactor->set_status(t->name,
2550 "Done (table sizes unknown for single "
2551 "file DB input)");
2552 } else {
2553 string status;
2554 if (out_size == in_size) {
2555 status = "Size unchanged (";
2556 } else {
2557 off_t delta;
2558 if (out_size < in_size) {
2559 delta = in_size - out_size;
2560 status = "Reduced by ";
2561 } else {
2562 delta = out_size - in_size;
2563 status = "INCREASED by ";
2565 if (in_size) {
2566 status += str(100 * delta / in_size);
2567 status += "% ";
2569 status += str(delta);
2570 status += "K (";
2571 status += str(in_size);
2572 status += "K -> ";
2574 status += str(out_size);
2575 status += "K)";
2576 if (compactor)
2577 compactor->set_status(t->name, status);
2581 // If compacting to a single file output and all the tables are empty, pad
2582 // the output so that it isn't mistaken for a stub database when we try to
2583 // open it. For this it needs to at least HONEY_MIN_DB_SIZE in size.
2584 if (single_file && prev_size < HONEY_MIN_DB_SIZE) {
2585 out_total = HONEY_MIN_DB_SIZE;
2586 #ifdef HAVE_FTRUNCATE
2587 if (ftruncate(fd, HONEY_MIN_DB_SIZE) < 0) {
2588 throw Xapian::DatabaseError("Failed to set size of output "
2589 "database", errno);
2591 #else
2592 const off_t off = HONEY_MIN_DB_SIZE - 1;
2593 if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2594 throw Xapian::DatabaseError("Failed to set size of output "
2595 "database", errno);
2597 #endif
2600 if (single_file) {
2601 if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2602 throw Xapian::DatabaseError("lseek() failed", errno);
2605 version_file_out->set_last_docid(last_docid);
2606 string tmpfile = version_file_out->write(1, FLAGS);
2607 for (unsigned j = 0; j != tabs.size(); ++j) {
2608 tabs[j]->sync();
2610 // Commit with revision 1.
2611 version_file_out->sync(tmpfile, 1, FLAGS);
2612 for (unsigned j = 0; j != tabs.size(); ++j) {
2613 delete tabs[j];
2617 if (!single_file) lock.release();
2619 if (!bad_totals && compactor) {
2620 string status;
2621 in_total /= 1024;
2622 out_total /= 1024;
2623 if (out_total == in_total) {
2624 status = "Size unchanged (";
2625 } else {
2626 off_t delta;
2627 if (out_total < in_total) {
2628 delta = in_total - out_total;
2629 status = "Reduced by ";
2630 } else {
2631 delta = out_total - in_total;
2632 status = "INCREASED by ";
2634 if (in_total) {
2635 status += str(100 * delta / in_total);
2636 status += "% ";
2638 status += str(delta);
2639 status += "K (";
2640 status += str(in_total);
2641 status += "K -> ";
2643 status += str(out_total);
2644 status += "K)";
2645 compactor->set_status("Total", status);