1 /** @file honey_compact.cc
2 * @brief Compact a honey database, or merge and compact several.
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "xapian/compactor.h"
25 #include "xapian/constants.h"
26 #include "xapian/error.h"
27 #include "xapian/types.h"
32 #include <type_traits>
37 #include "backends/flint_lock.h"
38 #include "compression_stream.h"
39 #include "honey_cursor.h"
40 #include "honey_database.h"
41 #include "honey_defs.h"
42 #include "honey_postlist_encodings.h"
43 #include "honey_table.h"
44 #include "honey_values.h"
45 #include "honey_version.h"
46 #include "filetests.h"
47 #include "internaltypes.h"
49 #include "backends/valuestats.h"
50 #include "wordaccess.h"
52 #include "../byte_length_strings.h"
53 #include "../prefix_compressed_strings.h"
55 #ifdef XAPIAN_HAS_GLASS_BACKEND
56 # include "../glass/glass_database.h"
57 # include "../glass/glass_table.h"
58 # include "../glass/glass_values.h"
62 using Honey::encode_valuestats
;
66 throw_database_corrupt(const char* item
, const char* pos
)
70 message
= "Value overflow unpacking termlist: ";
72 message
= "Out of data unpacking termlist: ";
75 throw Xapian::DatabaseCorruptError(message
);
78 #ifdef XAPIAN_HAS_GLASS_BACKEND
79 namespace GlassCompact
{
82 is_user_metadata_key(const string
& key
)
84 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xc0';
88 is_valuestats_key(const string
& key
)
90 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd0';
94 is_valuechunk_key(const string
& key
)
96 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd8';
100 is_doclenchunk_key(const string
& key
)
102 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xe0';
108 termlist_key_is_values_used(const string
& key
)
110 const char* p
= key
.data();
111 const char* e
= p
+ key
.size();
113 if (unpack_uint_preserving_sort(&p
, e
, &did
)) {
116 if (e
- p
== 1 && *p
== '\0')
119 throw Xapian::DatabaseCorruptError("termlist key format");
123 // Put all the helpers in a namespace to avoid symbols colliding with those of
124 // the same name in other flint-derived backends.
125 namespace HoneyCompact
{
127 /// Return a Honey::KEY_* constant, or a different value for an invalid key.
129 key_type(const string
& key
)
132 return Honey::KEY_POSTING_CHUNK
;
137 unsigned char ch
= key
[1];
138 if (ch
>= Honey::KEY_VALUE_STATS
&& ch
<= Honey::KEY_VALUE_STATS_HI
)
139 return Honey::KEY_VALUE_STATS
;
140 if (ch
>= Honey::KEY_VALUE_CHUNK
&& ch
<= Honey::KEY_VALUE_CHUNK_HI
)
141 return Honey::KEY_VALUE_CHUNK
;
142 if (ch
>= Honey::KEY_DOCLEN_CHUNK
&& ch
<= Honey::KEY_DOCLEN_CHUNK_HI
)
143 return Honey::KEY_DOCLEN_CHUNK
;
145 // Handle Honey::KEY_USER_METADATA, Honey::KEY_POSTING_CHUNK, and currently
150 template<typename T
> class PostlistCursor
;
152 #ifdef XAPIAN_HAS_GLASS_BACKEND
153 // Convert glass to honey.
155 class PostlistCursor
<const GlassTable
&> : private GlassCursor
{
156 Xapian::docid offset
;
160 Xapian::docid firstdid
;
161 Xapian::docid chunk_lastdid
;
162 Xapian::termcount tf
, cf
;
163 Xapian::termcount first_wdf
;
164 Xapian::termcount wdf_max
;
167 PostlistCursor(const GlassTable
*in
, Xapian::docid offset_
)
168 : GlassCursor(in
), offset(offset_
), firstdid(0)
174 if (!GlassCursor::next()) return false;
175 // We put all chunks into the non-initial chunk form here, then fix up
176 // the first chunk for each term in the merged database as we merge.
181 if (GlassCompact::is_user_metadata_key(key
)) {
182 key
[1] = Honey::KEY_USER_METADATA
;
185 if (GlassCompact::is_valuestats_key(key
)) {
187 const char * p
= key
.data();
188 const char * end
= p
+ key
.length();
190 Xapian::valueno slot
;
191 if (!unpack_uint_last(&p
, end
, &slot
))
192 throw Xapian::DatabaseCorruptError("bad value stats key");
193 // FIXME: pack_uint_last() is not a good encoding for keys, as the
194 // encoded values do not in general sort the same way as the
195 // numeric values. The first 256 values will be in order at least.
197 // We could just buffer up the stats for all the slots - it's
198 // unlikely there are going to be very many. Another option is
199 // to use multiple cursors or seek a single cursor around.
200 AssertRel(slot
, <=, 256);
201 key
= Honey::make_valuestats_key(slot
);
204 if (GlassCompact::is_valuechunk_key(key
)) {
205 const char * p
= key
.data();
206 const char * end
= p
+ key
.length();
208 Xapian::valueno slot
;
209 if (!unpack_uint(&p
, end
, &slot
))
210 throw Xapian::DatabaseCorruptError("bad value key");
211 // FIXME: pack_uint() is not a good encoding for keys, as the
212 // encoded values do not in general sort the same way as the
213 // numeric values. The first 128 values will be in order at least.
215 // Buffering up this data for all slots is potentially prohibitively
216 // costly. We probably need to use multiple cursors or seek a single
218 AssertRel(slot
, <=, 128);
219 Xapian::docid first_did
;
220 if (!unpack_uint_preserving_sort(&p
, end
, &first_did
))
221 throw Xapian::DatabaseCorruptError("bad value key");
224 Glass::ValueChunkReader
reader(tag
.data(), tag
.size(), first_did
);
225 Xapian::docid last_did
= first_did
;
226 while (reader
.next(), !reader
.at_end()) {
227 last_did
= reader
.get_docid();
230 key
= Honey::make_valuechunk_key(slot
, last_did
);
232 // Add the docid delta across the chunk to the start of the tag.
234 pack_uint(newtag
, last_did
- first_did
);
235 tag
.insert(0, newtag
);
240 if (GlassCompact::is_doclenchunk_key(key
)) {
241 const char * d
= key
.data();
242 const char * e
= d
+ key
.size();
246 // This is an initial chunk, so adjust tag header.
249 if (!unpack_uint(&d
, e
, &tf
) ||
250 !unpack_uint(&d
, e
, &cf
) ||
251 !unpack_uint(&d
, e
, &firstdid
)) {
252 throw Xapian::DatabaseCorruptError("Bad postlist key");
255 tag
.erase(0, d
- tag
.data());
257 // Not an initial chunk, just unpack firstdid.
258 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
259 throw Xapian::DatabaseCorruptError("Bad postlist key");
263 // Set key to placeholder value which just indicates that this is a
265 static const char doclen_key_prefix
[2] = {
266 0, char(Honey::KEY_DOCLEN_CHUNK
)
268 key
.assign(doclen_key_prefix
, 2);
273 // Convert doclen chunk to honey format.
276 // Skip the "last chunk" flag and increase_to_last.
278 throw Xapian::DatabaseCorruptError("No last chunk flag in "
279 "glass docdata chunk");
281 Xapian::docid increase_to_last
;
282 if (!unpack_uint(&d
, e
, &increase_to_last
))
283 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
284 "in glass docdata chunk");
286 Xapian::termcount doclen_max
= 0;
288 Xapian::termcount doclen
;
289 if (!unpack_uint(&d
, e
, &doclen
))
290 throw Xapian::DatabaseCorruptError("Decoding doclen in "
291 "glass docdata chunk");
292 if (doclen
> doclen_max
)
294 unsigned char buf
[4];
295 unaligned_write4(buf
, doclen
);
296 newtag
.append(reinterpret_cast<char*>(buf
), 4);
299 Xapian::docid gap_size
;
300 if (!unpack_uint(&d
, e
, &gap_size
))
301 throw Xapian::DatabaseCorruptError("Decoding docid "
304 // FIXME: Split chunk if the gap_size is at all large.
305 newtag
.append(4 * gap_size
, '\xff');
308 Assert(!startswith(newtag
, "\xff\xff\xff\xff"));
309 Assert(!endswith(newtag
, "\xff\xff\xff\xff"));
311 AssertEq(newtag
.size() % 4, 0);
312 chunk_lastdid
= firstdid
- 1 + newtag
.size() / 4;
314 // Only encode document lengths using a whole number of bytes for
315 // now. We could allow arbitrary bit widths, but it complicates
316 // encoding and decoding so we should consider if the fairly small
317 // additional saving is worth it.
318 if (doclen_max
>= 0xffff) {
319 if (doclen_max
>= 0xffffff) {
320 newtag
.insert(0, 1, char(32));
322 } else if (doclen_max
>= 0xffffffff) {
323 // FIXME: Handle these.
324 const char* m
= "Document length values >= 0xffffffff not "
326 throw Xapian::FeatureUnavailableError(m
);
328 tag
.assign(1, char(24));
329 for (size_t i
= 1; i
< newtag
.size(); i
+= 4)
330 tag
.append(newtag
, i
, 3);
333 if (doclen_max
>= 0xff) {
334 tag
.assign(1, char(16));
335 for (size_t i
= 2; i
< newtag
.size(); i
+= 4)
336 tag
.append(newtag
, i
, 2);
338 tag
.assign(1, char(8));
339 for (size_t i
= 3; i
< newtag
.size(); i
+= 4)
340 tag
.append(newtag
, i
, 1);
347 // Adjust key if this is *NOT* an initial chunk.
348 // key is: pack_string_preserving_sort(key, tname)
349 // plus optionally: pack_uint_preserving_sort(key, did)
350 const char * d
= key
.data();
351 const char * e
= d
+ key
.size();
353 if (!unpack_string_preserving_sort(&d
, e
, tname
))
354 throw Xapian::DatabaseCorruptError("Bad postlist key");
357 // This is an initial chunk for a term, so adjust tag header.
360 if (!unpack_uint(&d
, e
, &tf
) ||
361 !unpack_uint(&d
, e
, &cf
) ||
362 !unpack_uint(&d
, e
, &firstdid
)) {
363 throw Xapian::DatabaseCorruptError("Bad postlist key");
366 have_wdfs
= (cf
!= 0);
367 tag
.erase(0, d
- tag
.data());
370 // Not an initial chunk, so adjust key.
371 size_t tmp
= d
- key
.data();
372 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
373 throw Xapian::DatabaseCorruptError("Bad postlist key");
381 // Convert posting chunk to honey format, but without any header.
384 // Skip the "last chunk" flag; decode increase_to_last.
386 throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
389 Xapian::docid increase_to_last
;
390 if (!unpack_uint(&d
, e
, &increase_to_last
))
391 throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
392 "glass posting chunk");
393 chunk_lastdid
= firstdid
+ increase_to_last
;
394 if (!unpack_uint(&d
, e
, &first_wdf
))
395 throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
397 if ((first_wdf
!= 0) != have_wdfs
) {
398 // FIXME: Also need to adjust document length, total document length.
399 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
402 throw Xapian::DatabaseError("Honey does not support a term having "
403 "both zero and non-zero wdf");
405 wdf_max
= max(wdf_max
, first_wdf
);
409 if (!unpack_uint(&d
, e
, &delta
))
410 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
411 "glass posting chunk");
412 pack_uint(newtag
, delta
);
413 Xapian::termcount wdf
;
414 if (!unpack_uint(&d
, e
, &wdf
))
415 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
417 if ((wdf
!= 0) != have_wdfs
) {
418 // FIXME: Also need to adjust document length, total document length.
419 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
422 throw Xapian::DatabaseError("Honey does not support a term "
423 "having both zero and non-zero "
427 pack_uint(newtag
, wdf
);
428 wdf_max
= max(wdf_max
, wdf
);
440 class PostlistCursor
<const HoneyTable
&> : private HoneyCursor
{
441 Xapian::docid offset
;
445 Xapian::docid firstdid
;
446 Xapian::docid chunk_lastdid
;
447 Xapian::termcount tf
, cf
;
448 Xapian::termcount first_wdf
;
449 Xapian::termcount wdf_max
;
452 PostlistCursor(const HoneyTable
*in
, Xapian::docid offset_
)
453 : HoneyCursor(in
), offset(offset_
), firstdid(0)
459 if (!HoneyCursor::next()) return false;
460 // We put all chunks into the non-initial chunk form here, then fix up
461 // the first chunk for each term in the merged database as we merge.
466 switch (key_type(key
)) {
467 case Honey::KEY_USER_METADATA
:
468 case Honey::KEY_VALUE_STATS
:
470 case Honey::KEY_VALUE_CHUNK
: {
471 const char * p
= key
.data();
472 const char * end
= p
+ key
.length();
474 Xapian::valueno slot
;
475 if (p
[-1] != char(Honey::KEY_VALUE_CHUNK_HI
)) {
476 slot
= p
[-1] - Honey::KEY_VALUE_CHUNK
;
478 if (!unpack_uint_preserving_sort(&p
, end
, &slot
))
479 throw Xapian::DatabaseCorruptError("bad value key");
482 if (!unpack_uint_preserving_sort(&p
, end
, &did
))
483 throw Xapian::DatabaseCorruptError("bad value key");
484 key
= Honey::make_valuechunk_key(slot
, did
+ offset
);
487 case Honey::KEY_DOCLEN_CHUNK
: {
488 Xapian::docid did
= Honey::docid_from_key(key
);
490 throw Xapian::DatabaseCorruptError("Bad doclen key");
491 chunk_lastdid
= did
+ offset
;
495 case Honey::KEY_POSTING_CHUNK
:
498 throw Xapian::DatabaseCorruptError("Bad postlist table key "
502 // Adjust key if this is *NOT* an initial chunk.
503 // key is: pack_string_preserving_sort(key, term)
504 // plus optionally: pack_uint_preserving_sort(key, did)
505 const char * d
= key
.data();
506 const char * e
= d
+ key
.size();
508 if (!unpack_string_preserving_sort(&d
, e
, term
))
509 throw Xapian::DatabaseCorruptError("Bad postlist key");
512 // This is an initial chunk for a term, so remove tag header.
516 Xapian::docid lastdid
;
517 if (!decode_initial_chunk_header(&d
, e
, tf
, cf
,
518 firstdid
, lastdid
, chunk_lastdid
,
519 first_wdf
, wdf_max
)) {
520 throw Xapian::DatabaseCorruptError("Bad postlist initial "
523 // Ignore lastdid - we'll need to recalculate it (at least when
524 // merging, and for simplicity we always do).
526 tag
.erase(0, d
- tag
.data());
528 have_wdfs
= (cf
!= 0) && (cf
!= tf
- 1 + first_wdf
);
529 if (have_wdfs
&& tf
> 2) {
530 Xapian::termcount remaining_cf_for_flat_wdf
=
532 // Check this matches and that it isn't a false match due
533 // to overflow of the multiplication above.
534 if (cf
- first_wdf
== remaining_cf_for_flat_wdf
&&
535 usual(remaining_cf_for_flat_wdf
/ wdf_max
== tf
- 1)) {
536 // The wdf is flat so we don't need to store it.
542 // The cf we report should only be non-zero for initial chunks
543 // (of which there may be several for the same term from
544 // different instances of this class when merging databases)
545 // and gets summed to give the total cf for the term, so we
546 // need to zero it here, after potentially using it to
547 // calculate first_wdf.
549 // If cf is zero for a term, first_wdf will be zero for all
550 // chunks so doesn't need setting specially here.
552 first_wdf
= (cf
- first_wdf
) / (tf
- 1);
556 // Not an initial chunk, so adjust key and remove tag header.
557 size_t tmp
= d
- key
.data();
558 if (!unpack_uint_preserving_sort(&d
, e
, &chunk_lastdid
) ||
560 throw Xapian::DatabaseCorruptError("Bad postlist key");
562 // -1 to remove the terminating zero byte too.
569 if (!decode_delta_chunk_header(&d
, e
, chunk_lastdid
, firstdid
,
571 throw Xapian::DatabaseCorruptError("Bad postlist delta "
575 if (!decode_delta_chunk_header_no_wdf(&d
, e
, chunk_lastdid
,
577 throw Xapian::DatabaseCorruptError("Bad postlist delta "
581 tag
.erase(0, d
- tag
.data());
584 chunk_lastdid
+= offset
;
590 class PostlistCursor
<HoneyTable
&> : public PostlistCursor
<const HoneyTable
&> {
592 PostlistCursor(HoneyTable
*in
, Xapian::docid offset_
)
593 : PostlistCursor
<const HoneyTable
&>(in
, offset_
) {}
597 class PostlistCursorGt
{
599 /** Return true if and only if a's key is strictly greater than b's key.
601 bool operator()(const T
* a
, const T
* b
) const {
602 if (a
->key
> b
->key
) return true;
603 if (a
->key
!= b
->key
) return false;
604 return (a
->firstdid
> b
->firstdid
);
608 // U : vector<HoneyTable*>::const_iterator
609 template<typename T
, typename U
> void
610 merge_postlists(Xapian::Compactor
* compactor
,
611 T
* out
, vector
<Xapian::docid
>::const_iterator offset
,
614 typedef decltype(**b
) table_type
; // E.g. HoneyTable
615 typedef PostlistCursor
<table_type
> cursor_type
;
616 typedef PostlistCursorGt
<cursor_type
> gt_type
;
617 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
618 for ( ; b
!= e
; ++b
, ++offset
) {
620 auto cursor
= new cursor_type(in
, *offset
);
621 if (cursor
->next()) {
624 // Skip empty tables.
631 // Merge user metadata.
633 while (!pq
.empty()) {
634 cursor_type
* cur
= pq
.top();
635 const string
& key
= cur
->key
;
636 if (key_type(key
) != Honey::KEY_USER_METADATA
) break;
638 if (key
!= last_key
) {
640 if (tags
.size() > 1 && compactor
) {
641 Assert(!last_key
.empty());
642 // FIXME: It would be better to merge all duplicates
643 // for a key in one call, but currently we don't in
645 const string
& resolved_tag
=
646 compactor
->resolve_duplicate_metadata(last_key
,
649 if (!resolved_tag
.empty())
650 out
->add(last_key
, resolved_tag
);
652 Assert(!last_key
.empty());
653 out
->add(last_key
, tags
[0]);
659 tags
.push_back(cur
->tag
);
669 if (tags
.size() > 1 && compactor
) {
670 Assert(!last_key
.empty());
671 const string
& resolved_tag
=
672 compactor
->resolve_duplicate_metadata(last_key
,
675 if (!resolved_tag
.empty())
676 out
->add(last_key
, resolved_tag
);
678 Assert(!last_key
.empty());
679 out
->add(last_key
, tags
[0]);
686 Xapian::doccount freq
= 0;
687 string lbound
, ubound
;
689 while (!pq
.empty()) {
690 cursor_type
* cur
= pq
.top();
691 const string
& key
= cur
->key
;
692 if (key_type(key
) != Honey::KEY_VALUE_STATS
) break;
693 if (key
!= last_key
) {
694 // For the first valuestats key, last_key will be the previous
695 // key we wrote, which we don't want to overwrite. This is the
696 // only time that freq will be 0, so check that.
698 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
704 const string
& tag
= cur
->tag
;
706 const char * pos
= tag
.data();
707 const char * end
= pos
+ tag
.size();
711 if (!unpack_uint(&pos
, end
, &f
)) {
713 throw Xapian::DatabaseCorruptError("Incomplete stats item "
715 throw Xapian::RangeError("Frequency statistic in value table "
718 if (!unpack_string(&pos
, end
, l
)) {
720 throw Xapian::DatabaseCorruptError("Incomplete stats item "
722 throw Xapian::RangeError("Lower bound in value table is too "
725 size_t len
= end
- pos
;
737 if (l
< lbound
) lbound
= l
;
738 if (u
> ubound
) ubound
= u
;
750 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
754 // Merge valuestream chunks.
755 while (!pq
.empty()) {
756 cursor_type
* cur
= pq
.top();
757 const string
& key
= cur
->key
;
758 if (key_type(key
) != Honey::KEY_VALUE_CHUNK
) break;
759 out
->add(key
, cur
->tag
);
768 // Merge doclen chunks.
769 while (!pq
.empty()) {
770 cursor_type
* cur
= pq
.top();
771 if (key_type(cur
->key
) != Honey::KEY_DOCLEN_CHUNK
) break;
772 string tag
= std::move(cur
->tag
);
773 auto chunk_lastdid
= cur
->chunk_lastdid
;
780 while (!pq
.empty()) {
782 if (key_type(cur
->key
) != Honey::KEY_DOCLEN_CHUNK
)
784 if (tag
[0] != cur
->tag
[0]) {
785 // Different width values in the two tags, so punt for now.
786 // FIXME: We would ideally optimise the total size here.
789 size_t byte_width
= tag
[0] / 8;
790 auto new_size
= tag
.size();
791 Xapian::docid gap_size
= cur
->firstdid
- chunk_lastdid
- 1;
792 new_size
+= gap_size
* byte_width
;
793 if (new_size
>= HONEY_DOCLEN_CHUNK_MAX
) {
794 // The gap spans beyond HONEY_DOCLEN_CHUNK_MAX.
797 new_size
+= cur
->tag
.size() - 1;
798 auto full_new_size
= new_size
;
799 if (new_size
> HONEY_DOCLEN_CHUNK_MAX
) {
800 if (byte_width
> 1) {
801 // HONEY_DOCLEN_CHUNK_MAX should be one more than a
802 // multiple of 12 so for widths 1,2,3,4 we can fix the
803 // initial byte which indicates the width for the chunk
804 // plus an exact number of entries.
805 auto m
= (new_size
- HONEY_DOCLEN_CHUNK_MAX
) % byte_width
;
809 new_size
= HONEY_DOCLEN_CHUNK_MAX
;
811 tag
.reserve(new_size
);
812 tag
.append(byte_width
* gap_size
, '\xff');
813 if (new_size
!= full_new_size
) {
815 auto copy_size
= new_size
- tag
.size();
816 tag
.append(cur
->tag
, 1, copy_size
);
817 cur
->tag
.erase(1, copy_size
);
818 copy_size
/= byte_width
;
819 cur
->firstdid
+= copy_size
;
820 chunk_lastdid
+= gap_size
;
821 chunk_lastdid
+= copy_size
;
825 tag
.append(cur
->tag
, 1, string::npos
);
826 chunk_lastdid
= cur
->chunk_lastdid
;
835 out
->add(Honey::make_doclenchunk_key(chunk_lastdid
), tag
);
838 Xapian::termcount tf
= 0, cf
= 0; // Initialise to avoid warnings.
840 struct HoneyPostListChunk
{
841 Xapian::docid first
, last
;
842 Xapian::termcount first_wdf
;
843 Xapian::termcount wdf_max
;
845 Xapian::termcount cf
;
849 HoneyPostListChunk(Xapian::docid first_
,
851 Xapian::termcount first_wdf_
,
852 Xapian::termcount wdf_max_
,
853 Xapian::doccount tf_
,
854 Xapian::termcount cf_
,
859 first_wdf(first_wdf_
),
863 have_wdfs(have_wdfs_
),
866 void append_postings_to(string
& tag
, bool want_wdfs
) {
867 if (want_wdfs
&& !have_wdfs
) {
868 // Need to add wdfs which were implicit.
869 auto wdf
= (cf
- first_wdf
) / (tf
- 1);
870 const char* pos
= data
.data();
871 const char* pos_end
= pos
+ data
.size();
872 while (pos
!= pos_end
) {
874 if (!unpack_uint(&pos
, pos_end
, &delta
))
875 throw_database_corrupt("Decoding docid delta", pos
);
876 pack_uint(tag
, delta
);
879 } else if (have_wdfs
&& !want_wdfs
) {
880 // Need to remove wdfs which are now implicit.
881 const char* pos
= data
.data();
882 const char* pos_end
= pos
+ data
.size();
883 while (pos
!= pos_end
) {
885 if (!unpack_uint(&pos
, pos_end
, &delta
))
886 throw_database_corrupt("Decoding docid delta", pos
);
887 pack_uint(tag
, delta
);
888 Xapian::termcount wdf
;
889 if (!unpack_uint(&pos
, pos_end
, &wdf
))
890 throw_database_corrupt("Decoding wdf", pos
);
898 vector
<HoneyPostListChunk
> tags
;
900 cursor_type
* cur
= NULL
;
906 AssertEq(key_type(cur
->key
), Honey::KEY_POSTING_CHUNK
);
908 if (cur
== NULL
|| cur
->key
!= last_key
) {
910 Xapian::termcount first_wdf
= tags
[0].first_wdf
;
911 Xapian::docid chunk_lastdid
= tags
[0].last
;
912 Xapian::docid last_did
= tags
.back().last
;
913 Xapian::termcount wdf_max
= tags
.back().wdf_max
;
915 bool have_wdfs
= true;
917 // wdf must always be zero.
919 } else if (tf
<= 2) {
920 // We only need to store cf and first_wdf to know all wdfs.
922 } else if (cf
== tf
- 1 + first_wdf
) {
923 // wdf must be 1 for second and subsequent entries.
926 Xapian::termcount remaining_cf_for_flat_wdf
=
928 // Check this matches and that it isn't a false match due
929 // to overflow of the multiplication above.
930 if (cf
- first_wdf
== remaining_cf_for_flat_wdf
&&
931 usual(remaining_cf_for_flat_wdf
/ wdf_max
== tf
- 1)) {
932 // The wdf is flat so we don't need to store it.
937 Xapian::docid splice_last
= 0;
938 if (!have_wdfs
&& tags
[0].have_wdfs
&& tf
> 2 &&
940 // Stripping wdfs will approximately halve the size so
941 // double up the chunks.
942 splice_last
= chunk_lastdid
;
943 chunk_lastdid
= tags
[1].last
;
947 encode_initial_chunk_header(tf
, cf
, tags
[0].first
, last_did
,
949 first_wdf
, wdf_max
, first_tag
);
952 tags
[0].append_postings_to(first_tag
, have_wdfs
);
953 if (!have_wdfs
&& splice_last
) {
954 pack_uint(first_tag
, tags
[1].first
- splice_last
);
955 tags
[1].append_postings_to(first_tag
, have_wdfs
);
958 out
->add(last_key
, first_tag
);
960 // If tf == 2, the data could be split over two tags when
961 // merging, but we only output an initial tag in this case.
962 if (tf
> 2 && tags
.size() > 1) {
964 const char* p
= last_key
.data();
965 const char* end
= p
+ last_key
.size();
966 if (!unpack_string_preserving_sort(&p
, end
, term
) ||
968 throw Xapian::DatabaseCorruptError("Bad postlist "
972 auto i
= tags
.begin();
977 while (++i
!= tags
.end()) {
981 encode_delta_chunk_header(i
->first
,
987 if (i
->have_wdfs
&& i
+ 1 != tags
.end()) {
988 splice_last
= last_did
;
989 last_did
= (i
+ 1)->last
;
992 encode_delta_chunk_header_no_wdf(i
->first
,
998 pack_uint(tag
, i
->first
- splice_last
);
1004 out
->add(pack_honey_postlist_key(term
, last_did
), tag
);
1009 if (cur
== NULL
) break;
1011 last_key
= cur
->key
;
1014 if (tf
&& cur
->tf
&& (cf
== 0) != (cur
->cf
== 0)) {
1015 // FIXME: Also need to adjust document length, total document
1017 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
1018 throw Xapian::DatabaseError("Honey does not support a term having "
1019 "both zero and non-zero wdf");
1024 tags
.push_back(HoneyPostListChunk(cur
->firstdid
,
1031 std::move(cur
->tag
)));
1040 template<typename T
> struct MergeCursor
;
1042 #ifdef XAPIAN_HAS_GLASS_BACKEND
1044 struct MergeCursor
<const GlassTable
&> : public GlassCursor
{
1045 explicit MergeCursor(const GlassTable
*in
) : GlassCursor(in
) {
1052 struct MergeCursor
<const HoneyTable
&> : public HoneyCursor
{
1053 explicit MergeCursor(const HoneyTable
*in
) : HoneyCursor(in
) {
1058 template<typename T
>
1060 /// Return true if and only if a's key is strictly greater than b's key.
1061 bool operator()(const T
* a
, const T
* b
) const {
1062 if (b
->after_end()) return false;
1063 if (a
->after_end()) return true;
1064 return (a
->current_key
> b
->current_key
);
1068 #ifdef XAPIAN_HAS_GLASS_BACKEND
1069 // Convert glass to honey.
1071 merge_spellings(HoneyTable
* out
,
1072 vector
<const GlassTable
*>::const_iterator b
,
1073 vector
<const GlassTable
*>::const_iterator e
)
1075 typedef MergeCursor
<const GlassTable
&> cursor_type
;
1076 typedef CursorGt
<cursor_type
> gt_type
;
1077 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1078 for ( ; b
!= e
; ++b
) {
1080 auto cursor
= new cursor_type(in
);
1081 if (cursor
->next()) {
1084 // Skip empty tables.
1089 while (!pq
.empty()) {
1090 cursor_type
* cur
= pq
.top();
1093 // Glass vs honey spelling key prefix map:
1097 // bookend Bbd \x00bd
1099 // middle Mddl \x02ddl
1103 // In honey, if the word's first byte is <= '\x04', we add a prefix
1104 // of '\x04' (which is unlikely in real world use but ensures that
1105 // we can handle arbitrary strings).
1107 // The prefixes for honey are chosen such that we save a byte for the
1108 // key of all real-world words, and so that more first bytes are used
1109 // so that a top-level array index is more effective, especially for
1110 // random-access lookup of word entries (which we do to check the
1111 // frequency of potential spelling candidates).
1113 // The other prefixes are chosen such that we can do look up in key
1114 // order at search time, which is more efficient for a cursor which can
1115 // only efficiently move forwards.
1117 // Note that the key ordering is the same for glass and honey, which
1118 // makes translating during compaction simpler.
1119 string key
= cur
->current_key
;
1122 key
[0] = Honey::KEY_PREFIX_BOOKEND
;
1125 key
[0] = Honey::KEY_PREFIX_HEAD
;
1128 key
[0] = Honey::KEY_PREFIX_MIDDLE
;
1131 key
[0] = Honey::KEY_PREFIX_TAIL
;
1134 if (static_cast<unsigned char>(key
[1]) > Honey::KEY_PREFIX_WORD
)
1137 key
[0] = Honey::KEY_PREFIX_WORD
;
1140 string m
= "Bad spelling key prefix: ";
1141 m
+= static_cast<unsigned char>(key
[0]);
1142 throw Xapian::DatabaseCorruptError(m
);
1146 if (pq
.empty() || pq
.top()->current_key
> key
) {
1147 // No merging to do for this key so just copy the tag value,
1148 // adjusting if necessary. If we don't need to adjust it, just
1149 // copy the compressed value.
1152 case Honey::KEY_PREFIX_HEAD
: {
1153 // Honey omits the first two bytes from the first entry
1154 // since we know what those most be from the key
1155 // (subsequent entries won't include those anyway, since
1156 // they'll be part of the reused prefix).
1157 compressed
= cur
->read_tag(false);
1158 unsigned char len
= cur
->current_tag
[0] ^ MAGIC_XOR_VALUE
;
1159 cur
->current_tag
[0] = (len
- 2) ^ MAGIC_XOR_VALUE
;
1160 AssertEq(cur
->current_tag
[1], key
[1]);
1161 AssertEq(cur
->current_tag
[2], key
[2]);
1162 cur
->current_tag
.erase(1, 2);
1165 case Honey::KEY_PREFIX_TAIL
:
1166 case Honey::KEY_PREFIX_BOOKEND
: {
1167 // Need to repack because honey omits the last 2 or 1 bytes
1168 // from each entry (2 for tail, 1 for bookend) since we
1169 // know what those must be from the key.
1170 compressed
= cur
->read_tag(false);
1171 PrefixCompressedStringItor
spell_in(cur
->current_tag
);
1173 PrefixCompressedStringWriter
spell_out(new_tag
, key
);
1174 while (!spell_in
.at_end()) {
1175 spell_out
.append(*spell_in
);
1178 cur
->current_tag
= std::move(new_tag
);
1182 compressed
= cur
->read_tag(true);
1185 out
->add(key
, cur
->current_tag
, compressed
);
1194 // Merge tag values with the same key:
1196 if (static_cast<unsigned char>(key
[0]) < Honey::KEY_PREFIX_WORD
) {
1197 // We just want the union of words, so copy over the first instance
1198 // and skip any identical ones.
1199 priority_queue
<PrefixCompressedStringItor
*,
1200 vector
<PrefixCompressedStringItor
*>,
1201 PrefixCompressedStringItorGt
> pqtag
;
1202 // Stick all the cursor_type pointers in a vector because their
1203 // current_tag members must remain valid while we're merging their
1204 // tags, but we need to call next() on them all afterwards.
1205 vector
<cursor_type
*> vec
;
1206 vec
.reserve(pq
.size());
1210 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
));
1212 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1217 PrefixCompressedStringWriter
wr(tag
, key
);
1219 while (!pqtag
.empty()) {
1220 PrefixCompressedStringItor
* it
= pqtag
.top();
1223 if (word
!= lastword
) {
1225 wr
.append(lastword
);
1228 if (!it
->at_end()) {
1235 for (auto i
: vec
) {
1244 // We want to sum the frequencies from tags for the same key.
1245 Xapian::termcount tot_freq
= 0;
1248 Xapian::termcount freq
;
1249 const char * p
= cur
->current_tag
.data();
1250 const char * end
= p
+ cur
->current_tag
.size();
1251 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1252 throw_database_corrupt("Bad spelling word freq", p
);
1260 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1265 pack_uint_last(tag
, tot_freq
);
1273 merge_spellings(HoneyTable
* out
,
1274 vector
<const HoneyTable
*>::const_iterator b
,
1275 vector
<const HoneyTable
*>::const_iterator e
)
1277 typedef MergeCursor
<const HoneyTable
&> cursor_type
;
1278 typedef CursorGt
<cursor_type
> gt_type
;
1279 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1280 for ( ; b
!= e
; ++b
) {
1282 auto cursor
= new cursor_type(in
);
1283 if (cursor
->next()) {
1286 // Skip empty tables.
1291 while (!pq
.empty()) {
1292 cursor_type
* cur
= pq
.top();
1295 string key
= cur
->current_key
;
1296 if (pq
.empty() || pq
.top()->current_key
> key
) {
1297 // No need to merge the tags, just copy the (possibly compressed)
1299 bool compressed
= cur
->read_tag(true);
1300 out
->add(key
, cur
->current_tag
, compressed
);
1309 // Merge tag values with the same key:
1311 if (static_cast<unsigned char>(key
[0]) < Honey::KEY_PREFIX_WORD
) {
1312 // We just want the union of words, so copy over the first instance
1313 // and skip any identical ones.
1314 priority_queue
<PrefixCompressedStringItor
*,
1315 vector
<PrefixCompressedStringItor
*>,
1316 PrefixCompressedStringItorGt
> pqtag
;
1317 // Stick all the cursor_type pointers in a vector because their
1318 // current_tag members must remain valid while we're merging their
1319 // tags, but we need to call next() on them all afterwards.
1320 vector
<cursor_type
*> vec
;
1321 vec
.reserve(pq
.size());
1325 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
,
1328 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1333 PrefixCompressedStringWriter
wr(tag
, key
);
1335 while (!pqtag
.empty()) {
1336 PrefixCompressedStringItor
* it
= pqtag
.top();
1339 if (word
!= lastword
) {
1341 wr
.append(lastword
);
1344 if (!it
->at_end()) {
1351 for (auto i
: vec
) {
1360 // We want to sum the frequencies from tags for the same key.
1361 Xapian::termcount tot_freq
= 0;
1364 Xapian::termcount freq
;
1365 const char * p
= cur
->current_tag
.data();
1366 const char * end
= p
+ cur
->current_tag
.size();
1367 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1368 throw_database_corrupt("Bad spelling word freq", p
);
1376 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1381 pack_uint_last(tag
, tot_freq
);
1387 // U : vector<HoneyTable*>::const_iterator
1388 template<typename T
, typename U
> void
1389 merge_synonyms(T
* out
, U b
, U e
)
1391 typedef decltype(**b
) table_type
; // E.g. HoneyTable
1392 typedef MergeCursor
<table_type
> cursor_type
;
1393 typedef CursorGt
<cursor_type
> gt_type
;
1394 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1395 for ( ; b
!= e
; ++b
) {
1397 auto cursor
= new cursor_type(in
);
1398 if (cursor
->next()) {
1401 // Skip empty tables.
1406 while (!pq
.empty()) {
1407 cursor_type
* cur
= pq
.top();
1410 string key
= cur
->current_key
;
1411 if (pq
.empty() || pq
.top()->current_key
> key
) {
1412 // No need to merge the tags, just copy the (possibly compressed)
1414 bool compressed
= cur
->read_tag(true);
1415 out
->add(key
, cur
->current_tag
, compressed
);
1424 // Merge tag values with the same key:
1427 // We just want the union of words, so copy over the first instance
1428 // and skip any identical ones.
1429 priority_queue
<ByteLengthPrefixedStringItor
*,
1430 vector
<ByteLengthPrefixedStringItor
*>,
1431 ByteLengthPrefixedStringItorGt
> pqtag
;
1432 vector
<cursor_type
*> vec
;
1436 pqtag
.push(new ByteLengthPrefixedStringItor(cur
->current_tag
));
1438 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1444 while (!pqtag
.empty()) {
1445 ByteLengthPrefixedStringItor
* it
= pqtag
.top();
1447 if (**it
!= lastword
) {
1449 tag
+= uint8_t(lastword
.size() ^ MAGIC_XOR_VALUE
);
1453 if (!it
->at_end()) {
1460 for (auto i
: vec
) {
1473 template<typename T
, typename U
> void
1474 multimerge_postlists(Xapian::Compactor
* compactor
,
1475 T
* out
, const char * tmpdir
,
1476 const vector
<U
*>& in
,
1477 vector
<Xapian::docid
> off
)
1479 if (in
.size() <= 3) {
1480 merge_postlists(compactor
, out
, off
.begin(), in
.begin(), in
.end());
1484 vector
<HoneyTable
*> tmp
;
1485 tmp
.reserve(in
.size() / 2);
1487 vector
<Xapian::docid
> newoff
;
1488 newoff
.resize(in
.size() / 2);
1489 for (unsigned int i
= 0, j
; i
< in
.size(); i
= j
) {
1491 if (j
== in
.size() - 1) ++j
;
1493 string dest
= tmpdir
;
1495 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1498 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1500 // Don't compress entries in temporary tables, even if the final
1501 // table would do so. Any already compressed entries will get
1502 // copied in compressed form.
1503 Honey::RootInfo root_info
;
1505 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1506 tmptab
->create_and_open(flags
, root_info
);
1508 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1509 in
.begin() + i
, in
.begin() + j
);
1510 tmp
.push_back(tmptab
);
1512 tmptab
->commit(1, &root_info
);
1518 while (tmp
.size() > 3) {
1519 vector
<HoneyTable
*> tmpout
;
1520 tmpout
.reserve(tmp
.size() / 2);
1521 vector
<Xapian::docid
> newoff
;
1522 newoff
.resize(tmp
.size() / 2);
1523 for (unsigned int i
= 0, j
; i
< tmp
.size(); i
= j
) {
1525 if (j
== tmp
.size() - 1) ++j
;
1527 string dest
= tmpdir
;
1529 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1532 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1534 // Don't compress entries in temporary tables, even if the final
1535 // table would do so. Any already compressed entries will get
1536 // copied in compressed form.
1537 Honey::RootInfo root_info
;
1539 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1540 tmptab
->create_and_open(flags
, root_info
);
1542 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1543 tmp
.begin() + i
, tmp
.begin() + j
);
1545 for (unsigned int k
= i
; k
< j
; ++k
) {
1546 // FIXME: unlink(tmp[k]->get_path().c_str());
1551 tmpout
.push_back(tmptab
);
1553 tmptab
->commit(1, &root_info
);
1559 merge_postlists(compactor
, out
, off
.begin(), tmp
.begin(), tmp
.end());
1561 for (size_t k
= 0; k
< tmp
.size(); ++k
) {
1562 // FIXME: unlink(tmp[k]->get_path().c_str());
1569 template<typename T
> class PositionCursor
;
1571 #ifdef XAPIAN_HAS_GLASS_BACKEND
1573 class PositionCursor
<const GlassTable
&> : private GlassCursor
{
1574 Xapian::docid offset
;
1578 Xapian::docid firstdid
;
1580 PositionCursor(const GlassTable
*in
, Xapian::docid offset_
)
1581 : GlassCursor(in
), offset(offset_
), firstdid(0) {
1586 if (!GlassCursor::next()) return false;
1588 const char * d
= current_key
.data();
1589 const char * e
= d
+ current_key
.size();
1592 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1593 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1595 throw Xapian::DatabaseCorruptError("Bad position key");
1599 pack_string_preserving_sort(key
, term
);
1600 pack_uint_preserving_sort(key
, did
+ offset
);
1604 const string
& get_tag() const {
1611 class PositionCursor
<const HoneyTable
&> : private HoneyCursor
{
1612 Xapian::docid offset
;
1616 Xapian::docid firstdid
;
1618 PositionCursor(const HoneyTable
*in
, Xapian::docid offset_
)
1619 : HoneyCursor(in
), offset(offset_
), firstdid(0) {
1624 if (!HoneyCursor::next()) return false;
1626 const char * d
= current_key
.data();
1627 const char * e
= d
+ current_key
.size();
1630 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1631 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1633 throw Xapian::DatabaseCorruptError("Bad position key");
1637 pack_string_preserving_sort(key
, term
);
1638 pack_uint_preserving_sort(key
, did
+ offset
);
1642 const string
& get_tag() const {
1647 template<typename T
>
1648 class PositionCursorGt
{
1650 /** Return true if and only if a's key is strictly greater than b's key.
1652 bool operator()(const T
* a
, const T
* b
) const {
1653 return a
->key
> b
->key
;
1657 template<typename T
, typename U
> void
1658 merge_positions(T
* out
, const vector
<U
*> & inputs
,
1659 const vector
<Xapian::docid
> & offset
)
1661 typedef decltype(*inputs
[0]) table_type
; // E.g. HoneyTable
1662 typedef PositionCursor
<table_type
> cursor_type
;
1663 typedef PositionCursorGt
<cursor_type
> gt_type
;
1664 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1665 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1666 auto in
= inputs
[i
];
1667 auto cursor
= new cursor_type(in
, offset
[i
]);
1668 if (cursor
->next()) {
1671 // Skip empty tables.
1676 while (!pq
.empty()) {
1677 cursor_type
* cur
= pq
.top();
1679 out
->add(cur
->key
, cur
->get_tag());
1688 template<typename T
, typename U
> void
1689 merge_docid_keyed(T
*out
, const vector
<U
*> & inputs
,
1690 const vector
<Xapian::docid
> & offset
,
1693 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1694 Xapian::docid off
= offset
[i
];
1696 auto in
= inputs
[i
];
1697 HoneyCursor
cur(in
);
1701 while (cur
.next()) {
1702 // Adjust the key if this isn't the first database.
1705 const char * d
= cur
.current_key
.data();
1706 const char * e
= d
+ cur
.current_key
.size();
1707 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1708 string msg
= "Bad key in ";
1709 msg
+= inputs
[i
]->get_path();
1710 throw Xapian::DatabaseCorruptError(msg
);
1714 pack_uint_preserving_sort(key
, did
);
1716 // Copy over anything extra in the key (e.g. the zero byte
1717 // at the end of "used value slots" in the termlist table).
1718 key
.append(d
, e
- d
);
1721 key
= cur
.current_key
;
1723 bool compressed
= cur
.read_tag(true);
1724 out
->add(key
, cur
.current_tag
, compressed
);
1729 #ifdef XAPIAN_HAS_GLASS_BACKEND
1730 template<typename T
> void
1731 merge_docid_keyed(T
*out
, const vector
<const GlassTable
*> & inputs
,
1732 const vector
<Xapian::docid
> & offset
,
1735 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1736 Xapian::docid off
= offset
[i
];
1738 auto in
= inputs
[i
];
1739 if (in
->empty()) continue;
1741 GlassCursor
cur(in
);
1745 while (cur
.next()) {
1747 // Adjust the key if this isn't the first database.
1750 const char * d
= cur
.current_key
.data();
1751 const char * e
= d
+ cur
.current_key
.size();
1752 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1753 string msg
= "Bad key in ";
1754 msg
+= inputs
[i
]->get_path();
1755 throw Xapian::DatabaseCorruptError(msg
);
1759 pack_uint_preserving_sort(key
, did
);
1761 // Copy over anything extra in the key (e.g. the zero byte
1762 // at the end of "used value slots" in the termlist table).
1763 key
.append(d
, e
- d
);
1766 key
= cur
.current_key
;
1768 if (table_type
== Honey::TERMLIST
) {
1769 if (termlist_key_is_values_used(key
)) {
1770 throw Xapian::DatabaseCorruptError("value slots used "
1776 string tag
= cur
.current_tag
;
1778 bool next_result
= cur
.next();
1779 bool next_already_done
= true;
1780 unsigned bitmap_slots_used
= 0;
1781 string encoded_slots_used
;
1783 termlist_key_is_values_used(cur
.current_key
)) {
1784 next_already_done
= false;
1786 const string
& valtag
= cur
.current_tag
;
1788 const char* p
= valtag
.data();
1789 const char* end
= p
+ valtag
.size();
1791 Xapian::VecCOW
<Xapian::termpos
> slots
;
1793 Xapian::valueno first_slot
;
1794 if (!unpack_uint(&p
, end
, &first_slot
)) {
1795 throw_database_corrupt("Value slot encoding corrupt",
1798 slots
.push_back(first_slot
);
1799 Xapian::valueno last_slot
= first_slot
;
1801 Xapian::valueno slot
;
1802 if (!unpack_uint(&p
, end
, &slot
)) {
1803 throw_database_corrupt("Value slot encoding "
1806 slot
+= last_slot
+ 1;
1809 slots
.push_back(slot
);
1812 if (slots
.back() <= 6) {
1813 // Encode as a bitmap if only slots in the range 0-6
1815 for (auto slot
: slots
) {
1816 bitmap_slots_used
|= 1 << slot
;
1820 pack_uint(enc
, last_slot
);
1821 if (slots
.size() > 1) {
1822 BitWriter
slots_used(enc
);
1823 slots_used
.encode(first_slot
, last_slot
);
1824 slots_used
.encode(slots
.size() - 2,
1825 last_slot
- first_slot
);
1826 slots_used
.encode_interpolative(slots
, 0,
1828 encoded_slots_used
= slots_used
.freeze();
1830 encoded_slots_used
= std::move(enc
);
1835 const char* pos
= tag
.data();
1836 const char* end
= pos
+ tag
.size();
1839 if (encoded_slots_used
.empty()) {
1840 newtag
+= char(bitmap_slots_used
);
1842 auto size
= encoded_slots_used
.size();
1844 newtag
+= char(0x80 | size
);
1847 pack_uint(newtag
, size
);
1849 newtag
+= encoded_slots_used
;
1853 Xapian::termcount doclen
;
1854 if (!unpack_uint(&pos
, end
, &doclen
)) {
1855 throw_database_corrupt("doclen", pos
);
1857 Xapian::termcount termlist_size
;
1858 if (!unpack_uint(&pos
, end
, &termlist_size
)) {
1859 throw_database_corrupt("termlist length", pos
);
1861 pack_uint(newtag
, termlist_size
- 1);
1862 pack_uint(newtag
, doclen
);
1864 string current_term
;
1865 while (pos
!= end
) {
1866 Xapian::termcount current_wdf
= 0;
1868 if (!current_term
.empty()) {
1869 size_t reuse
= static_cast<unsigned char>(*pos
++);
1870 newtag
+= char(reuse
);
1872 if (reuse
> current_term
.size()) {
1873 current_wdf
= reuse
/ (current_term
.size() + 1);
1874 reuse
= reuse
% (current_term
.size() + 1);
1876 current_term
.resize(reuse
);
1879 throw_database_corrupt("term", NULL
);
1882 size_t append
= static_cast<unsigned char>(*pos
++);
1883 if (size_t(end
- pos
) < append
)
1884 throw_database_corrupt("term", NULL
);
1886 current_term
.append(pos
, append
);
1892 if (!unpack_uint(&pos
, end
, ¤t_wdf
)) {
1893 throw_database_corrupt("wdf", pos
);
1895 pack_uint(newtag
, current_wdf
);
1898 newtag
+= char(append
);
1899 newtag
.append(current_term
.end() - append
,
1900 current_term
.end());
1903 if (!newtag
.empty())
1904 out
->add(key
, newtag
);
1905 if (!next_result
) break;
1906 if (next_already_done
) goto next_without_next
;
1908 bool compressed
= cur
.read_tag(true);
1909 out
->add(key
, cur
.current_tag
, compressed
);
1918 using namespace HoneyCompact
;
1921 HoneyDatabase::compact(Xapian::Compactor
* compactor
,
1922 const char* destdir
,
1925 const vector
<const Xapian::Database::Internal
*>& sources
,
1926 const vector
<Xapian::docid
>& offset
,
1927 Xapian::Compactor::compaction_level compaction
,
1929 Xapian::docid last_docid
)
1931 // Currently unused for honey.
1935 // The "base name" of the table.
1938 Honey::table_type type
;
1939 // Create tables after position lazily.
1943 static const table_list tables
[] = {
1945 { "postlist", Honey::POSTLIST
, false },
1946 { "docdata", Honey::DOCDATA
, true },
1947 { "termlist", Honey::TERMLIST
, false },
1948 { "position", Honey::POSITION
, true },
1949 { "spelling", Honey::SPELLING
, true },
1950 { "synonym", Honey::SYNONYM
, true }
1952 const table_list
* tables_end
= tables
+
1953 (sizeof(tables
) / sizeof(tables
[0]));
1955 const int FLAGS
= Xapian::DB_DANGEROUS
;
1957 bool single_file
= (flags
& Xapian::DBCOMPACT_SINGLE_FILE
);
1958 bool multipass
= (flags
& Xapian::DBCOMPACT_MULTIPASS
);
1960 // FIXME: Support this combination - we need to put temporary files
1966 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1967 bool has_uncommitted_changes
;
1968 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1969 #ifdef XAPIAN_HAS_GLASS_BACKEND
1970 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1971 has_uncommitted_changes
= db
->has_uncommitted_changes();
1973 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1976 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1977 has_uncommitted_changes
= db
->has_uncommitted_changes();
1979 if (has_uncommitted_changes
) {
1981 "Can't compact from a WritableDatabase with uncommitted "
1982 "changes - either call commit() first, or create a new "
1983 "Database object from the filename on disk";
1984 throw Xapian::InvalidOperationError(m
);
1989 FlintLock
lock(destdir
? destdir
: "");
1992 FlintLock::reason why
= lock
.lock(true, false, explanation
);
1993 if (why
!= FlintLock::SUCCESS
) {
1994 lock
.throw_databaselockerror(why
, destdir
, explanation
);
1998 unique_ptr
<HoneyVersion
> version_file_out
;
2001 fd
= open(destdir
, O_RDWR
|O_CREAT
|O_TRUNC
|O_BINARY
|O_CLOEXEC
, 0666);
2003 throw Xapian::DatabaseCreateError("open() failed", errno
);
2006 version_file_out
.reset(new HoneyVersion(fd
));
2009 version_file_out
.reset(new HoneyVersion(destdir
));
2012 // Set to true if stat() failed (which can happen if the files are > 2GB
2013 // and off_t is 32 bit) or one of the totals overflowed.
2014 bool bad_totals
= false;
2015 off_t in_total
= 0, out_total
= 0;
2017 version_file_out
->create();
2018 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
2019 bool source_single_file
= false;
2020 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
2021 #ifdef XAPIAN_HAS_GLASS_BACKEND
2022 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
2023 auto& v_in
= db
->version_file
;
2024 auto& v_out
= version_file_out
;
2025 v_out
->merge_stats(v_in
.get_doccount(),
2026 v_in
.get_doclength_lower_bound(),
2027 v_in
.get_doclength_upper_bound(),
2028 v_in
.get_wdf_upper_bound(),
2029 v_in
.get_total_doclen(),
2030 v_in
.get_spelling_wordfreq_upper_bound());
2031 source_single_file
= db
->single_file();
2036 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
2037 version_file_out
->merge_stats(db
->version_file
);
2038 source_single_file
= db
->single_file();
2040 if (source_single_file
) {
2041 // Add single file input DB sizes to in_total here. For other
2042 // databases, we sum per-table below.
2044 sources
[i
]->get_backend_info(&path
);
2045 off_t db_size
= file_size(path
);
2047 // FIXME: check overflow and set bad_totals
2048 in_total
+= db_size
;
2055 string fl_serialised
;
2059 fl
.set_first_unused_block(1); // FIXME: Assumption?
2060 fl
.pack(fl_serialised
);
2064 // FIXME: sort out indentation.
2065 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
2066 #ifndef XAPIAN_HAS_GLASS_BACKEND
2067 throw Xapian::FeatureUnavailableError("Glass backend disabled");
2069 vector
<HoneyTable
*> tabs
;
2070 tabs
.reserve(tables_end
- tables
);
2071 off_t prev_size
= 0;
2072 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
2073 // The postlist table requires an N-way merge, adjusting the
2074 // headers of various blocks. The spelling and synonym tables also
2075 // need special handling. The other tables have keys sorted in
2076 // docid order, so we can merge them by simply copying all the keys
2077 // from each source table in turn.
2079 compactor
->set_status(t
->name
, string());
2089 bool output_will_exist
= !t
->lazy
;
2091 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2092 // on certain systems).
2093 bool bad_stat
= false;
2095 // We can't currently report input sizes if there's a single file DB
2096 // amongst the inputs.
2097 bool single_file_in
= false;
2101 vector
<const GlassTable
*> inputs
;
2102 inputs
.reserve(sources
.size());
2103 size_t inputs_present
= 0;
2104 for (auto src
: sources
) {
2105 auto db
= static_cast<const GlassDatabase
*>(src
);
2106 const GlassTable
* table
;
2108 case Honey::POSTLIST
:
2109 table
= &(db
->postlist_table
);
2111 case Honey::DOCDATA
:
2112 table
= &(db
->docdata_table
);
2114 case Honey::TERMLIST
:
2115 table
= &(db
->termlist_table
);
2117 case Honey::POSITION
:
2118 table
= &(db
->position_table
);
2120 case Honey::SPELLING
:
2121 table
= &(db
->spelling_table
);
2123 case Honey::SYNONYM
:
2124 table
= &(db
->synonym_table
);
2131 if (db
->single_file()) {
2132 if (t
->lazy
&& !GlassCursor(table
).next()) {
2133 // Lazy table with no entries, so essentially doesn't
2136 // FIXME: Find actual size somehow?
2137 // in_size += table->size() / 1024;
2138 single_file_in
= true;
2139 output_will_exist
= true;
2143 off_t db_size
= file_size(table
->get_path());
2145 // FIXME: check overflow and set bad_totals
2146 in_total
+= db_size
;
2147 in_size
+= db_size
/ 1024;
2148 output_will_exist
= true;
2150 } else if (errno
!= ENOENT
) {
2151 // We get ENOENT for an optional table.
2152 bad_totals
= bad_stat
= true;
2153 output_will_exist
= true;
2157 inputs
.push_back(table
);
2160 // If any inputs lack a termlist table, suppress it in the output.
2161 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2162 if (inputs_present
!= 0) {
2164 string m
= str(inputs_present
);
2166 m
+= str(sources
.size());
2167 m
+= " inputs present, so suppressing output";
2168 compactor
->set_status(t
->name
, m
);
2172 output_will_exist
= false;
2175 if (!output_will_exist
) {
2177 compactor
->set_status(t
->name
, "doesn't exist");
2182 off_t table_start_offset
= -1;
2185 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2186 // space for version file. It's tricky to exactly know the
2187 // size of the version file beforehand.
2188 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2189 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2190 throw Xapian::DatabaseError("lseek() failed", errno
);
2192 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2194 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2197 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2199 tabs
.push_back(out
);
2200 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2202 root_info
->set_free_list(fl_serialised
);
2203 root_info
->set_offset(table_start_offset
);
2205 version_file_out
->get_root(t
->type
),
2206 version_file_out
->get_revision());
2208 out
->create_and_open(FLAGS
, *root_info
);
2212 case Honey::POSTLIST
: {
2213 if (multipass
&& inputs
.size() > 3) {
2214 multimerge_postlists(compactor
, out
, destdir
,
2217 merge_postlists(compactor
, out
, offset
.begin(),
2218 inputs
.begin(), inputs
.end());
2222 case Honey::SPELLING
:
2223 merge_spellings(out
, inputs
.cbegin(), inputs
.cend());
2225 case Honey::SYNONYM
:
2226 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2228 case Honey::POSITION
:
2229 merge_positions(out
, inputs
, offset
);
2232 // DocData, Termlist
2233 merge_docid_keyed(out
, inputs
, offset
, t
->type
);
2237 // Commit as revision 1.
2239 out
->commit(1, root_info
);
2241 if (single_file
) fl_serialised
= root_info
->get_free_list();
2244 if (!bad_stat
&& !single_file_in
) {
2247 db_size
= file_size(fd
);
2249 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2253 off_t old_prev_size
= prev_size
;
2254 prev_size
= db_size
;
2255 db_size
-= old_prev_size
;
2257 // FIXME: check overflow and set bad_totals
2258 out_total
+= db_size
;
2259 out_size
= db_size
/ 1024;
2260 } else if (errno
!= ENOENT
) {
2261 bad_totals
= bad_stat
= true;
2266 compactor
->set_status(t
->name
,
2267 "Done (couldn't stat all the DB files)");
2268 } else if (single_file_in
) {
2270 compactor
->set_status(t
->name
,
2271 "Done (table sizes unknown for single "
2275 if (out_size
== in_size
) {
2276 status
= "Size unchanged (";
2279 if (out_size
< in_size
) {
2280 delta
= in_size
- out_size
;
2281 status
= "Reduced by ";
2283 delta
= out_size
- in_size
;
2284 status
= "INCREASED by ";
2287 status
+= str(100 * delta
/ in_size
);
2290 status
+= str(delta
);
2292 status
+= str(in_size
);
2295 status
+= str(out_size
);
2298 compactor
->set_status(t
->name
, status
);
2302 // If compacting to a single file output and all the tables are empty, pad
2303 // the output so that it isn't mistaken for a stub database when we try to
2304 // open it. For this it needs to at least HONEY_MIN_DB_SIZE in size.
2305 if (single_file
&& prev_size
< HONEY_MIN_DB_SIZE
) {
2306 out_total
= HONEY_MIN_DB_SIZE
;
2307 #ifdef HAVE_FTRUNCATE
2308 if (ftruncate(fd
, HONEY_MIN_DB_SIZE
) < 0) {
2309 throw Xapian::DatabaseError("Failed to set size of output "
2313 const off_t off
= HONEY_MIN_DB_SIZE
- 1;
2314 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2315 throw Xapian::DatabaseError("Failed to set size of output "
2322 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2323 throw Xapian::DatabaseError("lseek() failed", errno
);
2326 version_file_out
->set_last_docid(last_docid
);
2327 string tmpfile
= version_file_out
->write(1, FLAGS
);
2329 off_t version_file_size
= lseek(fd
, 0, SEEK_CUR
);
2330 if (version_file_size
< 0) {
2331 throw Xapian::DatabaseError("lseek() failed", errno
);
2333 if (version_file_size
> HONEY_VERSION_MAX_SIZE
) {
2334 throw Xapian::DatabaseError("Didn't allow enough space for "
2335 "version file data");
2338 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2341 // Commit with revision 1.
2342 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2343 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2348 vector
<HoneyTable
*> tabs
;
2349 tabs
.reserve(tables_end
- tables
);
2350 off_t prev_size
= HONEY_MIN_DB_SIZE
;
2351 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
2352 // The postlist table requires an N-way merge, adjusting the
2353 // headers of various blocks. The spelling and synonym tables also
2354 // need special handling. The other tables have keys sorted in
2355 // docid order, so we can merge them by simply copying all the keys
2356 // from each source table in turn.
2358 compactor
->set_status(t
->name
, string());
2368 bool output_will_exist
= !t
->lazy
;
2370 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2371 // on certain systems).
2372 bool bad_stat
= false;
2374 // We can't currently report input sizes if there's a single file DB
2375 // amongst the inputs.
2376 bool single_file_in
= false;
2380 vector
<const HoneyTable
*> inputs
;
2381 inputs
.reserve(sources
.size());
2382 size_t inputs_present
= 0;
2383 for (auto src
: sources
) {
2384 auto db
= static_cast<const HoneyDatabase
*>(src
);
2385 const HoneyTable
* table
;
2387 case Honey::POSTLIST
:
2388 table
= &(db
->postlist_table
);
2390 case Honey::DOCDATA
:
2391 table
= &(db
->docdata_table
);
2393 case Honey::TERMLIST
:
2394 table
= &(db
->termlist_table
);
2396 case Honey::POSITION
:
2397 table
= &(db
->position_table
);
2399 case Honey::SPELLING
:
2400 table
= &(db
->spelling_table
);
2402 case Honey::SYNONYM
:
2403 table
= &(db
->synonym_table
);
2410 if (db
->single_file()) {
2411 if (t
->lazy
&& !HoneyCursor(table
).next()) {
2412 // Lazy table with no entries, so essentially doesn't
2415 // FIXME: Find actual size somehow?
2416 // in_size += table->size() / 1024;
2417 single_file_in
= true;
2418 output_will_exist
= true;
2422 off_t db_size
= file_size(table
->get_path());
2424 // FIXME: check overflow and set bad_totals
2425 in_total
+= db_size
;
2426 in_size
+= db_size
/ 1024;
2427 output_will_exist
= true;
2429 } else if (errno
!= ENOENT
) {
2430 // We get ENOENT for an optional table.
2431 bad_totals
= bad_stat
= true;
2432 output_will_exist
= true;
2436 inputs
.push_back(table
);
2439 // If any inputs lack a termlist table, suppress it in the output.
2440 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2441 if (inputs_present
!= 0) {
2443 string m
= str(inputs_present
);
2445 m
+= str(sources
.size());
2446 m
+= " inputs present, so suppressing output";
2447 compactor
->set_status(t
->name
, m
);
2451 output_will_exist
= false;
2454 if (!output_will_exist
) {
2456 compactor
->set_status(t
->name
, "doesn't exist");
2461 off_t table_start_offset
= -1;
2464 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2465 // space for version file. It's tricky to exactly know the
2466 // size of the version file beforehand.
2467 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2468 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2469 throw Xapian::DatabaseError("lseek() failed", errno
);
2471 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2473 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2476 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2478 tabs
.push_back(out
);
2479 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2481 root_info
->set_free_list(fl_serialised
);
2482 root_info
->set_offset(table_start_offset
);
2484 version_file_out
->get_root(t
->type
),
2485 version_file_out
->get_revision());
2487 out
->create_and_open(FLAGS
, *root_info
);
2491 case Honey::POSTLIST
: {
2492 if (multipass
&& inputs
.size() > 3) {
2493 multimerge_postlists(compactor
, out
, destdir
,
2496 merge_postlists(compactor
, out
, offset
.begin(),
2497 inputs
.begin(), inputs
.end());
2501 case Honey::SPELLING
:
2502 merge_spellings(out
, inputs
.begin(), inputs
.end());
2504 case Honey::SYNONYM
:
2505 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2507 case Honey::POSITION
:
2508 merge_positions(out
, inputs
, offset
);
2511 // DocData, Termlist
2512 merge_docid_keyed(out
, inputs
, offset
);
2516 // Commit as revision 1.
2518 out
->commit(1, root_info
);
2520 if (single_file
) fl_serialised
= root_info
->get_free_list();
2523 if (!bad_stat
&& !single_file_in
) {
2526 db_size
= file_size(fd
);
2528 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2532 off_t old_prev_size
= prev_size
;
2533 prev_size
= db_size
;
2534 db_size
-= old_prev_size
;
2536 // FIXME: check overflow and set bad_totals
2537 out_total
+= db_size
;
2538 out_size
= db_size
/ 1024;
2539 } else if (errno
!= ENOENT
) {
2540 bad_totals
= bad_stat
= true;
2545 compactor
->set_status(t
->name
,
2546 "Done (couldn't stat all the DB files)");
2547 } else if (single_file_in
) {
2549 compactor
->set_status(t
->name
,
2550 "Done (table sizes unknown for single "
2554 if (out_size
== in_size
) {
2555 status
= "Size unchanged (";
2558 if (out_size
< in_size
) {
2559 delta
= in_size
- out_size
;
2560 status
= "Reduced by ";
2562 delta
= out_size
- in_size
;
2563 status
= "INCREASED by ";
2566 status
+= str(100 * delta
/ in_size
);
2569 status
+= str(delta
);
2571 status
+= str(in_size
);
2574 status
+= str(out_size
);
2577 compactor
->set_status(t
->name
, status
);
2581 // If compacting to a single file output and all the tables are empty, pad
2582 // the output so that it isn't mistaken for a stub database when we try to
2583 // open it. For this it needs to at least HONEY_MIN_DB_SIZE in size.
2584 if (single_file
&& prev_size
< HONEY_MIN_DB_SIZE
) {
2585 out_total
= HONEY_MIN_DB_SIZE
;
2586 #ifdef HAVE_FTRUNCATE
2587 if (ftruncate(fd
, HONEY_MIN_DB_SIZE
) < 0) {
2588 throw Xapian::DatabaseError("Failed to set size of output "
2592 const off_t off
= HONEY_MIN_DB_SIZE
- 1;
2593 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2594 throw Xapian::DatabaseError("Failed to set size of output "
2601 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2602 throw Xapian::DatabaseError("lseek() failed", errno
);
2605 version_file_out
->set_last_docid(last_docid
);
2606 string tmpfile
= version_file_out
->write(1, FLAGS
);
2607 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2610 // Commit with revision 1.
2611 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2612 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2617 if (!single_file
) lock
.release();
2619 if (!bad_totals
&& compactor
) {
2623 if (out_total
== in_total
) {
2624 status
= "Size unchanged (";
2627 if (out_total
< in_total
) {
2628 delta
= in_total
- out_total
;
2629 status
= "Reduced by ";
2631 delta
= out_total
- in_total
;
2632 status
= "INCREASED by ";
2635 status
+= str(100 * delta
/ in_total
);
2638 status
+= str(delta
);
2640 status
+= str(in_total
);
2643 status
+= str(out_total
);
2645 compactor
->set_status("Total", status
);