1 /** @file honey_compact.cc
2 * @brief Compact a honey database, or merge and compact several.
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "xapian/compactor.h"
25 #include "xapian/constants.h"
26 #include "xapian/error.h"
27 #include "xapian/types.h"
32 #include <type_traits>
36 #include "safeerrno.h"
38 #include "backends/flint_lock.h"
39 #include "compression_stream.h"
40 #include "honey_cursor.h"
41 #include "honey_database.h"
42 #include "honey_defs.h"
43 #include "honey_postlist_encodings.h"
44 #include "honey_table.h"
45 #include "honey_version.h"
46 #include "filetests.h"
47 #include "internaltypes.h"
49 #include "backends/valuestats.h"
50 #include "wordaccess.h"
52 #include "../byte_length_strings.h"
53 #include "../prefix_compressed_strings.h"
55 #ifndef DISABLE_GPL_LIBXAPIAN
56 # include "../glass/glass_database.h"
57 # include "../glass/glass_table.h"
58 # include "../glass/glass_values.h"
63 #ifndef DISABLE_GPL_LIBXAPIAN
66 throw_database_corrupt(const char* item
, const char* pos
)
70 message
= "Value overflow unpacking termlist: ";
72 message
= "Out of data unpacking termlist: ";
75 throw Xapian::DatabaseCorruptError(message
);
78 namespace GlassCompact
{
81 is_user_metadata_key(const string
& key
)
83 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xc0';
87 is_valuestats_key(const string
& key
)
89 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd0';
93 is_valuechunk_key(const string
& key
)
95 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd8';
99 is_doclenchunk_key(const string
& key
)
101 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xe0';
107 termlist_key_is_values_used(const string
& key
)
109 const char* p
= key
.data();
110 const char* e
= p
+ key
.size();
112 if (unpack_uint_preserving_sort(&p
, e
, &did
)) {
115 if (e
- p
== 1 && *p
== '\0')
118 throw Xapian::DatabaseCorruptError("termlist key format");
122 // Put all the helpers in a namespace to avoid symbols colliding with those of
123 // the same name in other flint-derived backends.
124 namespace HoneyCompact
{
127 KEY_USER_METADATA
= 0x00,
128 KEY_VALUE_STATS
= 0x01,
129 KEY_VALUE_CHUNK
= 0xd8,
130 KEY_DOCLEN_CHUNK
= 0xe0,
131 KEY_POSTING_CHUNK
= 0xff
134 /// Return one of the KEY_* constants, or a different value for an invalid key.
136 key_type(const string
& key
)
139 return KEY_POSTING_CHUNK
;
144 switch (static_cast<unsigned char>(key
[1])) {
145 case 0x01: case 0x02: case 0x03: case 0x04:
146 return KEY_VALUE_STATS
;
149 // If key[1] is \xff then this correctly returns KEY_POSTING_CHUNK.
150 return static_cast<unsigned char>(key
[1]);
153 template<typename T
> class PostlistCursor
;
155 #ifndef DISABLE_GPL_LIBXAPIAN
156 // Convert glass to honey.
158 class PostlistCursor
<const GlassTable
&> : private GlassCursor
{
159 Xapian::docid offset
;
163 Xapian::docid firstdid
;
164 Xapian::docid chunk_lastdid
;
165 Xapian::termcount tf
, cf
;
166 Xapian::termcount first_wdf
;
167 Xapian::termcount wdf_max
;
169 PostlistCursor(const GlassTable
*in
, Xapian::docid offset_
)
170 : GlassCursor(in
), offset(offset_
), firstdid(0)
177 if (!GlassCursor::next()) return false;
178 // We put all chunks into the non-initial chunk form here, then fix up
179 // the first chunk for each term in the merged database as we merge.
184 if (GlassCompact::is_user_metadata_key(key
)) {
185 key
[1] = KEY_USER_METADATA
;
188 if (GlassCompact::is_valuestats_key(key
)) {
190 const char * p
= key
.data();
191 const char * end
= p
+ key
.length();
193 Xapian::valueno slot
;
194 if (!unpack_uint_last(&p
, end
, &slot
))
195 throw Xapian::DatabaseCorruptError("bad value stats key");
196 key
= pack_honey_valuestats_key(slot
);
199 if (GlassCompact::is_valuechunk_key(key
)) {
200 const char * p
= key
.data();
201 const char * end
= p
+ key
.length();
203 Xapian::valueno slot
;
204 if (!unpack_uint(&p
, end
, &slot
))
205 throw Xapian::DatabaseCorruptError("bad value key");
206 Xapian::docid first_did
;
207 if (!unpack_uint_preserving_sort(&p
, end
, &first_did
))
208 throw Xapian::DatabaseCorruptError("bad value key");
211 Glass::ValueChunkReader
reader(tag
.data(), tag
.size(), first_did
);
212 Xapian::docid last_did
= first_did
;
213 while (reader
.next(), !reader
.at_end()) {
214 last_did
= reader
.get_docid();
217 key
.assign("\0\xd8", 2);
218 pack_uint(key
, slot
);
219 pack_uint_preserving_sort(key
, last_did
);
221 // Add the docid delta across the chunk to the start of the tag.
223 pack_uint(newtag
, last_did
- first_did
);
224 tag
.insert(0, newtag
);
229 if (GlassCompact::is_doclenchunk_key(key
)) {
230 const char * d
= key
.data();
231 const char * e
= d
+ key
.size();
235 // This is an initial chunk, so adjust tag header.
238 if (!unpack_uint(&d
, e
, &tf
) ||
239 !unpack_uint(&d
, e
, &cf
) ||
240 !unpack_uint(&d
, e
, &firstdid
)) {
241 throw Xapian::DatabaseCorruptError("Bad postlist key");
244 tag
.erase(0, d
- tag
.data());
246 // Not an initial chunk, so adjust key.
247 size_t tmp
= d
- key
.data();
248 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
249 throw Xapian::DatabaseCorruptError("Bad postlist key");
257 // Convert doclen chunk to honey format.
260 // Skip the "last chunk" flag and increase_to_last.
262 throw Xapian::DatabaseCorruptError("No last chunk flag in "
263 "glass docdata chunk");
265 Xapian::docid increase_to_last
;
266 if (!unpack_uint(&d
, e
, &increase_to_last
))
267 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
268 "in glass docdata chunk");
270 Xapian::termcount doclen_max
= 0;
272 Xapian::termcount doclen
;
273 if (!unpack_uint(&d
, e
, &doclen
))
274 throw Xapian::DatabaseCorruptError("Decoding doclen in "
275 "glass docdata chunk");
276 if (doclen
> doclen_max
)
278 unsigned char buf
[4];
279 unaligned_write4(buf
, doclen
);
280 newtag
.append(reinterpret_cast<char*>(buf
), 4);
283 Xapian::docid gap_size
;
284 if (!unpack_uint(&d
, e
, &gap_size
))
285 throw Xapian::DatabaseCorruptError("Decoding docid "
288 // FIXME: Split chunk if the gap_size is at all large.
289 newtag
.append(4 * gap_size
, '\xff');
292 Assert(!startswith(newtag
, "\xff\xff\xff\xff"));
293 Assert(!endswith(newtag
, "\xff\xff\xff\xff"));
295 chunk_lastdid
= firstdid
- 1 + newtag
.size() / 4;
297 // Only encode document lengths using a whole number of bytes for
298 // now. We could allow arbitrary bit widths, but it complicates
299 // encoding and decoding so we should consider if the fairly small
300 // additional saving is worth it.
301 if (doclen_max
>= 0xffff) {
302 if (doclen_max
>= 0xffffff) {
303 newtag
.insert(0, 1, char(32));
305 } else if (doclen_max
>= 0xffffffff) {
306 // FIXME: Handle these.
307 const char* m
= "Document length values >= 0xffffffff not "
309 throw Xapian::FeatureUnavailableError(m
);
311 tag
.assign(1, char(24));
312 for (size_t i
= 1; i
< newtag
.size(); i
+= 4)
313 tag
.append(newtag
, i
, 3);
316 if (doclen_max
>= 0xff) {
317 tag
.assign(1, char(16));
318 for (size_t i
= 2; i
< newtag
.size(); i
+= 4)
319 tag
.append(newtag
, i
, 2);
321 tag
.assign(1, char(8));
322 for (size_t i
= 3; i
< newtag
.size(); i
+= 4)
323 tag
.append(newtag
, i
, 1);
330 // Adjust key if this is *NOT* an initial chunk.
331 // key is: pack_string_preserving_sort(key, tname)
332 // plus optionally: pack_uint_preserving_sort(key, did)
333 const char * d
= key
.data();
334 const char * e
= d
+ key
.size();
336 if (!unpack_string_preserving_sort(&d
, e
, tname
))
337 throw Xapian::DatabaseCorruptError("Bad postlist key");
340 // This is an initial chunk for a term, so adjust tag header.
343 if (!unpack_uint(&d
, e
, &tf
) ||
344 !unpack_uint(&d
, e
, &cf
) ||
345 !unpack_uint(&d
, e
, &firstdid
)) {
346 throw Xapian::DatabaseCorruptError("Bad postlist key");
349 tag
.erase(0, d
- tag
.data());
352 // Not an initial chunk, so adjust key.
353 size_t tmp
= d
- key
.data();
354 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
355 throw Xapian::DatabaseCorruptError("Bad postlist key");
363 // Convert posting chunk to honey format, but without any header.
366 // Skip the "last chunk" flag; decode increase_to_last.
368 throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
371 Xapian::docid increase_to_last
;
372 if (!unpack_uint(&d
, e
, &increase_to_last
))
373 throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
374 "glass posting chunk");
375 chunk_lastdid
= firstdid
+ increase_to_last
;
376 if (!unpack_uint(&d
, e
, &first_wdf
))
377 throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
379 wdf_max
= max(wdf_max
, first_wdf
);
383 if (!unpack_uint(&d
, e
, &delta
))
384 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
385 "glass posting chunk");
386 pack_uint(newtag
, delta
);
387 Xapian::termcount wdf
;
388 if (!unpack_uint(&d
, e
, &wdf
))
389 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
391 pack_uint(newtag
, wdf
);
392 wdf_max
= max(wdf_max
, wdf
);
403 class PostlistCursor
<const HoneyTable
&> : private HoneyCursor
{
404 Xapian::docid offset
;
408 Xapian::docid firstdid
;
409 Xapian::docid chunk_lastdid
;
410 Xapian::termcount tf
, cf
;
411 Xapian::termcount first_wdf
;
412 Xapian::termcount wdf_max
;
414 PostlistCursor(const HoneyTable
*in
, Xapian::docid offset_
)
415 : HoneyCursor(in
), offset(offset_
), firstdid(0)
422 if (!HoneyCursor::next()) return false;
423 // We put all chunks into the non-initial chunk form here, then fix up
424 // the first chunk for each term in the merged database as we merge.
429 switch (key_type(key
)) {
430 case KEY_USER_METADATA
:
431 case KEY_VALUE_STATS
:
433 case KEY_VALUE_CHUNK
: {
434 const char * p
= key
.data();
435 const char * end
= p
+ key
.length();
437 Xapian::valueno slot
;
438 if (!unpack_uint(&p
, end
, &slot
))
439 throw Xapian::DatabaseCorruptError("bad value key");
441 if (!unpack_uint_preserving_sort(&p
, end
, &did
))
442 throw Xapian::DatabaseCorruptError("bad value key");
445 key
.assign("\0\xd8", 2);
446 pack_uint(key
, slot
);
447 pack_uint_preserving_sort(key
, did
);
450 case KEY_DOCLEN_CHUNK
: {
451 const char * p
= key
.data();
452 const char * end
= p
+ key
.length();
454 Xapian::docid did
= 1;
456 (!unpack_uint_preserving_sort(&p
, end
, &did
) || p
!= end
)) {
457 throw Xapian::DatabaseCorruptError("Bad doclen key");
463 pack_uint_preserving_sort(key
, did
);
467 case KEY_POSTING_CHUNK
:
470 throw Xapian::DatabaseCorruptError("Bad postlist table key "
474 // Adjust key if this is *NOT* an initial chunk.
475 // key is: pack_string_preserving_sort(key, term)
476 // plus optionally: pack_uint_preserving_sort(key, did)
477 const char * d
= key
.data();
478 const char * e
= d
+ key
.size();
480 if (!unpack_string_preserving_sort(&d
, e
, term
))
481 throw Xapian::DatabaseCorruptError("Bad postlist key");
484 // This is an initial chunk for a term, so remove tag header.
488 Xapian::docid lastdid
;
489 if (!decode_initial_chunk_header(&d
, e
, tf
, cf
,
490 firstdid
, lastdid
, chunk_lastdid
,
491 first_wdf
, wdf_max
)) {
492 throw Xapian::DatabaseCorruptError("Bad postlist initial "
495 // Ignore lastdid - we'll need to recalculate it (at least when
496 // merging, and for simplicity we always do).
498 tag
.erase(0, d
- tag
.data());
500 // Not an initial chunk, so adjust key and remove tag header.
501 size_t tmp
= d
- key
.data();
502 if (!unpack_uint_preserving_sort(&d
, e
, &chunk_lastdid
) ||
504 throw Xapian::DatabaseCorruptError("Bad postlist key");
506 // -1 to remove the terminating zero byte too.
513 bool have_wdfs
= (cf
!= 0);
514 if (have_wdfs
&& tf
> 2) {
515 Xapian::termcount remaining_cf_for_flat_wdf
=
517 // Check this matches and that it isn't a false match due
518 // to overflow of the multiplication above.
519 if (cf
- first_wdf
== remaining_cf_for_flat_wdf
&&
520 usual(remaining_cf_for_flat_wdf
/ wdf_max
== tf
- 1)) {
521 // The wdf is flat so we don't need to store it.
527 if (!decode_delta_chunk_header(&d
, e
, chunk_lastdid
, firstdid
,
529 throw Xapian::DatabaseCorruptError("Bad postlist delta "
532 tag
.erase(0, d
- tag
.data());
534 if (!decode_delta_chunk_header_no_wdf(&d
, e
, chunk_lastdid
,
536 throw Xapian::DatabaseCorruptError("Bad postlist delta "
540 // Splice in a 0 wdf value after each docid delta.
542 for (size_t i
= d
- tag
.data(); i
< tag
.size(); i
+= 4) {
543 newtag
.append(tag
, i
, 4);
544 newtag
.append(4, '\0');
546 tag
= std::move(newtag
);
550 chunk_lastdid
+= offset
;
556 class PostlistCursor
<HoneyTable
&> : public PostlistCursor
<const HoneyTable
&> {
558 PostlistCursor(HoneyTable
*in
, Xapian::docid offset_
)
559 : PostlistCursor
<const HoneyTable
&>(in
, offset_
) {}
563 class PostlistCursorGt
{
565 /** Return true if and only if a's key is strictly greater than b's key.
567 bool operator()(const T
* a
, const T
* b
) const {
568 if (a
->key
> b
->key
) return true;
569 if (a
->key
!= b
->key
) return false;
570 return (a
->firstdid
> b
->firstdid
);
575 encode_valuestats(Xapian::doccount freq
,
576 const string
& lbound
, const string
& ubound
)
579 pack_uint(value
, freq
);
580 pack_string(value
, lbound
);
581 // We don't store or count empty values, so neither of the bounds
582 // can be empty. So we can safely store an empty upper bound when
583 // the bounds are equal.
584 if (lbound
!= ubound
) value
+= ubound
;
588 // U : vector<HoneyTable*>::const_iterator
589 template<typename T
, typename U
> void
590 merge_postlists(Xapian::Compactor
* compactor
,
591 T
* out
, vector
<Xapian::docid
>::const_iterator offset
,
594 typedef decltype(**b
) table_type
; // E.g. HoneyTable
595 typedef PostlistCursor
<table_type
> cursor_type
;
596 typedef PostlistCursorGt
<cursor_type
> gt_type
;
597 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
598 for ( ; b
!= e
; ++b
, ++offset
) {
601 // Skip empty tables.
605 pq
.push(new cursor_type(in
, *offset
));
610 // Merge user metadata.
612 while (!pq
.empty()) {
613 cursor_type
* cur
= pq
.top();
614 const string
& key
= cur
->key
;
615 if (key_type(key
) != KEY_USER_METADATA
) break;
617 if (key
!= last_key
) {
619 if (tags
.size() > 1 && compactor
) {
620 Assert(!last_key
.empty());
621 // FIXME: It would be better to merge all duplicates
622 // for a key in one call, but currently we don't in
624 const string
& resolved_tag
=
625 compactor
->resolve_duplicate_metadata(last_key
,
628 out
->add(last_key
, resolved_tag
);
630 Assert(!last_key
.empty());
631 out
->add(last_key
, tags
[0]);
637 tags
.push_back(cur
->tag
);
647 if (tags
.size() > 1 && compactor
) {
648 Assert(!last_key
.empty());
649 const string
& resolved_tag
=
650 compactor
->resolve_duplicate_metadata(last_key
,
653 out
->add(last_key
, resolved_tag
);
655 Assert(!last_key
.empty());
656 out
->add(last_key
, tags
[0]);
663 Xapian::doccount freq
= 0;
664 string lbound
, ubound
;
666 while (!pq
.empty()) {
667 cursor_type
* cur
= pq
.top();
668 const string
& key
= cur
->key
;
669 if (key_type(key
) != KEY_VALUE_STATS
) break;
670 if (key
!= last_key
) {
671 // For the first valuestats key, last_key will be the previous
672 // key we wrote, which we don't want to overwrite. This is the
673 // only time that freq will be 0, so check that.
675 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
681 const string
& tag
= cur
->tag
;
683 const char * pos
= tag
.data();
684 const char * end
= pos
+ tag
.size();
688 if (!unpack_uint(&pos
, end
, &f
)) {
690 throw Xapian::DatabaseCorruptError("Incomplete stats item "
692 throw Xapian::RangeError("Frequency statistic in value table "
695 if (!unpack_string(&pos
, end
, l
)) {
697 throw Xapian::DatabaseCorruptError("Incomplete stats item "
699 throw Xapian::RangeError("Lower bound in value table is too "
702 size_t len
= end
- pos
;
714 if (l
< lbound
) lbound
= l
;
715 if (u
> ubound
) ubound
= u
;
727 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
731 // Merge valuestream chunks.
732 while (!pq
.empty()) {
733 cursor_type
* cur
= pq
.top();
734 const string
& key
= cur
->key
;
735 if (key_type(key
) != KEY_VALUE_CHUNK
) break;
736 out
->add(key
, cur
->tag
);
745 // Merge doclen chunks.
746 while (!pq
.empty()) {
747 cursor_type
* cur
= pq
.top();
748 string
& key
= cur
->key
;
749 if (key_type(key
) != KEY_DOCLEN_CHUNK
) break;
750 pack_uint_preserving_sort(key
, cur
->chunk_lastdid
);
751 out
->add(key
, cur
->tag
);
760 Xapian::termcount tf
= 0, cf
= 0; // Initialise to avoid warnings.
762 struct HoneyPostListChunk
{
763 Xapian::docid first
, last
;
764 Xapian::termcount first_wdf
;
765 Xapian::termcount wdf_max
;
768 HoneyPostListChunk(Xapian::docid first_
,
770 Xapian::termcount first_wdf_
,
771 Xapian::termcount wdf_max_
,
775 first_wdf(first_wdf_
),
779 vector
<HoneyPostListChunk
> tags
;
781 cursor_type
* cur
= NULL
;
787 AssertEq(key_type(cur
->key
), KEY_POSTING_CHUNK
);
789 if (cur
== NULL
|| cur
->key
!= last_key
) {
791 Xapian::termcount first_wdf
= tags
[0].first_wdf
;
792 Xapian::docid chunk_lastdid
= tags
[0].last
;
793 Xapian::docid last_did
= tags
.back().last
;
794 Xapian::termcount wdf_max
= tags
.back().wdf_max
;
797 encode_initial_chunk_header(tf
, cf
, tags
[0].first
, last_did
,
799 first_wdf
, wdf_max
, first_tag
);
800 bool have_wdfs
= (cf
!= 0);
801 if (have_wdfs
&& tf
> 2) {
802 Xapian::termcount remaining_cf_for_flat_wdf
=
804 // Check this matches and that it isn't a false match due
805 // to overflow of the multiplication above.
806 if (cf
- first_wdf
== remaining_cf_for_flat_wdf
&&
807 usual(remaining_cf_for_flat_wdf
/ wdf_max
== tf
- 1)) {
808 // The wdf is flat so we don't need to store it.
815 first_tag
+= tags
[0].data
;
817 const char* pos
= tags
[0].data
.data();
818 const char* pos_end
= pos
+ tags
[0].data
.size();
819 while (pos
!= pos_end
) {
821 if (!unpack_uint(&pos
, pos_end
, &delta
))
822 throw_database_corrupt("Decoding docid "
824 pack_uint(first_tag
, delta
);
825 Xapian::termcount wdf
;
826 if (!unpack_uint(&pos
, pos_end
, &wdf
))
827 throw_database_corrupt("Decoding wdf",
829 // Only copy over the docid deltas, not the
835 out
->add(last_key
, first_tag
);
837 // If tf == 2, the data could be split over two tags when
838 // merging, but we only output an initial tag in this case.
839 if (tf
> 2 && tags
.size() > 1) {
841 const char* p
= last_key
.data();
842 const char* end
= p
+ last_key
.size();
843 if (!unpack_string_preserving_sort(&p
, end
, term
) ||
845 throw Xapian::DatabaseCorruptError("Bad postlist "
849 auto i
= tags
.begin();
850 while (++i
!= tags
.end()) {
852 const string
& key
= pack_honey_postlist_key(term
,
856 encode_delta_chunk_header(i
->first
,
862 encode_delta_chunk_header_no_wdf(i
->first
,
865 const char* pos
= i
->data
.data();
866 const char* pos_end
= pos
+ i
->data
.size();
867 while (pos
!= pos_end
) {
869 if (!unpack_uint(&pos
, pos_end
, &delta
))
870 throw_database_corrupt("Decoding docid "
872 pack_uint(tag
, delta
);
873 Xapian::termcount wdf
;
874 if (!unpack_uint(&pos
, pos_end
, &wdf
))
875 throw_database_corrupt("Decoding wdf",
877 // Only copy over the docid deltas, not the
888 if (cur
== NULL
) break;
896 tags
.push_back(HoneyPostListChunk(cur
->firstdid
,
900 std::move(cur
->tag
)));
909 template<typename T
> struct MergeCursor
;
911 #ifndef DISABLE_GPL_LIBXAPIAN
913 struct MergeCursor
<const GlassTable
&> : public GlassCursor
{
914 explicit MergeCursor(const GlassTable
*in
) : GlassCursor(in
) {
922 struct MergeCursor
<const HoneyTable
&> : public HoneyCursor
{
923 explicit MergeCursor(const HoneyTable
*in
) : HoneyCursor(in
) {
931 /// Return true if and only if a's key is strictly greater than b's key.
932 bool operator()(const T
* a
, const T
* b
) const {
933 if (b
->after_end()) return false;
934 if (a
->after_end()) return true;
935 return (a
->current_key
> b
->current_key
);
939 #ifndef DISABLE_GPL_LIBXAPIAN
940 // Convert glass to honey.
942 merge_spellings(HoneyTable
* out
,
943 vector
<const GlassTable
*>::const_iterator b
,
944 vector
<const GlassTable
*>::const_iterator e
)
946 typedef MergeCursor
<const GlassTable
&> cursor_type
;
947 typedef CursorGt
<cursor_type
> gt_type
;
948 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
949 for ( ; b
!= e
; ++b
) {
952 pq
.push(new cursor_type(in
));
956 while (!pq
.empty()) {
957 cursor_type
* cur
= pq
.top();
960 // Glass vs honey spelling key prefix map:
964 // bookend Bbd \x00bd
966 // middle Mddl \x02ddl
970 // In honey, if the word's first byte is <= '\x04', we add a prefix
971 // of '\x04' (which is unlikely in real world use but ensures that
972 // we can handle arbitrary strings).
974 // The prefixes for honey are chosen such that we save a byte for the
975 // key of all real-world words, and so that more first bytes are used
976 // so that a top-level array index is more effective, especially for
977 // random-access lookup of word entries (which we do to check the
978 // frequency of potential spelling candidates).
980 // The other prefixes are chosen such that we can do look up in key
981 // order at search time, which is more efficient for a cursor which can
982 // only efficiently move forwards.
984 // Note that the key ordering is the same for glass and honey, which
985 // makes translating during compaction simpler.
986 string key
= cur
->current_key
;
989 key
[0] = Honey::KEY_PREFIX_BOOKEND
;
992 key
[0] = Honey::KEY_PREFIX_HEAD
;
995 key
[0] = Honey::KEY_PREFIX_MIDDLE
;
998 key
[0] = Honey::KEY_PREFIX_TAIL
;
1001 if (static_cast<unsigned char>(key
[1]) > Honey::KEY_PREFIX_WORD
)
1004 key
[0] = Honey::KEY_PREFIX_WORD
;
1007 string m
= "Bad spelling key prefix: ";
1008 m
+= static_cast<unsigned char>(key
[0]);
1009 throw Xapian::DatabaseCorruptError(m
);
1013 if (pq
.empty() || pq
.top()->current_key
> key
) {
1014 // No merging to do for this key so just copy the tag value,
1015 // adjusting if necessary. If we don't need to adjust it, just
1016 // copy the compressed value.
1019 case Honey::KEY_PREFIX_HEAD
: {
1020 compressed
= cur
->read_tag(false);
1021 unsigned char len
= cur
->current_tag
[0] ^ MAGIC_XOR_VALUE
;
1022 cur
->current_tag
[0] = (len
- 2) ^ MAGIC_XOR_VALUE
;
1023 AssertEq(cur
->current_tag
[1], key
[1]);
1024 AssertEq(cur
->current_tag
[2], key
[2]);
1025 cur
->current_tag
.erase(1, 2);
1028 case Honey::KEY_PREFIX_TAIL
: {
1029 compressed
= cur
->read_tag(false);
1030 unsigned char len
= cur
->current_tag
[0] ^ MAGIC_XOR_VALUE
;
1031 cur
->current_tag
[0] = (len
- 2) ^ MAGIC_XOR_VALUE
;
1032 AssertEq(cur
->current_tag
[len
- 1], key
[1]);
1033 AssertEq(cur
->current_tag
[len
], key
[2]);
1034 cur
->current_tag
.erase(len
- 1, 2);
1037 case Honey::KEY_PREFIX_BOOKEND
: {
1038 compressed
= cur
->read_tag(false);
1039 unsigned char len
= cur
->current_tag
[0] ^ MAGIC_XOR_VALUE
;
1040 cur
->current_tag
[0] = (len
- 2) ^ MAGIC_XOR_VALUE
;
1041 AssertEq(cur
->current_tag
[1], key
[1]);
1042 AssertEq(cur
->current_tag
[len
], key
[2]);
1043 cur
->current_tag
.erase(len
, 1);
1044 cur
->current_tag
.erase(1, 1);
1048 compressed
= cur
->read_tag(true);
1051 out
->add(key
, cur
->current_tag
, compressed
);
1060 // Merge tag values with the same key:
1062 if (static_cast<unsigned char>(key
[0]) < Honey::KEY_PREFIX_WORD
) {
1063 // We just want the union of words, so copy over the first instance
1064 // and skip any identical ones.
1065 priority_queue
<PrefixCompressedStringItor
*,
1066 vector
<PrefixCompressedStringItor
*>,
1067 PrefixCompressedStringItorGt
> pqtag
;
1068 // Stick all the cursor_type pointers in a vector because their
1069 // current_tag members must remain valid while we're merging their
1070 // tags, but we need to call next() on them all afterwards.
1071 vector
<cursor_type
*> vec
;
1072 vec
.reserve(pq
.size());
1076 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
));
1078 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1083 PrefixCompressedStringWriter
wr(tag
, key
);
1085 while (!pqtag
.empty()) {
1086 PrefixCompressedStringItor
* it
= pqtag
.top();
1089 if (word
!= lastword
) {
1091 wr
.append(lastword
);
1094 if (!it
->at_end()) {
1101 for (auto i
: vec
) {
1110 // We want to sum the frequencies from tags for the same key.
1111 Xapian::termcount tot_freq
= 0;
1114 Xapian::termcount freq
;
1115 const char * p
= cur
->current_tag
.data();
1116 const char * end
= p
+ cur
->current_tag
.size();
1117 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1118 throw_database_corrupt("Bad spelling word freq", p
);
1126 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1131 pack_uint_last(tag
, tot_freq
);
1139 merge_spellings(HoneyTable
* out
,
1140 vector
<const HoneyTable
*>::const_iterator b
,
1141 vector
<const HoneyTable
*>::const_iterator e
)
1143 typedef MergeCursor
<const HoneyTable
&> cursor_type
;
1144 typedef CursorGt
<cursor_type
> gt_type
;
1145 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1146 for ( ; b
!= e
; ++b
) {
1149 pq
.push(new cursor_type(in
));
1153 while (!pq
.empty()) {
1154 cursor_type
* cur
= pq
.top();
1157 string key
= cur
->current_key
;
1158 if (pq
.empty() || pq
.top()->current_key
> key
) {
1159 // No need to merge the tags, just copy the (possibly compressed)
1161 bool compressed
= cur
->read_tag(true);
1162 out
->add(key
, cur
->current_tag
, compressed
);
1171 // Merge tag values with the same key:
1173 if (static_cast<unsigned char>(key
[0]) < Honey::KEY_PREFIX_WORD
) {
1174 // We just want the union of words, so copy over the first instance
1175 // and skip any identical ones.
1176 priority_queue
<PrefixCompressedStringItor
*,
1177 vector
<PrefixCompressedStringItor
*>,
1178 PrefixCompressedStringItorGt
> pqtag
;
1179 // Stick all the cursor_type pointers in a vector because their
1180 // current_tag members must remain valid while we're merging their
1181 // tags, but we need to call next() on them all afterwards.
1182 vector
<cursor_type
*> vec
;
1183 vec
.reserve(pq
.size());
1187 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
,
1190 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1195 PrefixCompressedStringWriter
wr(tag
, key
);
1197 while (!pqtag
.empty()) {
1198 PrefixCompressedStringItor
* it
= pqtag
.top();
1201 if (word
!= lastword
) {
1203 wr
.append(lastword
);
1206 if (!it
->at_end()) {
1213 for (auto i
: vec
) {
1222 // We want to sum the frequencies from tags for the same key.
1223 Xapian::termcount tot_freq
= 0;
1226 Xapian::termcount freq
;
1227 const char * p
= cur
->current_tag
.data();
1228 const char * end
= p
+ cur
->current_tag
.size();
1229 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1230 throw_database_corrupt("Bad spelling word freq", p
);
1238 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1243 pack_uint_last(tag
, tot_freq
);
1249 // U : vector<HoneyTable*>::const_iterator
1250 template<typename T
, typename U
> void
1251 merge_synonyms(T
* out
, U b
, U e
)
1253 typedef decltype(**b
) table_type
; // E.g. HoneyTable
1254 typedef MergeCursor
<table_type
> cursor_type
;
1255 typedef CursorGt
<cursor_type
> gt_type
;
1256 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1257 for ( ; b
!= e
; ++b
) {
1260 pq
.push(new cursor_type(in
));
1264 while (!pq
.empty()) {
1265 cursor_type
* cur
= pq
.top();
1268 string key
= cur
->current_key
;
1269 if (pq
.empty() || pq
.top()->current_key
> key
) {
1270 // No need to merge the tags, just copy the (possibly compressed)
1272 bool compressed
= cur
->read_tag(true);
1273 out
->add(key
, cur
->current_tag
, compressed
);
1282 // Merge tag values with the same key:
1285 // We just want the union of words, so copy over the first instance
1286 // and skip any identical ones.
1287 priority_queue
<ByteLengthPrefixedStringItor
*,
1288 vector
<ByteLengthPrefixedStringItor
*>,
1289 ByteLengthPrefixedStringItorGt
> pqtag
;
1290 vector
<cursor_type
*> vec
;
1294 pqtag
.push(new ByteLengthPrefixedStringItor(cur
->current_tag
));
1296 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1302 while (!pqtag
.empty()) {
1303 ByteLengthPrefixedStringItor
* it
= pqtag
.top();
1305 if (**it
!= lastword
) {
1307 tag
+= byte(lastword
.size() ^ MAGIC_XOR_VALUE
);
1311 if (!it
->at_end()) {
1318 for (auto i
: vec
) {
1331 template<typename T
, typename U
> void
1332 multimerge_postlists(Xapian::Compactor
* compactor
,
1333 T
* out
, const char * tmpdir
,
1334 const vector
<U
*>& in
,
1335 vector
<Xapian::docid
> off
)
1337 if (in
.size() <= 3) {
1338 merge_postlists(compactor
, out
, off
.begin(), in
.begin(), in
.end());
1342 vector
<HoneyTable
*> tmp
;
1343 tmp
.reserve(in
.size() / 2);
1345 vector
<Xapian::docid
> newoff
;
1346 newoff
.resize(in
.size() / 2);
1347 for (unsigned int i
= 0, j
; i
< in
.size(); i
= j
) {
1349 if (j
== in
.size() - 1) ++j
;
1351 string dest
= tmpdir
;
1353 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1356 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1358 // Use maximum blocksize for temporary tables. And don't compress
1359 // entries in temporary tables, even if the final table would do
1360 // so. Any already compressed entries will get copied in
1361 // compressed form. (FIXME: HoneyTable has no blocksize)
1362 Honey::RootInfo root_info
;
1363 root_info
.init(65536, 0);
1364 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1365 tmptab
->create_and_open(flags
, root_info
);
1367 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1368 in
.begin() + i
, in
.begin() + j
);
1369 tmp
.push_back(tmptab
);
1371 tmptab
->commit(1, &root_info
);
1372 AssertRel(root_info
.get_blocksize(),==,65536);
1378 while (tmp
.size() > 3) {
1379 vector
<HoneyTable
*> tmpout
;
1380 tmpout
.reserve(tmp
.size() / 2);
1381 vector
<Xapian::docid
> newoff
;
1382 newoff
.resize(tmp
.size() / 2);
1383 for (unsigned int i
= 0, j
; i
< tmp
.size(); i
= j
) {
1385 if (j
== tmp
.size() - 1) ++j
;
1387 string dest
= tmpdir
;
1389 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1392 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1394 // Use maximum blocksize for temporary tables. And don't compress
1395 // entries in temporary tables, even if the final table would do
1396 // so. Any already compressed entries will get copied in
1397 // compressed form. (FIXME: HoneyTable has no blocksize)
1398 Honey::RootInfo root_info
;
1399 root_info
.init(65536, 0);
1400 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1401 tmptab
->create_and_open(flags
, root_info
);
1403 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1404 tmp
.begin() + i
, tmp
.begin() + j
);
1406 for (unsigned int k
= i
; k
< j
; ++k
) {
1407 // FIXME: unlink(tmp[k]->get_path().c_str());
1412 tmpout
.push_back(tmptab
);
1414 tmptab
->commit(1, &root_info
);
1415 AssertRel(root_info
.get_blocksize(),==,65536);
1421 merge_postlists(compactor
, out
, off
.begin(), tmp
.begin(), tmp
.end());
1423 for (size_t k
= 0; k
< tmp
.size(); ++k
) {
1424 // FIXME: unlink(tmp[k]->get_path().c_str());
1431 template<typename T
> class PositionCursor
;
1433 #ifndef DISABLE_GPL_LIBXAPIAN
1435 class PositionCursor
<const GlassTable
&> : private GlassCursor
{
1436 Xapian::docid offset
;
1440 Xapian::docid firstdid
;
1442 PositionCursor(const GlassTable
*in
, Xapian::docid offset_
)
1443 : GlassCursor(in
), offset(offset_
), firstdid(0) {
1449 if (!GlassCursor::next()) return false;
1451 const char * d
= current_key
.data();
1452 const char * e
= d
+ current_key
.size();
1455 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1456 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1458 throw Xapian::DatabaseCorruptError("Bad position key");
1462 pack_string_preserving_sort(key
, term
);
1463 pack_uint_preserving_sort(key
, did
+ offset
);
1467 const string
& get_tag() const {
1474 class PositionCursor
<const HoneyTable
&> : private HoneyCursor
{
1475 Xapian::docid offset
;
1479 Xapian::docid firstdid
;
1481 PositionCursor(const HoneyTable
*in
, Xapian::docid offset_
)
1482 : HoneyCursor(in
), offset(offset_
), firstdid(0) {
1488 if (!HoneyCursor::next()) return false;
1490 const char * d
= current_key
.data();
1491 const char * e
= d
+ current_key
.size();
1494 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1495 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1497 throw Xapian::DatabaseCorruptError("Bad position key");
1501 pack_string_preserving_sort(key
, term
);
1502 pack_uint_preserving_sort(key
, did
+ offset
);
1506 const string
& get_tag() const {
1511 template<typename T
>
1512 class PositionCursorGt
{
1514 /** Return true if and only if a's key is strictly greater than b's key.
1516 bool operator()(const T
* a
, const T
* b
) const {
1517 return a
->key
> b
->key
;
1521 template<typename T
, typename U
> void
1522 merge_positions(T
* out
, const vector
<U
*> & inputs
,
1523 const vector
<Xapian::docid
> & offset
)
1525 typedef decltype(*inputs
[0]) table_type
; // E.g. HoneyTable
1526 typedef PositionCursor
<table_type
> cursor_type
;
1527 typedef PositionCursorGt
<cursor_type
> gt_type
;
1528 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1529 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1530 auto in
= inputs
[i
];
1532 // Skip empty tables.
1536 pq
.push(new cursor_type(in
, offset
[i
]));
1539 while (!pq
.empty()) {
1540 cursor_type
* cur
= pq
.top();
1542 out
->add(cur
->key
, cur
->get_tag());
1551 template<typename T
, typename U
> void
1552 merge_docid_keyed(T
*out
, const vector
<U
*> & inputs
,
1553 const vector
<Xapian::docid
> & offset
,
1556 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1557 Xapian::docid off
= offset
[i
];
1559 auto in
= inputs
[i
];
1560 if (in
->empty()) continue;
1562 HoneyCursor
cur(in
);
1566 while (cur
.next()) {
1567 // Adjust the key if this isn't the first database.
1570 const char * d
= cur
.current_key
.data();
1571 const char * e
= d
+ cur
.current_key
.size();
1572 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1573 string msg
= "Bad key in ";
1574 msg
+= inputs
[i
]->get_path();
1575 throw Xapian::DatabaseCorruptError(msg
);
1579 pack_uint_preserving_sort(key
, did
);
1581 // Copy over anything extra in the key (e.g. the zero byte
1582 // at the end of "used value slots" in the termlist table).
1583 key
.append(d
, e
- d
);
1586 key
= cur
.current_key
;
1588 bool compressed
= cur
.read_tag(true);
1589 out
->add(key
, cur
.current_tag
, compressed
);
1594 #ifndef DISABLE_GPL_LIBXAPIAN
1595 template<typename T
> void
1596 merge_docid_keyed(T
*out
, const vector
<const GlassTable
*> & inputs
,
1597 const vector
<Xapian::docid
> & offset
,
1600 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1601 Xapian::docid off
= offset
[i
];
1603 auto in
= inputs
[i
];
1604 if (in
->empty()) continue;
1606 GlassCursor
cur(in
);
1610 while (cur
.next()) {
1612 // Adjust the key if this isn't the first database.
1615 const char * d
= cur
.current_key
.data();
1616 const char * e
= d
+ cur
.current_key
.size();
1617 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1618 string msg
= "Bad key in ";
1619 msg
+= inputs
[i
]->get_path();
1620 throw Xapian::DatabaseCorruptError(msg
);
1624 pack_uint_preserving_sort(key
, did
);
1626 // Copy over anything extra in the key (e.g. the zero byte
1627 // at the end of "used value slots" in the termlist table).
1628 key
.append(d
, e
- d
);
1631 key
= cur
.current_key
;
1633 if (table_type
== Honey::TERMLIST
) {
1634 if (termlist_key_is_values_used(key
)) {
1635 throw Xapian::DatabaseCorruptError("value slots used "
1641 string tag
= cur
.current_tag
;
1643 bool next_result
= cur
.next();
1644 bool next_already_done
= true;
1645 unsigned bitmap_slots_used
= 0;
1646 string encoded_slots_used
;
1648 termlist_key_is_values_used(cur
.current_key
)) {
1649 next_already_done
= false;
1651 const string
& valtag
= cur
.current_tag
;
1653 const char* p
= valtag
.data();
1654 const char* end
= p
+ valtag
.size();
1656 Xapian::VecCOW
<Xapian::termpos
> slots
;
1658 Xapian::valueno first_slot
;
1659 if (!unpack_uint(&p
, end
, &first_slot
)) {
1660 throw_database_corrupt("Value slot encoding corrupt",
1663 slots
.push_back(first_slot
);
1664 Xapian::valueno last_slot
= first_slot
;
1666 Xapian::valueno slot
;
1667 if (!unpack_uint(&p
, end
, &slot
)) {
1668 throw_database_corrupt("Value slot encoding "
1671 slot
+= last_slot
+ 1;
1674 slots
.push_back(slot
);
1677 if (slots
.back() <= 6) {
1678 // Encode as a bitmap if only slots in the range 0-6
1680 for (auto slot
: slots
) {
1681 bitmap_slots_used
|= 1 << slot
;
1685 pack_uint(enc
, last_slot
);
1686 if (slots
.size() > 1) {
1687 BitWriter
slots_used(enc
);
1688 slots_used
.encode(first_slot
, last_slot
);
1689 slots_used
.encode(slots
.size() - 2,
1690 last_slot
- first_slot
);
1691 slots_used
.encode_interpolative(slots
, 0,
1693 encoded_slots_used
= slots_used
.freeze();
1695 encoded_slots_used
= std::move(enc
);
1700 const char* pos
= tag
.data();
1701 const char* end
= pos
+ tag
.size();
1704 if (encoded_slots_used
.empty()) {
1705 newtag
+= char(bitmap_slots_used
);
1707 auto size
= encoded_slots_used
.size();
1709 newtag
+= char(0x80 | size
);
1712 pack_uint(newtag
, size
);
1714 newtag
+= encoded_slots_used
;
1718 Xapian::termcount doclen
;
1719 if (!unpack_uint(&pos
, end
, &doclen
)) {
1720 throw_database_corrupt("doclen", pos
);
1722 Xapian::termcount termlist_size
;
1723 if (!unpack_uint(&pos
, end
, &termlist_size
)) {
1724 throw_database_corrupt("termlist length", pos
);
1726 pack_uint(newtag
, termlist_size
- 1);
1727 pack_uint(newtag
, doclen
);
1729 string current_term
;
1730 while (pos
!= end
) {
1731 Xapian::termcount current_wdf
= 0;
1733 if (!current_term
.empty()) {
1734 size_t reuse
= static_cast<unsigned char>(*pos
++);
1735 newtag
+= char(reuse
);
1737 if (reuse
> current_term
.size()) {
1738 current_wdf
= reuse
/ (current_term
.size() + 1);
1739 reuse
= reuse
% (current_term
.size() + 1);
1741 current_term
.resize(reuse
);
1744 throw_database_corrupt("term", NULL
);
1747 size_t append
= static_cast<unsigned char>(*pos
++);
1748 if (size_t(end
- pos
) < append
)
1749 throw_database_corrupt("term", NULL
);
1751 current_term
.append(pos
, append
);
1757 if (!unpack_uint(&pos
, end
, ¤t_wdf
)) {
1758 throw_database_corrupt("wdf", pos
);
1760 pack_uint(newtag
, current_wdf
);
1763 newtag
+= char(append
);
1764 newtag
.append(current_term
.end() - append
,
1765 current_term
.end());
1768 if (!newtag
.empty())
1769 out
->add(key
, newtag
);
1770 if (!next_result
) break;
1771 if (next_already_done
) goto next_without_next
;
1773 bool compressed
= cur
.read_tag(true);
1774 out
->add(key
, cur
.current_tag
, compressed
);
1783 using namespace HoneyCompact
;
1786 HoneyDatabase::compact(Xapian::Compactor
* compactor
,
1787 const char* destdir
,
1790 const vector
<const Xapian::Database::Internal
*>& sources
,
1791 const vector
<Xapian::docid
>& offset
,
1793 Xapian::Compactor::compaction_level compaction
,
1795 Xapian::docid last_docid
)
1798 // The "base name" of the table.
1801 Honey::table_type type
;
1802 // Create tables after position lazily.
1806 static const table_list tables
[] = {
1808 { "postlist", Honey::POSTLIST
, false },
1809 { "docdata", Honey::DOCDATA
, true },
1810 { "termlist", Honey::TERMLIST
, false },
1811 { "position", Honey::POSITION
, true },
1812 { "spelling", Honey::SPELLING
, true },
1813 { "synonym", Honey::SYNONYM
, true }
1815 const table_list
* tables_end
= tables
+
1816 (sizeof(tables
) / sizeof(tables
[0]));
1818 const int FLAGS
= Xapian::DB_DANGEROUS
;
1820 bool single_file
= (flags
& Xapian::DBCOMPACT_SINGLE_FILE
);
1821 bool multipass
= (flags
& Xapian::DBCOMPACT_MULTIPASS
);
1823 // FIXME: Support this combination - we need to put temporary files
1829 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1830 bool has_uncommitted_changes
;
1831 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1832 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1833 has_uncommitted_changes
= db
->has_uncommitted_changes();
1835 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1836 has_uncommitted_changes
= db
->has_uncommitted_changes();
1838 if (has_uncommitted_changes
) {
1840 "Can't compact from a WritableDatabase with uncommitted "
1841 "changes - either call commit() first, or create a new "
1842 "Database object from the filename on disk";
1843 throw Xapian::InvalidOperationError(m
);
1848 if (block_size
< HONEY_MIN_BLOCKSIZE
||
1849 block_size
> HONEY_MAX_BLOCKSIZE
||
1850 (block_size
& (block_size
- 1)) != 0) {
1851 block_size
= HONEY_DEFAULT_BLOCKSIZE
;
1854 FlintLock
lock(destdir
? destdir
: "");
1857 FlintLock::reason why
= lock
.lock(true, false, explanation
);
1858 if (why
!= FlintLock::SUCCESS
) {
1859 lock
.throw_databaselockerror(why
, destdir
, explanation
);
1863 unique_ptr
<HoneyVersion
> version_file_out
;
1866 fd
= open(destdir
, O_RDWR
|O_CREAT
|O_TRUNC
|O_BINARY
|O_CLOEXEC
, 0666);
1868 throw Xapian::DatabaseCreateError("open() failed", errno
);
1871 version_file_out
.reset(new HoneyVersion(fd
));
1874 version_file_out
.reset(new HoneyVersion(destdir
));
1877 version_file_out
->create(block_size
);
1878 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1879 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1880 #ifdef DISABLE_GPL_LIBXAPIAN
1883 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1884 auto& v_in
= db
->version_file
;
1885 auto& v_out
= version_file_out
;
1886 v_out
->merge_stats(v_in
.get_doccount(),
1887 v_in
.get_doclength_lower_bound(),
1888 v_in
.get_doclength_upper_bound(),
1889 v_in
.get_wdf_upper_bound(),
1890 v_in
.get_total_doclen(),
1891 v_in
.get_spelling_wordfreq_upper_bound());
1894 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1895 version_file_out
->merge_stats(db
->version_file
);
1899 string fl_serialised
;
1903 fl
.set_first_unused_block(1); // FIXME: Assumption?
1904 fl
.pack(fl_serialised
);
1908 // Set to true if stat() failed (which can happen if the files are > 2GB
1909 // and off_t is 32 bit) or one of the totals overflowed.
1910 bool bad_totals
= false;
1911 off_t in_total
= 0, out_total
= 0;
1913 // FIXME: sort out indentation.
1914 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1915 #ifdef DISABLE_GPL_LIBXAPIAN
1916 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1918 vector
<HoneyTable
*> tabs
;
1919 tabs
.reserve(tables_end
- tables
);
1920 off_t prev_size
= block_size
;
1921 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
1922 // The postlist table requires an N-way merge, adjusting the
1923 // headers of various blocks. The spelling and synonym tables also
1924 // need special handling. The other tables have keys sorted in
1925 // docid order, so we can merge them by simply copying all the keys
1926 // from each source table in turn.
1928 compactor
->set_status(t
->name
, string());
1938 bool output_will_exist
= !t
->lazy
;
1940 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
1941 // on certain systems).
1942 bool bad_stat
= false;
1944 // We can't currently report input sizes if there's a single file DB
1945 // amongst the inputs.
1946 bool single_file_in
= false;
1950 vector
<const GlassTable
*> inputs
;
1951 inputs
.reserve(sources
.size());
1952 size_t inputs_present
= 0;
1953 for (auto src
: sources
) {
1954 auto db
= static_cast<const GlassDatabase
*>(src
);
1955 const GlassTable
* table
;
1957 case Honey::POSTLIST
:
1958 table
= &(db
->postlist_table
);
1960 case Honey::DOCDATA
:
1961 table
= &(db
->docdata_table
);
1963 case Honey::TERMLIST
:
1964 table
= &(db
->termlist_table
);
1966 case Honey::POSITION
:
1967 table
= &(db
->position_table
);
1969 case Honey::SPELLING
:
1970 table
= &(db
->spelling_table
);
1972 case Honey::SYNONYM
:
1973 table
= &(db
->synonym_table
);
1980 if (db
->single_file()) {
1981 if (t
->lazy
&& table
->empty()) {
1982 // Essentially doesn't exist.
1984 // FIXME: Find actual size somehow?
1985 // in_size += table->size() / 1024;
1986 single_file_in
= true;
1988 output_will_exist
= true;
1992 off_t db_size
= file_size(table
->get_path());
1994 // FIXME: check overflow and set bad_totals
1995 in_total
+= db_size
;
1996 in_size
+= db_size
/ 1024;
1997 output_will_exist
= true;
1999 } else if (errno
!= ENOENT
) {
2000 // We get ENOENT for an optional table.
2001 bad_totals
= bad_stat
= true;
2002 output_will_exist
= true;
2006 inputs
.push_back(table
);
2009 // If any inputs lack a termlist table, suppress it in the output.
2010 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2011 if (inputs_present
!= 0) {
2013 string m
= str(inputs_present
);
2015 m
+= str(sources
.size());
2016 m
+= " inputs present, so suppressing output";
2017 compactor
->set_status(t
->name
, m
);
2021 output_will_exist
= false;
2024 if (!output_will_exist
) {
2026 compactor
->set_status(t
->name
, "doesn't exist");
2031 off_t table_start_offset
= -1;
2034 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2035 // space for version file. It's tricky to exactly know the
2036 // size of the version file beforehand.
2037 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2038 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2039 throw Xapian::DatabaseError("lseek() failed", errno
);
2041 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2043 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2046 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2048 tabs
.push_back(out
);
2049 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2051 root_info
->set_free_list(fl_serialised
);
2052 root_info
->set_offset(table_start_offset
);
2054 version_file_out
->get_root(t
->type
),
2055 version_file_out
->get_revision());
2057 out
->create_and_open(FLAGS
, *root_info
);
2060 out
->set_full_compaction(compaction
!= compactor
->STANDARD
);
2061 if (compaction
== compactor
->FULLER
) out
->set_max_item_size(1);
2064 case Honey::POSTLIST
: {
2065 if (multipass
&& inputs
.size() > 3) {
2066 multimerge_postlists(compactor
, out
, destdir
,
2069 merge_postlists(compactor
, out
, offset
.begin(),
2070 inputs
.begin(), inputs
.end());
2074 case Honey::SPELLING
:
2075 merge_spellings(out
, inputs
.cbegin(), inputs
.cend());
2077 case Honey::SYNONYM
:
2078 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2080 case Honey::POSITION
:
2081 merge_positions(out
, inputs
, offset
);
2084 // DocData, Termlist
2085 merge_docid_keyed(out
, inputs
, offset
, t
->type
);
2089 // Commit as revision 1.
2091 out
->commit(1, root_info
);
2093 if (single_file
) fl_serialised
= root_info
->get_free_list();
2096 if (!bad_stat
&& !single_file_in
) {
2099 db_size
= file_size(fd
);
2101 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2105 off_t old_prev_size
= max(prev_size
, off_t(block_size
));
2106 prev_size
= db_size
;
2107 db_size
-= old_prev_size
;
2109 // FIXME: check overflow and set bad_totals
2110 out_total
+= db_size
;
2111 out_size
= db_size
/ 1024;
2112 } else if (errno
!= ENOENT
) {
2113 bad_totals
= bad_stat
= true;
2118 compactor
->set_status(t
->name
,
2119 "Done (couldn't stat all the DB files)");
2120 } else if (single_file_in
) {
2122 compactor
->set_status(t
->name
,
2123 "Done (table sizes unknown for single "
2127 if (out_size
== in_size
) {
2128 status
= "Size unchanged (";
2131 if (out_size
< in_size
) {
2132 delta
= in_size
- out_size
;
2133 status
= "Reduced by ";
2135 delta
= out_size
- in_size
;
2136 status
= "INCREASED by ";
2139 status
+= str(100 * delta
/ in_size
);
2142 status
+= str(delta
);
2144 status
+= str(in_size
);
2147 status
+= str(out_size
);
2150 compactor
->set_status(t
->name
, status
);
2154 // If compacting to a single file output and all the tables are empty, pad
2155 // the output so that it isn't mistaken for a stub database when we try to
2156 // open it. For this it needs to be a multiple of 2KB in size.
2157 if (single_file
&& prev_size
< off_t(block_size
)) {
2158 #ifdef HAVE_FTRUNCATE
2159 if (ftruncate(fd
, block_size
) < 0) {
2160 throw Xapian::DatabaseError("Failed to set size of output "
2164 const off_t off
= block_size
- 1;
2165 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2166 throw Xapian::DatabaseError("Failed to set size of output "
2173 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2174 throw Xapian::DatabaseError("lseek() failed", errno
);
2177 version_file_out
->set_last_docid(last_docid
);
2178 string tmpfile
= version_file_out
->write(1, FLAGS
);
2180 off_t version_file_size
= lseek(fd
, 0, SEEK_CUR
);
2181 if (version_file_size
< 0) {
2182 throw Xapian::DatabaseError("lseek() failed", errno
);
2184 if (version_file_size
> HONEY_VERSION_MAX_SIZE
) {
2185 throw Xapian::DatabaseError("Didn't allow enough space for "
2186 "version file data");
2189 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2192 // Commit with revision 1.
2193 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2194 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2199 vector
<HoneyTable
*> tabs
;
2200 tabs
.reserve(tables_end
- tables
);
2201 off_t prev_size
= block_size
;
2202 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
2203 // The postlist table requires an N-way merge, adjusting the
2204 // headers of various blocks. The spelling and synonym tables also
2205 // need special handling. The other tables have keys sorted in
2206 // docid order, so we can merge them by simply copying all the keys
2207 // from each source table in turn.
2209 compactor
->set_status(t
->name
, string());
2219 bool output_will_exist
= !t
->lazy
;
2221 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2222 // on certain systems).
2223 bool bad_stat
= false;
2225 // We can't currently report input sizes if there's a single file DB
2226 // amongst the inputs.
2227 bool single_file_in
= false;
2231 vector
<const HoneyTable
*> inputs
;
2232 inputs
.reserve(sources
.size());
2233 size_t inputs_present
= 0;
2234 for (auto src
: sources
) {
2235 auto db
= static_cast<const HoneyDatabase
*>(src
);
2236 const HoneyTable
* table
;
2238 case Honey::POSTLIST
:
2239 table
= &(db
->postlist_table
);
2241 case Honey::DOCDATA
:
2242 table
= &(db
->docdata_table
);
2244 case Honey::TERMLIST
:
2245 table
= &(db
->termlist_table
);
2247 case Honey::POSITION
:
2248 table
= &(db
->position_table
);
2250 case Honey::SPELLING
:
2251 table
= &(db
->spelling_table
);
2253 case Honey::SYNONYM
:
2254 table
= &(db
->synonym_table
);
2261 if (db
->single_file()) {
2262 if (t
->lazy
&& table
->empty()) {
2263 // Essentially doesn't exist.
2265 // FIXME: Find actual size somehow?
2266 // in_size += table->size() / 1024;
2267 single_file_in
= true;
2269 output_will_exist
= true;
2273 off_t db_size
= file_size(table
->get_path());
2275 // FIXME: check overflow and set bad_totals
2276 in_total
+= db_size
;
2277 in_size
+= db_size
/ 1024;
2278 output_will_exist
= true;
2280 } else if (errno
!= ENOENT
) {
2281 // We get ENOENT for an optional table.
2282 bad_totals
= bad_stat
= true;
2283 output_will_exist
= true;
2287 inputs
.push_back(table
);
2290 // If any inputs lack a termlist table, suppress it in the output.
2291 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2292 if (inputs_present
!= 0) {
2294 string m
= str(inputs_present
);
2296 m
+= str(sources
.size());
2297 m
+= " inputs present, so suppressing output";
2298 compactor
->set_status(t
->name
, m
);
2302 output_will_exist
= false;
2305 if (!output_will_exist
) {
2307 compactor
->set_status(t
->name
, "doesn't exist");
2312 off_t table_start_offset
= -1;
2315 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2316 // space for version file. It's tricky to exactly know the
2317 // size of the version file beforehand.
2318 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2319 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2320 throw Xapian::DatabaseError("lseek() failed", errno
);
2322 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2324 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2327 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2329 tabs
.push_back(out
);
2330 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2332 root_info
->set_free_list(fl_serialised
);
2333 root_info
->set_offset(table_start_offset
);
2335 version_file_out
->get_root(t
->type
),
2336 version_file_out
->get_revision());
2338 out
->create_and_open(FLAGS
, *root_info
);
2341 out
->set_full_compaction(compaction
!= compactor
->STANDARD
);
2342 if (compaction
== compactor
->FULLER
) out
->set_max_item_size(1);
2345 case Honey::POSTLIST
: {
2346 if (multipass
&& inputs
.size() > 3) {
2347 multimerge_postlists(compactor
, out
, destdir
,
2350 merge_postlists(compactor
, out
, offset
.begin(),
2351 inputs
.begin(), inputs
.end());
2355 case Honey::SPELLING
:
2356 merge_spellings(out
, inputs
.begin(), inputs
.end());
2358 case Honey::SYNONYM
:
2359 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2361 case Honey::POSITION
:
2362 merge_positions(out
, inputs
, offset
);
2365 // DocData, Termlist
2366 merge_docid_keyed(out
, inputs
, offset
);
2370 // Commit as revision 1.
2372 out
->commit(1, root_info
);
2374 if (single_file
) fl_serialised
= root_info
->get_free_list();
2377 if (!bad_stat
&& !single_file_in
) {
2380 db_size
= file_size(fd
);
2382 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2386 off_t old_prev_size
= max(prev_size
, off_t(block_size
));
2387 prev_size
= db_size
;
2388 db_size
-= old_prev_size
;
2390 // FIXME: check overflow and set bad_totals
2391 out_total
+= db_size
;
2392 out_size
= db_size
/ 1024;
2393 } else if (errno
!= ENOENT
) {
2394 bad_totals
= bad_stat
= true;
2399 compactor
->set_status(t
->name
,
2400 "Done (couldn't stat all the DB files)");
2401 } else if (single_file_in
) {
2403 compactor
->set_status(t
->name
,
2404 "Done (table sizes unknown for single "
2408 if (out_size
== in_size
) {
2409 status
= "Size unchanged (";
2412 if (out_size
< in_size
) {
2413 delta
= in_size
- out_size
;
2414 status
= "Reduced by ";
2416 delta
= out_size
- in_size
;
2417 status
= "INCREASED by ";
2420 status
+= str(100 * delta
/ in_size
);
2423 status
+= str(delta
);
2425 status
+= str(in_size
);
2428 status
+= str(out_size
);
2431 compactor
->set_status(t
->name
, status
);
2435 // If compacting to a single file output and all the tables are empty, pad
2436 // the output so that it isn't mistaken for a stub database when we try to
2437 // open it. For this it needs to be a multiple of 2KB in size.
2438 if (single_file
&& prev_size
< off_t(block_size
)) {
2439 #ifdef HAVE_FTRUNCATE
2440 if (ftruncate(fd
, block_size
) < 0) {
2441 throw Xapian::DatabaseError("Failed to set size of output "
2445 const off_t off
= block_size
- 1;
2446 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2447 throw Xapian::DatabaseError("Failed to set size of output "
2454 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2455 throw Xapian::DatabaseError("lseek() failed", errno
);
2458 version_file_out
->set_last_docid(last_docid
);
2459 string tmpfile
= version_file_out
->write(1, FLAGS
);
2460 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2463 // Commit with revision 1.
2464 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2465 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2470 if (!single_file
) lock
.release();
2472 if (!bad_totals
&& compactor
) {
2476 if (out_total
== in_total
) {
2477 status
= "Size unchanged (";
2480 if (out_total
< in_total
) {
2481 delta
= in_total
- out_total
;
2482 status
= "Reduced by ";
2484 delta
= out_total
- in_total
;
2485 status
= "INCREASED by ";
2488 status
+= str(100 * delta
/ in_total
);
2491 status
+= str(delta
);
2493 status
+= str(in_total
);
2496 status
+= str(out_total
);
2498 compactor
->set_status("Total", status
);