1 /** @file honey_compact.cc
2 * @brief Compact a honey database, or merge and compact several.
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "xapian/compactor.h"
25 #include "xapian/constants.h"
26 #include "xapian/error.h"
27 #include "xapian/types.h"
33 #include <type_traits>
40 #include "safeerrno.h"
42 #include "backends/flint_lock.h"
43 #include "compression_stream.h"
44 #include "honey_cursor.h"
45 #include "honey_database.h"
46 #include "honey_defs.h"
47 #include "honey_postlist_encodings.h"
48 #include "honey_table.h"
49 #include "honey_version.h"
50 #include "filetests.h"
51 #include "internaltypes.h"
53 #include "backends/valuestats.h"
54 #include "wordaccess.h"
56 #include "../byte_length_strings.h"
57 #include "../prefix_compressed_strings.h"
59 #ifndef DISABLE_GPL_LIBXAPIAN
60 # include "../glass/glass_database.h"
61 # include "../glass/glass_table.h"
62 # include "../glass/glass_values.h"
67 #ifndef DISABLE_GPL_LIBXAPIAN
70 throw_database_corrupt(const char* item
, const char* pos
)
74 message
= "Value overflow unpacking termlist: ";
76 message
= "Out of data unpacking termlist: ";
79 throw Xapian::DatabaseCorruptError(message
);
82 namespace GlassCompact
{
85 is_user_metadata_key(const string
& key
)
87 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xc0';
91 is_valuestats_key(const string
& key
)
93 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd0';
97 is_valuechunk_key(const string
& key
)
99 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd8';
103 is_doclenchunk_key(const string
& key
)
105 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xe0';
111 termlist_key_is_values_used(const string
& key
)
113 const char* p
= key
.data();
114 const char* e
= p
+ key
.size();
116 if (unpack_uint_preserving_sort(&p
, e
, &did
)) {
119 if (e
- p
== 1 && *p
== '\0')
122 throw Xapian::DatabaseCorruptError("termlist key format");
126 // Put all the helpers in a namespace to avoid symbols colliding with those of
127 // the same name in other flint-derived backends.
128 namespace HoneyCompact
{
131 KEY_USER_METADATA
= 0x00,
132 KEY_VALUE_STATS
= 0x01,
133 KEY_VALUE_CHUNK
= 0xd8,
134 KEY_DOCLEN_CHUNK
= 0xe0,
135 KEY_POSTING_CHUNK
= 0xff
138 /// Return one of the KEY_* constants, or a different value for an invalid key.
140 key_type(const string
& key
)
143 return KEY_POSTING_CHUNK
;
148 switch (static_cast<unsigned char>(key
[1])) {
149 case 0x01: case 0x02: case 0x03: case 0x04:
150 return KEY_VALUE_STATS
;
153 // If key[1] is \xff then this correctly returns KEY_POSTING_CHUNK.
154 return static_cast<unsigned char>(key
[1]);
159 bool read_only
= true;
160 mutable size_t buf_end
= 0;
161 mutable char buf
[4096];
167 if (fd
>= 0) close(fd
);
170 bool open(const std::string
& path
, bool read_only_
) {
171 if (fd
>= 0) close(fd
);
172 read_only
= read_only_
;
174 // FIXME: add new io_open_stream_rd() etc?
175 fd
= io_open_block_rd(path
);
177 // FIXME: Always create anew for now...
178 fd
= io_open_block_wr(path
, true);
183 off_t
get_pos() const {
185 lseek(fd
, 0, SEEK_CUR
) - buf_end
:
186 lseek(fd
, 0, SEEK_CUR
) + buf_end
;
190 if (buf_end
) return false;
192 if (fd
== -1 || fstat(fd
, &sbuf
) < 0) return true;
193 return (sbuf
.st_size
== 0);
196 void write(unsigned char ch
) {
197 if (buf_end
== sizeof(buf
)) {
199 io_write(fd
, buf
, buf_end
);
205 void write(const char* p
, size_t len
) {
206 if (buf_end
+ len
<= sizeof(buf
)) {
207 memcpy(buf
+ buf_end
, p
, len
);
215 iov
[0].iov_base
= buf
;
216 iov
[0].iov_len
= buf_end
;
217 iov
[1].iov_base
= const_cast<char*>(p
);
218 iov
[1].iov_len
= len
;
219 ssize_t n_
= writev(fd
, iov
, 2);
222 if (n
== buf_end
+ len
) {
232 io_write(fd
, p
, len
);
237 memmove(buf
, buf
+ n
, buf_end
);
240 io_write(fd
, buf
, buf_end
);
241 if (len
>= sizeof(buf
)) {
242 // If it's bigger than our buffer, just write it directly.
243 io_write(fd
, p
, len
);
255 ssize_t r
= ::read(fd
, buf
, sizeof(buf
));
256 if (r
== 0) return EOF
;
258 if (errno
== EINTR
) goto retry
;
259 throw Xapian::DatabaseError("read failed", errno
);
261 if (size_t(r
) < sizeof(buf
)) {
262 memmove(buf
+ sizeof(buf
) - r
, buf
, r
);
266 return static_cast<unsigned char>(buf
[sizeof(buf
) - buf_end
--]);
269 bool read(char* p
, size_t len
) const {
270 if (len
<= buf_end
) {
271 memcpy(p
, buf
+ sizeof(buf
) - buf_end
, len
);
275 memcpy(p
, buf
+ sizeof(buf
) - buf_end
, buf_end
);
279 // FIXME: refill buffer if len < sizeof(buf)
280 return ::read(fd
, p
, len
) == ssize_t(len
);
284 if (!read_only
&& buf_end
) {
285 io_write(fd
, buf
, buf_end
);
296 lseek(fd
, 0, SEEK_SET
);
301 template<typename T
> class PostlistCursor
;
303 #ifndef DISABLE_GPL_LIBXAPIAN
304 // Convert glass to honey.
306 class PostlistCursor
<const GlassTable
&> : private GlassCursor
{
307 Xapian::docid offset
;
311 Xapian::docid firstdid
;
312 Xapian::docid chunk_lastdid
;
313 Xapian::termcount tf
, cf
;
314 Xapian::termcount first_wdf
;
316 PostlistCursor(const GlassTable
*in
, Xapian::docid offset_
)
317 : GlassCursor(in
), offset(offset_
), firstdid(0)
324 if (!GlassCursor::next()) return false;
325 // We put all chunks into the non-initial chunk form here, then fix up
326 // the first chunk for each term in the merged database as we merge.
331 if (GlassCompact::is_user_metadata_key(key
)) {
332 key
[1] = KEY_USER_METADATA
;
335 if (GlassCompact::is_valuestats_key(key
)) {
337 const char * p
= key
.data();
338 const char * end
= p
+ key
.length();
340 Xapian::valueno slot
;
341 if (!unpack_uint_last(&p
, end
, &slot
))
342 throw Xapian::DatabaseCorruptError("bad value stats key");
343 key
= pack_honey_valuestats_key(slot
);
346 if (GlassCompact::is_valuechunk_key(key
)) {
347 const char * p
= key
.data();
348 const char * end
= p
+ key
.length();
350 Xapian::valueno slot
;
351 if (!unpack_uint(&p
, end
, &slot
))
352 throw Xapian::DatabaseCorruptError("bad value key");
353 Xapian::docid first_did
;
354 if (!unpack_uint_preserving_sort(&p
, end
, &first_did
))
355 throw Xapian::DatabaseCorruptError("bad value key");
358 Glass::ValueChunkReader
reader(tag
.data(), tag
.size(), first_did
);
359 Xapian::docid last_did
= first_did
;
360 while (reader
.next(), !reader
.at_end()) {
361 last_did
= reader
.get_docid();
364 key
.assign("\0\xd8", 2);
365 pack_uint(key
, slot
);
366 pack_uint_preserving_sort(key
, last_did
);
368 // Add the docid delta across the chunk to the start of the tag.
370 pack_uint(newtag
, last_did
- first_did
);
371 tag
.insert(0, newtag
);
376 if (GlassCompact::is_doclenchunk_key(key
)) {
377 const char * d
= key
.data();
378 const char * e
= d
+ key
.size();
382 // This is an initial chunk, so adjust tag header.
385 if (!unpack_uint(&d
, e
, &tf
) ||
386 !unpack_uint(&d
, e
, &cf
) ||
387 !unpack_uint(&d
, e
, &firstdid
)) {
388 throw Xapian::DatabaseCorruptError("Bad postlist key");
391 tag
.erase(0, d
- tag
.data());
393 // Not an initial chunk, so adjust key.
394 size_t tmp
= d
- key
.data();
395 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
396 throw Xapian::DatabaseCorruptError("Bad postlist key");
404 // Convert doclen chunk to honey format.
407 // Skip the "last chunk" flag and increase_to_last.
409 throw Xapian::DatabaseCorruptError("No last chunk flag in "
410 "glass docdata chunk");
412 Xapian::docid increase_to_last
;
413 if (!unpack_uint(&d
, e
, &increase_to_last
))
414 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
415 "in glass docdata chunk");
417 Xapian::termcount doclen_max
= 0;
419 Xapian::termcount doclen
;
420 if (!unpack_uint(&d
, e
, &doclen
))
421 throw Xapian::DatabaseCorruptError("Decoding doclen in "
422 "glass docdata chunk");
423 if (doclen
> doclen_max
)
425 unsigned char buf
[4];
426 unaligned_write4(buf
, doclen
);
427 newtag
.append(reinterpret_cast<char*>(buf
), 4);
430 Xapian::docid gap_size
;
431 if (!unpack_uint(&d
, e
, &gap_size
))
432 throw Xapian::DatabaseCorruptError("Decoding docid "
435 // FIXME: Split chunk if the gap_size is at all large.
436 newtag
.append(4 * gap_size
, '\xff');
439 Assert(!startswith(newtag
, "\xff\xff\xff\xff"));
440 Assert(!endswith(newtag
, "\xff\xff\xff\xff"));
442 chunk_lastdid
= firstdid
- 1 + newtag
.size() / 4;
444 // Only encode document lengths using a whole number of bytes for
445 // now. We could allow arbitrary bit widths, but it complicates
446 // encoding and decoding so we should consider if the fairly small
447 // additional saving is worth it.
448 if (doclen_max
>= 0xffff) {
449 if (doclen_max
>= 0xffffff) {
450 newtag
.insert(0, 1, char(32));
452 } else if (doclen_max
>= 0xffffffff) {
453 // FIXME: Handle these.
454 const char* m
= "Document length values >= 0xffffffff not "
456 throw Xapian::FeatureUnavailableError(m
);
458 tag
.assign(1, char(24));
459 for (size_t i
= 1; i
< newtag
.size(); i
+= 4)
460 tag
.append(newtag
, i
, 3);
463 if (doclen_max
>= 0xff) {
464 tag
.assign(1, char(16));
465 for (size_t i
= 2; i
< newtag
.size(); i
+= 4)
466 tag
.append(newtag
, i
, 2);
468 tag
.assign(1, char(8));
469 for (size_t i
= 3; i
< newtag
.size(); i
+= 4)
470 tag
.append(newtag
, i
, 1);
477 // Adjust key if this is *NOT* an initial chunk.
478 // key is: pack_string_preserving_sort(key, tname)
479 // plus optionally: pack_uint_preserving_sort(key, did)
480 const char * d
= key
.data();
481 const char * e
= d
+ key
.size();
483 if (!unpack_string_preserving_sort(&d
, e
, tname
))
484 throw Xapian::DatabaseCorruptError("Bad postlist key");
487 // This is an initial chunk for a term, so adjust tag header.
490 if (!unpack_uint(&d
, e
, &tf
) ||
491 !unpack_uint(&d
, e
, &cf
) ||
492 !unpack_uint(&d
, e
, &firstdid
)) {
493 throw Xapian::DatabaseCorruptError("Bad postlist key");
496 tag
.erase(0, d
- tag
.data());
498 // Not an initial chunk, so adjust key.
499 size_t tmp
= d
- key
.data();
500 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
501 throw Xapian::DatabaseCorruptError("Bad postlist key");
509 // Convert posting chunk to honey format, but without any header.
512 // Skip the "last chunk" flag; decode increase_to_last.
514 throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
517 Xapian::docid increase_to_last
;
518 if (!unpack_uint(&d
, e
, &increase_to_last
))
519 throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
520 "glass posting chunk");
521 chunk_lastdid
= firstdid
+ increase_to_last
;
522 if (!unpack_uint(&d
, e
, &first_wdf
))
523 throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
528 if (!unpack_uint(&d
, e
, &delta
))
529 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
530 "glass posting chunk");
531 pack_uint(newtag
, delta
);
532 Xapian::termcount wdf
;
533 if (!unpack_uint(&d
, e
, &wdf
))
534 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
536 pack_uint(newtag
, wdf
);
547 class PostlistCursor
<const HoneyTable
&> : private HoneyCursor
{
548 Xapian::docid offset
;
552 Xapian::docid firstdid
;
553 Xapian::docid chunk_lastdid
;
554 Xapian::termcount tf
, cf
;
555 Xapian::termcount first_wdf
;
557 PostlistCursor(const HoneyTable
*in
, Xapian::docid offset_
)
558 : HoneyCursor(in
), offset(offset_
), firstdid(0)
565 if (!HoneyCursor::next()) return false;
566 // We put all chunks into the non-initial chunk form here, then fix up
567 // the first chunk for each term in the merged database as we merge.
572 switch (key_type(key
)) {
573 case KEY_USER_METADATA
:
574 case KEY_VALUE_STATS
:
576 case KEY_VALUE_CHUNK
: {
577 const char * p
= key
.data();
578 const char * end
= p
+ key
.length();
580 Xapian::valueno slot
;
581 if (!unpack_uint(&p
, end
, &slot
))
582 throw Xapian::DatabaseCorruptError("bad value key");
584 if (!unpack_uint_preserving_sort(&p
, end
, &did
))
585 throw Xapian::DatabaseCorruptError("bad value key");
588 key
.assign("\0\xd8", 2);
589 pack_uint(key
, slot
);
590 pack_uint_preserving_sort(key
, did
);
593 case KEY_DOCLEN_CHUNK
: {
594 const char * p
= key
.data();
595 const char * end
= p
+ key
.length();
597 Xapian::docid did
= 1;
599 (!unpack_uint_preserving_sort(&p
, end
, &did
) || p
!= end
)) {
600 throw Xapian::DatabaseCorruptError("Bad doclen key");
606 pack_uint_preserving_sort(key
, did
);
610 case KEY_POSTING_CHUNK
:
613 throw Xapian::DatabaseCorruptError("Bad postlist table key "
617 // Adjust key if this is *NOT* an initial chunk.
618 // key is: pack_string_preserving_sort(key, term)
619 // plus optionally: pack_uint_preserving_sort(key, did)
620 const char * d
= key
.data();
621 const char * e
= d
+ key
.size();
623 if (!unpack_string_preserving_sort(&d
, e
, term
))
624 throw Xapian::DatabaseCorruptError("Bad postlist key");
627 // This is an initial chunk for a term, so remove tag header.
631 Xapian::docid lastdid
;
632 if (!decode_initial_chunk_header(&d
, e
, tf
, cf
,
633 firstdid
, lastdid
, chunk_lastdid
,
635 throw Xapian::DatabaseCorruptError("Bad postlist initial "
638 // Ignore lastdid - we'll need to recalculate it (at least when
639 // merging, and for simplicity we always do).
641 tag
.erase(0, d
- tag
.data());
643 // Not an initial chunk, so adjust key and remove tag header.
644 size_t tmp
= d
- key
.data();
645 if (!unpack_uint_preserving_sort(&d
, e
, &chunk_lastdid
) ||
647 throw Xapian::DatabaseCorruptError("Bad postlist key");
649 // -1 to remove the terminating zero byte too.
656 if (!decode_delta_chunk_header(&d
, e
, chunk_lastdid
, firstdid
,
658 throw Xapian::DatabaseCorruptError("Bad postlist delta "
661 tag
.erase(0, d
- tag
.data());
663 if (!decode_delta_chunk_header_bool(&d
, e
, chunk_lastdid
,
665 throw Xapian::DatabaseCorruptError("Bad postlist delta "
669 // Splice in a 0 wdf value after each docid delta.
671 for (size_t i
= d
- tag
.data(); i
< tag
.size(); i
+= 4) {
672 newtag
.append(tag
, i
, 4);
673 newtag
.append(4, '\0');
675 tag
= std::move(newtag
);
679 chunk_lastdid
+= offset
;
685 class PostlistCursor
<HoneyTable
&> : public PostlistCursor
<const HoneyTable
&> {
687 PostlistCursor(HoneyTable
*in
, Xapian::docid offset_
)
688 : PostlistCursor
<const HoneyTable
&>(in
, offset_
) {}
692 class PostlistCursorGt
{
694 /** Return true if and only if a's key is strictly greater than b's key.
696 bool operator()(const T
* a
, const T
* b
) const {
697 if (a
->key
> b
->key
) return true;
698 if (a
->key
!= b
->key
) return false;
699 return (a
->firstdid
> b
->firstdid
);
704 encode_valuestats(Xapian::doccount freq
,
705 const string
& lbound
, const string
& ubound
)
708 pack_uint(value
, freq
);
709 pack_string(value
, lbound
);
710 // We don't store or count empty values, so neither of the bounds
711 // can be empty. So we can safely store an empty upper bound when
712 // the bounds are equal.
713 if (lbound
!= ubound
) value
+= ubound
;
717 // U : vector<HoneyTable*>::const_iterator
718 template<typename T
, typename U
> void
719 merge_postlists(Xapian::Compactor
* compactor
,
720 T
* out
, vector
<Xapian::docid
>::const_iterator offset
,
723 typedef decltype(**b
) table_type
; // E.g. HoneyTable
724 typedef PostlistCursor
<table_type
> cursor_type
;
725 typedef PostlistCursorGt
<cursor_type
> gt_type
;
726 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
727 for ( ; b
!= e
; ++b
, ++offset
) {
730 // Skip empty tables.
734 pq
.push(new cursor_type(in
, *offset
));
739 // Merge user metadata.
741 while (!pq
.empty()) {
742 cursor_type
* cur
= pq
.top();
743 const string
& key
= cur
->key
;
744 if (key_type(key
) != KEY_USER_METADATA
) break;
746 if (key
!= last_key
) {
748 if (tags
.size() > 1 && compactor
) {
749 Assert(!last_key
.empty());
750 // FIXME: It would be better to merge all duplicates
751 // for a key in one call, but currently we don't in
753 const string
& resolved_tag
=
754 compactor
->resolve_duplicate_metadata(last_key
,
757 out
->add(last_key
, resolved_tag
);
759 Assert(!last_key
.empty());
760 out
->add(last_key
, tags
[0]);
766 tags
.push_back(cur
->tag
);
776 if (tags
.size() > 1 && compactor
) {
777 Assert(!last_key
.empty());
778 const string
& resolved_tag
=
779 compactor
->resolve_duplicate_metadata(last_key
,
782 out
->add(last_key
, resolved_tag
);
784 Assert(!last_key
.empty());
785 out
->add(last_key
, tags
[0]);
792 Xapian::doccount freq
= 0;
793 string lbound
, ubound
;
795 while (!pq
.empty()) {
796 cursor_type
* cur
= pq
.top();
797 const string
& key
= cur
->key
;
798 if (key_type(key
) != KEY_VALUE_STATS
) break;
799 if (key
!= last_key
) {
800 // For the first valuestats key, last_key will be the previous
801 // key we wrote, which we don't want to overwrite. This is the
802 // only time that freq will be 0, so check that.
804 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
810 const string
& tag
= cur
->tag
;
812 const char * pos
= tag
.data();
813 const char * end
= pos
+ tag
.size();
817 if (!unpack_uint(&pos
, end
, &f
)) {
819 throw Xapian::DatabaseCorruptError("Incomplete stats item "
821 throw Xapian::RangeError("Frequency statistic in value table "
824 if (!unpack_string(&pos
, end
, l
)) {
826 throw Xapian::DatabaseCorruptError("Incomplete stats item "
828 throw Xapian::RangeError("Lower bound in value table is too "
831 size_t len
= end
- pos
;
843 if (l
< lbound
) lbound
= l
;
844 if (u
> ubound
) ubound
= u
;
856 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
860 // Merge valuestream chunks.
861 while (!pq
.empty()) {
862 cursor_type
* cur
= pq
.top();
863 const string
& key
= cur
->key
;
864 if (key_type(key
) != KEY_VALUE_CHUNK
) break;
865 out
->add(key
, cur
->tag
);
874 // Merge doclen chunks.
875 while (!pq
.empty()) {
876 cursor_type
* cur
= pq
.top();
877 string
& key
= cur
->key
;
878 if (key_type(key
) != KEY_DOCLEN_CHUNK
) break;
879 pack_uint_preserving_sort(key
, cur
->chunk_lastdid
);
880 out
->add(key
, cur
->tag
);
889 Xapian::termcount tf
= 0, cf
= 0; // Initialise to avoid warnings.
891 struct HoneyPostListChunk
{
892 Xapian::docid first
, last
;
893 Xapian::termcount first_wdf
;
896 HoneyPostListChunk(Xapian::docid first_
,
898 Xapian::termcount first_wdf_
,
902 first_wdf(first_wdf_
),
905 vector
<HoneyPostListChunk
> tags
;
907 cursor_type
* cur
= NULL
;
913 AssertEq(key_type(cur
->key
), KEY_POSTING_CHUNK
);
915 if (cur
== NULL
|| cur
->key
!= last_key
) {
917 Xapian::termcount first_wdf
= tags
[0].first_wdf
;
918 Xapian::docid chunk_lastdid
= tags
[0].last
;
919 Xapian::docid last_did
= tags
.back().last
;
922 encode_initial_chunk_header(tf
, cf
, tags
[0].first
, last_did
,
924 first_wdf
, first_tag
);
926 first_tag
+= tags
[0].data
;
928 out
->add(last_key
, first_tag
);
930 // If tf == 2, the data could be split over two tags when
931 // merging, but we only output an initial tag in this case.
932 if (tf
> 2 && tags
.size() > 1) {
934 const char* p
= last_key
.data();
935 const char* end
= p
+ last_key
.size();
936 if (!unpack_string_preserving_sort(&p
, end
, term
) ||
938 throw Xapian::DatabaseCorruptError("Bad postlist "
942 auto i
= tags
.begin();
943 while (++i
!= tags
.end()) {
945 const string
& key
= pack_honey_postlist_key(term
,
949 encode_delta_chunk_header(i
->first
,
955 encode_delta_chunk_header_bool(i
->first
,
958 const char* pos
= i
->data
.data();
959 const char* pos_end
= pos
+ i
->data
.size();
960 while (pos
!= pos_end
) {
962 if (!unpack_uint(&pos
, pos_end
, &delta
))
963 throw_database_corrupt("Decoding docid "
965 pack_uint(tag
, delta
);
966 Xapian::termcount wdf
;
967 if (!unpack_uint(&pos
, pos_end
, &wdf
))
968 throw_database_corrupt("Decoding wdf",
970 // Only copy over the docid deltas, not the
981 if (cur
== NULL
) break;
989 tags
.push_back(HoneyPostListChunk(cur
->firstdid
,
992 std::move(cur
->tag
)));
1001 template<typename T
> struct MergeCursor
;
1003 #ifndef DISABLE_GPL_LIBXAPIAN
1005 struct MergeCursor
<const GlassTable
&> : public GlassCursor
{
1006 explicit MergeCursor(const GlassTable
*in
) : GlassCursor(in
) {
1014 struct MergeCursor
<const HoneyTable
&> : public HoneyCursor
{
1015 explicit MergeCursor(const HoneyTable
*in
) : HoneyCursor(in
) {
1021 template<typename T
>
1023 /// Return true if and only if a's key is strictly greater than b's key.
1024 bool operator()(const T
* a
, const T
* b
) const {
1025 if (b
->after_end()) return false;
1026 if (a
->after_end()) return true;
1027 return (a
->current_key
> b
->current_key
);
1031 #ifndef DISABLE_GPL_LIBXAPIAN
1032 // Convert glass to honey.
1034 merge_spellings(HoneyTable
* out
,
1035 vector
<const GlassTable
*>::const_iterator b
,
1036 vector
<const GlassTable
*>::const_iterator e
)
1038 typedef MergeCursor
<const GlassTable
&> cursor_type
;
1039 typedef CursorGt
<cursor_type
> gt_type
;
1040 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1041 for ( ; b
!= e
; ++b
) {
1044 pq
.push(new cursor_type(in
));
1048 while (!pq
.empty()) {
1049 cursor_type
* cur
= pq
.top();
1052 // Glass vs honey spelling key prefix map:
1056 // bookend Bbd \x00bd
1058 // middle Mddl \x02ddl
1062 // In honey, if the word's first byte is <= '\x04', we add a prefix
1063 // of '\x04' (which is unlikely in real world use but ensures that
1064 // we can handle arbitrary strings).
1066 // The prefixes for honey are chosen such that we save a byte for the
1067 // key of all real-world words, and so that more first bytes are used
1068 // so that a top-level array index is more effective, especially for
1069 // random-access lookup of word entries (which we do to check the
1070 // frequency of potential spelling candidates).
1072 // The other prefixes are chosen such that we can do look up in key
1073 // order at search time, which is more efficient for a cursor which can
1074 // only efficiently move forwards.
1076 // Note that the key ordering is the same for glass and honey, which
1077 // makes translating during compaction simpler.
1078 string key
= cur
->current_key
;
1093 if (static_cast<unsigned char>(key
[1]) > 0x04)
1099 string m
= "Bad spelling key prefix: ";
1100 m
+= static_cast<unsigned char>(key
[0]);
1101 throw Xapian::DatabaseCorruptError(m
);
1105 if (pq
.empty() || pq
.top()->current_key
> key
) {
1106 // No need to merge the tags, just copy the (possibly compressed)
1108 bool compressed
= cur
->read_tag(true);
1109 out
->add(key
, cur
->current_tag
, compressed
);
1118 // Merge tag values with the same key:
1120 if (static_cast<unsigned char>(key
[0]) < 0x04) {
1121 // We just want the union of words, so copy over the first instance
1122 // and skip any identical ones.
1123 priority_queue
<PrefixCompressedStringItor
*,
1124 vector
<PrefixCompressedStringItor
*>,
1125 PrefixCompressedStringItorGt
> pqtag
;
1126 // Stick all the cursor_type pointers in a vector because their
1127 // current_tag members must remain valid while we're merging their
1128 // tags, but we need to call next() on them all afterwards.
1129 vector
<cursor_type
*> vec
;
1130 vec
.reserve(pq
.size());
1134 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
));
1136 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1141 PrefixCompressedStringWriter
wr(tag
);
1143 while (!pqtag
.empty()) {
1144 PrefixCompressedStringItor
* it
= pqtag
.top();
1147 if (word
!= lastword
) {
1149 wr
.append(lastword
);
1152 if (!it
->at_end()) {
1159 for (auto i
: vec
) {
1168 // We want to sum the frequencies from tags for the same key.
1169 Xapian::termcount tot_freq
= 0;
1172 Xapian::termcount freq
;
1173 const char * p
= cur
->current_tag
.data();
1174 const char * end
= p
+ cur
->current_tag
.size();
1175 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1176 throw_database_corrupt("Bad spelling word freq", p
);
1184 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1189 pack_uint_last(tag
, tot_freq
);
1197 merge_spellings(HoneyTable
* out
,
1198 vector
<const HoneyTable
*>::const_iterator b
,
1199 vector
<const HoneyTable
*>::const_iterator e
)
1201 typedef MergeCursor
<const HoneyTable
&> cursor_type
;
1202 typedef CursorGt
<cursor_type
> gt_type
;
1203 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1204 for ( ; b
!= e
; ++b
) {
1207 pq
.push(new cursor_type(in
));
1211 while (!pq
.empty()) {
1212 cursor_type
* cur
= pq
.top();
1215 string key
= cur
->current_key
;
1216 if (pq
.empty() || pq
.top()->current_key
> key
) {
1217 // No need to merge the tags, just copy the (possibly compressed)
1219 bool compressed
= cur
->read_tag(true);
1220 out
->add(key
, cur
->current_tag
, compressed
);
1229 // Merge tag values with the same key:
1231 if (static_cast<unsigned char>(key
[0]) < 0x04) {
1232 // We just want the union of words, so copy over the first instance
1233 // and skip any identical ones.
1234 priority_queue
<PrefixCompressedStringItor
*,
1235 vector
<PrefixCompressedStringItor
*>,
1236 PrefixCompressedStringItorGt
> pqtag
;
1237 // Stick all the cursor_type pointers in a vector because their
1238 // current_tag members must remain valid while we're merging their
1239 // tags, but we need to call next() on them all afterwards.
1240 vector
<cursor_type
*> vec
;
1241 vec
.reserve(pq
.size());
1245 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
));
1247 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1252 PrefixCompressedStringWriter
wr(tag
);
1254 while (!pqtag
.empty()) {
1255 PrefixCompressedStringItor
* it
= pqtag
.top();
1258 if (word
!= lastword
) {
1260 wr
.append(lastword
);
1263 if (!it
->at_end()) {
1270 for (auto i
: vec
) {
1279 // We want to sum the frequencies from tags for the same key.
1280 Xapian::termcount tot_freq
= 0;
1283 Xapian::termcount freq
;
1284 const char * p
= cur
->current_tag
.data();
1285 const char * end
= p
+ cur
->current_tag
.size();
1286 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1287 throw_database_corrupt("Bad spelling word freq", p
);
1295 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1300 pack_uint_last(tag
, tot_freq
);
1306 // U : vector<HoneyTable*>::const_iterator
1307 template<typename T
, typename U
> void
1308 merge_synonyms(T
* out
, U b
, U e
)
1310 typedef decltype(**b
) table_type
; // E.g. HoneyTable
1311 typedef MergeCursor
<table_type
> cursor_type
;
1312 typedef CursorGt
<cursor_type
> gt_type
;
1313 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1314 for ( ; b
!= e
; ++b
) {
1317 pq
.push(new cursor_type(in
));
1321 while (!pq
.empty()) {
1322 cursor_type
* cur
= pq
.top();
1325 string key
= cur
->current_key
;
1326 if (pq
.empty() || pq
.top()->current_key
> key
) {
1327 // No need to merge the tags, just copy the (possibly compressed)
1329 bool compressed
= cur
->read_tag(true);
1330 out
->add(key
, cur
->current_tag
, compressed
);
1339 // Merge tag values with the same key:
1342 // We just want the union of words, so copy over the first instance
1343 // and skip any identical ones.
1344 priority_queue
<ByteLengthPrefixedStringItor
*,
1345 vector
<ByteLengthPrefixedStringItor
*>,
1346 ByteLengthPrefixedStringItorGt
> pqtag
;
1347 vector
<cursor_type
*> vec
;
1351 pqtag
.push(new ByteLengthPrefixedStringItor(cur
->current_tag
));
1353 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1359 while (!pqtag
.empty()) {
1360 ByteLengthPrefixedStringItor
* it
= pqtag
.top();
1362 if (**it
!= lastword
) {
1364 tag
+= byte(lastword
.size() ^ MAGIC_XOR_VALUE
);
1368 if (!it
->at_end()) {
1375 for (auto i
: vec
) {
1388 template<typename T
, typename U
> void
1389 multimerge_postlists(Xapian::Compactor
* compactor
,
1390 T
* out
, const char * tmpdir
,
1391 const vector
<U
*>& in
,
1392 vector
<Xapian::docid
> off
)
1394 if (in
.size() <= 3) {
1395 merge_postlists(compactor
, out
, off
.begin(), in
.begin(), in
.end());
1399 vector
<HoneyTable
*> tmp
;
1400 tmp
.reserve(in
.size() / 2);
1402 vector
<Xapian::docid
> newoff
;
1403 newoff
.resize(in
.size() / 2);
1404 for (unsigned int i
= 0, j
; i
< in
.size(); i
= j
) {
1406 if (j
== in
.size() - 1) ++j
;
1408 string dest
= tmpdir
;
1410 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1413 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1415 // Use maximum blocksize for temporary tables. And don't compress
1416 // entries in temporary tables, even if the final table would do
1417 // so. Any already compressed entries will get copied in
1418 // compressed form. (FIXME: HoneyTable has no blocksize)
1419 Honey::RootInfo root_info
;
1420 root_info
.init(65536, 0);
1421 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1422 tmptab
->create_and_open(flags
, root_info
);
1424 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1425 in
.begin() + i
, in
.begin() + j
);
1426 tmp
.push_back(tmptab
);
1428 tmptab
->commit(1, &root_info
);
1429 AssertRel(root_info
.get_blocksize(),==,65536);
1435 while (tmp
.size() > 3) {
1436 vector
<HoneyTable
*> tmpout
;
1437 tmpout
.reserve(tmp
.size() / 2);
1438 vector
<Xapian::docid
> newoff
;
1439 newoff
.resize(tmp
.size() / 2);
1440 for (unsigned int i
= 0, j
; i
< tmp
.size(); i
= j
) {
1442 if (j
== tmp
.size() - 1) ++j
;
1444 string dest
= tmpdir
;
1446 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1449 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1451 // Use maximum blocksize for temporary tables. And don't compress
1452 // entries in temporary tables, even if the final table would do
1453 // so. Any already compressed entries will get copied in
1454 // compressed form. (FIXME: HoneyTable has no blocksize)
1455 Honey::RootInfo root_info
;
1456 root_info
.init(65536, 0);
1457 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1458 tmptab
->create_and_open(flags
, root_info
);
1460 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1461 tmp
.begin() + i
, tmp
.begin() + j
);
1463 for (unsigned int k
= i
; k
< j
; ++k
) {
1464 // FIXME: unlink(tmp[k]->get_path().c_str());
1469 tmpout
.push_back(tmptab
);
1471 tmptab
->commit(1, &root_info
);
1472 AssertRel(root_info
.get_blocksize(),==,65536);
1478 merge_postlists(compactor
, out
, off
.begin(), tmp
.begin(), tmp
.end());
1480 for (size_t k
= 0; k
< tmp
.size(); ++k
) {
1481 // FIXME: unlink(tmp[k]->get_path().c_str());
1488 template<typename T
> class PositionCursor
;
1490 #ifndef DISABLE_GPL_LIBXAPIAN
1492 class PositionCursor
<const GlassTable
&> : private GlassCursor
{
1493 Xapian::docid offset
;
1497 Xapian::docid firstdid
;
1499 PositionCursor(const GlassTable
*in
, Xapian::docid offset_
)
1500 : GlassCursor(in
), offset(offset_
), firstdid(0) {
1506 if (!GlassCursor::next()) return false;
1508 const char * d
= current_key
.data();
1509 const char * e
= d
+ current_key
.size();
1512 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1513 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1515 throw Xapian::DatabaseCorruptError("Bad position key");
1519 pack_string_preserving_sort(key
, term
);
1520 pack_uint_preserving_sort(key
, did
+ offset
);
1524 const string
& get_tag() const {
1531 class PositionCursor
<const HoneyTable
&> : private HoneyCursor
{
1532 Xapian::docid offset
;
1536 Xapian::docid firstdid
;
1538 PositionCursor(const HoneyTable
*in
, Xapian::docid offset_
)
1539 : HoneyCursor(in
), offset(offset_
), firstdid(0) {
1545 if (!HoneyCursor::next()) return false;
1547 const char * d
= current_key
.data();
1548 const char * e
= d
+ current_key
.size();
1551 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1552 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1554 throw Xapian::DatabaseCorruptError("Bad position key");
1558 pack_string_preserving_sort(key
, term
);
1559 pack_uint_preserving_sort(key
, did
+ offset
);
1563 const string
& get_tag() const {
1568 template<typename T
>
1569 class PositionCursorGt
{
1571 /** Return true if and only if a's key is strictly greater than b's key.
1573 bool operator()(const T
* a
, const T
* b
) const {
1574 return a
->key
> b
->key
;
1578 template<typename T
, typename U
> void
1579 merge_positions(T
* out
, const vector
<U
*> & inputs
,
1580 const vector
<Xapian::docid
> & offset
)
1582 typedef decltype(*inputs
[0]) table_type
; // E.g. HoneyTable
1583 typedef PositionCursor
<table_type
> cursor_type
;
1584 typedef PositionCursorGt
<cursor_type
> gt_type
;
1585 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1586 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1587 auto in
= inputs
[i
];
1589 // Skip empty tables.
1593 pq
.push(new cursor_type(in
, offset
[i
]));
1596 while (!pq
.empty()) {
1597 cursor_type
* cur
= pq
.top();
1599 out
->add(cur
->key
, cur
->get_tag());
1608 template<typename T
, typename U
> void
1609 merge_docid_keyed(T
*out
, const vector
<U
*> & inputs
,
1610 const vector
<Xapian::docid
> & offset
,
1613 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1614 Xapian::docid off
= offset
[i
];
1616 auto in
= inputs
[i
];
1617 if (in
->empty()) continue;
1619 HoneyCursor
cur(in
);
1623 while (cur
.next()) {
1624 // Adjust the key if this isn't the first database.
1627 const char * d
= cur
.current_key
.data();
1628 const char * e
= d
+ cur
.current_key
.size();
1629 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1630 string msg
= "Bad key in ";
1631 msg
+= inputs
[i
]->get_path();
1632 throw Xapian::DatabaseCorruptError(msg
);
1636 pack_uint_preserving_sort(key
, did
);
1638 // Copy over anything extra in the key (e.g. the zero byte
1639 // at the end of "used value slots" in the termlist table).
1640 key
.append(d
, e
- d
);
1643 key
= cur
.current_key
;
1645 bool compressed
= cur
.read_tag(true);
1646 out
->add(key
, cur
.current_tag
, compressed
);
1651 #ifndef DISABLE_GPL_LIBXAPIAN
1652 template<typename T
> void
1653 merge_docid_keyed(T
*out
, const vector
<const GlassTable
*> & inputs
,
1654 const vector
<Xapian::docid
> & offset
,
1657 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1658 Xapian::docid off
= offset
[i
];
1660 auto in
= inputs
[i
];
1661 if (in
->empty()) continue;
1663 GlassCursor
cur(in
);
1667 while (cur
.next()) {
1669 // Adjust the key if this isn't the first database.
1672 const char * d
= cur
.current_key
.data();
1673 const char * e
= d
+ cur
.current_key
.size();
1674 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1675 string msg
= "Bad key in ";
1676 msg
+= inputs
[i
]->get_path();
1677 throw Xapian::DatabaseCorruptError(msg
);
1681 pack_uint_preserving_sort(key
, did
);
1683 // Copy over anything extra in the key (e.g. the zero byte
1684 // at the end of "used value slots" in the termlist table).
1685 key
.append(d
, e
- d
);
1688 key
= cur
.current_key
;
1690 if (table_type
== Honey::TERMLIST
) {
1691 if (termlist_key_is_values_used(key
)) {
1692 throw Xapian::DatabaseCorruptError("value slots used "
1698 string tag
= cur
.current_tag
;
1700 bool next_result
= cur
.next();
1701 bool next_already_done
= true;
1702 unsigned bitmap_slots_used
= 0;
1703 string encoded_slots_used
;
1705 termlist_key_is_values_used(cur
.current_key
)) {
1706 next_already_done
= false;
1708 const string
& valtag
= cur
.current_tag
;
1710 const char* p
= valtag
.data();
1711 const char* end
= p
+ valtag
.size();
1713 Xapian::VecCOW
<Xapian::termpos
> slots
;
1715 Xapian::valueno first_slot
;
1716 if (!unpack_uint(&p
, end
, &first_slot
)) {
1717 throw_database_corrupt("Value slot encoding corrupt",
1720 slots
.push_back(first_slot
);
1721 Xapian::valueno last_slot
= first_slot
;
1723 Xapian::valueno slot
;
1724 if (!unpack_uint(&p
, end
, &slot
)) {
1725 throw_database_corrupt("Value slot encoding "
1728 slot
+= last_slot
+ 1;
1731 slots
.push_back(slot
);
1734 if (slots
.back() <= 6) {
1735 // Encode as a bitmap if only slots in the range 0-6
1737 for (auto slot
: slots
) {
1738 bitmap_slots_used
|= 1 << slot
;
1742 pack_uint(enc
, last_slot
);
1743 if (slots
.size() > 1) {
1744 BitWriter
slots_used(enc
);
1745 slots_used
.encode(first_slot
, last_slot
);
1746 slots_used
.encode(slots
.size() - 2,
1747 last_slot
- first_slot
);
1748 slots_used
.encode_interpolative(slots
, 0,
1750 encoded_slots_used
= slots_used
.freeze();
1752 encoded_slots_used
= std::move(enc
);
1757 const char* pos
= tag
.data();
1758 const char* end
= pos
+ tag
.size();
1761 if (encoded_slots_used
.empty()) {
1762 newtag
+= char(bitmap_slots_used
);
1764 auto size
= encoded_slots_used
.size();
1766 newtag
+= char(0x80 | size
);
1769 pack_uint(newtag
, size
);
1771 newtag
+= encoded_slots_used
;
1775 Xapian::termcount doclen
;
1776 if (!unpack_uint(&pos
, end
, &doclen
)) {
1777 throw_database_corrupt("doclen", pos
);
1779 Xapian::termcount termlist_size
;
1780 if (!unpack_uint(&pos
, end
, &termlist_size
)) {
1781 throw_database_corrupt("termlist length", pos
);
1783 pack_uint(newtag
, termlist_size
- 1);
1784 pack_uint(newtag
, doclen
);
1786 string current_term
;
1787 while (pos
!= end
) {
1788 Xapian::termcount current_wdf
= 0;
1790 if (!current_term
.empty()) {
1791 size_t reuse
= static_cast<unsigned char>(*pos
++);
1792 newtag
+= char(reuse
);
1794 if (reuse
> current_term
.size()) {
1795 current_wdf
= reuse
/ (current_term
.size() + 1);
1796 reuse
= reuse
% (current_term
.size() + 1);
1798 current_term
.resize(reuse
);
1801 throw_database_corrupt("term", NULL
);
1804 size_t append
= static_cast<unsigned char>(*pos
++);
1805 if (size_t(end
- pos
) < append
)
1806 throw_database_corrupt("term", NULL
);
1808 current_term
.append(pos
, append
);
1814 if (!unpack_uint(&pos
, end
, ¤t_wdf
)) {
1815 throw_database_corrupt("wdf", pos
);
1817 pack_uint(newtag
, current_wdf
);
1820 newtag
+= char(append
);
1821 newtag
.append(current_term
.end() - append
,
1822 current_term
.end());
1825 if (!newtag
.empty())
1826 out
->add(key
, newtag
);
1827 if (!next_result
) break;
1828 if (next_already_done
) goto next_without_next
;
1830 bool compressed
= cur
.read_tag(true);
1831 out
->add(key
, cur
.current_tag
, compressed
);
1840 using namespace HoneyCompact
;
1843 HoneyDatabase::compact(Xapian::Compactor
* compactor
,
1844 const char* destdir
,
1847 const vector
<const Xapian::Database::Internal
*>& sources
,
1848 const vector
<Xapian::docid
>& offset
,
1850 Xapian::Compactor::compaction_level compaction
,
1852 Xapian::docid last_docid
)
1855 // The "base name" of the table.
1858 Honey::table_type type
;
1859 // Create tables after position lazily.
1863 static const table_list tables
[] = {
1865 { "postlist", Honey::POSTLIST
, false },
1866 { "docdata", Honey::DOCDATA
, true },
1867 { "termlist", Honey::TERMLIST
, false },
1868 { "position", Honey::POSITION
, true },
1869 { "spelling", Honey::SPELLING
, true },
1870 { "synonym", Honey::SYNONYM
, true }
1872 const table_list
* tables_end
= tables
+
1873 (sizeof(tables
) / sizeof(tables
[0]));
1875 const int FLAGS
= Xapian::DB_DANGEROUS
;
1877 bool single_file
= (flags
& Xapian::DBCOMPACT_SINGLE_FILE
);
1878 bool multipass
= (flags
& Xapian::DBCOMPACT_MULTIPASS
);
1880 // FIXME: Support this combination - we need to put temporary files
1886 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1887 bool has_uncommitted_changes
;
1888 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1889 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1890 has_uncommitted_changes
= db
->has_uncommitted_changes();
1892 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1893 has_uncommitted_changes
= db
->has_uncommitted_changes();
1895 if (has_uncommitted_changes
) {
1897 "Can't compact from a WritableDatabase with uncommitted "
1898 "changes - either call commit() first, or create a new "
1899 "Database object from the filename on disk";
1900 throw Xapian::InvalidOperationError(m
);
1905 if (block_size
< HONEY_MIN_BLOCKSIZE
||
1906 block_size
> HONEY_MAX_BLOCKSIZE
||
1907 (block_size
& (block_size
- 1)) != 0) {
1908 block_size
= HONEY_DEFAULT_BLOCKSIZE
;
1911 FlintLock
lock(destdir
? destdir
: "");
1914 FlintLock::reason why
= lock
.lock(true, false, explanation
);
1915 if (why
!= FlintLock::SUCCESS
) {
1916 lock
.throw_databaselockerror(why
, destdir
, explanation
);
1920 unique_ptr
<HoneyVersion
> version_file_out
;
1923 fd
= open(destdir
, O_RDWR
|O_CREAT
|O_TRUNC
|O_BINARY
|O_CLOEXEC
, 0666);
1925 throw Xapian::DatabaseCreateError("open() failed", errno
);
1928 version_file_out
.reset(new HoneyVersion(fd
));
1931 version_file_out
.reset(new HoneyVersion(destdir
));
1934 version_file_out
->create(block_size
);
1935 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1936 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1937 #ifdef DISABLE_GPL_LIBXAPIAN
1940 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1941 auto& v_in
= db
->version_file
;
1942 auto& v_out
= version_file_out
;
1943 v_out
->merge_stats(v_in
.get_doccount(),
1944 v_in
.get_doclength_lower_bound(),
1945 v_in
.get_doclength_upper_bound(),
1946 v_in
.get_wdf_upper_bound(),
1947 v_in
.get_total_doclen(),
1948 v_in
.get_spelling_wordfreq_upper_bound());
1951 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1952 version_file_out
->merge_stats(db
->version_file
);
1956 string fl_serialised
;
1960 fl
.set_first_unused_block(1); // FIXME: Assumption?
1961 fl
.pack(fl_serialised
);
1965 // Set to true if stat() failed (which can happen if the files are > 2GB
1966 // and off_t is 32 bit) or one of the totals overflowed.
1967 bool bad_totals
= false;
1968 off_t in_total
= 0, out_total
= 0;
1970 // FIXME: sort out indentation.
1971 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1972 #ifdef DISABLE_GPL_LIBXAPIAN
1973 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1975 vector
<HoneyTable
*> tabs
;
1976 tabs
.reserve(tables_end
- tables
);
1977 off_t prev_size
= block_size
;
1978 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
1979 // The postlist table requires an N-way merge, adjusting the
1980 // headers of various blocks. The spelling and synonym tables also
1981 // need special handling. The other tables have keys sorted in
1982 // docid order, so we can merge them by simply copying all the keys
1983 // from each source table in turn.
1985 compactor
->set_status(t
->name
, string());
1995 bool output_will_exist
= !t
->lazy
;
1997 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
1998 // on certain systems).
1999 bool bad_stat
= false;
2001 // We can't currently report input sizes if there's a single file DB
2002 // amongst the inputs.
2003 bool single_file_in
= false;
2007 vector
<const GlassTable
*> inputs
;
2008 inputs
.reserve(sources
.size());
2009 size_t inputs_present
= 0;
2010 for (auto src
: sources
) {
2011 auto db
= static_cast<const GlassDatabase
*>(src
);
2012 const GlassTable
* table
;
2014 case Honey::POSTLIST
:
2015 table
= &(db
->postlist_table
);
2017 case Honey::DOCDATA
:
2018 table
= &(db
->docdata_table
);
2020 case Honey::TERMLIST
:
2021 table
= &(db
->termlist_table
);
2023 case Honey::POSITION
:
2024 table
= &(db
->position_table
);
2026 case Honey::SPELLING
:
2027 table
= &(db
->spelling_table
);
2029 case Honey::SYNONYM
:
2030 table
= &(db
->synonym_table
);
2037 if (db
->single_file()) {
2038 if (t
->lazy
&& table
->empty()) {
2039 // Essentially doesn't exist.
2041 // FIXME: Find actual size somehow?
2042 // in_size += table->size() / 1024;
2043 single_file_in
= true;
2045 output_will_exist
= true;
2049 off_t db_size
= file_size(table
->get_path());
2051 // FIXME: check overflow and set bad_totals
2052 in_total
+= db_size
;
2053 in_size
+= db_size
/ 1024;
2054 output_will_exist
= true;
2056 } else if (errno
!= ENOENT
) {
2057 // We get ENOENT for an optional table.
2058 bad_totals
= bad_stat
= true;
2059 output_will_exist
= true;
2063 inputs
.push_back(table
);
2066 // If any inputs lack a termlist table, suppress it in the output.
2067 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2068 if (inputs_present
!= 0) {
2070 string m
= str(inputs_present
);
2072 m
+= str(sources
.size());
2073 m
+= " inputs present, so suppressing output";
2074 compactor
->set_status(t
->name
, m
);
2078 output_will_exist
= false;
2081 if (!output_will_exist
) {
2083 compactor
->set_status(t
->name
, "doesn't exist");
2088 off_t table_start_offset
= -1;
2091 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2092 // space for version file. It's tricky to exactly know the
2093 // size of the version file beforehand.
2094 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2095 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2096 throw Xapian::DatabaseError("lseek() failed", errno
);
2098 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2100 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2103 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2105 tabs
.push_back(out
);
2106 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2108 root_info
->set_free_list(fl_serialised
);
2109 root_info
->set_offset(table_start_offset
);
2111 version_file_out
->get_root(t
->type
),
2112 version_file_out
->get_revision());
2114 out
->create_and_open(FLAGS
, *root_info
);
2117 out
->set_full_compaction(compaction
!= compactor
->STANDARD
);
2118 if (compaction
== compactor
->FULLER
) out
->set_max_item_size(1);
2121 case Honey::POSTLIST
: {
2122 if (multipass
&& inputs
.size() > 3) {
2123 multimerge_postlists(compactor
, out
, destdir
,
2126 merge_postlists(compactor
, out
, offset
.begin(),
2127 inputs
.begin(), inputs
.end());
2131 case Honey::SPELLING
:
2132 merge_spellings(out
, inputs
.cbegin(), inputs
.cend());
2134 case Honey::SYNONYM
:
2135 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2137 case Honey::POSITION
:
2138 merge_positions(out
, inputs
, offset
);
2141 // DocData, Termlist
2142 merge_docid_keyed(out
, inputs
, offset
, t
->type
);
2146 // Commit as revision 1.
2148 out
->commit(1, root_info
);
2150 if (single_file
) fl_serialised
= root_info
->get_free_list();
2153 if (!bad_stat
&& !single_file_in
) {
2156 db_size
= file_size(fd
);
2158 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2162 off_t old_prev_size
= max(prev_size
, off_t(block_size
));
2163 prev_size
= db_size
;
2164 db_size
-= old_prev_size
;
2166 // FIXME: check overflow and set bad_totals
2167 out_total
+= db_size
;
2168 out_size
= db_size
/ 1024;
2169 } else if (errno
!= ENOENT
) {
2170 bad_totals
= bad_stat
= true;
2175 compactor
->set_status(t
->name
,
2176 "Done (couldn't stat all the DB files)");
2177 } else if (single_file_in
) {
2179 compactor
->set_status(t
->name
,
2180 "Done (table sizes unknown for single "
2184 if (out_size
== in_size
) {
2185 status
= "Size unchanged (";
2188 if (out_size
< in_size
) {
2189 delta
= in_size
- out_size
;
2190 status
= "Reduced by ";
2192 delta
= out_size
- in_size
;
2193 status
= "INCREASED by ";
2196 status
+= str(100 * delta
/ in_size
);
2199 status
+= str(delta
);
2201 status
+= str(in_size
);
2204 status
+= str(out_size
);
2207 compactor
->set_status(t
->name
, status
);
2211 // If compacting to a single file output and all the tables are empty, pad
2212 // the output so that it isn't mistaken for a stub database when we try to
2213 // open it. For this it needs to be a multiple of 2KB in size.
2214 if (single_file
&& prev_size
< off_t(block_size
)) {
2215 #ifdef HAVE_FTRUNCATE
2216 if (ftruncate(fd
, block_size
) < 0) {
2217 throw Xapian::DatabaseError("Failed to set size of output "
2221 const off_t off
= block_size
- 1;
2222 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2223 throw Xapian::DatabaseError("Failed to set size of output "
2230 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2231 throw Xapian::DatabaseError("lseek() failed", errno
);
2234 version_file_out
->set_last_docid(last_docid
);
2235 string tmpfile
= version_file_out
->write(1, FLAGS
);
2237 off_t version_file_size
= lseek(fd
, 0, SEEK_CUR
);
2238 if (version_file_size
< 0) {
2239 throw Xapian::DatabaseError("lseek() failed", errno
);
2241 if (version_file_size
> HONEY_VERSION_MAX_SIZE
) {
2242 throw Xapian::DatabaseError("Didn't allow enough space for "
2243 "version file data");
2246 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2249 // Commit with revision 1.
2250 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2251 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2256 vector
<HoneyTable
*> tabs
;
2257 tabs
.reserve(tables_end
- tables
);
2258 off_t prev_size
= block_size
;
2259 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
2260 // The postlist table requires an N-way merge, adjusting the
2261 // headers of various blocks. The spelling and synonym tables also
2262 // need special handling. The other tables have keys sorted in
2263 // docid order, so we can merge them by simply copying all the keys
2264 // from each source table in turn.
2266 compactor
->set_status(t
->name
, string());
2276 bool output_will_exist
= !t
->lazy
;
2278 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2279 // on certain systems).
2280 bool bad_stat
= false;
2282 // We can't currently report input sizes if there's a single file DB
2283 // amongst the inputs.
2284 bool single_file_in
= false;
2288 vector
<const HoneyTable
*> inputs
;
2289 inputs
.reserve(sources
.size());
2290 size_t inputs_present
= 0;
2291 for (auto src
: sources
) {
2292 auto db
= static_cast<const HoneyDatabase
*>(src
);
2293 const HoneyTable
* table
;
2295 case Honey::POSTLIST
:
2296 table
= &(db
->postlist_table
);
2298 case Honey::DOCDATA
:
2299 table
= &(db
->docdata_table
);
2301 case Honey::TERMLIST
:
2302 table
= &(db
->termlist_table
);
2304 case Honey::POSITION
:
2305 table
= &(db
->position_table
);
2307 case Honey::SPELLING
:
2308 table
= &(db
->spelling_table
);
2310 case Honey::SYNONYM
:
2311 table
= &(db
->synonym_table
);
2318 if (db
->single_file()) {
2319 if (t
->lazy
&& table
->empty()) {
2320 // Essentially doesn't exist.
2322 // FIXME: Find actual size somehow?
2323 // in_size += table->size() / 1024;
2324 single_file_in
= true;
2326 output_will_exist
= true;
2330 off_t db_size
= file_size(table
->get_path());
2332 // FIXME: check overflow and set bad_totals
2333 in_total
+= db_size
;
2334 in_size
+= db_size
/ 1024;
2335 output_will_exist
= true;
2337 } else if (errno
!= ENOENT
) {
2338 // We get ENOENT for an optional table.
2339 bad_totals
= bad_stat
= true;
2340 output_will_exist
= true;
2344 inputs
.push_back(table
);
2347 // If any inputs lack a termlist table, suppress it in the output.
2348 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2349 if (inputs_present
!= 0) {
2351 string m
= str(inputs_present
);
2353 m
+= str(sources
.size());
2354 m
+= " inputs present, so suppressing output";
2355 compactor
->set_status(t
->name
, m
);
2359 output_will_exist
= false;
2362 if (!output_will_exist
) {
2364 compactor
->set_status(t
->name
, "doesn't exist");
2369 off_t table_start_offset
= -1;
2372 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2373 // space for version file. It's tricky to exactly know the
2374 // size of the version file beforehand.
2375 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2376 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2377 throw Xapian::DatabaseError("lseek() failed", errno
);
2379 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2381 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2384 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2386 tabs
.push_back(out
);
2387 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2389 root_info
->set_free_list(fl_serialised
);
2390 root_info
->set_offset(table_start_offset
);
2392 version_file_out
->get_root(t
->type
),
2393 version_file_out
->get_revision());
2395 out
->create_and_open(FLAGS
, *root_info
);
2398 out
->set_full_compaction(compaction
!= compactor
->STANDARD
);
2399 if (compaction
== compactor
->FULLER
) out
->set_max_item_size(1);
2402 case Honey::POSTLIST
: {
2403 if (multipass
&& inputs
.size() > 3) {
2404 multimerge_postlists(compactor
, out
, destdir
,
2407 merge_postlists(compactor
, out
, offset
.begin(),
2408 inputs
.begin(), inputs
.end());
2412 case Honey::SPELLING
:
2413 merge_spellings(out
, inputs
.begin(), inputs
.end());
2415 case Honey::SYNONYM
:
2416 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2418 case Honey::POSITION
:
2419 merge_positions(out
, inputs
, offset
);
2422 // DocData, Termlist
2423 merge_docid_keyed(out
, inputs
, offset
);
2427 // Commit as revision 1.
2429 out
->commit(1, root_info
);
2431 if (single_file
) fl_serialised
= root_info
->get_free_list();
2434 if (!bad_stat
&& !single_file_in
) {
2437 db_size
= file_size(fd
);
2439 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2443 off_t old_prev_size
= max(prev_size
, off_t(block_size
));
2444 prev_size
= db_size
;
2445 db_size
-= old_prev_size
;
2447 // FIXME: check overflow and set bad_totals
2448 out_total
+= db_size
;
2449 out_size
= db_size
/ 1024;
2450 } else if (errno
!= ENOENT
) {
2451 bad_totals
= bad_stat
= true;
2456 compactor
->set_status(t
->name
,
2457 "Done (couldn't stat all the DB files)");
2458 } else if (single_file_in
) {
2460 compactor
->set_status(t
->name
,
2461 "Done (table sizes unknown for single "
2465 if (out_size
== in_size
) {
2466 status
= "Size unchanged (";
2469 if (out_size
< in_size
) {
2470 delta
= in_size
- out_size
;
2471 status
= "Reduced by ";
2473 delta
= out_size
- in_size
;
2474 status
= "INCREASED by ";
2477 status
+= str(100 * delta
/ in_size
);
2480 status
+= str(delta
);
2482 status
+= str(in_size
);
2485 status
+= str(out_size
);
2488 compactor
->set_status(t
->name
, status
);
2492 // If compacting to a single file output and all the tables are empty, pad
2493 // the output so that it isn't mistaken for a stub database when we try to
2494 // open it. For this it needs to be a multiple of 2KB in size.
2495 if (single_file
&& prev_size
< off_t(block_size
)) {
2496 #ifdef HAVE_FTRUNCATE
2497 if (ftruncate(fd
, block_size
) < 0) {
2498 throw Xapian::DatabaseError("Failed to set size of output "
2502 const off_t off
= block_size
- 1;
2503 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2504 throw Xapian::DatabaseError("Failed to set size of output "
2511 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2512 throw Xapian::DatabaseError("lseek() failed", errno
);
2515 version_file_out
->set_last_docid(last_docid
);
2516 string tmpfile
= version_file_out
->write(1, FLAGS
);
2517 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2520 // Commit with revision 1.
2521 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2522 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2527 if (!single_file
) lock
.release();
2529 if (!bad_totals
&& compactor
) {
2533 if (out_total
== in_total
) {
2534 status
= "Size unchanged (";
2537 if (out_total
< in_total
) {
2538 delta
= in_total
- out_total
;
2539 status
= "Reduced by ";
2541 delta
= out_total
- in_total
;
2542 status
= "INCREASED by ";
2545 status
+= str(100 * delta
/ in_total
);
2548 status
+= str(delta
);
2550 status
+= str(in_total
);
2553 status
+= str(out_total
);
2555 compactor
->set_status("Total", status
);