1 /** @file honey_compact.cc
2 * @brief Compact a honey database, or merge and compact several.
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "xapian/compactor.h"
25 #include "xapian/constants.h"
26 #include "xapian/error.h"
27 #include "xapian/types.h"
33 #include <type_traits>
38 #include "safeerrno.h"
40 #include "backends/flint_lock.h"
41 #include "compression_stream.h"
42 #include "honey_cursor.h"
43 #include "honey_database.h"
44 #include "honey_defs.h"
45 #include "honey_postlist_encodings.h"
46 #include "honey_table.h"
47 #include "honey_version.h"
48 #include "filetests.h"
49 #include "internaltypes.h"
51 #include "backends/valuestats.h"
52 #include "wordaccess.h"
54 #include "../byte_length_strings.h"
55 #include "../prefix_compressed_strings.h"
57 #ifndef DISABLE_GPL_LIBXAPIAN
58 # include "../glass/glass_database.h"
59 # include "../glass/glass_table.h"
64 #ifndef DISABLE_GPL_LIBXAPIAN
67 throw_database_corrupt(const char* item
, const char* pos
)
71 message
= "Value overflow unpacking termlist: ";
73 message
= "Out of data unpacking termlist: ";
76 throw Xapian::DatabaseCorruptError(message
);
79 namespace GlassCompact
{
82 is_user_metadata_key(const string
& key
)
84 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xc0';
88 is_valuestats_key(const string
& key
)
90 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd0';
94 is_valuechunk_key(const string
& key
)
96 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xd8';
100 is_doclenchunk_key(const string
& key
)
102 return key
.size() > 1 && key
[0] == '\0' && key
[1] == '\xe0';
109 termlist_key_is_values_used(const string
& key
)
111 const char* p
= key
.data();
112 const char* e
= p
+ key
.size();
114 if (unpack_uint_preserving_sort(&p
, e
, &did
)) {
117 if (e
- p
== 1 && *p
== '\0')
120 throw Xapian::DatabaseCorruptError("termlist key format");
123 // Put all the helpers in a namespace to avoid symbols colliding with those of
124 // the same name in other flint-derived backends.
125 namespace HoneyCompact
{
128 KEY_USER_METADATA
= 0x00,
129 KEY_VALUE_STATS
= 0x01,
130 KEY_VALUE_CHUNK
= 0xd8,
131 KEY_DOCLEN_CHUNK
= 0xe0,
132 KEY_POSTING_CHUNK
= 0xff
135 /// Return one of the KEY_* constants, or a different value for an invalid key.
137 key_type(const string
& key
)
140 return KEY_POSTING_CHUNK
;
145 switch (static_cast<unsigned char>(key
[1])) {
146 case 0x01: case 0x02: case 0x03: case 0x04:
147 return KEY_VALUE_STATS
;
150 // If key[1] is \xff then this correctly returns KEY_POSTING_CHUNK.
151 return static_cast<unsigned char>(key
[1]);
156 bool read_only
= true;
157 mutable size_t buf_end
= 0;
158 mutable char buf
[4096];
164 if (fd
>= 0) close(fd
);
167 bool open(const std::string
& path
, bool read_only_
) {
168 if (fd
>= 0) close(fd
);
169 read_only
= read_only_
;
171 // FIXME: add new io_open_stream_rd() etc?
172 fd
= io_open_block_rd(path
);
174 fd
= io_open_block_wr(path
, true); // FIXME: Always create anew for now...
179 off_t
get_pos() const {
181 lseek(fd
, 0, SEEK_CUR
) - buf_end
:
182 lseek(fd
, 0, SEEK_CUR
) + buf_end
;
186 if (buf_end
) return false;
188 if (fd
== -1 || fstat(fd
, &sbuf
) < 0) return true;
189 return (sbuf
.st_size
== 0);
192 void write(unsigned char ch
) {
193 if (buf_end
== sizeof(buf
)) {
195 if (::write(fd
, buf
, buf_end
)) {
196 // FIXME: retry short write
203 void write(const char* p
, size_t len
) {
204 if (buf_end
+ len
<= sizeof(buf
)) {
205 memcpy(buf
+ buf_end
, p
, len
);
212 iov
[0].iov_base
= buf
;
213 iov
[0].iov_len
= buf_end
;
214 iov
[1].iov_base
= const_cast<char*>(p
);
215 iov
[1].iov_len
= len
;
216 ssize_t n_
= writev(fd
, iov
, 2);
219 if (n
== buf_end
+ len
) {
229 if (::write(fd
, p
, len
)) {
230 // FIXME: retry short write
236 memmove(buf
, buf
+ n
, buf_end
);
242 ssize_t r
= ::read(fd
, buf
, sizeof(buf
));
243 if (r
== 0) return EOF
;
244 // FIXME: retry on EINTR
248 return static_cast<unsigned char>(buf
[sizeof(buf
) - buf_end
--]);
251 bool read(char* p
, size_t len
) const {
252 if (len
<= buf_end
) {
253 memcpy(p
, buf
+ sizeof(buf
) - buf_end
, len
);
257 memcpy(p
, buf
+ sizeof(buf
) - buf_end
, buf_end
);
261 // FIXME: refill buffer if len < sizeof(buf)
262 return ::read(fd
, p
, len
) == ssize_t(len
);
266 if (!read_only
&& buf_end
) {
267 if (::write(fd
, buf
, buf_end
)) {
268 // FIXME: retry short write
280 lseek(fd
, 0, SEEK_SET
);
285 template<typename T
> class PostlistCursor
;
287 #ifndef DISABLE_GPL_LIBXAPIAN
288 // Convert glass to honey.
290 class PostlistCursor
<const GlassTable
&> : private GlassCursor
{
291 Xapian::docid offset
;
295 Xapian::docid firstdid
;
296 Xapian::docid chunk_lastdid
;
297 Xapian::termcount tf
, cf
;
298 Xapian::termcount first_wdf
;
300 PostlistCursor(const GlassTable
*in
, Xapian::docid offset_
)
301 : GlassCursor(in
), offset(offset_
), firstdid(0)
303 find_entry(string());
308 if (!GlassCursor::next()) return false;
309 // We put all chunks into the non-initial chunk form here, then fix up
310 // the first chunk for each term in the merged database as we merge.
315 if (GlassCompact::is_user_metadata_key(key
)) {
316 key
[1] = KEY_USER_METADATA
;
319 if (GlassCompact::is_valuestats_key(key
)) {
321 const char * p
= key
.data();
322 const char * end
= p
+ key
.length();
324 Xapian::valueno slot
;
325 if (!unpack_uint_last(&p
, end
, &slot
))
326 throw Xapian::DatabaseCorruptError("bad value stats key");
327 key
= pack_honey_valuestats_key(slot
);
330 if (GlassCompact::is_valuechunk_key(key
)) {
331 const char * p
= key
.data();
332 const char * end
= p
+ key
.length();
334 Xapian::valueno slot
;
335 if (!unpack_uint(&p
, end
, &slot
))
336 throw Xapian::DatabaseCorruptError("bad value key");
338 if (!unpack_uint_preserving_sort(&p
, end
, &did
))
339 throw Xapian::DatabaseCorruptError("bad value key");
342 key
.assign("\0\xd8", 2);
343 pack_uint(key
, slot
);
344 pack_uint_preserving_sort(key
, did
);
348 if (GlassCompact::is_doclenchunk_key(key
)) {
349 const char * d
= key
.data();
350 const char * e
= d
+ key
.size();
354 // This is an initial chunk, so adjust tag header.
357 if (!unpack_uint(&d
, e
, &tf
) ||
358 !unpack_uint(&d
, e
, &cf
) ||
359 !unpack_uint(&d
, e
, &firstdid
)) {
360 throw Xapian::DatabaseCorruptError("Bad postlist key");
363 tag
.erase(0, d
- tag
.data());
365 // Not an initial chunk, so adjust key.
366 size_t tmp
= d
- key
.data();
367 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
368 throw Xapian::DatabaseCorruptError("Bad postlist key");
376 // Convert doclen chunk to honey format.
379 // Skip the "last chunk" flag and increase_to_last.
381 throw Xapian::DatabaseCorruptError("No last chunk flag in glass docdata chunk");
383 Xapian::docid increase_to_last
;
384 if (!unpack_uint(&d
, e
, &increase_to_last
))
385 throw Xapian::DatabaseCorruptError("Decoding last docid delta in glass docdata chunk");
388 Xapian::termcount doclen
;
389 if (!unpack_uint(&d
, e
, &doclen
))
390 throw Xapian::DatabaseCorruptError("Decoding doclen in glass docdata chunk");
391 Assert(doclen
!= 0xffffffff);
392 unsigned char buf
[4];
393 unaligned_write4(buf
, doclen
);
394 newtag
.append(reinterpret_cast<char*>(buf
), 4);
397 Xapian::docid gap_size
;
398 if (!unpack_uint(&d
, e
, &gap_size
))
399 throw Xapian::DatabaseCorruptError("Decoding docid gap_size in glass docdata chunk");
400 // FIXME: Split chunk if the gap_size is at all large.
401 newtag
.append(4 * gap_size
, '\xff');
404 Assert(!startswith(newtag
, "\xff\xff\xff\xff"));
405 Assert(!endswith(newtag
, "\xff\xff\xff\xff"));
412 // Adjust key if this is *NOT* an initial chunk.
413 // key is: pack_string_preserving_sort(key, tname)
414 // plus optionally: pack_uint_preserving_sort(key, did)
415 const char * d
= key
.data();
416 const char * e
= d
+ key
.size();
418 if (!unpack_string_preserving_sort(&d
, e
, tname
))
419 throw Xapian::DatabaseCorruptError("Bad postlist key");
422 // This is an initial chunk for a term, so adjust tag header.
425 if (!unpack_uint(&d
, e
, &tf
) ||
426 !unpack_uint(&d
, e
, &cf
) ||
427 !unpack_uint(&d
, e
, &firstdid
)) {
428 throw Xapian::DatabaseCorruptError("Bad postlist key");
431 tag
.erase(0, d
- tag
.data());
433 // Not an initial chunk, so adjust key.
434 size_t tmp
= d
- key
.data();
435 if (!unpack_uint_preserving_sort(&d
, e
, &firstdid
) || d
!= e
)
436 throw Xapian::DatabaseCorruptError("Bad postlist key");
444 // Convert posting chunk to honey format, but without any header.
447 // Skip the "last chunk" flag; decode increase_to_last.
449 throw Xapian::DatabaseCorruptError("No last chunk flag in glass posting chunk");
451 Xapian::docid increase_to_last
;
452 if (!unpack_uint(&d
, e
, &increase_to_last
))
453 throw Xapian::DatabaseCorruptError("Decoding last docid delta in glass posting chunk");
454 chunk_lastdid
= firstdid
+ increase_to_last
;
455 if (!unpack_uint(&d
, e
, &first_wdf
))
456 throw Xapian::DatabaseCorruptError("Decoding first wdf in glass posting chunk");
460 if (!unpack_uint(&d
, e
, &delta
))
461 throw Xapian::DatabaseCorruptError("Decoding docid delta in glass posting chunk");
462 pack_uint(newtag
, delta
);
463 Xapian::termcount wdf
;
464 if (!unpack_uint(&d
, e
, &wdf
))
465 throw Xapian::DatabaseCorruptError("Decoding wdf in glass posting chunk");
466 pack_uint(newtag
, wdf
);
477 class PostlistCursor
<const HoneyTable
&> : private HoneyCursor
{
478 Xapian::docid offset
;
482 Xapian::docid firstdid
;
483 Xapian::docid chunk_lastdid
;
484 Xapian::termcount tf
, cf
;
485 Xapian::termcount first_wdf
;
487 PostlistCursor(const HoneyTable
*in
, Xapian::docid offset_
)
488 : HoneyCursor(in
), offset(offset_
), firstdid(0)
490 find_entry(string());
495 if (!HoneyCursor::next()) return false;
496 // We put all chunks into the non-initial chunk form here, then fix up
497 // the first chunk for each term in the merged database as we merge.
502 switch (key_type(key
)) {
503 case KEY_USER_METADATA
:
504 case KEY_VALUE_STATS
:
506 case KEY_VALUE_CHUNK
: {
507 const char * p
= key
.data();
508 const char * end
= p
+ key
.length();
510 Xapian::valueno slot
;
511 if (!unpack_uint(&p
, end
, &slot
))
512 throw Xapian::DatabaseCorruptError("bad value key");
514 if (!unpack_uint_preserving_sort(&p
, end
, &did
))
515 throw Xapian::DatabaseCorruptError("bad value key");
518 key
.assign("\0\xd8", 2);
519 pack_uint(key
, slot
);
520 pack_uint_preserving_sort(key
, did
);
523 case KEY_DOCLEN_CHUNK
: {
524 const char * p
= key
.data();
525 const char * end
= p
+ key
.length();
527 Xapian::docid did
= 1;
529 (!unpack_uint_preserving_sort(&p
, end
, &did
) || p
!= end
)) {
530 throw Xapian::DatabaseCorruptError("Bad doclen key");
536 pack_uint_preserving_sort(key
, did
);
540 case KEY_POSTING_CHUNK
:
543 throw Xapian::DatabaseCorruptError("Bad postlist table key type");
546 // Adjust key if this is *NOT* an initial chunk.
547 // key is: pack_string_preserving_sort(key, term)
548 // plus optionally: pack_uint_preserving_sort(key, did)
549 const char * d
= key
.data();
550 const char * e
= d
+ key
.size();
552 if (!unpack_string_preserving_sort(&d
, e
, term
))
553 throw Xapian::DatabaseCorruptError("Bad postlist key");
556 // This is an initial chunk for a term, so remove tag header.
560 Xapian::docid lastdid
;
561 if (!decode_initial_chunk_header(&d
, e
, tf
, cf
,
562 firstdid
, lastdid
, chunk_lastdid
,
564 throw Xapian::DatabaseCorruptError("Bad postlist initial chunk header");
566 // Ignore lastdid - we'll need to recalculate it (at least when
567 // merging, and for simplicity we always do).
569 tag
.erase(0, d
- tag
.data());
571 // Not an initial chunk, so adjust key and remove tag header.
572 size_t tmp
= d
- key
.data();
573 if (!unpack_uint_preserving_sort(&d
, e
, &chunk_lastdid
) ||
575 throw Xapian::DatabaseCorruptError("Bad postlist key");
577 // -1 to remove the terminating zero byte too.
584 if (!decode_delta_chunk_header(&d
, e
, chunk_lastdid
, firstdid
,
586 throw Xapian::DatabaseCorruptError("Bad postlist delta chunk header");
588 tag
.erase(0, d
- tag
.data());
590 if (!decode_delta_chunk_header_bool(&d
, e
, chunk_lastdid
, firstdid
)) {
591 throw Xapian::DatabaseCorruptError("Bad postlist delta chunk header");
594 // Splice in a 0 wdf value after each docid delta.
596 for (size_t i
= d
- tag
.data(); i
< tag
.size(); i
+= 4) {
597 newtag
.append(tag
, i
, 4);
598 newtag
.append(4, '\0');
600 tag
= std::move(newtag
);
604 chunk_lastdid
+= offset
;
610 class PostlistCursor
<HoneyTable
&> : public PostlistCursor
<const HoneyTable
&> {
612 PostlistCursor(HoneyTable
*in
, Xapian::docid offset_
)
613 : PostlistCursor
<const HoneyTable
&>(in
, offset_
) {}
617 class PostlistCursorGt
{
619 /** Return true if and only if a's key is strictly greater than b's key.
621 bool operator()(const T
* a
, const T
* b
) const {
622 if (a
->key
> b
->key
) return true;
623 if (a
->key
!= b
->key
) return false;
624 return (a
->firstdid
> b
->firstdid
);
629 encode_valuestats(Xapian::doccount freq
,
630 const string
& lbound
, const string
& ubound
)
633 pack_uint(value
, freq
);
634 pack_string(value
, lbound
);
635 // We don't store or count empty values, so neither of the bounds
636 // can be empty. So we can safely store an empty upper bound when
637 // the bounds are equal.
638 if (lbound
!= ubound
) value
+= ubound
;
642 // U : vector<HoneyTable*>::const_iterator
643 template<typename T
, typename U
> void
644 merge_postlists(Xapian::Compactor
* compactor
,
645 T
* out
, vector
<Xapian::docid
>::const_iterator offset
,
648 typedef decltype(**b
) table_type
; // E.g. HoneyTable
649 typedef PostlistCursor
<table_type
> cursor_type
;
650 typedef PostlistCursorGt
<cursor_type
> gt_type
;
651 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
652 for ( ; b
!= e
; ++b
, ++offset
) {
655 // Skip empty tables.
659 pq
.push(new cursor_type(in
, *offset
));
664 // Merge user metadata.
666 while (!pq
.empty()) {
667 cursor_type
* cur
= pq
.top();
668 const string
& key
= cur
->key
;
669 if (key_type(key
) != KEY_USER_METADATA
) break;
671 if (key
!= last_key
) {
673 if (tags
.size() > 1 && compactor
) {
674 Assert(!last_key
.empty());
675 // FIXME: It would be better to merge all duplicates
676 // for a key in one call, but currently we don't in
678 const string
& resolved_tag
=
679 compactor
->resolve_duplicate_metadata(last_key
,
682 out
->add(last_key
, resolved_tag
);
684 Assert(!last_key
.empty());
685 out
->add(last_key
, tags
[0]);
691 tags
.push_back(cur
->tag
);
701 if (tags
.size() > 1 && compactor
) {
702 Assert(!last_key
.empty());
703 const string
& resolved_tag
=
704 compactor
->resolve_duplicate_metadata(last_key
,
707 out
->add(last_key
, resolved_tag
);
709 Assert(!last_key
.empty());
710 out
->add(last_key
, tags
[0]);
717 Xapian::doccount freq
= 0;
718 string lbound
, ubound
;
720 while (!pq
.empty()) {
721 cursor_type
* cur
= pq
.top();
722 const string
& key
= cur
->key
;
723 if (key_type(key
) != KEY_VALUE_STATS
) break;
724 if (key
!= last_key
) {
725 // For the first valuestats key, last_key will be the previous
726 // key we wrote, which we don't want to overwrite. This is the
727 // only time that freq will be 0, so check that.
729 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
735 const string
& tag
= cur
->tag
;
737 const char * pos
= tag
.data();
738 const char * end
= pos
+ tag
.size();
742 if (!unpack_uint(&pos
, end
, &f
)) {
743 if (*pos
== 0) throw Xapian::DatabaseCorruptError("Incomplete stats item in value table");
744 throw Xapian::RangeError("Frequency statistic in value table is too large");
746 if (!unpack_string(&pos
, end
, l
)) {
747 if (*pos
== 0) throw Xapian::DatabaseCorruptError("Incomplete stats item in value table");
748 throw Xapian::RangeError("Lower bound in value table is too large");
750 size_t len
= end
- pos
;
762 if (l
< lbound
) lbound
= l
;
763 if (u
> ubound
) ubound
= u
;
775 out
->add(last_key
, encode_valuestats(freq
, lbound
, ubound
));
779 // Merge valuestream chunks.
780 while (!pq
.empty()) {
781 cursor_type
* cur
= pq
.top();
782 const string
& key
= cur
->key
;
783 if (key_type(key
) != KEY_VALUE_CHUNK
) break;
784 out
->add(key
, cur
->tag
);
793 // Merge doclen chunks.
794 while (!pq
.empty()) {
795 cursor_type
* cur
= pq
.top();
796 string
& key
= cur
->key
;
797 if (key_type(key
) != KEY_DOCLEN_CHUNK
) break;
798 if (cur
->firstdid
!= 1)
799 pack_uint_preserving_sort(key
, cur
->firstdid
);
800 out
->add(key
, cur
->tag
);
809 Xapian::termcount tf
= 0, cf
= 0; // Initialise to avoid warnings.
811 struct HoneyPostListChunk
{
812 Xapian::docid first
, last
;
813 Xapian::termcount first_wdf
;
816 HoneyPostListChunk(Xapian::docid first_
,
818 Xapian::termcount first_wdf_
,
822 first_wdf(first_wdf_
),
825 vector
<HoneyPostListChunk
> tags
;
827 cursor_type
* cur
= NULL
;
832 Assert(cur
== NULL
|| key_type(cur
->key
) == KEY_POSTING_CHUNK
);
833 if (cur
== NULL
|| cur
->key
!= last_key
) {
835 Xapian::termcount first_wdf
= tags
[0].first_wdf
;
836 Xapian::docid chunk_lastdid
= tags
[0].last
;
837 Xapian::docid last_did
= tags
.back().last
;
840 encode_initial_chunk_header(tf
, cf
, tags
[0].first
, last_did
,
842 first_wdf
, first_tag
);
844 first_tag
+= tags
[0].data
;
846 out
->add(last_key
, first_tag
);
848 // If tf == 2, the data could be split over two tags when
849 // merging, but we only output an initial tag in this case.
850 if (tf
> 2 && tags
.size() > 1) {
852 const char* p
= last_key
.data();
853 const char* end
= p
+ last_key
.size();
854 if (!unpack_string_preserving_sort(&p
, end
, term
) ||
856 throw Xapian::DatabaseCorruptError("Bad postlist "
860 auto i
= tags
.begin();
861 while (++i
!= tags
.end()) {
863 const string
& key
= pack_honey_postlist_key(term
,
867 encode_delta_chunk_header(i
->first
,
873 encode_delta_chunk_header_bool(i
->first
,
876 const char* pos
= i
->data
.data();
877 const char* pos_end
= pos
+ i
->data
.size();
878 while (pos
!= pos_end
) {
880 if (!unpack_uint(&pos
, pos_end
, &delta
))
881 throw Xapian::DatabaseCorruptError("Decoding docid delta");
882 pack_uint(tag
, delta
);
883 Xapian::termcount wdf
;
884 if (!unpack_uint(&pos
, pos_end
, &wdf
))
885 throw Xapian::DatabaseCorruptError("Decoding wdf");
886 // Only copy over the docid deltas, not the wdfs.
896 if (cur
== NULL
) break;
904 tags
.push_back(HoneyPostListChunk(cur
->firstdid
,
907 std::move(cur
->tag
)));
916 template<typename T
> struct MergeCursor
;
918 #ifndef DISABLE_GPL_LIBXAPIAN
920 struct MergeCursor
<const GlassTable
&> : public GlassCursor
{
921 explicit MergeCursor(const GlassTable
*in
) : GlassCursor(in
) {
922 find_entry(string());
929 struct MergeCursor
<const HoneyTable
&> : public HoneyCursor
{
930 explicit MergeCursor(const HoneyTable
*in
) : HoneyCursor(in
) {
931 find_entry(string());
937 struct MergeCursor
<HoneyTable
&> {
939 string current_key
, current_tag
;
940 bool current_compressed
;
941 mutable CompressionStream comp_stream
;
943 explicit MergeCursor(HoneyTable
*in
) : table(in
), comp_stream(Z_DEFAULT_STRATEGY
) {
948 return table
->read_item(current_key
, current_tag
, current_compressed
);
951 bool read_tag(bool keep_compressed
) {
952 if (!keep_compressed
&& current_compressed
) {
953 // Need to decompress.
954 comp_stream
.decompress_start();
956 if (!comp_stream
.decompress_chunk(current_tag
.data(), current_tag
.size(), new_tag
)) {
957 // Decompression didn't complete.
960 swap(current_tag
, new_tag
);
961 current_compressed
= false;
963 return current_compressed
;
969 /// Return true if and only if a's key is strictly greater than b's key.
970 bool operator()(const T
* a
, const T
* b
) const {
971 if (b
->after_end()) return false;
972 if (a
->after_end()) return true;
973 return (a
->current_key
> b
->current_key
);
977 // U : vector<HoneyTable*>::const_iterator
978 template<typename T
, typename U
> void
979 merge_spellings(T
* out
, U b
, U e
)
981 typedef decltype(**b
) table_type
; // E.g. HoneyTable
982 typedef MergeCursor
<table_type
> cursor_type
;
983 typedef CursorGt
<cursor_type
> gt_type
;
984 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
985 for ( ; b
!= e
; ++b
) {
988 pq
.push(new cursor_type(in
));
992 while (!pq
.empty()) {
993 cursor_type
* cur
= pq
.top();
996 string key
= cur
->current_key
;
997 if (pq
.empty() || pq
.top()->current_key
> key
) {
998 // No need to merge the tags, just copy the (possibly compressed)
1000 bool compressed
= cur
->read_tag(true);
1001 out
->add(key
, cur
->current_tag
, compressed
);
1010 // Merge tag values with the same key:
1012 if (key
[0] != 'W') {
1013 // We just want the union of words, so copy over the first instance
1014 // and skip any identical ones.
1015 priority_queue
<PrefixCompressedStringItor
*,
1016 vector
<PrefixCompressedStringItor
*>,
1017 PrefixCompressedStringItorGt
> pqtag
;
1018 // Stick all the cursor_type pointers in a vector because their
1019 // current_tag members must remain valid while we're merging their
1020 // tags, but we need to call next() on them all afterwards.
1021 vector
<cursor_type
*> vec
;
1022 vec
.reserve(pq
.size());
1026 pqtag
.push(new PrefixCompressedStringItor(cur
->current_tag
));
1028 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1033 PrefixCompressedStringWriter
wr(tag
);
1035 while (!pqtag
.empty()) {
1036 PrefixCompressedStringItor
* it
= pqtag
.top();
1039 if (word
!= lastword
) {
1041 wr
.append(lastword
);
1044 if (!it
->at_end()) {
1051 for (auto i
: vec
) {
1060 // We want to sum the frequencies from tags for the same key.
1061 Xapian::termcount tot_freq
= 0;
1064 Xapian::termcount freq
;
1065 const char * p
= cur
->current_tag
.data();
1066 const char * end
= p
+ cur
->current_tag
.size();
1067 if (!unpack_uint_last(&p
, end
, &freq
) || freq
== 0) {
1068 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
1076 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1081 pack_uint_last(tag
, tot_freq
);
1087 // U : vector<HoneyTable*>::const_iterator
1088 template<typename T
, typename U
> void
1089 merge_synonyms(T
* out
, U b
, U e
)
1091 typedef decltype(**b
) table_type
; // E.g. HoneyTable
1092 typedef MergeCursor
<table_type
> cursor_type
;
1093 typedef CursorGt
<cursor_type
> gt_type
;
1094 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1095 for ( ; b
!= e
; ++b
) {
1098 pq
.push(new cursor_type(in
));
1102 while (!pq
.empty()) {
1103 cursor_type
* cur
= pq
.top();
1106 string key
= cur
->current_key
;
1107 if (pq
.empty() || pq
.top()->current_key
> key
) {
1108 // No need to merge the tags, just copy the (possibly compressed)
1110 bool compressed
= cur
->read_tag(true);
1111 out
->add(key
, cur
->current_tag
, compressed
);
1120 // Merge tag values with the same key:
1123 // We just want the union of words, so copy over the first instance
1124 // and skip any identical ones.
1125 priority_queue
<ByteLengthPrefixedStringItor
*,
1126 vector
<ByteLengthPrefixedStringItor
*>,
1127 ByteLengthPrefixedStringItorGt
> pqtag
;
1128 vector
<cursor_type
*> vec
;
1132 pqtag
.push(new ByteLengthPrefixedStringItor(cur
->current_tag
));
1134 if (pq
.empty() || pq
.top()->current_key
!= key
) break;
1140 while (!pqtag
.empty()) {
1141 ByteLengthPrefixedStringItor
* it
= pqtag
.top();
1143 if (**it
!= lastword
) {
1145 tag
+= byte(lastword
.size() ^ MAGIC_XOR_VALUE
);
1149 if (!it
->at_end()) {
1156 for (auto i
: vec
) {
1169 template<typename T
, typename U
> void
1170 multimerge_postlists(Xapian::Compactor
* compactor
,
1171 T
* out
, const char * tmpdir
,
1172 const vector
<U
*>& in
,
1173 vector
<Xapian::docid
> off
)
1175 if (in
.size() <= 3) {
1176 merge_postlists(compactor
, out
, off
.begin(), in
.begin(), in
.end());
1180 vector
<HoneyTable
*> tmp
;
1181 tmp
.reserve(in
.size() / 2);
1183 vector
<Xapian::docid
> newoff
;
1184 newoff
.resize(in
.size() / 2);
1185 for (unsigned int i
= 0, j
; i
< in
.size(); i
= j
) {
1187 if (j
== in
.size() - 1) ++j
;
1189 string dest
= tmpdir
;
1191 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1194 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1196 // Use maximum blocksize for temporary tables. And don't compress
1197 // entries in temporary tables, even if the final table would do
1198 // so. Any already compressed entries will get copied in
1199 // compressed form. (FIXME: HoneyTable has no blocksize)
1200 Honey::RootInfo root_info
;
1201 root_info
.init(65536, 0);
1202 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1203 tmptab
->create_and_open(flags
, root_info
);
1205 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1206 in
.begin() + i
, in
.begin() + j
);
1207 tmp
.push_back(tmptab
);
1209 tmptab
->commit(1, &root_info
);
1210 AssertRel(root_info
.get_blocksize(),==,65536);
1216 while (tmp
.size() > 3) {
1217 vector
<HoneyTable
*> tmpout
;
1218 tmpout
.reserve(tmp
.size() / 2);
1219 vector
<Xapian::docid
> newoff
;
1220 newoff
.resize(tmp
.size() / 2);
1221 for (unsigned int i
= 0, j
; i
< tmp
.size(); i
= j
) {
1223 if (j
== tmp
.size() - 1) ++j
;
1225 string dest
= tmpdir
;
1227 sprintf(buf
, "/tmp%u_%u.", c
, i
/ 2);
1230 HoneyTable
* tmptab
= new HoneyTable("postlist", dest
, false);
1232 // Use maximum blocksize for temporary tables. And don't compress
1233 // entries in temporary tables, even if the final table would do
1234 // so. Any already compressed entries will get copied in
1235 // compressed form. (FIXME: HoneyTable has no blocksize)
1236 Honey::RootInfo root_info
;
1237 root_info
.init(65536, 0);
1238 const int flags
= Xapian::DB_DANGEROUS
|Xapian::DB_NO_SYNC
;
1239 tmptab
->create_and_open(flags
, root_info
);
1241 merge_postlists(compactor
, tmptab
, off
.begin() + i
,
1242 tmp
.begin() + i
, tmp
.begin() + j
);
1244 for (unsigned int k
= i
; k
< j
; ++k
) {
1245 // FIXME: unlink(tmp[k]->get_path().c_str());
1250 tmpout
.push_back(tmptab
);
1252 tmptab
->commit(1, &root_info
);
1253 AssertRel(root_info
.get_blocksize(),==,65536);
1259 merge_postlists(compactor
, out
, off
.begin(), tmp
.begin(), tmp
.end());
1261 for (size_t k
= 0; k
< tmp
.size(); ++k
) {
1262 // FIXME: unlink(tmp[k]->get_path().c_str());
1269 template<typename T
> class PositionCursor
;
1271 #ifndef DISABLE_GPL_LIBXAPIAN
1273 class PositionCursor
<const GlassTable
&> : private GlassCursor
{
1274 Xapian::docid offset
;
1278 Xapian::docid firstdid
;
1280 PositionCursor(const GlassTable
*in
, Xapian::docid offset_
)
1281 : GlassCursor(in
), offset(offset_
), firstdid(0) {
1282 find_entry(string());
1287 if (!GlassCursor::next()) return false;
1289 const char * d
= current_key
.data();
1290 const char * e
= d
+ current_key
.size();
1293 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1294 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1296 throw Xapian::DatabaseCorruptError("Bad position key");
1300 pack_string_preserving_sort(key
, term
);
1301 pack_uint_preserving_sort(key
, did
+ offset
);
1305 const string
& get_tag() const {
1312 class PositionCursor
<const HoneyTable
&> : private HoneyCursor
{
1313 Xapian::docid offset
;
1317 Xapian::docid firstdid
;
1319 PositionCursor(const HoneyTable
*in
, Xapian::docid offset_
)
1320 : HoneyCursor(in
), offset(offset_
), firstdid(0) {
1321 find_entry(string());
1326 if (!HoneyCursor::next()) return false;
1328 const char * d
= current_key
.data();
1329 const char * e
= d
+ current_key
.size();
1332 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1333 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1335 throw Xapian::DatabaseCorruptError("Bad position key");
1339 pack_string_preserving_sort(key
, term
);
1340 pack_uint_preserving_sort(key
, did
+ offset
);
1344 const string
& get_tag() const {
1350 class PositionCursor
<HoneyTable
&> {
1352 Xapian::docid offset
;
1356 string current_key
, current_tag
;
1357 Xapian::docid firstdid
;
1359 PositionCursor(HoneyTable
*in
, Xapian::docid offset_
)
1360 : table(in
), offset(offset_
), firstdid(0) {
1366 if (!table
->read_item(current_key
, current_tag
, compressed
))
1368 if (compressed
) abort();
1369 const char * d
= current_key
.data();
1370 const char * e
= d
+ current_key
.size();
1373 if (!unpack_string_preserving_sort(&d
, e
, term
) ||
1374 !unpack_uint_preserving_sort(&d
, e
, &did
) ||
1376 throw Xapian::DatabaseCorruptError("Bad position key");
1380 pack_string_preserving_sort(key
, term
);
1381 pack_uint_preserving_sort(key
, did
+ offset
);
1385 const string
& get_tag() const {
1390 template<typename T
>
1391 class PositionCursorGt
{
1393 /** Return true if and only if a's key is strictly greater than b's key.
1395 bool operator()(const T
* a
, const T
* b
) const {
1396 return a
->key
> b
->key
;
1400 template<typename T
, typename U
> void
1401 merge_positions(T
* out
, const vector
<U
*> & inputs
,
1402 const vector
<Xapian::docid
> & offset
)
1404 typedef decltype(*inputs
[0]) table_type
; // E.g. HoneyTable
1405 typedef PositionCursor
<table_type
> cursor_type
;
1406 typedef PositionCursorGt
<cursor_type
> gt_type
;
1407 priority_queue
<cursor_type
*, vector
<cursor_type
*>, gt_type
> pq
;
1408 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1409 auto in
= inputs
[i
];
1411 // Skip empty tables.
1415 pq
.push(new cursor_type(in
, offset
[i
]));
1418 while (!pq
.empty()) {
1419 cursor_type
* cur
= pq
.top();
1421 out
->add(cur
->key
, cur
->get_tag());
1430 template<typename T
, typename U
> void
1431 merge_docid_keyed(T
*out
, const vector
<U
*> & inputs
,
1432 const vector
<Xapian::docid
> & offset
,
1435 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1436 Xapian::docid off
= offset
[i
];
1438 auto in
= inputs
[i
];
1439 if (in
->empty()) continue;
1441 HoneyCursor
cur(in
);
1442 cur
.find_entry(string());
1445 while (cur
.next()) {
1446 // Adjust the key if this isn't the first database.
1449 const char * d
= cur
.current_key
.data();
1450 const char * e
= d
+ cur
.current_key
.size();
1451 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1452 string msg
= "Bad key in ";
1453 msg
+= inputs
[i
]->get_path();
1454 throw Xapian::DatabaseCorruptError(msg
);
1458 pack_uint_preserving_sort(key
, did
);
1460 // Copy over anything extra in the key (e.g. the zero byte
1461 // at the end of "used value slots" in the termlist table).
1462 key
.append(d
, e
- d
);
1465 key
= cur
.current_key
;
1467 bool compressed
= cur
.read_tag(true);
1468 out
->add(key
, cur
.current_tag
, compressed
);
1473 #ifndef DISABLE_GPL_LIBXAPIAN
1474 template<typename T
> void
1475 merge_docid_keyed(T
*out
, const vector
<const GlassTable
*> & inputs
,
1476 const vector
<Xapian::docid
> & offset
,
1479 for (size_t i
= 0; i
< inputs
.size(); ++i
) {
1480 Xapian::docid off
= offset
[i
];
1482 auto in
= inputs
[i
];
1483 if (in
->empty()) continue;
1485 GlassCursor
cur(in
);
1486 cur
.find_entry(string());
1489 while (cur
.next()) {
1491 // Adjust the key if this isn't the first database.
1494 const char * d
= cur
.current_key
.data();
1495 const char * e
= d
+ cur
.current_key
.size();
1496 if (!unpack_uint_preserving_sort(&d
, e
, &did
)) {
1497 string msg
= "Bad key in ";
1498 msg
+= inputs
[i
]->get_path();
1499 throw Xapian::DatabaseCorruptError(msg
);
1503 pack_uint_preserving_sort(key
, did
);
1505 // Copy over anything extra in the key (e.g. the zero byte
1506 // at the end of "used value slots" in the termlist table).
1507 key
.append(d
, e
- d
);
1510 key
= cur
.current_key
;
1512 if (table_type
== Honey::TERMLIST
) {
1513 if (termlist_key_is_values_used(key
)) {
1514 throw Xapian::DatabaseCorruptError("value slots used entry without a corresponding termlist entry");
1517 string tag
= cur
.current_tag
;
1519 bool next_result
= cur
.next();
1520 bool next_already_done
= true;
1521 string encoded_slots_used
;
1523 termlist_key_is_values_used(cur
.current_key
)) {
1524 next_already_done
= false;
1526 const string
& valtag
= cur
.current_tag
;
1528 const char* p
= valtag
.data();
1529 const char* end
= p
+ valtag
.size();
1531 Xapian::VecCOW
<Xapian::termpos
> slots
;
1533 Xapian::valueno first_slot
;
1534 if (!unpack_uint(&p
, end
, &first_slot
)) {
1535 throw Xapian::DatabaseCorruptError("Value slot encoding corrupt");
1537 slots
.push_back(first_slot
);
1538 Xapian::valueno last_slot
= first_slot
;
1540 Xapian::valueno slot
;
1541 if (!unpack_uint(&p
, end
, &slot
)) {
1542 throw Xapian::DatabaseCorruptError("Value slot encoding corrupt");
1544 slot
+= last_slot
+ 1;
1547 slots
.push_back(slot
);
1551 pack_uint(enc
, last_slot
);
1552 if (slots
.size() > 1) {
1553 BitWriter
slots_used(enc
);
1554 slots_used
.encode(first_slot
, last_slot
);
1555 slots_used
.encode(slots
.size() - 2, last_slot
- first_slot
);
1556 slots_used
.encode_interpolative(slots
, 0, slots
.size() - 1);
1557 encoded_slots_used
= slots_used
.freeze();
1559 encoded_slots_used
= std::move(enc
);
1563 const char* pos
= tag
.data();
1564 const char* end
= pos
+ tag
.size();
1567 pack_uint(newtag
, encoded_slots_used
.size());
1568 newtag
+= encoded_slots_used
;
1571 Xapian::termcount doclen
;
1572 if (!unpack_uint(&pos
, end
, &doclen
)) {
1573 throw_database_corrupt("doclen", pos
);
1575 Xapian::termcount termlist_size
;
1576 if (!unpack_uint(&pos
, end
, &termlist_size
)) {
1577 throw_database_corrupt("termlist length", pos
);
1579 pack_uint(newtag
, termlist_size
- 1);
1580 pack_uint(newtag
, doclen
);
1582 string current_term
;
1583 while (pos
!= end
) {
1584 Xapian::termcount current_wdf
= 0;
1586 if (!current_term
.empty()) {
1587 size_t reuse
= static_cast<unsigned char>(*pos
++);
1588 newtag
+= char(reuse
);
1590 if (reuse
> current_term
.size()) {
1591 current_wdf
= reuse
/ (current_term
.size() + 1);
1592 reuse
= reuse
% (current_term
.size() + 1);
1594 current_term
.resize(reuse
);
1597 throw_database_corrupt("term", NULL
);
1600 size_t append
= static_cast<unsigned char>(*pos
++);
1601 if (size_t(end
- pos
) < append
)
1602 throw_database_corrupt("term", NULL
);
1604 current_term
.append(pos
, append
);
1610 if (!unpack_uint(&pos
, end
, ¤t_wdf
)) {
1611 throw_database_corrupt("wdf", pos
);
1613 pack_uint(newtag
, current_wdf
);
1616 newtag
+= char(append
);
1617 newtag
.append(current_term
.end() - append
, current_term
.end());
1620 if (!newtag
.empty())
1621 out
->add(key
, newtag
);
1622 if (!next_result
) break;
1623 if (next_already_done
) goto next_without_next
;
1625 bool compressed
= cur
.read_tag(true);
1626 out
->add(key
, cur
.current_tag
, compressed
);
1635 using namespace HoneyCompact
;
1638 HoneyDatabase::compact(Xapian::Compactor
* compactor
,
1639 const char* destdir
,
1642 const vector
<const Xapian::Database::Internal
*>& sources
,
1643 const vector
<Xapian::docid
>& offset
,
1645 Xapian::Compactor::compaction_level compaction
,
1647 Xapian::docid last_docid
)
1650 // The "base name" of the table.
1653 Honey::table_type type
;
1654 // Create tables after position lazily.
1658 static const table_list tables
[] = {
1660 { "postlist", Honey::POSTLIST
, false },
1661 { "docdata", Honey::DOCDATA
, true },
1662 { "termlist", Honey::TERMLIST
, false },
1663 { "position", Honey::POSITION
, true },
1664 { "spelling", Honey::SPELLING
, true },
1665 { "synonym", Honey::SYNONYM
, true }
1667 const table_list
* tables_end
= tables
+
1668 (sizeof(tables
) / sizeof(tables
[0]));
1670 const int FLAGS
= Xapian::DB_DANGEROUS
;
1672 bool single_file
= (flags
& Xapian::DBCOMPACT_SINGLE_FILE
);
1673 bool multipass
= (flags
& Xapian::DBCOMPACT_MULTIPASS
);
1675 // FIXME: Support this combination - we need to put temporary files
1681 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1682 bool has_uncommitted_changes
;
1683 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1684 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1685 has_uncommitted_changes
= db
->has_uncommitted_changes();
1687 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1688 has_uncommitted_changes
= db
->has_uncommitted_changes();
1690 if (has_uncommitted_changes
) {
1692 "Can't compact from a WritableDatabase with uncommitted "
1693 "changes - either call commit() first, or create a new "
1694 "Database object from the filename on disk";
1695 throw Xapian::InvalidOperationError(m
);
1700 if (block_size
< HONEY_MIN_BLOCKSIZE
||
1701 block_size
> HONEY_MAX_BLOCKSIZE
||
1702 (block_size
& (block_size
- 1)) != 0) {
1703 block_size
= HONEY_DEFAULT_BLOCKSIZE
;
1706 FlintLock
lock(destdir
? destdir
: "");
1709 FlintLock::reason why
= lock
.lock(true, false, explanation
);
1710 if (why
!= FlintLock::SUCCESS
) {
1711 lock
.throw_databaselockerror(why
, destdir
, explanation
);
1715 unique_ptr
<HoneyVersion
> version_file_out
;
1718 fd
= open(destdir
, O_RDWR
|O_CREAT
|O_TRUNC
|O_BINARY
|O_CLOEXEC
, 0666);
1720 throw Xapian::DatabaseCreateError("open() failed", errno
);
1723 version_file_out
.reset(new HoneyVersion(fd
));
1726 version_file_out
.reset(new HoneyVersion(destdir
));
1729 version_file_out
->create(block_size
);
1730 for (size_t i
= 0; i
!= sources
.size(); ++i
) {
1731 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1732 #ifdef DISABLE_GPL_LIBXAPIAN
1735 auto db
= static_cast<const GlassDatabase
*>(sources
[i
]);
1736 version_file_out
->merge_stats(db
->version_file
.get_doccount(),
1737 db
->version_file
.get_doclength_lower_bound(),
1738 db
->version_file
.get_doclength_upper_bound(),
1739 db
->version_file
.get_wdf_upper_bound(),
1740 db
->version_file
.get_total_doclen(),
1741 db
->version_file
.get_spelling_wordfreq_upper_bound());
1744 auto db
= static_cast<const HoneyDatabase
*>(sources
[i
]);
1745 version_file_out
->merge_stats(db
->version_file
);
1749 string fl_serialised
;
1753 fl
.set_first_unused_block(1); // FIXME: Assumption?
1754 fl
.pack(fl_serialised
);
1758 // Set to true if stat() failed (which can happen if the files are > 2GB
1759 // and off_t is 32 bit) or one of the totals overflowed.
1760 bool bad_totals
= false;
1761 off_t in_total
= 0, out_total
= 0;
1763 // FIXME: sort out indentation.
1764 if (source_backend
== Xapian::DB_BACKEND_GLASS
) {
1765 #ifdef DISABLE_GPL_LIBXAPIAN
1766 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1768 vector
<HoneyTable
*> tabs
;
1769 tabs
.reserve(tables_end
- tables
);
1770 off_t prev_size
= block_size
;
1771 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
1772 // The postlist table requires an N-way merge, adjusting the
1773 // headers of various blocks. The spelling and synonym tables also
1774 // need special handling. The other tables have keys sorted in
1775 // docid order, so we can merge them by simply copying all the keys
1776 // from each source table in turn.
1778 compactor
->set_status(t
->name
, string());
1788 bool output_will_exist
= !t
->lazy
;
1790 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
1791 // on certain systems).
1792 bool bad_stat
= false;
1794 // We can't currently report input sizes if there's a single file DB
1795 // amongst the inputs.
1796 bool single_file_in
= false;
1800 vector
<const GlassTable
*> inputs
;
1801 inputs
.reserve(sources
.size());
1802 size_t inputs_present
= 0;
1803 for (auto src
: sources
) {
1804 auto db
= static_cast<const GlassDatabase
*>(src
);
1805 const GlassTable
* table
;
1807 case Honey::POSTLIST
:
1808 table
= &(db
->postlist_table
);
1810 case Honey::DOCDATA
:
1811 table
= &(db
->docdata_table
);
1813 case Honey::TERMLIST
:
1814 table
= &(db
->termlist_table
);
1816 case Honey::POSITION
:
1817 table
= &(db
->position_table
);
1819 case Honey::SPELLING
:
1820 table
= &(db
->spelling_table
);
1822 case Honey::SYNONYM
:
1823 table
= &(db
->synonym_table
);
1830 if (db
->single_file()) {
1831 if (t
->lazy
&& table
->empty()) {
1832 // Essentially doesn't exist.
1834 // FIXME: Find actual size somehow?
1835 // in_size += table->size() / 1024;
1836 single_file_in
= true;
1838 output_will_exist
= true;
1842 off_t db_size
= file_size(table
->get_path());
1844 // FIXME: check overflow and set bad_totals
1845 in_total
+= db_size
;
1846 in_size
+= db_size
/ 1024;
1847 output_will_exist
= true;
1849 } else if (errno
!= ENOENT
) {
1850 // We get ENOENT for an optional table.
1851 bad_totals
= bad_stat
= true;
1852 output_will_exist
= true;
1856 inputs
.push_back(table
);
1859 // If any inputs lack a termlist table, suppress it in the output.
1860 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
1861 if (inputs_present
!= 0) {
1863 string m
= str(inputs_present
);
1865 m
+= str(sources
.size());
1866 m
+= " inputs present, so suppressing output";
1867 compactor
->set_status(t
->name
, m
);
1871 output_will_exist
= false;
1874 if (!output_will_exist
) {
1876 compactor
->set_status(t
->name
, "doesn't exist");
1881 off_t table_start_offset
= -1;
1884 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
1885 // space for version file. It's tricky to exactly know the
1886 // size of the version file beforehand.
1887 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
1888 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
1889 throw Xapian::DatabaseError("lseek() failed", errno
);
1891 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
1893 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
1896 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
1898 tabs
.push_back(out
);
1899 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
1901 root_info
->set_free_list(fl_serialised
);
1902 root_info
->set_offset(table_start_offset
);
1903 out
->open(FLAGS
, version_file_out
->get_root(t
->type
), version_file_out
->get_revision());
1905 out
->create_and_open(FLAGS
, *root_info
);
1908 out
->set_full_compaction(compaction
!= compactor
->STANDARD
);
1909 if (compaction
== compactor
->FULLER
) out
->set_max_item_size(1);
1912 case Honey::POSTLIST
: {
1913 if (multipass
&& inputs
.size() > 3) {
1914 multimerge_postlists(compactor
, out
, destdir
,
1917 merge_postlists(compactor
, out
, offset
.begin(),
1918 inputs
.begin(), inputs
.end());
1922 case Honey::SPELLING
:
1923 merge_spellings(out
, inputs
.begin(), inputs
.end());
1925 case Honey::SYNONYM
:
1926 merge_synonyms(out
, inputs
.begin(), inputs
.end());
1928 case Honey::POSITION
:
1929 merge_positions(out
, inputs
, offset
);
1932 // DocData, Termlist
1933 merge_docid_keyed(out
, inputs
, offset
, t
->type
);
1937 // Commit as revision 1.
1939 out
->commit(1, root_info
);
1941 if (single_file
) fl_serialised
= root_info
->get_free_list();
1944 if (!bad_stat
&& !single_file_in
) {
1947 db_size
= file_size(fd
);
1949 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
1953 off_t old_prev_size
= max(prev_size
, off_t(block_size
));
1954 prev_size
= db_size
;
1955 db_size
-= old_prev_size
;
1957 // FIXME: check overflow and set bad_totals
1958 out_total
+= db_size
;
1959 out_size
= db_size
/ 1024;
1960 } else if (errno
!= ENOENT
) {
1961 bad_totals
= bad_stat
= true;
1966 compactor
->set_status(t
->name
, "Done (couldn't stat all the DB files)");
1967 } else if (single_file_in
) {
1969 compactor
->set_status(t
->name
, "Done (table sizes unknown for single file DB input)");
1972 if (out_size
== in_size
) {
1973 status
= "Size unchanged (";
1976 if (out_size
< in_size
) {
1977 delta
= in_size
- out_size
;
1978 status
= "Reduced by ";
1980 delta
= out_size
- in_size
;
1981 status
= "INCREASED by ";
1984 status
+= str(100 * delta
/ in_size
);
1987 status
+= str(delta
);
1989 status
+= str(in_size
);
1992 status
+= str(out_size
);
1995 compactor
->set_status(t
->name
, status
);
1999 // If compacting to a single file output and all the tables are empty, pad
2000 // the output so that it isn't mistaken for a stub database when we try to
2001 // open it. For this it needs to be a multiple of 2KB in size.
2002 if (single_file
&& prev_size
< off_t(block_size
)) {
2003 #ifdef HAVE_FTRUNCATE
2004 if (ftruncate(fd
, block_size
) < 0) {
2005 throw Xapian::DatabaseError("Failed to set size of output database", errno
);
2008 const off_t off
= block_size
- 1;
2009 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2010 throw Xapian::DatabaseError("Failed to set size of output database", errno
);
2016 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2017 throw Xapian::DatabaseError("lseek() failed", errno
);
2020 version_file_out
->set_last_docid(last_docid
);
2021 string tmpfile
= version_file_out
->write(1, FLAGS
);
2023 off_t version_file_size
= lseek(fd
, 0, SEEK_CUR
);
2024 if (version_file_size
< 0) {
2025 throw Xapian::DatabaseError("lseek() failed", errno
);
2027 if (version_file_size
> HONEY_VERSION_MAX_SIZE
) {
2028 throw Xapian::DatabaseError("Didn't allow enough space for version file data");
2031 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2034 // Commit with revision 1.
2035 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2036 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2041 vector
<HoneyTable
*> tabs
;
2042 tabs
.reserve(tables_end
- tables
);
2043 off_t prev_size
= block_size
;
2044 for (const table_list
* t
= tables
; t
< tables_end
; ++t
) {
2045 // The postlist table requires an N-way merge, adjusting the
2046 // headers of various blocks. The spelling and synonym tables also
2047 // need special handling. The other tables have keys sorted in
2048 // docid order, so we can merge them by simply copying all the keys
2049 // from each source table in turn.
2051 compactor
->set_status(t
->name
, string());
2061 bool output_will_exist
= !t
->lazy
;
2063 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2064 // on certain systems).
2065 bool bad_stat
= false;
2067 // We can't currently report input sizes if there's a single file DB
2068 // amongst the inputs.
2069 bool single_file_in
= false;
2073 vector
<const HoneyTable
*> inputs
;
2074 inputs
.reserve(sources
.size());
2075 size_t inputs_present
= 0;
2076 for (auto src
: sources
) {
2077 auto db
= static_cast<const HoneyDatabase
*>(src
);
2078 const HoneyTable
* table
;
2080 case Honey::POSTLIST
:
2081 table
= &(db
->postlist_table
);
2083 case Honey::DOCDATA
:
2084 table
= &(db
->docdata_table
);
2086 case Honey::TERMLIST
:
2087 table
= &(db
->termlist_table
);
2089 case Honey::POSITION
:
2090 table
= &(db
->position_table
);
2092 case Honey::SPELLING
:
2093 table
= &(db
->spelling_table
);
2095 case Honey::SYNONYM
:
2096 table
= &(db
->synonym_table
);
2103 if (db
->single_file()) {
2104 if (t
->lazy
&& table
->empty()) {
2105 // Essentially doesn't exist.
2107 // FIXME: Find actual size somehow?
2108 // in_size += table->size() / 1024;
2109 single_file_in
= true;
2111 output_will_exist
= true;
2115 off_t db_size
= file_size(table
->get_path());
2117 // FIXME: check overflow and set bad_totals
2118 in_total
+= db_size
;
2119 in_size
+= db_size
/ 1024;
2120 output_will_exist
= true;
2122 } else if (errno
!= ENOENT
) {
2123 // We get ENOENT for an optional table.
2124 bad_totals
= bad_stat
= true;
2125 output_will_exist
= true;
2129 inputs
.push_back(table
);
2132 // If any inputs lack a termlist table, suppress it in the output.
2133 if (t
->type
== Honey::TERMLIST
&& inputs_present
!= sources
.size()) {
2134 if (inputs_present
!= 0) {
2136 string m
= str(inputs_present
);
2138 m
+= str(sources
.size());
2139 m
+= " inputs present, so suppressing output";
2140 compactor
->set_status(t
->name
, m
);
2144 output_will_exist
= false;
2147 if (!output_will_exist
) {
2149 compactor
->set_status(t
->name
, "doesn't exist");
2154 off_t table_start_offset
= -1;
2157 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2158 // space for version file. It's tricky to exactly know the
2159 // size of the version file beforehand.
2160 table_start_offset
= HONEY_VERSION_MAX_SIZE
;
2161 if (lseek(fd
, table_start_offset
, SEEK_SET
) < 0)
2162 throw Xapian::DatabaseError("lseek() failed", errno
);
2164 table_start_offset
= lseek(fd
, 0, SEEK_CUR
);
2166 out
= new HoneyTable(t
->name
, fd
, version_file_out
->get_offset(),
2169 out
= new HoneyTable(t
->name
, dest
, false, t
->lazy
);
2171 tabs
.push_back(out
);
2172 Honey::RootInfo
* root_info
= version_file_out
->root_to_set(t
->type
);
2174 root_info
->set_free_list(fl_serialised
);
2175 root_info
->set_offset(table_start_offset
);
2176 out
->open(FLAGS
, version_file_out
->get_root(t
->type
), version_file_out
->get_revision());
2178 out
->create_and_open(FLAGS
, *root_info
);
2181 out
->set_full_compaction(compaction
!= compactor
->STANDARD
);
2182 if (compaction
== compactor
->FULLER
) out
->set_max_item_size(1);
2185 case Honey::POSTLIST
: {
2186 if (multipass
&& inputs
.size() > 3) {
2187 multimerge_postlists(compactor
, out
, destdir
,
2190 merge_postlists(compactor
, out
, offset
.begin(),
2191 inputs
.begin(), inputs
.end());
2195 case Honey::SPELLING
:
2196 merge_spellings(out
, inputs
.begin(), inputs
.end());
2198 case Honey::SYNONYM
:
2199 merge_synonyms(out
, inputs
.begin(), inputs
.end());
2201 case Honey::POSITION
:
2202 merge_positions(out
, inputs
, offset
);
2205 // DocData, Termlist
2206 merge_docid_keyed(out
, inputs
, offset
);
2210 // Commit as revision 1.
2212 out
->commit(1, root_info
);
2214 if (single_file
) fl_serialised
= root_info
->get_free_list();
2217 if (!bad_stat
&& !single_file_in
) {
2220 db_size
= file_size(fd
);
2222 db_size
= file_size(dest
+ HONEY_TABLE_EXTENSION
);
2226 off_t old_prev_size
= max(prev_size
, off_t(block_size
));
2227 prev_size
= db_size
;
2228 db_size
-= old_prev_size
;
2230 // FIXME: check overflow and set bad_totals
2231 out_total
+= db_size
;
2232 out_size
= db_size
/ 1024;
2233 } else if (errno
!= ENOENT
) {
2234 bad_totals
= bad_stat
= true;
2239 compactor
->set_status(t
->name
, "Done (couldn't stat all the DB files)");
2240 } else if (single_file_in
) {
2242 compactor
->set_status(t
->name
, "Done (table sizes unknown for single file DB input)");
2245 if (out_size
== in_size
) {
2246 status
= "Size unchanged (";
2249 if (out_size
< in_size
) {
2250 delta
= in_size
- out_size
;
2251 status
= "Reduced by ";
2253 delta
= out_size
- in_size
;
2254 status
= "INCREASED by ";
2257 status
+= str(100 * delta
/ in_size
);
2260 status
+= str(delta
);
2262 status
+= str(in_size
);
2265 status
+= str(out_size
);
2268 compactor
->set_status(t
->name
, status
);
2272 // If compacting to a single file output and all the tables are empty, pad
2273 // the output so that it isn't mistaken for a stub database when we try to
2274 // open it. For this it needs to be a multiple of 2KB in size.
2275 if (single_file
&& prev_size
< off_t(block_size
)) {
2276 #ifdef HAVE_FTRUNCATE
2277 if (ftruncate(fd
, block_size
) < 0) {
2278 throw Xapian::DatabaseError("Failed to set size of output database", errno
);
2281 const off_t off
= block_size
- 1;
2282 if (lseek(fd
, off
, SEEK_SET
) != off
|| write(fd
, "", 1) != 1) {
2283 throw Xapian::DatabaseError("Failed to set size of output database", errno
);
2289 if (lseek(fd
, version_file_out
->get_offset(), SEEK_SET
) == -1) {
2290 throw Xapian::DatabaseError("lseek() failed", errno
);
2293 version_file_out
->set_last_docid(last_docid
);
2294 string tmpfile
= version_file_out
->write(1, FLAGS
);
2295 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2298 // Commit with revision 1.
2299 version_file_out
->sync(tmpfile
, 1, FLAGS
);
2300 for (unsigned j
= 0; j
!= tabs
.size(); ++j
) {
2305 if (!single_file
) lock
.release();
2307 if (!bad_totals
&& compactor
) {
2311 if (out_total
== in_total
) {
2312 status
= "Size unchanged (";
2315 if (out_total
< in_total
) {
2316 delta
= in_total
- out_total
;
2317 status
= "Reduced by ";
2319 delta
= out_total
- in_total
;
2320 status
= "INCREASED by ";
2323 status
+= str(100 * delta
/ in_total
);
2326 status
+= str(delta
);
2328 status
+= str(in_total
);
2331 status
+= str(out_total
);
2333 compactor
->set_status("Total", status
);