xapian-core/backends/honey/honey_compact.cc

   1 /** @file honey_compact.cc
   2  * @brief Compact a honey database, or merge and compact several.
   3  */
   4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  19  * USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "xapian/compactor.h"
  25 #include "xapian/constants.h"
  26 #include "xapian/error.h"
  27 #include "xapian/types.h"
  28
  29 #include <algorithm>
  30 #include <memory>
  31 #include <queue>
  32 #include <type_traits>
  33
  34 #include <cstdio>
  35
  36 #include "safeerrno.h"
  37
  38 #include "backends/flint_lock.h"
  39 #include "compression_stream.h"
  40 #include "honey_cursor.h"
  41 #include "honey_database.h"
  42 #include "honey_defs.h"
  43 #include "honey_postlist_encodings.h"
  44 #include "honey_table.h"
  45 #include "honey_version.h"
  46 #include "filetests.h"
  47 #include "internaltypes.h"
  48 #include "pack.h"
  49 #include "backends/valuestats.h"
  50 #include "wordaccess.h"
  51
  52 #include "../byte_length_strings.h"
  53 #include "../prefix_compressed_strings.h"
  54
  55 #ifndef DISABLE_GPL_LIBXAPIAN
  56 # include "../glass/glass_database.h"
  57 # include "../glass/glass_table.h"
  58 # include "../glass/glass_values.h"
  59 #endif
  60
  61 using namespace std;
  62
  63 #ifndef DISABLE_GPL_LIBXAPIAN
  64 [[noreturn]]
  65 static void
  66 throw_database_corrupt(const char* item, const char* pos)
  67 {
  68     string message;
  69     if (pos != NULL) {
  70         message = "Value overflow unpacking termlist: ";
  71     } else {
  72         message = "Out of data unpacking termlist: ";
  73     }
  74     message += item;
  75     throw Xapian::DatabaseCorruptError(message);
  76 }
  77
  78 namespace GlassCompact {
  79
  80 static inline bool
  81 is_user_metadata_key(const string & key)
  82 {
  83     return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
  84 }
  85
  86 static inline bool
  87 is_valuestats_key(const string & key)
  88 {
  89     return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
  90 }
  91
  92 static inline bool
  93 is_valuechunk_key(const string & key)
  94 {
  95     return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
  96 }
  97
  98 static inline bool
  99 is_doclenchunk_key(const string & key)
 100 {
 101     return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
 102 }
 103
 104 }
 105
 106 inline static bool
 107 termlist_key_is_values_used(const string& key)
 108 {
 109     const char* p = key.data();
 110     const char* e = p + key.size();
 111     Xapian::docid did;
 112     if (unpack_uint_preserving_sort(&p, e, &did)) {
 113         if (p == e)
 114             return false;
 115         if (e - p == 1 && *p == '\0')
 116             return true;
 117     }
 118     throw Xapian::DatabaseCorruptError("termlist key format");
 119 }
 120 #endif
 121
 122 // Put all the helpers in a namespace to avoid symbols colliding with those of
 123 // the same name in other flint-derived backends.
 124 namespace HoneyCompact {
 125
 126 enum {
 127     KEY_USER_METADATA = 0x00,
 128     KEY_VALUE_STATS = 0x01,
 129     KEY_VALUE_CHUNK = 0xd8,
 130     KEY_DOCLEN_CHUNK = 0xe0,
 131     KEY_POSTING_CHUNK = 0xff
 132 };
 133
 134 /// Return one of the KEY_* constants, or a different value for an invalid key.
 135 static inline int
 136 key_type(const string& key)
 137 {
 138     if (key[0] != '\0')
 139         return KEY_POSTING_CHUNK;
 140
 141     if (key.size() <= 1)
 142         return -1;
 143
 144     switch (static_cast<unsigned char>(key[1])) {
 145         case 0x01: case 0x02: case 0x03: case 0x04:
 146             return KEY_VALUE_STATS;
 147     }
 148
 149     // If key[1] is \xff then this correctly returns KEY_POSTING_CHUNK.
 150     return static_cast<unsigned char>(key[1]);
 151 }
 152
 153 template<typename T> class PostlistCursor;
 154
 155 #ifndef DISABLE_GPL_LIBXAPIAN
 156 // Convert glass to honey.
 157 template<>
 158 class PostlistCursor<const GlassTable&> : private GlassCursor {
 159     Xapian::docid offset;
 160
 161   public:
 162     string key, tag;
 163     Xapian::docid firstdid;
 164     Xapian::docid chunk_lastdid;
 165     Xapian::termcount tf, cf;
 166     Xapian::termcount first_wdf;
 167     Xapian::termcount wdf_max;
 168
 169     PostlistCursor(const GlassTable *in, Xapian::docid offset_)
 170         : GlassCursor(in), offset(offset_), firstdid(0)
 171     {
 172         rewind();
 173         next();
 174     }
 175
 176     bool next() {
 177         if (!GlassCursor::next()) return false;
 178         // We put all chunks into the non-initial chunk form here, then fix up
 179         // the first chunk for each term in the merged database as we merge.
 180         read_tag();
 181         key = current_key;
 182         tag = current_tag;
 183         tf = cf = 0;
 184         if (GlassCompact::is_user_metadata_key(key)) {
 185             key[1] = KEY_USER_METADATA;
 186             return true;
 187         }
 188         if (GlassCompact::is_valuestats_key(key)) {
 189             // Adjust key.
 190             const char * p = key.data();
 191             const char * end = p + key.length();
 192             p += 2;
 193             Xapian::valueno slot;
 194             if (!unpack_uint_last(&p, end, &slot))
 195                 throw Xapian::DatabaseCorruptError("bad value stats key");
 196             key = pack_honey_valuestats_key(slot);
 197             return true;
 198         }
 199         if (GlassCompact::is_valuechunk_key(key)) {
 200             const char * p = key.data();
 201             const char * end = p + key.length();
 202             p += 2;
 203             Xapian::valueno slot;
 204             if (!unpack_uint(&p, end, &slot))
 205                 throw Xapian::DatabaseCorruptError("bad value key");
 206             Xapian::docid first_did;
 207             if (!unpack_uint_preserving_sort(&p, end, &first_did))
 208                 throw Xapian::DatabaseCorruptError("bad value key");
 209             first_did += offset;
 210
 211             Glass::ValueChunkReader reader(tag.data(), tag.size(), first_did);
 212             Xapian::docid last_did = first_did;
 213             while (reader.next(), !reader.at_end()) {
 214                 last_did = reader.get_docid();
 215             }
 216
 217             key.assign("\0\xd8", 2);
 218             pack_uint(key, slot);
 219             pack_uint_preserving_sort(key, last_did);
 220
 221             // Add the docid delta across the chunk to the start of the tag.
 222             string newtag;
 223             pack_uint(newtag, last_did - first_did);
 224             tag.insert(0, newtag);
 225
 226             return true;
 227         }
 228
 229         if (GlassCompact::is_doclenchunk_key(key)) {
 230             const char * d = key.data();
 231             const char * e = d + key.size();
 232             d += 2;
 233
 234             if (d == e) {
 235                 // This is an initial chunk, so adjust tag header.
 236                 d = tag.data();
 237                 e = d + tag.size();
 238                 if (!unpack_uint(&d, e, &tf) ||
 239                     !unpack_uint(&d, e, &cf) ||
 240                     !unpack_uint(&d, e, &firstdid)) {
 241                     throw Xapian::DatabaseCorruptError("Bad postlist key");
 242                 }
 243                 ++firstdid;
 244                 tag.erase(0, d - tag.data());
 245             } else {
 246                 // Not an initial chunk, so adjust key.
 247                 size_t tmp = d - key.data();
 248                 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
 249                     throw Xapian::DatabaseCorruptError("Bad postlist key");
 250                 key.erase(tmp);
 251             }
 252             firstdid += offset;
 253
 254             d = tag.data();
 255             e = d + tag.size();
 256
 257             // Convert doclen chunk to honey format.
 258             string newtag;
 259
 260             // Skip the "last chunk" flag and increase_to_last.
 261             if (d == e)
 262                 throw Xapian::DatabaseCorruptError("No last chunk flag in "
 263                                                    "glass docdata chunk");
 264             ++d;
 265             Xapian::docid increase_to_last;
 266             if (!unpack_uint(&d, e, &increase_to_last))
 267                 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
 268                                                    "in glass docdata chunk");
 269
 270             Xapian::termcount doclen_max = 0;
 271             while (true) {
 272                 Xapian::termcount doclen;
 273                 if (!unpack_uint(&d, e, &doclen))
 274                     throw Xapian::DatabaseCorruptError("Decoding doclen in "
 275                                                        "glass docdata chunk");
 276                 if (doclen > doclen_max)
 277                     doclen_max = doclen;
 278                 unsigned char buf[4];
 279                 unaligned_write4(buf, doclen);
 280                 newtag.append(reinterpret_cast<char*>(buf), 4);
 281                 if (d == e)
 282                     break;
 283                 Xapian::docid gap_size;
 284                 if (!unpack_uint(&d, e, &gap_size))
 285                     throw Xapian::DatabaseCorruptError("Decoding docid "
 286                                                        "gap_size in glass "
 287                                                        "docdata chunk");
 288                 // FIXME: Split chunk if the gap_size is at all large.
 289                 newtag.append(4 * gap_size, '\xff');
 290             }
 291
 292             Assert(!startswith(newtag, "\xff\xff\xff\xff"));
 293             Assert(!endswith(newtag, "\xff\xff\xff\xff"));
 294
 295             chunk_lastdid = firstdid - 1 + newtag.size() / 4;
 296
 297             // Only encode document lengths using a whole number of bytes for
 298             // now.  We could allow arbitrary bit widths, but it complicates
 299             // encoding and decoding so we should consider if the fairly small
 300             // additional saving is worth it.
 301             if (doclen_max >= 0xffff) {
 302                 if (doclen_max >= 0xffffff) {
 303                     newtag.insert(0, 1, char(32));
 304                     swap(tag, newtag);
 305                 } else if (doclen_max >= 0xffffffff) {
 306                     // FIXME: Handle these.
 307                     const char* m = "Document length values >= 0xffffffff not "
 308                                     "currently handled";
 309                     throw Xapian::FeatureUnavailableError(m);
 310                 } else {
 311                     tag.assign(1, char(24));
 312                     for (size_t i = 1; i < newtag.size(); i += 4)
 313                         tag.append(newtag, i, 3);
 314                 }
 315             } else {
 316                 if (doclen_max >= 0xff) {
 317                     tag.assign(1, char(16));
 318                     for (size_t i = 2; i < newtag.size(); i += 4)
 319                         tag.append(newtag, i, 2);
 320                 } else {
 321                     tag.assign(1, char(8));
 322                     for (size_t i = 3; i < newtag.size(); i += 4)
 323                         tag.append(newtag, i, 1);
 324                 }
 325             }
 326
 327             return true;
 328         }
 329
 330         // Adjust key if this is *NOT* an initial chunk.
 331         // key is: pack_string_preserving_sort(key, tname)
 332         // plus optionally: pack_uint_preserving_sort(key, did)
 333         const char * d = key.data();
 334         const char * e = d + key.size();
 335         string tname;
 336         if (!unpack_string_preserving_sort(&d, e, tname))
 337             throw Xapian::DatabaseCorruptError("Bad postlist key");
 338
 339         if (d == e) {
 340             // This is an initial chunk for a term, so adjust tag header.
 341             d = tag.data();
 342             e = d + tag.size();
 343             if (!unpack_uint(&d, e, &tf) ||
 344                 !unpack_uint(&d, e, &cf) ||
 345                 !unpack_uint(&d, e, &firstdid)) {
 346                 throw Xapian::DatabaseCorruptError("Bad postlist key");
 347             }
 348             ++firstdid;
 349             tag.erase(0, d - tag.data());
 350             wdf_max = 0;
 351         } else {
 352             // Not an initial chunk, so adjust key.
 353             size_t tmp = d - key.data();
 354             if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
 355                 throw Xapian::DatabaseCorruptError("Bad postlist key");
 356             key.erase(tmp - 1);
 357         }
 358         firstdid += offset;
 359
 360         d = tag.data();
 361         e = d + tag.size();
 362
 363         // Convert posting chunk to honey format, but without any header.
 364         string newtag;
 365
 366         // Skip the "last chunk" flag; decode increase_to_last.
 367         if (d == e)
 368             throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
 369                                                "posting chunk");
 370         ++d;
 371         Xapian::docid increase_to_last;
 372         if (!unpack_uint(&d, e, &increase_to_last))
 373             throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
 374                                                "glass posting chunk");
 375         chunk_lastdid = firstdid + increase_to_last;
 376         if (!unpack_uint(&d, e, &first_wdf))
 377             throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
 378                                                "posting chunk");
 379         wdf_max = max(wdf_max, first_wdf);
 380
 381         while (d != e) {
 382             Xapian::docid delta;
 383             if (!unpack_uint(&d, e, &delta))
 384                 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
 385                                                    "glass posting chunk");
 386             pack_uint(newtag, delta);
 387             Xapian::termcount wdf;
 388             if (!unpack_uint(&d, e, &wdf))
 389                 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
 390                                                    "posting chunk");
 391             pack_uint(newtag, wdf);
 392             wdf_max = max(wdf_max, wdf);
 393         }
 394
 395         swap(tag, newtag);
 396
 397         return true;
 398     }
 399 };
 400 #endif
 401
 402 template<>
 403 class PostlistCursor<const HoneyTable&> : private HoneyCursor {
 404     Xapian::docid offset;
 405
 406   public:
 407     string key, tag;
 408     Xapian::docid firstdid;
 409     Xapian::docid chunk_lastdid;
 410     Xapian::termcount tf, cf;
 411     Xapian::termcount first_wdf;
 412     Xapian::termcount wdf_max;
 413
 414     PostlistCursor(const HoneyTable *in, Xapian::docid offset_)
 415         : HoneyCursor(in), offset(offset_), firstdid(0)
 416     {
 417         rewind();
 418         next();
 419     }
 420
 421     bool next() {
 422         if (!HoneyCursor::next()) return false;
 423         // We put all chunks into the non-initial chunk form here, then fix up
 424         // the first chunk for each term in the merged database as we merge.
 425         read_tag();
 426         key = current_key;
 427         tag = current_tag;
 428         tf = 0;
 429         switch (key_type(key)) {
 430             case KEY_USER_METADATA:
 431             case KEY_VALUE_STATS:
 432                 return true;
 433             case KEY_VALUE_CHUNK: {
 434                 const char * p = key.data();
 435                 const char * end = p + key.length();
 436                 p += 2;
 437                 Xapian::valueno slot;
 438                 if (!unpack_uint(&p, end, &slot))
 439                     throw Xapian::DatabaseCorruptError("bad value key");
 440                 Xapian::docid did;
 441                 if (!unpack_uint_preserving_sort(&p, end, &did))
 442                     throw Xapian::DatabaseCorruptError("bad value key");
 443                 did += offset;
 444
 445                 key.assign("\0\xd8", 2);
 446                 pack_uint(key, slot);
 447                 pack_uint_preserving_sort(key, did);
 448                 return true;
 449             }
 450             case KEY_DOCLEN_CHUNK: {
 451                 const char * p = key.data();
 452                 const char * end = p + key.length();
 453                 p += 2;
 454                 Xapian::docid did = 1;
 455                 if (p != end &&
 456                     (!unpack_uint_preserving_sort(&p, end, &did) || p != end)) {
 457                     throw Xapian::DatabaseCorruptError("Bad doclen key");
 458                 }
 459                 did += offset;
 460
 461                 key.erase(2);
 462                 if (did != 1) {
 463                     pack_uint_preserving_sort(key, did);
 464                 }
 465                 return true;
 466             }
 467             case KEY_POSTING_CHUNK:
 468                 break;
 469             default:
 470                 throw Xapian::DatabaseCorruptError("Bad postlist table key "
 471                                                    "type");
 472         }
 473
 474         // Adjust key if this is *NOT* an initial chunk.
 475         // key is: pack_string_preserving_sort(key, term)
 476         // plus optionally: pack_uint_preserving_sort(key, did)
 477         const char * d = key.data();
 478         const char * e = d + key.size();
 479         string term;
 480         if (!unpack_string_preserving_sort(&d, e, term))
 481             throw Xapian::DatabaseCorruptError("Bad postlist key");
 482
 483         if (d == e) {
 484             // This is an initial chunk for a term, so remove tag header.
 485             d = tag.data();
 486             e = d + tag.size();
 487
 488             Xapian::docid lastdid;
 489             if (!decode_initial_chunk_header(&d, e, tf, cf,
 490                                              firstdid, lastdid, chunk_lastdid,
 491                                              first_wdf, wdf_max)) {
 492                 throw Xapian::DatabaseCorruptError("Bad postlist initial "
 493                                                    "chunk header");
 494             }
 495             // Ignore lastdid - we'll need to recalculate it (at least when
 496             // merging, and for simplicity we always do).
 497             (void)lastdid;
 498             tag.erase(0, d - tag.data());
 499         } else {
 500             // Not an initial chunk, so adjust key and remove tag header.
 501             size_t tmp = d - key.data();
 502             if (!unpack_uint_preserving_sort(&d, e, &chunk_lastdid) ||
 503                 d != e) {
 504                 throw Xapian::DatabaseCorruptError("Bad postlist key");
 505             }
 506             // -1 to remove the terminating zero byte too.
 507             key.erase(tmp - 1);
 508
 509             d = tag.data();
 510             e = d + tag.size();
 511
 512
 513             bool have_wdfs = (cf != 0);
 514             if (have_wdfs && tf > 2) {
 515                 Xapian::termcount remaining_cf_for_flat_wdf =
 516                     (tf - 1) * wdf_max;
 517                 // Check this matches and that it isn't a false match due
 518                 // to overflow of the multiplication above.
 519                 if (cf - first_wdf == remaining_cf_for_flat_wdf &&
 520                     usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
 521                     // The wdf is flat so we don't need to store it.
 522                     have_wdfs = false;
 523                 }
 524             }
 525
 526             if (have_wdfs) {
 527                 if (!decode_delta_chunk_header(&d, e, chunk_lastdid, firstdid,
 528                                                first_wdf)) {
 529                     throw Xapian::DatabaseCorruptError("Bad postlist delta "
 530                                                        "chunk header");
 531                 }
 532                 tag.erase(0, d - tag.data());
 533             } else {
 534                 if (!decode_delta_chunk_header_no_wdf(&d, e, chunk_lastdid,
 535                                                       firstdid)) {
 536                     throw Xapian::DatabaseCorruptError("Bad postlist delta "
 537                                                        "chunk header");
 538                 }
 539                 first_wdf = 0;
 540                 // Splice in a 0 wdf value after each docid delta.
 541                 string newtag;
 542                 for (size_t i = d - tag.data(); i < tag.size(); i += 4) {
 543                     newtag.append(tag, i, 4);
 544                     newtag.append(4, '\0');
 545                 }
 546                 tag = std::move(newtag);
 547             }
 548         }
 549         firstdid += offset;
 550         chunk_lastdid += offset;
 551         return true;
 552     }
 553 };
 554
 555 template<>
 556 class PostlistCursor<HoneyTable&> : public PostlistCursor<const HoneyTable&> {
 557   public:
 558     PostlistCursor(HoneyTable *in, Xapian::docid offset_)
 559         : PostlistCursor<const HoneyTable&>(in, offset_) {}
 560 };
 561
 562 template<typename T>
 563 class PostlistCursorGt {
 564   public:
 565     /** Return true if and only if a's key is strictly greater than b's key.
 566      */
 567     bool operator()(const T* a, const T* b) const {
 568         if (a->key > b->key) return true;
 569         if (a->key != b->key) return false;
 570         return (a->firstdid > b->firstdid);
 571     }
 572 };
 573
 574 static string
 575 encode_valuestats(Xapian::doccount freq,
 576                   const string & lbound, const string & ubound)
 577 {
 578     string value;
 579     pack_uint(value, freq);
 580     pack_string(value, lbound);
 581     // We don't store or count empty values, so neither of the bounds
 582     // can be empty.  So we can safely store an empty upper bound when
 583     // the bounds are equal.
 584     if (lbound != ubound) value += ubound;
 585     return value;
 586 }
 587
 588 // U : vector<HoneyTable*>::const_iterator
 589 template<typename T, typename U> void
 590 merge_postlists(Xapian::Compactor * compactor,
 591                 T * out, vector<Xapian::docid>::const_iterator offset,
 592                 U b, U e)
 593 {
 594     typedef decltype(**b) table_type; // E.g. HoneyTable
 595     typedef PostlistCursor<table_type> cursor_type;
 596     typedef PostlistCursorGt<cursor_type> gt_type;
 597     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
 598     for ( ; b != e; ++b, ++offset) {
 599         auto in = *b;
 600         if (in->empty()) {
 601             // Skip empty tables.
 602             continue;
 603         }
 604
 605         pq.push(new cursor_type(in, *offset));
 606     }
 607
 608     string last_key;
 609     {
 610         // Merge user metadata.
 611         vector<string> tags;
 612         while (!pq.empty()) {
 613             cursor_type * cur = pq.top();
 614             const string& key = cur->key;
 615             if (key_type(key) != KEY_USER_METADATA) break;
 616
 617             if (key != last_key) {
 618                 if (!tags.empty()) {
 619                     if (tags.size() > 1 && compactor) {
 620                         Assert(!last_key.empty());
 621                         // FIXME: It would be better to merge all duplicates
 622                         // for a key in one call, but currently we don't in
 623                         // multipass mode.
 624                         const string & resolved_tag =
 625                             compactor->resolve_duplicate_metadata(last_key,
 626                                                                   tags.size(),
 627                                                                   &tags[0]);
 628                         out->add(last_key, resolved_tag);
 629                     } else {
 630                         Assert(!last_key.empty());
 631                         out->add(last_key, tags[0]);
 632                     }
 633                     tags.resize(0);
 634                 }
 635                 last_key = key;
 636             }
 637             tags.push_back(cur->tag);
 638
 639             pq.pop();
 640             if (cur->next()) {
 641                 pq.push(cur);
 642             } else {
 643                 delete cur;
 644             }
 645         }
 646         if (!tags.empty()) {
 647             if (tags.size() > 1 && compactor) {
 648                 Assert(!last_key.empty());
 649                 const string & resolved_tag =
 650                     compactor->resolve_duplicate_metadata(last_key,
 651                                                           tags.size(),
 652                                                           &tags[0]);
 653                 out->add(last_key, resolved_tag);
 654             } else {
 655                 Assert(!last_key.empty());
 656                 out->add(last_key, tags[0]);
 657             }
 658         }
 659     }
 660
 661     {
 662         // Merge valuestats.
 663         Xapian::doccount freq = 0;
 664         string lbound, ubound;
 665
 666         while (!pq.empty()) {
 667             cursor_type * cur = pq.top();
 668             const string& key = cur->key;
 669             if (key_type(key) != KEY_VALUE_STATS) break;
 670             if (key != last_key) {
 671                 // For the first valuestats key, last_key will be the previous
 672                 // key we wrote, which we don't want to overwrite.  This is the
 673                 // only time that freq will be 0, so check that.
 674                 if (freq) {
 675                     out->add(last_key, encode_valuestats(freq, lbound, ubound));
 676                     freq = 0;
 677                 }
 678                 last_key = key;
 679             }
 680
 681             const string & tag = cur->tag;
 682
 683             const char * pos = tag.data();
 684             const char * end = pos + tag.size();
 685
 686             Xapian::doccount f;
 687             string l, u;
 688             if (!unpack_uint(&pos, end, &f)) {
 689                 if (*pos == 0)
 690                     throw Xapian::DatabaseCorruptError("Incomplete stats item "
 691                                                        "in value table");
 692                 throw Xapian::RangeError("Frequency statistic in value table "
 693                                          "is too large");
 694             }
 695             if (!unpack_string(&pos, end, l)) {
 696                 if (*pos == 0)
 697                     throw Xapian::DatabaseCorruptError("Incomplete stats item "
 698                                                        "in value table");
 699                 throw Xapian::RangeError("Lower bound in value table is too "
 700                                          "large");
 701             }
 702             size_t len = end - pos;
 703             if (len == 0) {
 704                 u = l;
 705             } else {
 706                 u.assign(pos, len);
 707             }
 708             if (freq == 0) {
 709                 freq = f;
 710                 lbound = l;
 711                 ubound = u;
 712             } else {
 713                 freq += f;
 714                 if (l < lbound) lbound = l;
 715                 if (u > ubound) ubound = u;
 716             }
 717
 718             pq.pop();
 719             if (cur->next()) {
 720                 pq.push(cur);
 721             } else {
 722                 delete cur;
 723             }
 724         }
 725
 726         if (freq) {
 727             out->add(last_key, encode_valuestats(freq, lbound, ubound));
 728         }
 729     }
 730
 731     // Merge valuestream chunks.
 732     while (!pq.empty()) {
 733         cursor_type * cur = pq.top();
 734         const string & key = cur->key;
 735         if (key_type(key) != KEY_VALUE_CHUNK) break;
 736         out->add(key, cur->tag);
 737         pq.pop();
 738         if (cur->next()) {
 739             pq.push(cur);
 740         } else {
 741             delete cur;
 742         }
 743     }
 744
 745     // Merge doclen chunks.
 746     while (!pq.empty()) {
 747         cursor_type * cur = pq.top();
 748         string & key = cur->key;
 749         if (key_type(key) != KEY_DOCLEN_CHUNK) break;
 750         pack_uint_preserving_sort(key, cur->chunk_lastdid);
 751         out->add(key, cur->tag);
 752         pq.pop();
 753         if (cur->next()) {
 754             pq.push(cur);
 755         } else {
 756             delete cur;
 757         }
 758     }
 759
 760     Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
 761
 762     struct HoneyPostListChunk {
 763         Xapian::docid first, last;
 764         Xapian::termcount first_wdf;
 765         Xapian::termcount wdf_max;
 766         string data;
 767
 768         HoneyPostListChunk(Xapian::docid first_,
 769                            Xapian::docid last_,
 770                            Xapian::termcount first_wdf_,
 771                            Xapian::termcount wdf_max_,
 772                            string&& data_)
 773             : first(first_),
 774               last(last_),
 775               first_wdf(first_wdf_),
 776               wdf_max(wdf_max_),
 777               data(data_) {}
 778     };
 779     vector<HoneyPostListChunk> tags;
 780     while (true) {
 781         cursor_type * cur = NULL;
 782         if (!pq.empty()) {
 783             cur = pq.top();
 784             pq.pop();
 785         }
 786         if (cur) {
 787             AssertEq(key_type(cur->key), KEY_POSTING_CHUNK);
 788         }
 789         if (cur == NULL || cur->key != last_key) {
 790             if (!tags.empty()) {
 791                 Xapian::termcount first_wdf = tags[0].first_wdf;
 792                 Xapian::docid chunk_lastdid = tags[0].last;
 793                 Xapian::docid last_did = tags.back().last;
 794                 Xapian::termcount wdf_max = tags.back().wdf_max;
 795
 796                 string first_tag;
 797                 encode_initial_chunk_header(tf, cf, tags[0].first, last_did,
 798                                             chunk_lastdid,
 799                                             first_wdf, wdf_max, first_tag);
 800                 bool have_wdfs = (cf != 0);
 801                 if (have_wdfs && tf > 2) {
 802                     Xapian::termcount remaining_cf_for_flat_wdf =
 803                         (tf - 1) * wdf_max;
 804                     // Check this matches and that it isn't a false match due
 805                     // to overflow of the multiplication above.
 806                     if (cf - first_wdf == remaining_cf_for_flat_wdf &&
 807                         usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
 808                         // The wdf is flat so we don't need to store it.
 809                         have_wdfs = false;
 810                     }
 811                 }
 812
 813                 if (tf > 2) {
 814                     if (have_wdfs) {
 815                         first_tag += tags[0].data;
 816                     } else {
 817                         const char* pos = tags[0].data.data();
 818                         const char* pos_end = pos + tags[0].data.size();
 819                         while (pos != pos_end) {
 820                             Xapian::docid delta;
 821                             if (!unpack_uint(&pos, pos_end, &delta))
 822                                 throw_database_corrupt("Decoding docid "
 823                                                        "delta", pos);
 824                             pack_uint(first_tag, delta);
 825                             Xapian::termcount wdf;
 826                             if (!unpack_uint(&pos, pos_end, &wdf))
 827                                 throw_database_corrupt("Decoding wdf",
 828                                                        pos);
 829                             // Only copy over the docid deltas, not the
 830                             // wdfs.
 831                             (void)wdf;
 832                         }
 833                     }
 834                 }
 835                 out->add(last_key, first_tag);
 836
 837                 // If tf == 2, the data could be split over two tags when
 838                 // merging, but we only output an initial tag in this case.
 839                 if (tf > 2 && tags.size() > 1) {
 840                     string term;
 841                     const char* p = last_key.data();
 842                     const char* end = p + last_key.size();
 843                     if (!unpack_string_preserving_sort(&p, end, term) ||
 844                         p != end) {
 845                         throw Xapian::DatabaseCorruptError("Bad postlist "
 846                                                            "chunk key");
 847                     }
 848
 849                     auto i = tags.begin();
 850                     while (++i != tags.end()) {
 851                         last_did = i->last;
 852                         const string& key = pack_honey_postlist_key(term,
 853                                                                     last_did);
 854                         string tag;
 855                         if (have_wdfs) {
 856                             encode_delta_chunk_header(i->first,
 857                                                       last_did,
 858                                                       i->first_wdf,
 859                                                       tag);
 860                             tag += i->data;
 861                         } else {
 862                             encode_delta_chunk_header_no_wdf(i->first,
 863                                                              last_did,
 864                                                              tag);
 865                             const char* pos = i->data.data();
 866                             const char* pos_end = pos + i->data.size();
 867                             while (pos != pos_end) {
 868                                 Xapian::docid delta;
 869                                 if (!unpack_uint(&pos, pos_end, &delta))
 870                                     throw_database_corrupt("Decoding docid "
 871                                                            "delta", pos);
 872                                 pack_uint(tag, delta);
 873                                 Xapian::termcount wdf;
 874                                 if (!unpack_uint(&pos, pos_end, &wdf))
 875                                     throw_database_corrupt("Decoding wdf",
 876                                                            pos);
 877                                 // Only copy over the docid deltas, not the
 878                                 // wdfs.
 879                                 (void)wdf;
 880                             }
 881                         }
 882
 883                         out->add(key, tag);
 884                     }
 885                 }
 886             }
 887             tags.clear();
 888             if (cur == NULL) break;
 889             tf = cf = 0;
 890             last_key = cur->key;
 891         }
 892         if (cur->tf) {
 893             tf += cur->tf;
 894             cf += cur->cf;
 895         }
 896         tags.push_back(HoneyPostListChunk(cur->firstdid,
 897                                           cur->chunk_lastdid,
 898                                           cur->first_wdf,
 899                                           cur->wdf_max,
 900                                           std::move(cur->tag)));
 901         if (cur->next()) {
 902             pq.push(cur);
 903         } else {
 904             delete cur;
 905         }
 906     }
 907 }
 908
 909 template<typename T> struct MergeCursor;
 910
 911 #ifndef DISABLE_GPL_LIBXAPIAN
 912 template<>
 913 struct MergeCursor<const GlassTable&> : public GlassCursor {
 914     explicit MergeCursor(const GlassTable *in) : GlassCursor(in) {
 915         rewind();
 916         next();
 917     }
 918 };
 919 #endif
 920
 921 template<>
 922 struct MergeCursor<const HoneyTable&> : public HoneyCursor {
 923     explicit MergeCursor(const HoneyTable *in) : HoneyCursor(in) {
 924         rewind();
 925         next();
 926     }
 927 };
 928
 929 template<typename T>
 930 struct CursorGt {
 931     /// Return true if and only if a's key is strictly greater than b's key.
 932     bool operator()(const T* a, const T* b) const {
 933         if (b->after_end()) return false;
 934         if (a->after_end()) return true;
 935         return (a->current_key > b->current_key);
 936     }
 937 };
 938
 939 #ifndef DISABLE_GPL_LIBXAPIAN
 940 // Convert glass to honey.
 941 static void
 942 merge_spellings(HoneyTable* out,
 943                 vector<const GlassTable*>::const_iterator b,
 944                 vector<const GlassTable*>::const_iterator e)
 945 {
 946     typedef MergeCursor<const GlassTable&> cursor_type;
 947     typedef CursorGt<cursor_type> gt_type;
 948     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
 949     for ( ; b != e; ++b) {
 950         auto in = *b;
 951         if (!in->empty()) {
 952             pq.push(new cursor_type(in));
 953         }
 954     }
 955
 956     while (!pq.empty()) {
 957         cursor_type * cur = pq.top();
 958         pq.pop();
 959
 960         // Glass vs honey spelling key prefix map:
 961         //
 962         //  Type        Glass       Honey
 963         //
 964         //  bookend     Bbd         \x00bd
 965         //  head        Hhe         \x01he
 966         //  middle      Mddl        \x02ddl
 967         //  tail        Til         \x03il
 968         //  word        Wword       word
 969         //
 970         // In honey, if the word's first byte is <= '\x04', we add a prefix
 971         // of '\x04' (which is unlikely in real world use but ensures that
 972         // we can handle arbitrary strings).
 973         //
 974         // The prefixes for honey are chosen such that we save a byte for the
 975         // key of all real-world words, and so that more first bytes are used
 976         // so that a top-level array index is more effective, especially for
 977         // random-access lookup of word entries (which we do to check the
 978         // frequency of potential spelling candidates).
 979         //
 980         // The other prefixes are chosen such that we can do look up in key
 981         // order at search time, which is more efficient for a cursor which can
 982         // only efficiently move forwards.
 983         //
 984         // Note that the key ordering is the same for glass and honey, which
 985         // makes translating during compaction simpler.
 986         string key = cur->current_key;
 987         switch (key[0]) {
 988             case 'B':
 989                 key[0] = Honey::KEY_PREFIX_BOOKEND;
 990                 break;
 991             case 'H':
 992                 key[0] = Honey::KEY_PREFIX_HEAD;
 993                 break;
 994             case 'M':
 995                 key[0] = Honey::KEY_PREFIX_MIDDLE;
 996                 break;
 997             case 'T':
 998                 key[0] = Honey::KEY_PREFIX_TAIL;
 999                 break;
1000             case 'W':
1001                 if (static_cast<unsigned char>(key[1]) > Honey::KEY_PREFIX_WORD)
1002                     key.erase(0, 1);
1003                 else
1004                     key[0] = Honey::KEY_PREFIX_WORD;
1005                 break;
1006             default: {
1007                 string m = "Bad spelling key prefix: ";
1008                 m += static_cast<unsigned char>(key[0]);
1009                 throw Xapian::DatabaseCorruptError(m);
1010             }
1011         }
1012
1013         if (pq.empty() || pq.top()->current_key > key) {
1014             // No merging to do for this key so just copy the tag value,
1015             // adjusting if necessary.  If we don't need to adjust it, just
1016             // copy the compressed value.
1017             bool compressed;
1018             switch (key[0]) {
1019                 case Honey::KEY_PREFIX_HEAD: {
1020                     compressed = cur->read_tag(false);
1021                     unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1022                     cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1023                     AssertEq(cur->current_tag[1], key[1]);
1024                     AssertEq(cur->current_tag[2], key[2]);
1025                     cur->current_tag.erase(1, 2);
1026                     break;
1027                 }
1028                 case Honey::KEY_PREFIX_TAIL: {
1029                     compressed = cur->read_tag(false);
1030                     unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1031                     cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1032                     AssertEq(cur->current_tag[len - 1], key[1]);
1033                     AssertEq(cur->current_tag[len], key[2]);
1034                     cur->current_tag.erase(len - 1, 2);
1035                     break;
1036                 }
1037                 case Honey::KEY_PREFIX_BOOKEND: {
1038                     compressed = cur->read_tag(false);
1039                     unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1040                     cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1041                     AssertEq(cur->current_tag[1], key[1]);
1042                     AssertEq(cur->current_tag[len], key[2]);
1043                     cur->current_tag.erase(len, 1);
1044                     cur->current_tag.erase(1, 1);
1045                     break;
1046                 }
1047                 default:
1048                     compressed = cur->read_tag(true);
1049                     break;
1050             }
1051             out->add(key, cur->current_tag, compressed);
1052             if (cur->next()) {
1053                 pq.push(cur);
1054             } else {
1055                 delete cur;
1056             }
1057             continue;
1058         }
1059
1060         // Merge tag values with the same key:
1061         string tag;
1062         if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1063             // We just want the union of words, so copy over the first instance
1064             // and skip any identical ones.
1065             priority_queue<PrefixCompressedStringItor *,
1066                            vector<PrefixCompressedStringItor *>,
1067                            PrefixCompressedStringItorGt> pqtag;
1068             // Stick all the cursor_type pointers in a vector because their
1069             // current_tag members must remain valid while we're merging their
1070             // tags, but we need to call next() on them all afterwards.
1071             vector<cursor_type *> vec;
1072             vec.reserve(pq.size());
1073
1074             while (true) {
1075                 cur->read_tag();
1076                 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
1077                 vec.push_back(cur);
1078                 if (pq.empty() || pq.top()->current_key != key) break;
1079                 cur = pq.top();
1080                 pq.pop();
1081             }
1082
1083             PrefixCompressedStringWriter wr(tag, key);
1084             string lastword;
1085             while (!pqtag.empty()) {
1086                 PrefixCompressedStringItor * it = pqtag.top();
1087                 pqtag.pop();
1088                 string word = **it;
1089                 if (word != lastword) {
1090                     lastword = word;
1091                     wr.append(lastword);
1092                 }
1093                 ++*it;
1094                 if (!it->at_end()) {
1095                     pqtag.push(it);
1096                 } else {
1097                     delete it;
1098                 }
1099             }
1100
1101             for (auto i : vec) {
1102                 cur = i;
1103                 if (cur->next()) {
1104                     pq.push(cur);
1105                 } else {
1106                     delete cur;
1107                 }
1108             }
1109         } else {
1110             // We want to sum the frequencies from tags for the same key.
1111             Xapian::termcount tot_freq = 0;
1112             while (true) {
1113                 cur->read_tag();
1114                 Xapian::termcount freq;
1115                 const char * p = cur->current_tag.data();
1116                 const char * end = p + cur->current_tag.size();
1117                 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1118                     throw_database_corrupt("Bad spelling word freq", p);
1119                 }
1120                 tot_freq += freq;
1121                 if (cur->next()) {
1122                     pq.push(cur);
1123                 } else {
1124                     delete cur;
1125                 }
1126                 if (pq.empty() || pq.top()->current_key != key) break;
1127                 cur = pq.top();
1128                 pq.pop();
1129             }
1130             tag.resize(0);
1131             pack_uint_last(tag, tot_freq);
1132         }
1133         out->add(key, tag);
1134     }
1135 }
1136 #endif
1137
1138 static void
1139 merge_spellings(HoneyTable* out,
1140                 vector<const HoneyTable*>::const_iterator b,
1141                 vector<const HoneyTable*>::const_iterator e)
1142 {
1143     typedef MergeCursor<const HoneyTable&> cursor_type;
1144     typedef CursorGt<cursor_type> gt_type;
1145     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1146     for ( ; b != e; ++b) {
1147         auto in = *b;
1148         if (!in->empty()) {
1149             pq.push(new cursor_type(in));
1150         }
1151     }
1152
1153     while (!pq.empty()) {
1154         cursor_type * cur = pq.top();
1155         pq.pop();
1156
1157         string key = cur->current_key;
1158         if (pq.empty() || pq.top()->current_key > key) {
1159             // No need to merge the tags, just copy the (possibly compressed)
1160             // tag value.
1161             bool compressed = cur->read_tag(true);
1162             out->add(key, cur->current_tag, compressed);
1163             if (cur->next()) {
1164                 pq.push(cur);
1165             } else {
1166                 delete cur;
1167             }
1168             continue;
1169         }
1170
1171         // Merge tag values with the same key:
1172         string tag;
1173         if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1174             // We just want the union of words, so copy over the first instance
1175             // and skip any identical ones.
1176             priority_queue<PrefixCompressedStringItor *,
1177                            vector<PrefixCompressedStringItor *>,
1178                            PrefixCompressedStringItorGt> pqtag;
1179             // Stick all the cursor_type pointers in a vector because their
1180             // current_tag members must remain valid while we're merging their
1181             // tags, but we need to call next() on them all afterwards.
1182             vector<cursor_type *> vec;
1183             vec.reserve(pq.size());
1184
1185             while (true) {
1186                 cur->read_tag();
1187                 pqtag.push(new PrefixCompressedStringItor(cur->current_tag,
1188                                                           key));
1189                 vec.push_back(cur);
1190                 if (pq.empty() || pq.top()->current_key != key) break;
1191                 cur = pq.top();
1192                 pq.pop();
1193             }
1194
1195             PrefixCompressedStringWriter wr(tag, key);
1196             string lastword;
1197             while (!pqtag.empty()) {
1198                 PrefixCompressedStringItor * it = pqtag.top();
1199                 pqtag.pop();
1200                 string word = **it;
1201                 if (word != lastword) {
1202                     lastword = word;
1203                     wr.append(lastword);
1204                 }
1205                 ++*it;
1206                 if (!it->at_end()) {
1207                     pqtag.push(it);
1208                 } else {
1209                     delete it;
1210                 }
1211             }
1212
1213             for (auto i : vec) {
1214                 cur = i;
1215                 if (cur->next()) {
1216                     pq.push(cur);
1217                 } else {
1218                     delete cur;
1219                 }
1220             }
1221         } else {
1222             // We want to sum the frequencies from tags for the same key.
1223             Xapian::termcount tot_freq = 0;
1224             while (true) {
1225                 cur->read_tag();
1226                 Xapian::termcount freq;
1227                 const char * p = cur->current_tag.data();
1228                 const char * end = p + cur->current_tag.size();
1229                 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1230                     throw_database_corrupt("Bad spelling word freq", p);
1231                 }
1232                 tot_freq += freq;
1233                 if (cur->next()) {
1234                     pq.push(cur);
1235                 } else {
1236                     delete cur;
1237                 }
1238                 if (pq.empty() || pq.top()->current_key != key) break;
1239                 cur = pq.top();
1240                 pq.pop();
1241             }
1242             tag.resize(0);
1243             pack_uint_last(tag, tot_freq);
1244         }
1245         out->add(key, tag);
1246     }
1247 }
1248
1249 // U : vector<HoneyTable*>::const_iterator
1250 template<typename T, typename U> void
1251 merge_synonyms(T* out, U b, U e)
1252 {
1253     typedef decltype(**b) table_type; // E.g. HoneyTable
1254     typedef MergeCursor<table_type> cursor_type;
1255     typedef CursorGt<cursor_type> gt_type;
1256     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1257     for ( ; b != e; ++b) {
1258         auto in = *b;
1259         if (!in->empty()) {
1260             pq.push(new cursor_type(in));
1261         }
1262     }
1263
1264     while (!pq.empty()) {
1265         cursor_type * cur = pq.top();
1266         pq.pop();
1267
1268         string key = cur->current_key;
1269         if (pq.empty() || pq.top()->current_key > key) {
1270             // No need to merge the tags, just copy the (possibly compressed)
1271             // tag value.
1272             bool compressed = cur->read_tag(true);
1273             out->add(key, cur->current_tag, compressed);
1274             if (cur->next()) {
1275                 pq.push(cur);
1276             } else {
1277                 delete cur;
1278             }
1279             continue;
1280         }
1281
1282         // Merge tag values with the same key:
1283         string tag;
1284
1285         // We just want the union of words, so copy over the first instance
1286         // and skip any identical ones.
1287         priority_queue<ByteLengthPrefixedStringItor *,
1288                        vector<ByteLengthPrefixedStringItor *>,
1289                        ByteLengthPrefixedStringItorGt> pqtag;
1290         vector<cursor_type *> vec;
1291
1292         while (true) {
1293             cur->read_tag();
1294             pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
1295             vec.push_back(cur);
1296             if (pq.empty() || pq.top()->current_key != key) break;
1297             cur = pq.top();
1298             pq.pop();
1299         }
1300
1301         string lastword;
1302         while (!pqtag.empty()) {
1303             ByteLengthPrefixedStringItor * it = pqtag.top();
1304             pqtag.pop();
1305             if (**it != lastword) {
1306                 lastword = **it;
1307                 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
1308                 tag += lastword;
1309             }
1310             ++*it;
1311             if (!it->at_end()) {
1312                 pqtag.push(it);
1313             } else {
1314                 delete it;
1315             }
1316         }
1317
1318         for (auto i : vec) {
1319             cur = i;
1320             if (cur->next()) {
1321                 pq.push(cur);
1322             } else {
1323                 delete cur;
1324             }
1325         }
1326
1327         out->add(key, tag);
1328     }
1329 }
1330
1331 template<typename T, typename U> void
1332 multimerge_postlists(Xapian::Compactor * compactor,
1333                      T* out, const char * tmpdir,
1334                      const vector<U*>& in,
1335                      vector<Xapian::docid> off)
1336 {
1337     if (in.size() <= 3) {
1338         merge_postlists(compactor, out, off.begin(), in.begin(), in.end());
1339         return;
1340     }
1341     unsigned int c = 0;
1342     vector<HoneyTable *> tmp;
1343     tmp.reserve(in.size() / 2);
1344     {
1345         vector<Xapian::docid> newoff;
1346         newoff.resize(in.size() / 2);
1347         for (unsigned int i = 0, j; i < in.size(); i = j) {
1348             j = i + 2;
1349             if (j == in.size() - 1) ++j;
1350
1351             string dest = tmpdir;
1352             char buf[64];
1353             sprintf(buf, "/tmp%u_%u.", c, i / 2);
1354             dest += buf;
1355
1356             HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1357
1358             // Use maximum blocksize for temporary tables.  And don't compress
1359             // entries in temporary tables, even if the final table would do
1360             // so.  Any already compressed entries will get copied in
1361             // compressed form. (FIXME: HoneyTable has no blocksize)
1362             Honey::RootInfo root_info;
1363             root_info.init(65536, 0);
1364             const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1365             tmptab->create_and_open(flags, root_info);
1366
1367             merge_postlists(compactor, tmptab, off.begin() + i,
1368                             in.begin() + i, in.begin() + j);
1369             tmp.push_back(tmptab);
1370             tmptab->flush_db();
1371             tmptab->commit(1, &root_info);
1372             AssertRel(root_info.get_blocksize(),==,65536);
1373         }
1374         swap(off, newoff);
1375         ++c;
1376     }
1377
1378     while (tmp.size() > 3) {
1379         vector<HoneyTable *> tmpout;
1380         tmpout.reserve(tmp.size() / 2);
1381         vector<Xapian::docid> newoff;
1382         newoff.resize(tmp.size() / 2);
1383         for (unsigned int i = 0, j; i < tmp.size(); i = j) {
1384             j = i + 2;
1385             if (j == tmp.size() - 1) ++j;
1386
1387             string dest = tmpdir;
1388             char buf[64];
1389             sprintf(buf, "/tmp%u_%u.", c, i / 2);
1390             dest += buf;
1391
1392             HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1393
1394             // Use maximum blocksize for temporary tables.  And don't compress
1395             // entries in temporary tables, even if the final table would do
1396             // so.  Any already compressed entries will get copied in
1397             // compressed form. (FIXME: HoneyTable has no blocksize)
1398             Honey::RootInfo root_info;
1399             root_info.init(65536, 0);
1400             const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1401             tmptab->create_and_open(flags, root_info);
1402
1403             merge_postlists(compactor, tmptab, off.begin() + i,
1404                             tmp.begin() + i, tmp.begin() + j);
1405             if (c > 0) {
1406                 for (unsigned int k = i; k < j; ++k) {
1407                     // FIXME: unlink(tmp[k]->get_path().c_str());
1408                     delete tmp[k];
1409                     tmp[k] = NULL;
1410                 }
1411             }
1412             tmpout.push_back(tmptab);
1413             tmptab->flush_db();
1414             tmptab->commit(1, &root_info);
1415             AssertRel(root_info.get_blocksize(),==,65536);
1416         }
1417         swap(tmp, tmpout);
1418         swap(off, newoff);
1419         ++c;
1420     }
1421     merge_postlists(compactor, out, off.begin(), tmp.begin(), tmp.end());
1422     if (c > 0) {
1423         for (size_t k = 0; k < tmp.size(); ++k) {
1424             // FIXME: unlink(tmp[k]->get_path().c_str());
1425             delete tmp[k];
1426             tmp[k] = NULL;
1427         }
1428     }
1429 }
1430
1431 template<typename T> class PositionCursor;
1432
1433 #ifndef DISABLE_GPL_LIBXAPIAN
1434 template<>
1435 class PositionCursor<const GlassTable&> : private GlassCursor {
1436     Xapian::docid offset;
1437
1438   public:
1439     string key;
1440     Xapian::docid firstdid;
1441
1442     PositionCursor(const GlassTable *in, Xapian::docid offset_)
1443         : GlassCursor(in), offset(offset_), firstdid(0) {
1444         rewind();
1445         next();
1446     }
1447
1448     bool next() {
1449         if (!GlassCursor::next()) return false;
1450         read_tag();
1451         const char * d = current_key.data();
1452         const char * e = d + current_key.size();
1453         string term;
1454         Xapian::docid did;
1455         if (!unpack_string_preserving_sort(&d, e, term) ||
1456             !unpack_uint_preserving_sort(&d, e, &did) ||
1457             d != e) {
1458             throw Xapian::DatabaseCorruptError("Bad position key");
1459         }
1460
1461         key.resize(0);
1462         pack_string_preserving_sort(key, term);
1463         pack_uint_preserving_sort(key, did + offset);
1464         return true;
1465     }
1466
1467     const string & get_tag() const {
1468         return current_tag;
1469     }
1470 };
1471 #endif
1472
1473 template<>
1474 class PositionCursor<const HoneyTable&> : private HoneyCursor {
1475     Xapian::docid offset;
1476
1477   public:
1478     string key;
1479     Xapian::docid firstdid;
1480
1481     PositionCursor(const HoneyTable *in, Xapian::docid offset_)
1482         : HoneyCursor(in), offset(offset_), firstdid(0) {
1483         rewind();
1484         next();
1485     }
1486
1487     bool next() {
1488         if (!HoneyCursor::next()) return false;
1489         read_tag();
1490         const char * d = current_key.data();
1491         const char * e = d + current_key.size();
1492         string term;
1493         Xapian::docid did;
1494         if (!unpack_string_preserving_sort(&d, e, term) ||
1495             !unpack_uint_preserving_sort(&d, e, &did) ||
1496             d != e) {
1497             throw Xapian::DatabaseCorruptError("Bad position key");
1498         }
1499
1500         key.resize(0);
1501         pack_string_preserving_sort(key, term);
1502         pack_uint_preserving_sort(key, did + offset);
1503         return true;
1504     }
1505
1506     const string & get_tag() const {
1507         return current_tag;
1508     }
1509 };
1510
1511 template<typename T>
1512 class PositionCursorGt {
1513   public:
1514     /** Return true if and only if a's key is strictly greater than b's key.
1515      */
1516     bool operator()(const T* a, const T* b) const {
1517         return a->key > b->key;
1518     }
1519 };
1520
1521 template<typename T, typename U> void
1522 merge_positions(T* out, const vector<U*> & inputs,
1523                 const vector<Xapian::docid> & offset)
1524 {
1525     typedef decltype(*inputs[0]) table_type; // E.g. HoneyTable
1526     typedef PositionCursor<table_type> cursor_type;
1527     typedef PositionCursorGt<cursor_type> gt_type;
1528     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1529     for (size_t i = 0; i < inputs.size(); ++i) {
1530         auto in = inputs[i];
1531         if (in->empty()) {
1532             // Skip empty tables.
1533             continue;
1534         }
1535
1536         pq.push(new cursor_type(in, offset[i]));
1537     }
1538
1539     while (!pq.empty()) {
1540         cursor_type * cur = pq.top();
1541         pq.pop();
1542         out->add(cur->key, cur->get_tag());
1543         if (cur->next()) {
1544             pq.push(cur);
1545         } else {
1546             delete cur;
1547         }
1548     }
1549 }
1550
1551 template<typename T, typename U> void
1552 merge_docid_keyed(T *out, const vector<U*> & inputs,
1553                   const vector<Xapian::docid> & offset,
1554                   int = 0)
1555 {
1556     for (size_t i = 0; i < inputs.size(); ++i) {
1557         Xapian::docid off = offset[i];
1558
1559         auto in = inputs[i];
1560         if (in->empty()) continue;
1561
1562         HoneyCursor cur(in);
1563         cur.rewind();
1564
1565         string key;
1566         while (cur.next()) {
1567             // Adjust the key if this isn't the first database.
1568             if (off) {
1569                 Xapian::docid did;
1570                 const char * d = cur.current_key.data();
1571                 const char * e = d + cur.current_key.size();
1572                 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1573                     string msg = "Bad key in ";
1574                     msg += inputs[i]->get_path();
1575                     throw Xapian::DatabaseCorruptError(msg);
1576                 }
1577                 did += off;
1578                 key.resize(0);
1579                 pack_uint_preserving_sort(key, did);
1580                 if (d != e) {
1581                     // Copy over anything extra in the key (e.g. the zero byte
1582                     // at the end of "used value slots" in the termlist table).
1583                     key.append(d, e - d);
1584                 }
1585             } else {
1586                 key = cur.current_key;
1587             }
1588             bool compressed = cur.read_tag(true);
1589             out->add(key, cur.current_tag, compressed);
1590         }
1591     }
1592 }
1593
1594 #ifndef DISABLE_GPL_LIBXAPIAN
1595 template<typename T> void
1596 merge_docid_keyed(T *out, const vector<const GlassTable*> & inputs,
1597                   const vector<Xapian::docid> & offset,
1598                   int table_type = 0)
1599 {
1600     for (size_t i = 0; i < inputs.size(); ++i) {
1601         Xapian::docid off = offset[i];
1602
1603         auto in = inputs[i];
1604         if (in->empty()) continue;
1605
1606         GlassCursor cur(in);
1607         cur.rewind();
1608
1609         string key;
1610         while (cur.next()) {
1611 next_without_next:
1612             // Adjust the key if this isn't the first database.
1613             if (off) {
1614                 Xapian::docid did;
1615                 const char * d = cur.current_key.data();
1616                 const char * e = d + cur.current_key.size();
1617                 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1618                     string msg = "Bad key in ";
1619                     msg += inputs[i]->get_path();
1620                     throw Xapian::DatabaseCorruptError(msg);
1621                 }
1622                 did += off;
1623                 key.resize(0);
1624                 pack_uint_preserving_sort(key, did);
1625                 if (d != e) {
1626                     // Copy over anything extra in the key (e.g. the zero byte
1627                     // at the end of "used value slots" in the termlist table).
1628                     key.append(d, e - d);
1629                 }
1630             } else {
1631                 key = cur.current_key;
1632             }
1633             if (table_type == Honey::TERMLIST) {
1634                 if (termlist_key_is_values_used(key)) {
1635                     throw Xapian::DatabaseCorruptError("value slots used "
1636                                                        "entry without a "
1637                                                        "corresponding "
1638                                                        "termlist entry");
1639                 }
1640                 cur.read_tag();
1641                 string tag = cur.current_tag;
1642
1643                 bool next_result = cur.next();
1644                 bool next_already_done = true;
1645                 unsigned bitmap_slots_used = 0;
1646                 string encoded_slots_used;
1647                 if (next_result &&
1648                     termlist_key_is_values_used(cur.current_key)) {
1649                     next_already_done = false;
1650                     cur.read_tag();
1651                     const string& valtag = cur.current_tag;
1652
1653                     const char* p = valtag.data();
1654                     const char* end = p + valtag.size();
1655
1656                     Xapian::VecCOW<Xapian::termpos> slots;
1657
1658                     Xapian::valueno first_slot;
1659                     if (!unpack_uint(&p, end, &first_slot)) {
1660                         throw_database_corrupt("Value slot encoding corrupt",
1661                                                p);
1662                     }
1663                     slots.push_back(first_slot);
1664                     Xapian::valueno last_slot = first_slot;
1665                     while (p != end) {
1666                         Xapian::valueno slot;
1667                         if (!unpack_uint(&p, end, &slot)) {
1668                             throw_database_corrupt("Value slot encoding "
1669                                                    "corrupt", p);
1670                         }
1671                         slot += last_slot + 1;
1672                         last_slot = slot;
1673
1674                         slots.push_back(slot);
1675                     }
1676
1677                     if (slots.back() <= 6) {
1678                         // Encode as a bitmap if only slots in the range 0-6
1679                         // are used.
1680                         for (auto slot : slots) {
1681                             bitmap_slots_used |= 1 << slot;
1682                         }
1683                     } else {
1684                         string enc;
1685                         pack_uint(enc, last_slot);
1686                         if (slots.size() > 1) {
1687                             BitWriter slots_used(enc);
1688                             slots_used.encode(first_slot, last_slot);
1689                             slots_used.encode(slots.size() - 2,
1690                                               last_slot - first_slot);
1691                             slots_used.encode_interpolative(slots, 0,
1692                                                             slots.size() - 1);
1693                             encoded_slots_used = slots_used.freeze();
1694                         } else {
1695                             encoded_slots_used = std::move(enc);
1696                         }
1697                     }
1698                 }
1699
1700                 const char* pos = tag.data();
1701                 const char* end = pos + tag.size();
1702
1703                 string newtag;
1704                 if (encoded_slots_used.empty()) {
1705                     newtag += char(bitmap_slots_used);
1706                 } else {
1707                     auto size = encoded_slots_used.size();
1708                     if (size < 0x80) {
1709                         newtag += char(0x80 | size);
1710                     } else {
1711                         newtag += '\x80';
1712                         pack_uint(newtag, size);
1713                     }
1714                     newtag += encoded_slots_used;
1715                 }
1716
1717                 if (pos != end) {
1718                     Xapian::termcount doclen;
1719                     if (!unpack_uint(&pos, end, &doclen)) {
1720                         throw_database_corrupt("doclen", pos);
1721                     }
1722                     Xapian::termcount termlist_size;
1723                     if (!unpack_uint(&pos, end, &termlist_size)) {
1724                         throw_database_corrupt("termlist length", pos);
1725                     }
1726                     pack_uint(newtag, termlist_size - 1);
1727                     pack_uint(newtag, doclen);
1728
1729                     string current_term;
1730                     while (pos != end) {
1731                         Xapian::termcount current_wdf = 0;
1732
1733                         if (!current_term.empty()) {
1734                             size_t reuse = static_cast<unsigned char>(*pos++);
1735                             newtag += char(reuse);
1736
1737                             if (reuse > current_term.size()) {
1738                                 current_wdf = reuse / (current_term.size() + 1);
1739                                 reuse = reuse % (current_term.size() + 1);
1740                             }
1741                             current_term.resize(reuse);
1742
1743                             if (pos == end)
1744                                 throw_database_corrupt("term", NULL);
1745                         }
1746
1747                         size_t append = static_cast<unsigned char>(*pos++);
1748                         if (size_t(end - pos) < append)
1749                             throw_database_corrupt("term", NULL);
1750
1751                         current_term.append(pos, append);
1752                         pos += append;
1753
1754                         if (current_wdf) {
1755                             --current_wdf;
1756                         } else {
1757                             if (!unpack_uint(&pos, end, &current_wdf)) {
1758                                 throw_database_corrupt("wdf", pos);
1759                             }
1760                             pack_uint(newtag, current_wdf);
1761                         }
1762
1763                         newtag += char(append);
1764                         newtag.append(current_term.end() - append,
1765                                       current_term.end());
1766                     }
1767                 }
1768                 if (!newtag.empty())
1769                     out->add(key, newtag);
1770                 if (!next_result) break;
1771                 if (next_already_done) goto next_without_next;
1772             } else {
1773                 bool compressed = cur.read_tag(true);
1774                 out->add(key, cur.current_tag, compressed);
1775             }
1776         }
1777     }
1778 }
1779 #endif
1780
1781 }
1782
1783 using namespace HoneyCompact;
1784
1785 void
1786 HoneyDatabase::compact(Xapian::Compactor* compactor,
1787                        const char* destdir,
1788                        int fd,
1789                        int source_backend,
1790                        const vector<const Xapian::Database::Internal*>& sources,
1791                        const vector<Xapian::docid>& offset,
1792                        size_t block_size,
1793                        Xapian::Compactor::compaction_level compaction,
1794                        unsigned flags,
1795                        Xapian::docid last_docid)
1796 {
1797     struct table_list {
1798         // The "base name" of the table.
1799         char name[9];
1800         // The type.
1801         Honey::table_type type;
1802         // Create tables after position lazily.
1803         bool lazy;
1804     };
1805
1806     static const table_list tables[] = {
1807         // name         type                    lazy
1808         { "postlist",   Honey::POSTLIST,        false },
1809         { "docdata",    Honey::DOCDATA,         true },
1810         { "termlist",   Honey::TERMLIST,        false },
1811         { "position",   Honey::POSITION,        true },
1812         { "spelling",   Honey::SPELLING,        true },
1813         { "synonym",    Honey::SYNONYM,         true }
1814     };
1815     const table_list * tables_end = tables +
1816         (sizeof(tables) / sizeof(tables[0]));
1817
1818     const int FLAGS = Xapian::DB_DANGEROUS;
1819
1820     bool single_file = (flags & Xapian::DBCOMPACT_SINGLE_FILE);
1821     bool multipass = (flags & Xapian::DBCOMPACT_MULTIPASS);
1822     if (single_file) {
1823         // FIXME: Support this combination - we need to put temporary files
1824         // somewhere.
1825         multipass = false;
1826     }
1827
1828     if (single_file) {
1829         for (size_t i = 0; i != sources.size(); ++i) {
1830             bool has_uncommitted_changes;
1831             if (source_backend == Xapian::DB_BACKEND_GLASS) {
1832                 auto db = static_cast<const GlassDatabase*>(sources[i]);
1833                 has_uncommitted_changes = db->has_uncommitted_changes();
1834             } else {
1835                 auto db = static_cast<const HoneyDatabase*>(sources[i]);
1836                 has_uncommitted_changes = db->has_uncommitted_changes();
1837             }
1838             if (has_uncommitted_changes) {
1839                 const char * m =
1840                     "Can't compact from a WritableDatabase with uncommitted "
1841                     "changes - either call commit() first, or create a new "
1842                     "Database object from the filename on disk";
1843                 throw Xapian::InvalidOperationError(m);
1844             }
1845         }
1846     }
1847
1848     if (block_size < HONEY_MIN_BLOCKSIZE ||
1849         block_size > HONEY_MAX_BLOCKSIZE ||
1850         (block_size & (block_size - 1)) != 0) {
1851         block_size = HONEY_DEFAULT_BLOCKSIZE;
1852     }
1853
1854     FlintLock lock(destdir ? destdir : "");
1855     if (!single_file) {
1856         string explanation;
1857         FlintLock::reason why = lock.lock(true, false, explanation);
1858         if (why != FlintLock::SUCCESS) {
1859             lock.throw_databaselockerror(why, destdir, explanation);
1860         }
1861     }
1862
1863     unique_ptr<HoneyVersion> version_file_out;
1864     if (single_file) {
1865         if (destdir) {
1866             fd = open(destdir, O_RDWR|O_CREAT|O_TRUNC|O_BINARY|O_CLOEXEC, 0666);
1867             if (fd < 0) {
1868                 throw Xapian::DatabaseCreateError("open() failed", errno);
1869             }
1870         }
1871         version_file_out.reset(new HoneyVersion(fd));
1872     } else {
1873         fd = -1;
1874         version_file_out.reset(new HoneyVersion(destdir));
1875     }
1876
1877     version_file_out->create(block_size);
1878     for (size_t i = 0; i != sources.size(); ++i) {
1879         if (source_backend == Xapian::DB_BACKEND_GLASS) {
1880 #ifdef DISABLE_GPL_LIBXAPIAN
1881             Assert(false);
1882 #else
1883             auto db = static_cast<const GlassDatabase*>(sources[i]);
1884             auto& v_in = db->version_file;
1885             auto& v_out = version_file_out;
1886             v_out->merge_stats(v_in.get_doccount(),
1887                                v_in.get_doclength_lower_bound(),
1888                                v_in.get_doclength_upper_bound(),
1889                                v_in.get_wdf_upper_bound(),
1890                                v_in.get_total_doclen(),
1891                                v_in.get_spelling_wordfreq_upper_bound());
1892 #endif
1893         } else {
1894             auto db = static_cast<const HoneyDatabase*>(sources[i]);
1895             version_file_out->merge_stats(db->version_file);
1896         }
1897     }
1898
1899     string fl_serialised;
1900 #if 0
1901     if (single_file) {
1902         HoneyFreeList fl;
1903         fl.set_first_unused_block(1); // FIXME: Assumption?
1904         fl.pack(fl_serialised);
1905     }
1906 #endif
1907
1908     // Set to true if stat() failed (which can happen if the files are > 2GB
1909     // and off_t is 32 bit) or one of the totals overflowed.
1910     bool bad_totals = false;
1911     off_t in_total = 0, out_total = 0;
1912
1913     // FIXME: sort out indentation.
1914 if (source_backend == Xapian::DB_BACKEND_GLASS) {
1915 #ifdef DISABLE_GPL_LIBXAPIAN
1916     throw Xapian::FeatureUnavailableError("Glass backend disabled");
1917 #else
1918     vector<HoneyTable *> tabs;
1919     tabs.reserve(tables_end - tables);
1920     off_t prev_size = block_size;
1921     for (const table_list * t = tables; t < tables_end; ++t) {
1922         // The postlist table requires an N-way merge, adjusting the
1923         // headers of various blocks.  The spelling and synonym tables also
1924         // need special handling.  The other tables have keys sorted in
1925         // docid order, so we can merge them by simply copying all the keys
1926         // from each source table in turn.
1927         if (compactor)
1928             compactor->set_status(t->name, string());
1929
1930         string dest;
1931         if (!single_file) {
1932             dest = destdir;
1933             dest += '/';
1934             dest += t->name;
1935             dest += '.';
1936         }
1937
1938         bool output_will_exist = !t->lazy;
1939
1940         // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
1941         // on certain systems).
1942         bool bad_stat = false;
1943
1944         // We can't currently report input sizes if there's a single file DB
1945         // amongst the inputs.
1946         bool single_file_in = false;
1947
1948         off_t in_size = 0;
1949
1950         vector<const GlassTable*> inputs;
1951         inputs.reserve(sources.size());
1952         size_t inputs_present = 0;
1953         for (auto src : sources) {
1954             auto db = static_cast<const GlassDatabase*>(src);
1955             const GlassTable * table;
1956             switch (t->type) {
1957                 case Honey::POSTLIST:
1958                     table = &(db->postlist_table);
1959                     break;
1960                 case Honey::DOCDATA:
1961                     table = &(db->docdata_table);
1962                     break;
1963                 case Honey::TERMLIST:
1964                     table = &(db->termlist_table);
1965                     break;
1966                 case Honey::POSITION:
1967                     table = &(db->position_table);
1968                     break;
1969                 case Honey::SPELLING:
1970                     table = &(db->spelling_table);
1971                     break;
1972                 case Honey::SYNONYM:
1973                     table = &(db->synonym_table);
1974                     break;
1975                 default:
1976                     Assert(false);
1977                     return;
1978             }
1979
1980             if (db->single_file()) {
1981                 if (t->lazy && table->empty()) {
1982                     // Essentially doesn't exist.
1983                 } else {
1984                     // FIXME: Find actual size somehow?
1985                     // in_size += table->size() / 1024;
1986                     single_file_in = true;
1987                     bad_totals = true;
1988                     output_will_exist = true;
1989                     ++inputs_present;
1990                 }
1991             } else {
1992                 off_t db_size = file_size(table->get_path());
1993                 if (errno == 0) {
1994                     // FIXME: check overflow and set bad_totals
1995                     in_total += db_size;
1996                     in_size += db_size / 1024;
1997                     output_will_exist = true;
1998                     ++inputs_present;
1999                 } else if (errno != ENOENT) {
2000                     // We get ENOENT for an optional table.
2001                     bad_totals = bad_stat = true;
2002                     output_will_exist = true;
2003                     ++inputs_present;
2004                 }
2005             }
2006             inputs.push_back(table);
2007         }
2008
2009         // If any inputs lack a termlist table, suppress it in the output.
2010         if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2011             if (inputs_present != 0) {
2012                 if (compactor) {
2013                     string m = str(inputs_present);
2014                     m += " of ";
2015                     m += str(sources.size());
2016                     m += " inputs present, so suppressing output";
2017                     compactor->set_status(t->name, m);
2018                 }
2019                 continue;
2020             }
2021             output_will_exist = false;
2022         }
2023
2024         if (!output_will_exist) {
2025             if (compactor)
2026                 compactor->set_status(t->name, "doesn't exist");
2027             continue;
2028         }
2029
2030         HoneyTable * out;
2031         off_t table_start_offset = -1;
2032         if (single_file) {
2033             if (t == tables) {
2034                 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2035                 // space for version file.  It's tricky to exactly know the
2036                 // size of the version file beforehand.
2037                 table_start_offset = HONEY_VERSION_MAX_SIZE;
2038                 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2039                     throw Xapian::DatabaseError("lseek() failed", errno);
2040             } else {
2041                 table_start_offset = lseek(fd, 0, SEEK_CUR);
2042             }
2043             out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2044                                  false, false);
2045         } else {
2046             out = new HoneyTable(t->name, dest, false, t->lazy);
2047         }
2048         tabs.push_back(out);
2049         Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2050         if (single_file) {
2051             root_info->set_free_list(fl_serialised);
2052             root_info->set_offset(table_start_offset);
2053             out->open(FLAGS,
2054                       version_file_out->get_root(t->type),
2055                       version_file_out->get_revision());
2056         } else {
2057             out->create_and_open(FLAGS, *root_info);
2058         }
2059
2060         out->set_full_compaction(compaction != compactor->STANDARD);
2061         if (compaction == compactor->FULLER) out->set_max_item_size(1);
2062
2063         switch (t->type) {
2064             case Honey::POSTLIST: {
2065                 if (multipass && inputs.size() > 3) {
2066                     multimerge_postlists(compactor, out, destdir,
2067                                          inputs, offset);
2068                 } else {
2069                     merge_postlists(compactor, out, offset.begin(),
2070                                     inputs.begin(), inputs.end());
2071                 }
2072                 break;
2073             }
2074             case Honey::SPELLING:
2075                 merge_spellings(out, inputs.cbegin(), inputs.cend());
2076                 break;
2077             case Honey::SYNONYM:
2078                 merge_synonyms(out, inputs.begin(), inputs.end());
2079                 break;
2080             case Honey::POSITION:
2081                 merge_positions(out, inputs, offset);
2082                 break;
2083             default:
2084                 // DocData, Termlist
2085                 merge_docid_keyed(out, inputs, offset, t->type);
2086                 break;
2087         }
2088
2089         // Commit as revision 1.
2090         out->flush_db();
2091         out->commit(1, root_info);
2092         out->sync();
2093         if (single_file) fl_serialised = root_info->get_free_list();
2094
2095         off_t out_size = 0;
2096         if (!bad_stat && !single_file_in) {
2097             off_t db_size;
2098             if (single_file) {
2099                 db_size = file_size(fd);
2100             } else {
2101                 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2102             }
2103             if (errno == 0) {
2104                 if (single_file) {
2105                     off_t old_prev_size = max(prev_size, off_t(block_size));
2106                     prev_size = db_size;
2107                     db_size -= old_prev_size;
2108                 }
2109                 // FIXME: check overflow and set bad_totals
2110                 out_total += db_size;
2111                 out_size = db_size / 1024;
2112             } else if (errno != ENOENT) {
2113                 bad_totals = bad_stat = true;
2114             }
2115         }
2116         if (bad_stat) {
2117             if (compactor)
2118                 compactor->set_status(t->name,
2119                                       "Done (couldn't stat all the DB files)");
2120         } else if (single_file_in) {
2121             if (compactor)
2122                 compactor->set_status(t->name,
2123                                       "Done (table sizes unknown for single "
2124                                       "file DB input)");
2125         } else {
2126             string status;
2127             if (out_size == in_size) {
2128                 status = "Size unchanged (";
2129             } else {
2130                 off_t delta;
2131                 if (out_size < in_size) {
2132                     delta = in_size - out_size;
2133                     status = "Reduced by ";
2134                 } else {
2135                     delta = out_size - in_size;
2136                     status = "INCREASED by ";
2137                 }
2138                 if (in_size) {
2139                     status += str(100 * delta / in_size);
2140                     status += "% ";
2141                 }
2142                 status += str(delta);
2143                 status += "K (";
2144                 status += str(in_size);
2145                 status += "K -> ";
2146             }
2147             status += str(out_size);
2148             status += "K)";
2149             if (compactor)
2150                 compactor->set_status(t->name, status);
2151         }
2152     }
2153
2154     // If compacting to a single file output and all the tables are empty, pad
2155     // the output so that it isn't mistaken for a stub database when we try to
2156     // open it.  For this it needs to be a multiple of 2KB in size.
2157     if (single_file && prev_size < off_t(block_size)) {
2158 #ifdef HAVE_FTRUNCATE
2159         if (ftruncate(fd, block_size) < 0) {
2160             throw Xapian::DatabaseError("Failed to set size of output "
2161                                         "database", errno);
2162         }
2163 #else
2164         const off_t off = block_size - 1;
2165         if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2166             throw Xapian::DatabaseError("Failed to set size of output "
2167                                         "database", errno);
2168         }
2169 #endif
2170     }
2171
2172     if (single_file) {
2173         if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2174             throw Xapian::DatabaseError("lseek() failed", errno);
2175         }
2176     }
2177     version_file_out->set_last_docid(last_docid);
2178     string tmpfile = version_file_out->write(1, FLAGS);
2179     if (single_file) {
2180         off_t version_file_size = lseek(fd, 0, SEEK_CUR);
2181         if (version_file_size < 0) {
2182             throw Xapian::DatabaseError("lseek() failed", errno);
2183         }
2184         if (version_file_size > HONEY_VERSION_MAX_SIZE) {
2185             throw Xapian::DatabaseError("Didn't allow enough space for "
2186                                         "version file data");
2187         }
2188     }
2189     for (unsigned j = 0; j != tabs.size(); ++j) {
2190         tabs[j]->sync();
2191     }
2192     // Commit with revision 1.
2193     version_file_out->sync(tmpfile, 1, FLAGS);
2194     for (unsigned j = 0; j != tabs.size(); ++j) {
2195         delete tabs[j];
2196     }
2197 #endif
2198 } else {
2199     vector<HoneyTable *> tabs;
2200     tabs.reserve(tables_end - tables);
2201     off_t prev_size = block_size;
2202     for (const table_list * t = tables; t < tables_end; ++t) {
2203         // The postlist table requires an N-way merge, adjusting the
2204         // headers of various blocks.  The spelling and synonym tables also
2205         // need special handling.  The other tables have keys sorted in
2206         // docid order, so we can merge them by simply copying all the keys
2207         // from each source table in turn.
2208         if (compactor)
2209             compactor->set_status(t->name, string());
2210
2211         string dest;
2212         if (!single_file) {
2213             dest = destdir;
2214             dest += '/';
2215             dest += t->name;
2216             dest += '.';
2217         }
2218
2219         bool output_will_exist = !t->lazy;
2220
2221         // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2222         // on certain systems).
2223         bool bad_stat = false;
2224
2225         // We can't currently report input sizes if there's a single file DB
2226         // amongst the inputs.
2227         bool single_file_in = false;
2228
2229         off_t in_size = 0;
2230
2231         vector<const HoneyTable*> inputs;
2232         inputs.reserve(sources.size());
2233         size_t inputs_present = 0;
2234         for (auto src : sources) {
2235             auto db = static_cast<const HoneyDatabase*>(src);
2236             const HoneyTable * table;
2237             switch (t->type) {
2238                 case Honey::POSTLIST:
2239                     table = &(db->postlist_table);
2240                     break;
2241                 case Honey::DOCDATA:
2242                     table = &(db->docdata_table);
2243                     break;
2244                 case Honey::TERMLIST:
2245                     table = &(db->termlist_table);
2246                     break;
2247                 case Honey::POSITION:
2248                     table = &(db->position_table);
2249                     break;
2250                 case Honey::SPELLING:
2251                     table = &(db->spelling_table);
2252                     break;
2253                 case Honey::SYNONYM:
2254                     table = &(db->synonym_table);
2255                     break;
2256                 default:
2257                     Assert(false);
2258                     return;
2259             }
2260
2261             if (db->single_file()) {
2262                 if (t->lazy && table->empty()) {
2263                     // Essentially doesn't exist.
2264                 } else {
2265                     // FIXME: Find actual size somehow?
2266                     // in_size += table->size() / 1024;
2267                     single_file_in = true;
2268                     bad_totals = true;
2269                     output_will_exist = true;
2270                     ++inputs_present;
2271                 }
2272             } else {
2273                 off_t db_size = file_size(table->get_path());
2274                 if (errno == 0) {
2275                     // FIXME: check overflow and set bad_totals
2276                     in_total += db_size;
2277                     in_size += db_size / 1024;
2278                     output_will_exist = true;
2279                     ++inputs_present;
2280                 } else if (errno != ENOENT) {
2281                     // We get ENOENT for an optional table.
2282                     bad_totals = bad_stat = true;
2283                     output_will_exist = true;
2284                     ++inputs_present;
2285                 }
2286             }
2287             inputs.push_back(table);
2288         }
2289
2290         // If any inputs lack a termlist table, suppress it in the output.
2291         if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2292             if (inputs_present != 0) {
2293                 if (compactor) {
2294                     string m = str(inputs_present);
2295                     m += " of ";
2296                     m += str(sources.size());
2297                     m += " inputs present, so suppressing output";
2298                     compactor->set_status(t->name, m);
2299                 }
2300                 continue;
2301             }
2302             output_will_exist = false;
2303         }
2304
2305         if (!output_will_exist) {
2306             if (compactor)
2307                 compactor->set_status(t->name, "doesn't exist");
2308             continue;
2309         }
2310
2311         HoneyTable * out;
2312         off_t table_start_offset = -1;
2313         if (single_file) {
2314             if (t == tables) {
2315                 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2316                 // space for version file.  It's tricky to exactly know the
2317                 // size of the version file beforehand.
2318                 table_start_offset = HONEY_VERSION_MAX_SIZE;
2319                 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2320                     throw Xapian::DatabaseError("lseek() failed", errno);
2321             } else {
2322                 table_start_offset = lseek(fd, 0, SEEK_CUR);
2323             }
2324             out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2325                                  false, false);
2326         } else {
2327             out = new HoneyTable(t->name, dest, false, t->lazy);
2328         }
2329         tabs.push_back(out);
2330         Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2331         if (single_file) {
2332             root_info->set_free_list(fl_serialised);
2333             root_info->set_offset(table_start_offset);
2334             out->open(FLAGS,
2335                       version_file_out->get_root(t->type),
2336                       version_file_out->get_revision());
2337         } else {
2338             out->create_and_open(FLAGS, *root_info);
2339         }
2340
2341         out->set_full_compaction(compaction != compactor->STANDARD);
2342         if (compaction == compactor->FULLER) out->set_max_item_size(1);
2343
2344         switch (t->type) {
2345             case Honey::POSTLIST: {
2346                 if (multipass && inputs.size() > 3) {
2347                     multimerge_postlists(compactor, out, destdir,
2348                                          inputs, offset);
2349                 } else {
2350                     merge_postlists(compactor, out, offset.begin(),
2351                                     inputs.begin(), inputs.end());
2352                 }
2353                 break;
2354             }
2355             case Honey::SPELLING:
2356                 merge_spellings(out, inputs.begin(), inputs.end());
2357                 break;
2358             case Honey::SYNONYM:
2359                 merge_synonyms(out, inputs.begin(), inputs.end());
2360                 break;
2361             case Honey::POSITION:
2362                 merge_positions(out, inputs, offset);
2363                 break;
2364             default:
2365                 // DocData, Termlist
2366                 merge_docid_keyed(out, inputs, offset);
2367                 break;
2368         }
2369
2370         // Commit as revision 1.
2371         out->flush_db();
2372         out->commit(1, root_info);
2373         out->sync();
2374         if (single_file) fl_serialised = root_info->get_free_list();
2375
2376         off_t out_size = 0;
2377         if (!bad_stat && !single_file_in) {
2378             off_t db_size;
2379             if (single_file) {
2380                 db_size = file_size(fd);
2381             } else {
2382                 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2383             }
2384             if (errno == 0) {
2385                 if (single_file) {
2386                     off_t old_prev_size = max(prev_size, off_t(block_size));
2387                     prev_size = db_size;
2388                     db_size -= old_prev_size;
2389                 }
2390                 // FIXME: check overflow and set bad_totals
2391                 out_total += db_size;
2392                 out_size = db_size / 1024;
2393             } else if (errno != ENOENT) {
2394                 bad_totals = bad_stat = true;
2395             }
2396         }
2397         if (bad_stat) {
2398             if (compactor)
2399                 compactor->set_status(t->name,
2400                                       "Done (couldn't stat all the DB files)");
2401         } else if (single_file_in) {
2402             if (compactor)
2403                 compactor->set_status(t->name,
2404                                       "Done (table sizes unknown for single "
2405                                       "file DB input)");
2406         } else {
2407             string status;
2408             if (out_size == in_size) {
2409                 status = "Size unchanged (";
2410             } else {
2411                 off_t delta;
2412                 if (out_size < in_size) {
2413                     delta = in_size - out_size;
2414                     status = "Reduced by ";
2415                 } else {
2416                     delta = out_size - in_size;
2417                     status = "INCREASED by ";
2418                 }
2419                 if (in_size) {
2420                     status += str(100 * delta / in_size);
2421                     status += "% ";
2422                 }
2423                 status += str(delta);
2424                 status += "K (";
2425                 status += str(in_size);
2426                 status += "K -> ";
2427             }
2428             status += str(out_size);
2429             status += "K)";
2430             if (compactor)
2431                 compactor->set_status(t->name, status);
2432         }
2433     }
2434
2435     // If compacting to a single file output and all the tables are empty, pad
2436     // the output so that it isn't mistaken for a stub database when we try to
2437     // open it.  For this it needs to be a multiple of 2KB in size.
2438     if (single_file && prev_size < off_t(block_size)) {
2439 #ifdef HAVE_FTRUNCATE
2440         if (ftruncate(fd, block_size) < 0) {
2441             throw Xapian::DatabaseError("Failed to set size of output "
2442                                         "database", errno);
2443         }
2444 #else
2445         const off_t off = block_size - 1;
2446         if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2447             throw Xapian::DatabaseError("Failed to set size of output "
2448                                         "database", errno);
2449         }
2450 #endif
2451     }
2452
2453     if (single_file) {
2454         if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2455             throw Xapian::DatabaseError("lseek() failed", errno);
2456         }
2457     }
2458     version_file_out->set_last_docid(last_docid);
2459     string tmpfile = version_file_out->write(1, FLAGS);
2460     for (unsigned j = 0; j != tabs.size(); ++j) {
2461         tabs[j]->sync();
2462     }
2463     // Commit with revision 1.
2464     version_file_out->sync(tmpfile, 1, FLAGS);
2465     for (unsigned j = 0; j != tabs.size(); ++j) {
2466         delete tabs[j];
2467     }
2468 }
2469
2470     if (!single_file) lock.release();
2471
2472     if (!bad_totals && compactor) {
2473         string status;
2474         in_total /= 1024;
2475         out_total /= 1024;
2476         if (out_total == in_total) {
2477             status = "Size unchanged (";
2478         } else {
2479             off_t delta;
2480             if (out_total < in_total) {
2481                 delta = in_total - out_total;
2482                 status = "Reduced by ";
2483             } else {
2484                 delta = out_total - in_total;
2485                 status = "INCREASED by ";
2486             }
2487             if (in_total) {
2488                 status += str(100 * delta / in_total);
2489                 status += "% ";
2490             }
2491             status += str(delta);
2492             status += "K (";
2493             status += str(in_total);
2494             status += "K -> ";
2495         }
2496         status += str(out_total);
2497         status += "K)";
2498         compactor->set_status("Total", status);
2499     }
2500 }