xapian-core/backends/honey/honey_compact.cc

   1 /** @file honey_compact.cc
   2  * @brief Compact a honey database, or merge and compact several.
   3  */
   4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  19  * USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "xapian/compactor.h"
  25 #include "xapian/constants.h"
  26 #include "xapian/error.h"
  27 #include "xapian/types.h"
  28
  29 #include <algorithm>
  30 #include <memory>
  31 #include <queue>
  32 #include <type_traits>
  33
  34 #include <cerrno>
  35 #include <cstdio>
  36
  37 #include "backends/flint_lock.h"
  38 #include "compression_stream.h"
  39 #include "honey_cursor.h"
  40 #include "honey_database.h"
  41 #include "honey_defs.h"
  42 #include "honey_postlist_encodings.h"
  43 #include "honey_table.h"
  44 #include "honey_values.h"
  45 #include "honey_version.h"
  46 #include "filetests.h"
  47 #include "internaltypes.h"
  48 #include "pack.h"
  49 #include "backends/valuestats.h"
  50 #include "wordaccess.h"
  51
  52 #include "../byte_length_strings.h"
  53 #include "../prefix_compressed_strings.h"
  54
  55 #ifdef XAPIAN_HAS_GLASS_BACKEND
  56 # include "../glass/glass_database.h"
  57 # include "../glass/glass_table.h"
  58 # include "../glass/glass_values.h"
  59 #endif
  60
  61 using namespace std;
  62 using Honey::encode_valuestats;
  63
  64 [[noreturn]]
  65 static void
  66 throw_database_corrupt(const char* item, const char* pos)
  67 {
  68     string message;
  69     if (pos != NULL) {
  70         message = "Value overflow unpacking termlist: ";
  71     } else {
  72         message = "Out of data unpacking termlist: ";
  73     }
  74     message += item;
  75     throw Xapian::DatabaseCorruptError(message);
  76 }
  77
  78 #ifdef XAPIAN_HAS_GLASS_BACKEND
  79 namespace GlassCompact {
  80
  81 static inline bool
  82 is_user_metadata_key(const string & key)
  83 {
  84     return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
  85 }
  86
  87 static inline bool
  88 is_valuestats_key(const string & key)
  89 {
  90     return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
  91 }
  92
  93 static inline bool
  94 is_valuechunk_key(const string & key)
  95 {
  96     return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
  97 }
  98
  99 static inline bool
 100 is_doclenchunk_key(const string & key)
 101 {
 102     return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
 103 }
 104
 105 }
 106
 107 inline static bool
 108 termlist_key_is_values_used(const string& key)
 109 {
 110     const char* p = key.data();
 111     const char* e = p + key.size();
 112     Xapian::docid did;
 113     if (unpack_uint_preserving_sort(&p, e, &did)) {
 114         if (p == e)
 115             return false;
 116         if (e - p == 1 && *p == '\0')
 117             return true;
 118     }
 119     throw Xapian::DatabaseCorruptError("termlist key format");
 120 }
 121 #endif
 122
 123 // Put all the helpers in a namespace to avoid symbols colliding with those of
 124 // the same name in other flint-derived backends.
 125 namespace HoneyCompact {
 126
 127 /// Return a Honey::KEY_* constant, or a different value for an invalid key.
 128 static inline int
 129 key_type(const string& key)
 130 {
 131     if (key[0] != '\0')
 132         return Honey::KEY_POSTING_CHUNK;
 133
 134     if (key.size() <= 1)
 135         return -1;
 136
 137     unsigned char ch = key[1];
 138     if (ch >= Honey::KEY_VALUE_STATS && ch <= Honey::KEY_VALUE_STATS_HI)
 139         return Honey::KEY_VALUE_STATS;
 140     if (ch >= Honey::KEY_VALUE_CHUNK && ch <= Honey::KEY_VALUE_CHUNK_HI)
 141         return Honey::KEY_VALUE_CHUNK;
 142     if (ch >= Honey::KEY_DOCLEN_CHUNK && ch <= Honey::KEY_DOCLEN_CHUNK_HI)
 143         return Honey::KEY_DOCLEN_CHUNK;
 144
 145     // Handle Honey::KEY_USER_METADATA, Honey::KEY_POSTING_CHUNK, and currently
 146     // unused values.
 147     return ch;
 148 }
 149
 150 template<typename T> class PostlistCursor;
 151
 152 #ifdef XAPIAN_HAS_GLASS_BACKEND
 153 // Convert glass to honey.
 154 template<>
 155 class PostlistCursor<const GlassTable&> : private GlassCursor {
 156     Xapian::docid offset;
 157
 158   public:
 159     string key, tag;
 160     Xapian::docid firstdid;
 161     Xapian::docid chunk_lastdid;
 162     Xapian::termcount tf, cf;
 163     Xapian::termcount first_wdf;
 164     Xapian::termcount wdf_max;
 165     bool have_wdfs;
 166
 167     PostlistCursor(const GlassTable *in, Xapian::docid offset_)
 168         : GlassCursor(in), offset(offset_), firstdid(0)
 169     {
 170         rewind();
 171     }
 172
 173     bool next() {
 174         if (!GlassCursor::next()) return false;
 175         // We put all chunks into the non-initial chunk form here, then fix up
 176         // the first chunk for each term in the merged database as we merge.
 177         read_tag();
 178         key = current_key;
 179         tag = current_tag;
 180         tf = cf = 0;
 181         if (GlassCompact::is_user_metadata_key(key)) {
 182             key[1] = Honey::KEY_USER_METADATA;
 183             return true;
 184         }
 185         if (GlassCompact::is_valuestats_key(key)) {
 186             // Adjust key.
 187             const char * p = key.data();
 188             const char * end = p + key.length();
 189             p += 2;
 190             Xapian::valueno slot;
 191             if (!unpack_uint_last(&p, end, &slot))
 192                 throw Xapian::DatabaseCorruptError("bad value stats key");
 193             // FIXME: pack_uint_last() is not a good encoding for keys, as the
 194             // encoded values do not in general sort the same way as the
 195             // numeric values.  The first 256 values will be in order at least.
 196             //
 197             // We could just buffer up the stats for all the slots - it's
 198             // unlikely there are going to be very many.  Another option is
 199             // to use multiple cursors or seek a single cursor around.
 200             AssertRel(slot, <=, 256);
 201             key = Honey::make_valuestats_key(slot);
 202             return true;
 203         }
 204         if (GlassCompact::is_valuechunk_key(key)) {
 205             const char * p = key.data();
 206             const char * end = p + key.length();
 207             p += 2;
 208             Xapian::valueno slot;
 209             if (!unpack_uint(&p, end, &slot))
 210                 throw Xapian::DatabaseCorruptError("bad value key");
 211             // FIXME: pack_uint() is not a good encoding for keys, as the
 212             // encoded values do not in general sort the same way as the
 213             // numeric values.  The first 128 values will be in order at least.
 214             //
 215             // Buffering up this data for all slots is potentially prohibitively
 216             // costly.  We probably need to use multiple cursors or seek a single
 217             // cursor around.
 218             AssertRel(slot, <=, 128);
 219             Xapian::docid first_did;
 220             if (!unpack_uint_preserving_sort(&p, end, &first_did))
 221                 throw Xapian::DatabaseCorruptError("bad value key");
 222             first_did += offset;
 223
 224             Glass::ValueChunkReader reader(tag.data(), tag.size(), first_did);
 225             Xapian::docid last_did = first_did;
 226             while (reader.next(), !reader.at_end()) {
 227                 last_did = reader.get_docid();
 228             }
 229
 230             key = Honey::make_valuechunk_key(slot, last_did);
 231
 232             // Add the docid delta across the chunk to the start of the tag.
 233             string newtag;
 234             pack_uint(newtag, last_did - first_did);
 235             tag.insert(0, newtag);
 236
 237             return true;
 238         }
 239
 240         if (GlassCompact::is_doclenchunk_key(key)) {
 241             const char * d = key.data();
 242             const char * e = d + key.size();
 243             d += 2;
 244
 245             if (d == e) {
 246                 // This is an initial chunk, so adjust tag header.
 247                 d = tag.data();
 248                 e = d + tag.size();
 249                 if (!unpack_uint(&d, e, &tf) ||
 250                     !unpack_uint(&d, e, &cf) ||
 251                     !unpack_uint(&d, e, &firstdid)) {
 252                     throw Xapian::DatabaseCorruptError("Bad postlist key");
 253                 }
 254                 ++firstdid;
 255                 tag.erase(0, d - tag.data());
 256             } else {
 257                 // Not an initial chunk, just unpack firstdid.
 258                 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
 259                     throw Xapian::DatabaseCorruptError("Bad postlist key");
 260             }
 261             firstdid += offset;
 262
 263             // Set key to placeholder value which just indicates that this is a
 264             // doclen chunk.
 265             static const char doclen_key_prefix[2] = {
 266                 0, char(Honey::KEY_DOCLEN_CHUNK)
 267             };
 268             key.assign(doclen_key_prefix, 2);
 269
 270             d = tag.data();
 271             e = d + tag.size();
 272
 273             // Convert doclen chunk to honey format.
 274             string newtag;
 275
 276             // Skip the "last chunk" flag and increase_to_last.
 277             if (d == e)
 278                 throw Xapian::DatabaseCorruptError("No last chunk flag in "
 279                                                    "glass docdata chunk");
 280             ++d;
 281             Xapian::docid increase_to_last;
 282             if (!unpack_uint(&d, e, &increase_to_last))
 283                 throw Xapian::DatabaseCorruptError("Decoding last docid delta "
 284                                                    "in glass docdata chunk");
 285
 286             Xapian::termcount doclen_max = 0;
 287             while (true) {
 288                 Xapian::termcount doclen;
 289                 if (!unpack_uint(&d, e, &doclen))
 290                     throw Xapian::DatabaseCorruptError("Decoding doclen in "
 291                                                        "glass docdata chunk");
 292                 if (doclen > doclen_max)
 293                     doclen_max = doclen;
 294                 unsigned char buf[4];
 295                 unaligned_write4(buf, doclen);
 296                 newtag.append(reinterpret_cast<char*>(buf), 4);
 297                 if (d == e)
 298                     break;
 299                 Xapian::docid gap_size;
 300                 if (!unpack_uint(&d, e, &gap_size))
 301                     throw Xapian::DatabaseCorruptError("Decoding docid "
 302                                                        "gap_size in glass "
 303                                                        "docdata chunk");
 304                 // FIXME: Split chunk if the gap_size is at all large.
 305                 newtag.append(4 * gap_size, '\xff');
 306             }
 307
 308             Assert(!startswith(newtag, "\xff\xff\xff\xff"));
 309             Assert(!endswith(newtag, "\xff\xff\xff\xff"));
 310
 311             AssertEq(newtag.size() % 4, 0);
 312             chunk_lastdid = firstdid - 1 + newtag.size() / 4;
 313
 314             // Only encode document lengths using a whole number of bytes for
 315             // now.  We could allow arbitrary bit widths, but it complicates
 316             // encoding and decoding so we should consider if the fairly small
 317             // additional saving is worth it.
 318             if (doclen_max >= 0xffff) {
 319                 if (doclen_max >= 0xffffff) {
 320                     newtag.insert(0, 1, char(32));
 321                     swap(tag, newtag);
 322                 } else if (doclen_max >= 0xffffffff) {
 323                     // FIXME: Handle these.
 324                     const char* m = "Document length values >= 0xffffffff not "
 325                                     "currently handled";
 326                     throw Xapian::FeatureUnavailableError(m);
 327                 } else {
 328                     tag.assign(1, char(24));
 329                     for (size_t i = 1; i < newtag.size(); i += 4)
 330                         tag.append(newtag, i, 3);
 331                 }
 332             } else {
 333                 if (doclen_max >= 0xff) {
 334                     tag.assign(1, char(16));
 335                     for (size_t i = 2; i < newtag.size(); i += 4)
 336                         tag.append(newtag, i, 2);
 337                 } else {
 338                     tag.assign(1, char(8));
 339                     for (size_t i = 3; i < newtag.size(); i += 4)
 340                         tag.append(newtag, i, 1);
 341                 }
 342             }
 343
 344             return true;
 345         }
 346
 347         // Adjust key if this is *NOT* an initial chunk.
 348         // key is: pack_string_preserving_sort(key, tname)
 349         // plus optionally: pack_uint_preserving_sort(key, did)
 350         const char * d = key.data();
 351         const char * e = d + key.size();
 352         string tname;
 353         if (!unpack_string_preserving_sort(&d, e, tname))
 354             throw Xapian::DatabaseCorruptError("Bad postlist key");
 355
 356         if (d == e) {
 357             // This is an initial chunk for a term, so adjust tag header.
 358             d = tag.data();
 359             e = d + tag.size();
 360             if (!unpack_uint(&d, e, &tf) ||
 361                 !unpack_uint(&d, e, &cf) ||
 362                 !unpack_uint(&d, e, &firstdid)) {
 363                 throw Xapian::DatabaseCorruptError("Bad postlist key");
 364             }
 365             ++firstdid;
 366             have_wdfs = (cf != 0);
 367             tag.erase(0, d - tag.data());
 368             wdf_max = 0;
 369         } else {
 370             // Not an initial chunk, so adjust key.
 371             size_t tmp = d - key.data();
 372             if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
 373                 throw Xapian::DatabaseCorruptError("Bad postlist key");
 374             key.erase(tmp - 1);
 375         }
 376         firstdid += offset;
 377
 378         d = tag.data();
 379         e = d + tag.size();
 380
 381         // Convert posting chunk to honey format, but without any header.
 382         string newtag;
 383
 384         // Skip the "last chunk" flag; decode increase_to_last.
 385         if (d == e)
 386             throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
 387                                                "posting chunk");
 388         ++d;
 389         Xapian::docid increase_to_last;
 390         if (!unpack_uint(&d, e, &increase_to_last))
 391             throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
 392                                                "glass posting chunk");
 393         chunk_lastdid = firstdid + increase_to_last;
 394         if (!unpack_uint(&d, e, &first_wdf))
 395             throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
 396                                                "posting chunk");
 397         if ((first_wdf != 0) != have_wdfs) {
 398             // FIXME: Also need to adjust document length, total document length.
 399             // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
 400             // first_wdf = 1;
 401             // ++cf;
 402             throw Xapian::DatabaseError("Honey does not support a term having "
 403                                         "both zero and non-zero wdf");
 404         }
 405         wdf_max = max(wdf_max, first_wdf);
 406
 407         while (d != e) {
 408             Xapian::docid delta;
 409             if (!unpack_uint(&d, e, &delta))
 410                 throw Xapian::DatabaseCorruptError("Decoding docid delta in "
 411                                                    "glass posting chunk");
 412             pack_uint(newtag, delta);
 413             Xapian::termcount wdf;
 414             if (!unpack_uint(&d, e, &wdf))
 415                 throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
 416                                                    "posting chunk");
 417             if ((wdf != 0) != have_wdfs) {
 418                 // FIXME: Also need to adjust document length, total document length.
 419                 // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
 420                 // wdf = 1;
 421                 // ++cf;
 422                 throw Xapian::DatabaseError("Honey does not support a term "
 423                                             "having both zero and non-zero "
 424                                             "wdf");
 425             }
 426             if (have_wdfs) {
 427                 pack_uint(newtag, wdf);
 428                 wdf_max = max(wdf_max, wdf);
 429             }
 430         }
 431
 432         swap(tag, newtag);
 433
 434         return true;
 435     }
 436 };
 437 #endif
 438
 439 template<>
 440 class PostlistCursor<const HoneyTable&> : private HoneyCursor {
 441     Xapian::docid offset;
 442
 443   public:
 444     string key, tag;
 445     Xapian::docid firstdid;
 446     Xapian::docid chunk_lastdid;
 447     Xapian::termcount tf, cf;
 448     Xapian::termcount first_wdf;
 449     Xapian::termcount wdf_max;
 450     bool have_wdfs;
 451
 452     PostlistCursor(const HoneyTable *in, Xapian::docid offset_)
 453         : HoneyCursor(in), offset(offset_), firstdid(0)
 454     {
 455         rewind();
 456     }
 457
 458     bool next() {
 459         if (!HoneyCursor::next()) return false;
 460         // We put all chunks into the non-initial chunk form here, then fix up
 461         // the first chunk for each term in the merged database as we merge.
 462         read_tag();
 463         key = current_key;
 464         tag = current_tag;
 465         tf = 0;
 466         switch (key_type(key)) {
 467             case Honey::KEY_USER_METADATA:
 468             case Honey::KEY_VALUE_STATS:
 469                 return true;
 470             case Honey::KEY_VALUE_CHUNK: {
 471                 const char * p = key.data();
 472                 const char * end = p + key.length();
 473                 p += 2;
 474                 Xapian::valueno slot;
 475                 if (p[-1] != char(Honey::KEY_VALUE_CHUNK_HI)) {
 476                     slot = p[-1] - Honey::KEY_VALUE_CHUNK;
 477                 } else {
 478                     if (!unpack_uint_preserving_sort(&p, end, &slot))
 479                         throw Xapian::DatabaseCorruptError("bad value key");
 480                 }
 481                 Xapian::docid did;
 482                 if (!unpack_uint_preserving_sort(&p, end, &did))
 483                     throw Xapian::DatabaseCorruptError("bad value key");
 484                 key = Honey::make_valuechunk_key(slot, did + offset);
 485                 return true;
 486             }
 487             case Honey::KEY_DOCLEN_CHUNK: {
 488                 Xapian::docid did = Honey::docid_from_key(key);
 489                 if (did == 0)
 490                     throw Xapian::DatabaseCorruptError("Bad doclen key");
 491                 chunk_lastdid = did + offset;
 492                 key.resize(2);
 493                 return true;
 494             }
 495             case Honey::KEY_POSTING_CHUNK:
 496                 break;
 497             default:
 498                 throw Xapian::DatabaseCorruptError("Bad postlist table key "
 499                                                    "type");
 500         }
 501
 502         // Adjust key if this is *NOT* an initial chunk.
 503         // key is: pack_string_preserving_sort(key, term)
 504         // plus optionally: pack_uint_preserving_sort(key, did)
 505         const char * d = key.data();
 506         const char * e = d + key.size();
 507         string term;
 508         if (!unpack_string_preserving_sort(&d, e, term))
 509             throw Xapian::DatabaseCorruptError("Bad postlist key");
 510
 511         if (d == e) {
 512             // This is an initial chunk for a term, so remove tag header.
 513             d = tag.data();
 514             e = d + tag.size();
 515
 516             Xapian::docid lastdid;
 517             if (!decode_initial_chunk_header(&d, e, tf, cf,
 518                                              firstdid, lastdid, chunk_lastdid,
 519                                              first_wdf, wdf_max)) {
 520                 throw Xapian::DatabaseCorruptError("Bad postlist initial "
 521                                                    "chunk header");
 522             }
 523             // Ignore lastdid - we'll need to recalculate it (at least when
 524             // merging, and for simplicity we always do).
 525             (void)lastdid;
 526             tag.erase(0, d - tag.data());
 527
 528             have_wdfs = (cf != 0) && (cf != tf - 1 + first_wdf);
 529             if (have_wdfs && tf > 2) {
 530                 Xapian::termcount remaining_cf_for_flat_wdf =
 531                     (tf - 1) * wdf_max;
 532                 // Check this matches and that it isn't a false match due
 533                 // to overflow of the multiplication above.
 534                 if (cf - first_wdf == remaining_cf_for_flat_wdf &&
 535                     usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
 536                     // The wdf is flat so we don't need to store it.
 537                     have_wdfs = false;
 538                 }
 539             }
 540         } else {
 541             if (cf > 0) {
 542                 // The cf we report should only be non-zero for initial chunks
 543                 // (of which there may be several for the same term from
 544                 // different instances of this class when merging databases)
 545                 // and gets summed to give the total cf for the term, so we
 546                 // need to zero it here, after potentially using it to
 547                 // calculate first_wdf.
 548                 //
 549                 // If cf is zero for a term, first_wdf will be zero for all
 550                 // chunks so doesn't need setting specially here.
 551                 if (!have_wdfs)
 552                     first_wdf = (cf - first_wdf) / (tf - 1);
 553                 cf = 0;
 554             }
 555
 556             // Not an initial chunk, so adjust key and remove tag header.
 557             size_t tmp = d - key.data();
 558             if (!unpack_uint_preserving_sort(&d, e, &chunk_lastdid) ||
 559                 d != e) {
 560                 throw Xapian::DatabaseCorruptError("Bad postlist key");
 561             }
 562             // -1 to remove the terminating zero byte too.
 563             key.erase(tmp - 1);
 564
 565             d = tag.data();
 566             e = d + tag.size();
 567
 568             if (have_wdfs) {
 569                 if (!decode_delta_chunk_header(&d, e, chunk_lastdid, firstdid,
 570                                                first_wdf)) {
 571                     throw Xapian::DatabaseCorruptError("Bad postlist delta "
 572                                                        "chunk header");
 573                 }
 574             } else {
 575                 if (!decode_delta_chunk_header_no_wdf(&d, e, chunk_lastdid,
 576                                                       firstdid)) {
 577                     throw Xapian::DatabaseCorruptError("Bad postlist delta "
 578                                                        "chunk header");
 579                 }
 580             }
 581             tag.erase(0, d - tag.data());
 582         }
 583         firstdid += offset;
 584         chunk_lastdid += offset;
 585         return true;
 586     }
 587 };
 588
 589 template<>
 590 class PostlistCursor<HoneyTable&> : public PostlistCursor<const HoneyTable&> {
 591   public:
 592     PostlistCursor(HoneyTable *in, Xapian::docid offset_)
 593         : PostlistCursor<const HoneyTable&>(in, offset_) {}
 594 };
 595
 596 template<typename T>
 597 class PostlistCursorGt {
 598   public:
 599     /** Return true if and only if a's key is strictly greater than b's key.
 600      */
 601     bool operator()(const T* a, const T* b) const {
 602         if (a->key > b->key) return true;
 603         if (a->key != b->key) return false;
 604         return (a->firstdid > b->firstdid);
 605     }
 606 };
 607
 608 // U : vector<HoneyTable*>::const_iterator
 609 template<typename T, typename U> void
 610 merge_postlists(Xapian::Compactor * compactor,
 611                 T * out, vector<Xapian::docid>::const_iterator offset,
 612                 U b, U e)
 613 {
 614     typedef decltype(**b) table_type; // E.g. HoneyTable
 615     typedef PostlistCursor<table_type> cursor_type;
 616     typedef PostlistCursorGt<cursor_type> gt_type;
 617     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
 618     for ( ; b != e; ++b, ++offset) {
 619         auto in = *b;
 620         auto cursor = new cursor_type(in, *offset);
 621         if (cursor->next()) {
 622             pq.push(cursor);
 623         } else {
 624             // Skip empty tables.
 625             delete cursor;
 626         }
 627     }
 628
 629     string last_key;
 630     {
 631         // Merge user metadata.
 632         vector<string> tags;
 633         while (!pq.empty()) {
 634             cursor_type * cur = pq.top();
 635             const string& key = cur->key;
 636             if (key_type(key) != Honey::KEY_USER_METADATA) break;
 637
 638             if (key != last_key) {
 639                 if (!tags.empty()) {
 640                     if (tags.size() > 1 && compactor) {
 641                         Assert(!last_key.empty());
 642                         // FIXME: It would be better to merge all duplicates
 643                         // for a key in one call, but currently we don't in
 644                         // multipass mode.
 645                         const string & resolved_tag =
 646                             compactor->resolve_duplicate_metadata(last_key,
 647                                                                   tags.size(),
 648                                                                   &tags[0]);
 649                         if (!resolved_tag.empty())
 650                             out->add(last_key, resolved_tag);
 651                     } else {
 652                         Assert(!last_key.empty());
 653                         out->add(last_key, tags[0]);
 654                     }
 655                     tags.resize(0);
 656                 }
 657                 last_key = key;
 658             }
 659             tags.push_back(cur->tag);
 660
 661             pq.pop();
 662             if (cur->next()) {
 663                 pq.push(cur);
 664             } else {
 665                 delete cur;
 666             }
 667         }
 668         if (!tags.empty()) {
 669             if (tags.size() > 1 && compactor) {
 670                 Assert(!last_key.empty());
 671                 const string & resolved_tag =
 672                     compactor->resolve_duplicate_metadata(last_key,
 673                                                           tags.size(),
 674                                                           &tags[0]);
 675                 if (!resolved_tag.empty())
 676                     out->add(last_key, resolved_tag);
 677             } else {
 678                 Assert(!last_key.empty());
 679                 out->add(last_key, tags[0]);
 680             }
 681         }
 682     }
 683
 684     {
 685         // Merge valuestats.
 686         Xapian::doccount freq = 0;
 687         string lbound, ubound;
 688
 689         while (!pq.empty()) {
 690             cursor_type * cur = pq.top();
 691             const string& key = cur->key;
 692             if (key_type(key) != Honey::KEY_VALUE_STATS) break;
 693             if (key != last_key) {
 694                 // For the first valuestats key, last_key will be the previous
 695                 // key we wrote, which we don't want to overwrite.  This is the
 696                 // only time that freq will be 0, so check that.
 697                 if (freq) {
 698                     out->add(last_key, encode_valuestats(freq, lbound, ubound));
 699                     freq = 0;
 700                 }
 701                 last_key = key;
 702             }
 703
 704             const string & tag = cur->tag;
 705
 706             const char * pos = tag.data();
 707             const char * end = pos + tag.size();
 708
 709             Xapian::doccount f;
 710             string l, u;
 711             if (!unpack_uint(&pos, end, &f)) {
 712                 if (*pos == 0)
 713                     throw Xapian::DatabaseCorruptError("Incomplete stats item "
 714                                                        "in value table");
 715                 throw Xapian::RangeError("Frequency statistic in value table "
 716                                          "is too large");
 717             }
 718             if (!unpack_string(&pos, end, l)) {
 719                 if (*pos == 0)
 720                     throw Xapian::DatabaseCorruptError("Incomplete stats item "
 721                                                        "in value table");
 722                 throw Xapian::RangeError("Lower bound in value table is too "
 723                                          "large");
 724             }
 725             size_t len = end - pos;
 726             if (len == 0) {
 727                 u = l;
 728             } else {
 729                 u.assign(pos, len);
 730             }
 731             if (freq == 0) {
 732                 freq = f;
 733                 lbound = l;
 734                 ubound = u;
 735             } else {
 736                 freq += f;
 737                 if (l < lbound) lbound = l;
 738                 if (u > ubound) ubound = u;
 739             }
 740
 741             pq.pop();
 742             if (cur->next()) {
 743                 pq.push(cur);
 744             } else {
 745                 delete cur;
 746             }
 747         }
 748
 749         if (freq) {
 750             out->add(last_key, encode_valuestats(freq, lbound, ubound));
 751         }
 752     }
 753
 754     // Merge valuestream chunks.
 755     while (!pq.empty()) {
 756         cursor_type * cur = pq.top();
 757         const string & key = cur->key;
 758         if (key_type(key) != Honey::KEY_VALUE_CHUNK) break;
 759         out->add(key, cur->tag);
 760         pq.pop();
 761         if (cur->next()) {
 762             pq.push(cur);
 763         } else {
 764             delete cur;
 765         }
 766     }
 767
 768     // Merge doclen chunks.
 769     while (!pq.empty()) {
 770         cursor_type * cur = pq.top();
 771         if (key_type(cur->key) != Honey::KEY_DOCLEN_CHUNK) break;
 772         string tag = std::move(cur->tag);
 773         auto chunk_lastdid = cur->chunk_lastdid;
 774         pq.pop();
 775         if (cur->next()) {
 776             pq.push(cur);
 777         } else {
 778             delete cur;
 779         }
 780         while (!pq.empty()) {
 781             cur = pq.top();
 782             if (key_type(cur->key) != Honey::KEY_DOCLEN_CHUNK)
 783                 break;
 784             if (tag[0] != cur->tag[0]) {
 785                 // Different width values in the two tags, so punt for now.
 786                 // FIXME: We would ideally optimise the total size here.
 787                 break;
 788             }
 789             size_t byte_width = tag[0] / 8;
 790             auto new_size = tag.size();
 791             Xapian::docid gap_size = cur->firstdid - chunk_lastdid - 1;
 792             new_size += gap_size * byte_width;
 793             if (new_size >= HONEY_DOCLEN_CHUNK_MAX) {
 794                 // The gap spans beyond HONEY_DOCLEN_CHUNK_MAX.
 795                 break;
 796             }
 797             new_size += cur->tag.size() - 1;
 798             auto full_new_size = new_size;
 799             if (new_size > HONEY_DOCLEN_CHUNK_MAX) {
 800                 if (byte_width > 1) {
 801                     // HONEY_DOCLEN_CHUNK_MAX should be one more than a
 802                     // multiple of 12 so for widths 1,2,3,4 we can fix the
 803                     // initial byte which indicates the width for the chunk
 804                     // plus an exact number of entries.
 805                     auto m = (new_size - HONEY_DOCLEN_CHUNK_MAX) % byte_width;
 806                     (void)m;
 807                     AssertEq(m, 0);
 808                 }
 809                 new_size = HONEY_DOCLEN_CHUNK_MAX;
 810             }
 811             tag.reserve(new_size);
 812             tag.append(byte_width * gap_size, '\xff');
 813             if (new_size != full_new_size) {
 814                 // Partial copy.
 815                 auto copy_size = new_size - tag.size();
 816                 tag.append(cur->tag, 1, copy_size);
 817                 cur->tag.erase(1, copy_size);
 818                 copy_size /= byte_width;
 819                 cur->firstdid += copy_size;
 820                 chunk_lastdid += gap_size;
 821                 chunk_lastdid += copy_size;
 822                 break;
 823             }
 824
 825             tag.append(cur->tag, 1, string::npos);
 826             chunk_lastdid = cur->chunk_lastdid;
 827
 828             pq.pop();
 829             if (cur->next()) {
 830                 pq.push(cur);
 831             } else {
 832                 delete cur;
 833             }
 834         }
 835         out->add(Honey::make_doclenchunk_key(chunk_lastdid), tag);
 836     }
 837
 838     Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
 839
 840     struct HoneyPostListChunk {
 841         Xapian::docid first, last;
 842         Xapian::termcount first_wdf;
 843         Xapian::termcount wdf_max;
 844         Xapian::doccount tf;
 845         Xapian::termcount cf;
 846         bool have_wdfs;
 847         string data;
 848
 849         HoneyPostListChunk(Xapian::docid first_,
 850                            Xapian::docid last_,
 851                            Xapian::termcount first_wdf_,
 852                            Xapian::termcount wdf_max_,
 853                            Xapian::doccount tf_,
 854                            Xapian::termcount cf_,
 855                            bool have_wdfs_,
 856                            string&& data_)
 857             : first(first_),
 858               last(last_),
 859               first_wdf(first_wdf_),
 860               wdf_max(wdf_max_),
 861               tf(tf_),
 862               cf(cf_),
 863               have_wdfs(have_wdfs_),
 864               data(data_) {}
 865
 866         void append_postings_to(string& tag, bool want_wdfs) {
 867             if (want_wdfs && !have_wdfs) {
 868                 // Need to add wdfs which were implicit.
 869                 auto wdf = (cf - first_wdf) / (tf - 1);
 870                 const char* pos = data.data();
 871                 const char* pos_end = pos + data.size();
 872                 while (pos != pos_end) {
 873                     Xapian::docid delta;
 874                     if (!unpack_uint(&pos, pos_end, &delta))
 875                         throw_database_corrupt("Decoding docid delta", pos);
 876                     pack_uint(tag, delta);
 877                     pack_uint(tag, wdf);
 878                 }
 879             } else if (have_wdfs && !want_wdfs) {
 880                 // Need to remove wdfs which are now implicit.
 881                 const char* pos = data.data();
 882                 const char* pos_end = pos + data.size();
 883                 while (pos != pos_end) {
 884                     Xapian::docid delta;
 885                     if (!unpack_uint(&pos, pos_end, &delta))
 886                         throw_database_corrupt("Decoding docid delta", pos);
 887                     pack_uint(tag, delta);
 888                     Xapian::termcount wdf;
 889                     if (!unpack_uint(&pos, pos_end, &wdf))
 890                         throw_database_corrupt("Decoding wdf", pos);
 891                     (void)wdf;
 892                 }
 893             } else {
 894                 tag += data;
 895             }
 896         }
 897     };
 898     vector<HoneyPostListChunk> tags;
 899     while (true) {
 900         cursor_type * cur = NULL;
 901         if (!pq.empty()) {
 902             cur = pq.top();
 903             pq.pop();
 904         }
 905         if (cur) {
 906             AssertEq(key_type(cur->key), Honey::KEY_POSTING_CHUNK);
 907         }
 908         if (cur == NULL || cur->key != last_key) {
 909             if (!tags.empty()) {
 910                 Xapian::termcount first_wdf = tags[0].first_wdf;
 911                 Xapian::docid chunk_lastdid = tags[0].last;
 912                 Xapian::docid last_did = tags.back().last;
 913                 Xapian::termcount wdf_max = tags.back().wdf_max;
 914
 915                 bool have_wdfs = true;
 916                 if (cf == 0) {
 917                     // wdf must always be zero.
 918                     have_wdfs = false;
 919                 } else if (tf <= 2) {
 920                     // We only need to store cf and first_wdf to know all wdfs.
 921                     have_wdfs = false;
 922                 } else if (cf == tf - 1 + first_wdf) {
 923                     // wdf must be 1 for second and subsequent entries.
 924                     have_wdfs = false;
 925                 } else {
 926                     Xapian::termcount remaining_cf_for_flat_wdf =
 927                         (tf - 1) * wdf_max;
 928                     // Check this matches and that it isn't a false match due
 929                     // to overflow of the multiplication above.
 930                     if (cf - first_wdf == remaining_cf_for_flat_wdf &&
 931                         usual(remaining_cf_for_flat_wdf / wdf_max == tf - 1)) {
 932                         // The wdf is flat so we don't need to store it.
 933                         have_wdfs = false;
 934                     }
 935                 }
 936
 937                 Xapian::docid splice_last = 0;
 938                 if (!have_wdfs && tags[0].have_wdfs && tf > 2 &&
 939                     tags.size() > 1) {
 940                     // Stripping wdfs will approximately halve the size so
 941                     // double up the chunks.
 942                     splice_last = chunk_lastdid;
 943                     chunk_lastdid = tags[1].last;
 944                 }
 945
 946                 string first_tag;
 947                 encode_initial_chunk_header(tf, cf, tags[0].first, last_did,
 948                                             chunk_lastdid,
 949                                             first_wdf, wdf_max, first_tag);
 950
 951                 if (tf > 2) {
 952                     tags[0].append_postings_to(first_tag, have_wdfs);
 953                     if (!have_wdfs && splice_last) {
 954                         pack_uint(first_tag, tags[1].first - splice_last);
 955                         tags[1].append_postings_to(first_tag, have_wdfs);
 956                     }
 957                 }
 958                 out->add(last_key, first_tag);
 959
 960                 // If tf == 2, the data could be split over two tags when
 961                 // merging, but we only output an initial tag in this case.
 962                 if (tf > 2 && tags.size() > 1) {
 963                     string term;
 964                     const char* p = last_key.data();
 965                     const char* end = p + last_key.size();
 966                     if (!unpack_string_preserving_sort(&p, end, term) ||
 967                         p != end) {
 968                         throw Xapian::DatabaseCorruptError("Bad postlist "
 969                                                            "chunk key");
 970                     }
 971
 972                     auto i = tags.begin();
 973                     if (splice_last) {
 974                         splice_last = 0;
 975                         ++i;
 976                     }
 977                     while (++i != tags.end()) {
 978                         last_did = i->last;
 979                         string tag;
 980                         if (have_wdfs) {
 981                             encode_delta_chunk_header(i->first,
 982                                                       last_did,
 983                                                       i->first_wdf,
 984                                                       tag);
 985                             tag += i->data;
 986                         } else {
 987                             if (i->have_wdfs && i + 1 != tags.end()) {
 988                                 splice_last = last_did;
 989                                 last_did = (i + 1)->last;
 990                             }
 991
 992                             encode_delta_chunk_header_no_wdf(i->first,
 993                                                              last_did,
 994                                                              tag);
 995                             tag += i->data;
 996                             if (splice_last) {
 997                                 ++i;
 998                                 pack_uint(tag, i->first - splice_last);
 999                                 splice_last = 0;
1000                                 tag += i->data;
1001                             }
1002                         }
1003
1004                         out->add(pack_honey_postlist_key(term, last_did), tag);
1005                     }
1006                 }
1007             }
1008             tags.clear();
1009             if (cur == NULL) break;
1010             tf = cf = 0;
1011             last_key = cur->key;
1012         }
1013
1014         if (tf && cur->tf && (cf == 0) != (cur->cf == 0)) {
1015             // FIXME: Also need to adjust document length, total document
1016             // length.
1017             // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
1018             throw Xapian::DatabaseError("Honey does not support a term having "
1019                                         "both zero and non-zero wdf");
1020         }
1021         tf += cur->tf;
1022         cf += cur->cf;
1023
1024         tags.push_back(HoneyPostListChunk(cur->firstdid,
1025                                           cur->chunk_lastdid,
1026                                           cur->first_wdf,
1027                                           cur->wdf_max,
1028                                           cur->tf,
1029                                           cur->cf,
1030                                           cur->have_wdfs,
1031                                           std::move(cur->tag)));
1032         if (cur->next()) {
1033             pq.push(cur);
1034         } else {
1035             delete cur;
1036         }
1037     }
1038 }
1039
1040 template<typename T> struct MergeCursor;
1041
1042 #ifdef XAPIAN_HAS_GLASS_BACKEND
1043 template<>
1044 struct MergeCursor<const GlassTable&> : public GlassCursor {
1045     explicit MergeCursor(const GlassTable *in) : GlassCursor(in) {
1046         rewind();
1047     }
1048 };
1049 #endif
1050
1051 template<>
1052 struct MergeCursor<const HoneyTable&> : public HoneyCursor {
1053     explicit MergeCursor(const HoneyTable *in) : HoneyCursor(in) {
1054         rewind();
1055     }
1056 };
1057
1058 template<typename T>
1059 struct CursorGt {
1060     /// Return true if and only if a's key is strictly greater than b's key.
1061     bool operator()(const T* a, const T* b) const {
1062         if (b->after_end()) return false;
1063         if (a->after_end()) return true;
1064         return (a->current_key > b->current_key);
1065     }
1066 };
1067
1068 #ifdef XAPIAN_HAS_GLASS_BACKEND
1069 // Convert glass to honey.
1070 static void
1071 merge_spellings(HoneyTable* out,
1072                 vector<const GlassTable*>::const_iterator b,
1073                 vector<const GlassTable*>::const_iterator e)
1074 {
1075     typedef MergeCursor<const GlassTable&> cursor_type;
1076     typedef CursorGt<cursor_type> gt_type;
1077     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1078     for ( ; b != e; ++b) {
1079         auto in = *b;
1080         auto cursor = new cursor_type(in);
1081         if (cursor->next()) {
1082             pq.push(cursor);
1083         } else {
1084             // Skip empty tables.
1085             delete cursor;
1086         }
1087     }
1088
1089     while (!pq.empty()) {
1090         cursor_type * cur = pq.top();
1091         pq.pop();
1092
1093         // Glass vs honey spelling key prefix map:
1094         //
1095         //  Type        Glass       Honey
1096         //
1097         //  bookend     Bbd         \x00bd
1098         //  head        Hhe         \x01he
1099         //  middle      Mddl        \x02ddl
1100         //  tail        Til         \x03il
1101         //  word        Wword       word
1102         //
1103         // In honey, if the word's first byte is <= '\x04', we add a prefix
1104         // of '\x04' (which is unlikely in real world use but ensures that
1105         // we can handle arbitrary strings).
1106         //
1107         // The prefixes for honey are chosen such that we save a byte for the
1108         // key of all real-world words, and so that more first bytes are used
1109         // so that a top-level array index is more effective, especially for
1110         // random-access lookup of word entries (which we do to check the
1111         // frequency of potential spelling candidates).
1112         //
1113         // The other prefixes are chosen such that we can do look up in key
1114         // order at search time, which is more efficient for a cursor which can
1115         // only efficiently move forwards.
1116         //
1117         // Note that the key ordering is the same for glass and honey, which
1118         // makes translating during compaction simpler.
1119         string key = cur->current_key;
1120         switch (key[0]) {
1121             case 'B':
1122                 key[0] = Honey::KEY_PREFIX_BOOKEND;
1123                 break;
1124             case 'H':
1125                 key[0] = Honey::KEY_PREFIX_HEAD;
1126                 break;
1127             case 'M':
1128                 key[0] = Honey::KEY_PREFIX_MIDDLE;
1129                 break;
1130             case 'T':
1131                 key[0] = Honey::KEY_PREFIX_TAIL;
1132                 break;
1133             case 'W':
1134                 if (static_cast<unsigned char>(key[1]) > Honey::KEY_PREFIX_WORD)
1135                     key.erase(0, 1);
1136                 else
1137                     key[0] = Honey::KEY_PREFIX_WORD;
1138                 break;
1139             default: {
1140                 string m = "Bad spelling key prefix: ";
1141                 m += static_cast<unsigned char>(key[0]);
1142                 throw Xapian::DatabaseCorruptError(m);
1143             }
1144         }
1145
1146         if (pq.empty() || pq.top()->current_key > key) {
1147             // No merging to do for this key so just copy the tag value,
1148             // adjusting if necessary.  If we don't need to adjust it, just
1149             // copy the compressed value.
1150             bool compressed;
1151             switch (key[0]) {
1152                 case Honey::KEY_PREFIX_HEAD: {
1153                     // Honey omits the first two bytes from the first entry
1154                     // since we know what those most be from the key
1155                     // (subsequent entries won't include those anyway, since
1156                     // they'll be part of the reused prefix).
1157                     compressed = cur->read_tag(false);
1158                     unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1159                     cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1160                     AssertEq(cur->current_tag[1], key[1]);
1161                     AssertEq(cur->current_tag[2], key[2]);
1162                     cur->current_tag.erase(1, 2);
1163                     break;
1164                 }
1165                 case Honey::KEY_PREFIX_TAIL:
1166                 case Honey::KEY_PREFIX_BOOKEND: {
1167                     // Need to repack because honey omits the last 2 or 1 bytes
1168                     // from each entry (2 for tail, 1 for bookend) since we
1169                     // know what those must be from the key.
1170                     compressed = cur->read_tag(false);
1171                     PrefixCompressedStringItor spell_in(cur->current_tag);
1172                     string new_tag;
1173                     PrefixCompressedStringWriter spell_out(new_tag, key);
1174                     while (!spell_in.at_end()) {
1175                         spell_out.append(*spell_in);
1176                         ++spell_in;
1177                     }
1178                     cur->current_tag = std::move(new_tag);
1179                     break;
1180                 }
1181                 default:
1182                     compressed = cur->read_tag(true);
1183                     break;
1184             }
1185             out->add(key, cur->current_tag, compressed);
1186             if (cur->next()) {
1187                 pq.push(cur);
1188             } else {
1189                 delete cur;
1190             }
1191             continue;
1192         }
1193
1194         // Merge tag values with the same key:
1195         string tag;
1196         if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1197             // We just want the union of words, so copy over the first instance
1198             // and skip any identical ones.
1199             priority_queue<PrefixCompressedStringItor *,
1200                            vector<PrefixCompressedStringItor *>,
1201                            PrefixCompressedStringItorGt> pqtag;
1202             // Stick all the cursor_type pointers in a vector because their
1203             // current_tag members must remain valid while we're merging their
1204             // tags, but we need to call next() on them all afterwards.
1205             vector<cursor_type *> vec;
1206             vec.reserve(pq.size());
1207
1208             while (true) {
1209                 cur->read_tag();
1210                 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
1211                 vec.push_back(cur);
1212                 if (pq.empty() || pq.top()->current_key != key) break;
1213                 cur = pq.top();
1214                 pq.pop();
1215             }
1216
1217             PrefixCompressedStringWriter wr(tag, key);
1218             string lastword;
1219             while (!pqtag.empty()) {
1220                 PrefixCompressedStringItor * it = pqtag.top();
1221                 pqtag.pop();
1222                 string word = **it;
1223                 if (word != lastword) {
1224                     lastword = word;
1225                     wr.append(lastword);
1226                 }
1227                 ++*it;
1228                 if (!it->at_end()) {
1229                     pqtag.push(it);
1230                 } else {
1231                     delete it;
1232                 }
1233             }
1234
1235             for (auto i : vec) {
1236                 cur = i;
1237                 if (cur->next()) {
1238                     pq.push(cur);
1239                 } else {
1240                     delete cur;
1241                 }
1242             }
1243         } else {
1244             // We want to sum the frequencies from tags for the same key.
1245             Xapian::termcount tot_freq = 0;
1246             while (true) {
1247                 cur->read_tag();
1248                 Xapian::termcount freq;
1249                 const char * p = cur->current_tag.data();
1250                 const char * end = p + cur->current_tag.size();
1251                 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1252                     throw_database_corrupt("Bad spelling word freq", p);
1253                 }
1254                 tot_freq += freq;
1255                 if (cur->next()) {
1256                     pq.push(cur);
1257                 } else {
1258                     delete cur;
1259                 }
1260                 if (pq.empty() || pq.top()->current_key != key) break;
1261                 cur = pq.top();
1262                 pq.pop();
1263             }
1264             tag.resize(0);
1265             pack_uint_last(tag, tot_freq);
1266         }
1267         out->add(key, tag);
1268     }
1269 }
1270 #endif
1271
1272 static void
1273 merge_spellings(HoneyTable* out,
1274                 vector<const HoneyTable*>::const_iterator b,
1275                 vector<const HoneyTable*>::const_iterator e)
1276 {
1277     typedef MergeCursor<const HoneyTable&> cursor_type;
1278     typedef CursorGt<cursor_type> gt_type;
1279     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1280     for ( ; b != e; ++b) {
1281         auto in = *b;
1282         auto cursor = new cursor_type(in);
1283         if (cursor->next()) {
1284             pq.push(cursor);
1285         } else {
1286             // Skip empty tables.
1287             delete cursor;
1288         }
1289     }
1290
1291     while (!pq.empty()) {
1292         cursor_type * cur = pq.top();
1293         pq.pop();
1294
1295         string key = cur->current_key;
1296         if (pq.empty() || pq.top()->current_key > key) {
1297             // No need to merge the tags, just copy the (possibly compressed)
1298             // tag value.
1299             bool compressed = cur->read_tag(true);
1300             out->add(key, cur->current_tag, compressed);
1301             if (cur->next()) {
1302                 pq.push(cur);
1303             } else {
1304                 delete cur;
1305             }
1306             continue;
1307         }
1308
1309         // Merge tag values with the same key:
1310         string tag;
1311         if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1312             // We just want the union of words, so copy over the first instance
1313             // and skip any identical ones.
1314             priority_queue<PrefixCompressedStringItor *,
1315                            vector<PrefixCompressedStringItor *>,
1316                            PrefixCompressedStringItorGt> pqtag;
1317             // Stick all the cursor_type pointers in a vector because their
1318             // current_tag members must remain valid while we're merging their
1319             // tags, but we need to call next() on them all afterwards.
1320             vector<cursor_type *> vec;
1321             vec.reserve(pq.size());
1322
1323             while (true) {
1324                 cur->read_tag();
1325                 pqtag.push(new PrefixCompressedStringItor(cur->current_tag,
1326                                                           key));
1327                 vec.push_back(cur);
1328                 if (pq.empty() || pq.top()->current_key != key) break;
1329                 cur = pq.top();
1330                 pq.pop();
1331             }
1332
1333             PrefixCompressedStringWriter wr(tag, key);
1334             string lastword;
1335             while (!pqtag.empty()) {
1336                 PrefixCompressedStringItor * it = pqtag.top();
1337                 pqtag.pop();
1338                 string word = **it;
1339                 if (word != lastword) {
1340                     lastword = word;
1341                     wr.append(lastword);
1342                 }
1343                 ++*it;
1344                 if (!it->at_end()) {
1345                     pqtag.push(it);
1346                 } else {
1347                     delete it;
1348                 }
1349             }
1350
1351             for (auto i : vec) {
1352                 cur = i;
1353                 if (cur->next()) {
1354                     pq.push(cur);
1355                 } else {
1356                     delete cur;
1357                 }
1358             }
1359         } else {
1360             // We want to sum the frequencies from tags for the same key.
1361             Xapian::termcount tot_freq = 0;
1362             while (true) {
1363                 cur->read_tag();
1364                 Xapian::termcount freq;
1365                 const char * p = cur->current_tag.data();
1366                 const char * end = p + cur->current_tag.size();
1367                 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1368                     throw_database_corrupt("Bad spelling word freq", p);
1369                 }
1370                 tot_freq += freq;
1371                 if (cur->next()) {
1372                     pq.push(cur);
1373                 } else {
1374                     delete cur;
1375                 }
1376                 if (pq.empty() || pq.top()->current_key != key) break;
1377                 cur = pq.top();
1378                 pq.pop();
1379             }
1380             tag.resize(0);
1381             pack_uint_last(tag, tot_freq);
1382         }
1383         out->add(key, tag);
1384     }
1385 }
1386
1387 // U : vector<HoneyTable*>::const_iterator
1388 template<typename T, typename U> void
1389 merge_synonyms(T* out, U b, U e)
1390 {
1391     typedef decltype(**b) table_type; // E.g. HoneyTable
1392     typedef MergeCursor<table_type> cursor_type;
1393     typedef CursorGt<cursor_type> gt_type;
1394     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1395     for ( ; b != e; ++b) {
1396         auto in = *b;
1397         auto cursor = new cursor_type(in);
1398         if (cursor->next()) {
1399             pq.push(cursor);
1400         } else {
1401             // Skip empty tables.
1402             delete cursor;
1403         }
1404     }
1405
1406     while (!pq.empty()) {
1407         cursor_type * cur = pq.top();
1408         pq.pop();
1409
1410         string key = cur->current_key;
1411         if (pq.empty() || pq.top()->current_key > key) {
1412             // No need to merge the tags, just copy the (possibly compressed)
1413             // tag value.
1414             bool compressed = cur->read_tag(true);
1415             out->add(key, cur->current_tag, compressed);
1416             if (cur->next()) {
1417                 pq.push(cur);
1418             } else {
1419                 delete cur;
1420             }
1421             continue;
1422         }
1423
1424         // Merge tag values with the same key:
1425         string tag;
1426
1427         // We just want the union of words, so copy over the first instance
1428         // and skip any identical ones.
1429         priority_queue<ByteLengthPrefixedStringItor *,
1430                        vector<ByteLengthPrefixedStringItor *>,
1431                        ByteLengthPrefixedStringItorGt> pqtag;
1432         vector<cursor_type *> vec;
1433
1434         while (true) {
1435             cur->read_tag();
1436             pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
1437             vec.push_back(cur);
1438             if (pq.empty() || pq.top()->current_key != key) break;
1439             cur = pq.top();
1440             pq.pop();
1441         }
1442
1443         string lastword;
1444         while (!pqtag.empty()) {
1445             ByteLengthPrefixedStringItor * it = pqtag.top();
1446             pqtag.pop();
1447             if (**it != lastword) {
1448                 lastword = **it;
1449                 tag += uint8_t(lastword.size() ^ MAGIC_XOR_VALUE);
1450                 tag += lastword;
1451             }
1452             ++*it;
1453             if (!it->at_end()) {
1454                 pqtag.push(it);
1455             } else {
1456                 delete it;
1457             }
1458         }
1459
1460         for (auto i : vec) {
1461             cur = i;
1462             if (cur->next()) {
1463                 pq.push(cur);
1464             } else {
1465                 delete cur;
1466             }
1467         }
1468
1469         out->add(key, tag);
1470     }
1471 }
1472
1473 template<typename T, typename U> void
1474 multimerge_postlists(Xapian::Compactor * compactor,
1475                      T* out, const char * tmpdir,
1476                      const vector<U*>& in,
1477                      vector<Xapian::docid> off)
1478 {
1479     if (in.size() <= 3) {
1480         merge_postlists(compactor, out, off.begin(), in.begin(), in.end());
1481         return;
1482     }
1483     unsigned int c = 0;
1484     vector<HoneyTable *> tmp;
1485     tmp.reserve(in.size() / 2);
1486     {
1487         vector<Xapian::docid> newoff;
1488         newoff.resize(in.size() / 2);
1489         for (unsigned int i = 0, j; i < in.size(); i = j) {
1490             j = i + 2;
1491             if (j == in.size() - 1) ++j;
1492
1493             string dest = tmpdir;
1494             char buf[64];
1495             sprintf(buf, "/tmp%u_%u.", c, i / 2);
1496             dest += buf;
1497
1498             HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1499
1500             // Don't compress entries in temporary tables, even if the final
1501             // table would do so.  Any already compressed entries will get
1502             // copied in compressed form.
1503             Honey::RootInfo root_info;
1504             root_info.init(0);
1505             const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1506             tmptab->create_and_open(flags, root_info);
1507
1508             merge_postlists(compactor, tmptab, off.begin() + i,
1509                             in.begin() + i, in.begin() + j);
1510             tmp.push_back(tmptab);
1511             tmptab->flush_db();
1512             tmptab->commit(1, &root_info);
1513         }
1514         swap(off, newoff);
1515         ++c;
1516     }
1517
1518     while (tmp.size() > 3) {
1519         vector<HoneyTable *> tmpout;
1520         tmpout.reserve(tmp.size() / 2);
1521         vector<Xapian::docid> newoff;
1522         newoff.resize(tmp.size() / 2);
1523         for (unsigned int i = 0, j; i < tmp.size(); i = j) {
1524             j = i + 2;
1525             if (j == tmp.size() - 1) ++j;
1526
1527             string dest = tmpdir;
1528             char buf[64];
1529             sprintf(buf, "/tmp%u_%u.", c, i / 2);
1530             dest += buf;
1531
1532             HoneyTable * tmptab = new HoneyTable("postlist", dest, false);
1533
1534             // Don't compress entries in temporary tables, even if the final
1535             // table would do so.  Any already compressed entries will get
1536             // copied in compressed form.
1537             Honey::RootInfo root_info;
1538             root_info.init(0);
1539             const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1540             tmptab->create_and_open(flags, root_info);
1541
1542             merge_postlists(compactor, tmptab, off.begin() + i,
1543                             tmp.begin() + i, tmp.begin() + j);
1544             if (c > 0) {
1545                 for (unsigned int k = i; k < j; ++k) {
1546                     // FIXME: unlink(tmp[k]->get_path().c_str());
1547                     delete tmp[k];
1548                     tmp[k] = NULL;
1549                 }
1550             }
1551             tmpout.push_back(tmptab);
1552             tmptab->flush_db();
1553             tmptab->commit(1, &root_info);
1554         }
1555         swap(tmp, tmpout);
1556         swap(off, newoff);
1557         ++c;
1558     }
1559     merge_postlists(compactor, out, off.begin(), tmp.begin(), tmp.end());
1560     if (c > 0) {
1561         for (size_t k = 0; k < tmp.size(); ++k) {
1562             // FIXME: unlink(tmp[k]->get_path().c_str());
1563             delete tmp[k];
1564             tmp[k] = NULL;
1565         }
1566     }
1567 }
1568
1569 template<typename T> class PositionCursor;
1570
1571 #ifdef XAPIAN_HAS_GLASS_BACKEND
1572 template<>
1573 class PositionCursor<const GlassTable&> : private GlassCursor {
1574     Xapian::docid offset;
1575
1576   public:
1577     string key;
1578     Xapian::docid firstdid;
1579
1580     PositionCursor(const GlassTable *in, Xapian::docid offset_)
1581         : GlassCursor(in), offset(offset_), firstdid(0) {
1582         rewind();
1583     }
1584
1585     bool next() {
1586         if (!GlassCursor::next()) return false;
1587         read_tag();
1588         const char * d = current_key.data();
1589         const char * e = d + current_key.size();
1590         string term;
1591         Xapian::docid did;
1592         if (!unpack_string_preserving_sort(&d, e, term) ||
1593             !unpack_uint_preserving_sort(&d, e, &did) ||
1594             d != e) {
1595             throw Xapian::DatabaseCorruptError("Bad position key");
1596         }
1597
1598         key.resize(0);
1599         pack_string_preserving_sort(key, term);
1600         pack_uint_preserving_sort(key, did + offset);
1601         return true;
1602     }
1603
1604     const string & get_tag() const {
1605         return current_tag;
1606     }
1607 };
1608 #endif
1609
1610 template<>
1611 class PositionCursor<const HoneyTable&> : private HoneyCursor {
1612     Xapian::docid offset;
1613
1614   public:
1615     string key;
1616     Xapian::docid firstdid;
1617
1618     PositionCursor(const HoneyTable *in, Xapian::docid offset_)
1619         : HoneyCursor(in), offset(offset_), firstdid(0) {
1620         rewind();
1621     }
1622
1623     bool next() {
1624         if (!HoneyCursor::next()) return false;
1625         read_tag();
1626         const char * d = current_key.data();
1627         const char * e = d + current_key.size();
1628         string term;
1629         Xapian::docid did;
1630         if (!unpack_string_preserving_sort(&d, e, term) ||
1631             !unpack_uint_preserving_sort(&d, e, &did) ||
1632             d != e) {
1633             throw Xapian::DatabaseCorruptError("Bad position key");
1634         }
1635
1636         key.resize(0);
1637         pack_string_preserving_sort(key, term);
1638         pack_uint_preserving_sort(key, did + offset);
1639         return true;
1640     }
1641
1642     const string & get_tag() const {
1643         return current_tag;
1644     }
1645 };
1646
1647 template<typename T>
1648 class PositionCursorGt {
1649   public:
1650     /** Return true if and only if a's key is strictly greater than b's key.
1651      */
1652     bool operator()(const T* a, const T* b) const {
1653         return a->key > b->key;
1654     }
1655 };
1656
1657 template<typename T, typename U> void
1658 merge_positions(T* out, const vector<U*> & inputs,
1659                 const vector<Xapian::docid> & offset)
1660 {
1661     typedef decltype(*inputs[0]) table_type; // E.g. HoneyTable
1662     typedef PositionCursor<table_type> cursor_type;
1663     typedef PositionCursorGt<cursor_type> gt_type;
1664     priority_queue<cursor_type *, vector<cursor_type *>, gt_type> pq;
1665     for (size_t i = 0; i < inputs.size(); ++i) {
1666         auto in = inputs[i];
1667         auto cursor = new cursor_type(in, offset[i]);
1668         if (cursor->next()) {
1669             pq.push(cursor);
1670         } else {
1671             // Skip empty tables.
1672             delete cursor;
1673         }
1674     }
1675
1676     while (!pq.empty()) {
1677         cursor_type * cur = pq.top();
1678         pq.pop();
1679         out->add(cur->key, cur->get_tag());
1680         if (cur->next()) {
1681             pq.push(cur);
1682         } else {
1683             delete cur;
1684         }
1685     }
1686 }
1687
1688 template<typename T, typename U> void
1689 merge_docid_keyed(T *out, const vector<U*> & inputs,
1690                   const vector<Xapian::docid> & offset,
1691                   int = 0)
1692 {
1693     for (size_t i = 0; i < inputs.size(); ++i) {
1694         Xapian::docid off = offset[i];
1695
1696         auto in = inputs[i];
1697         HoneyCursor cur(in);
1698         cur.rewind();
1699
1700         string key;
1701         while (cur.next()) {
1702             // Adjust the key if this isn't the first database.
1703             if (off) {
1704                 Xapian::docid did;
1705                 const char * d = cur.current_key.data();
1706                 const char * e = d + cur.current_key.size();
1707                 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1708                     string msg = "Bad key in ";
1709                     msg += inputs[i]->get_path();
1710                     throw Xapian::DatabaseCorruptError(msg);
1711                 }
1712                 did += off;
1713                 key.resize(0);
1714                 pack_uint_preserving_sort(key, did);
1715                 if (d != e) {
1716                     // Copy over anything extra in the key (e.g. the zero byte
1717                     // at the end of "used value slots" in the termlist table).
1718                     key.append(d, e - d);
1719                 }
1720             } else {
1721                 key = cur.current_key;
1722             }
1723             bool compressed = cur.read_tag(true);
1724             out->add(key, cur.current_tag, compressed);
1725         }
1726     }
1727 }
1728
1729 #ifdef XAPIAN_HAS_GLASS_BACKEND
1730 template<typename T> void
1731 merge_docid_keyed(T *out, const vector<const GlassTable*> & inputs,
1732                   const vector<Xapian::docid> & offset,
1733                   int table_type = 0)
1734 {
1735     for (size_t i = 0; i < inputs.size(); ++i) {
1736         Xapian::docid off = offset[i];
1737
1738         auto in = inputs[i];
1739         if (in->empty()) continue;
1740
1741         GlassCursor cur(in);
1742         cur.rewind();
1743
1744         string key;
1745         while (cur.next()) {
1746 next_without_next:
1747             // Adjust the key if this isn't the first database.
1748             if (off) {
1749                 Xapian::docid did;
1750                 const char * d = cur.current_key.data();
1751                 const char * e = d + cur.current_key.size();
1752                 if (!unpack_uint_preserving_sort(&d, e, &did)) {
1753                     string msg = "Bad key in ";
1754                     msg += inputs[i]->get_path();
1755                     throw Xapian::DatabaseCorruptError(msg);
1756                 }
1757                 did += off;
1758                 key.resize(0);
1759                 pack_uint_preserving_sort(key, did);
1760                 if (d != e) {
1761                     // Copy over anything extra in the key (e.g. the zero byte
1762                     // at the end of "used value slots" in the termlist table).
1763                     key.append(d, e - d);
1764                 }
1765             } else {
1766                 key = cur.current_key;
1767             }
1768             if (table_type == Honey::TERMLIST) {
1769                 if (termlist_key_is_values_used(key)) {
1770                     throw Xapian::DatabaseCorruptError("value slots used "
1771                                                        "entry without a "
1772                                                        "corresponding "
1773                                                        "termlist entry");
1774                 }
1775                 cur.read_tag();
1776                 string tag = cur.current_tag;
1777
1778                 bool next_result = cur.next();
1779                 bool next_already_done = true;
1780                 unsigned bitmap_slots_used = 0;
1781                 string encoded_slots_used;
1782                 if (next_result &&
1783                     termlist_key_is_values_used(cur.current_key)) {
1784                     next_already_done = false;
1785                     cur.read_tag();
1786                     const string& valtag = cur.current_tag;
1787
1788                     const char* p = valtag.data();
1789                     const char* end = p + valtag.size();
1790
1791                     Xapian::VecCOW<Xapian::termpos> slots;
1792
1793                     Xapian::valueno first_slot;
1794                     if (!unpack_uint(&p, end, &first_slot)) {
1795                         throw_database_corrupt("Value slot encoding corrupt",
1796                                                p);
1797                     }
1798                     slots.push_back(first_slot);
1799                     Xapian::valueno last_slot = first_slot;
1800                     while (p != end) {
1801                         Xapian::valueno slot;
1802                         if (!unpack_uint(&p, end, &slot)) {
1803                             throw_database_corrupt("Value slot encoding "
1804                                                    "corrupt", p);
1805                         }
1806                         slot += last_slot + 1;
1807                         last_slot = slot;
1808
1809                         slots.push_back(slot);
1810                     }
1811
1812                     if (slots.back() <= 6) {
1813                         // Encode as a bitmap if only slots in the range 0-6
1814                         // are used.
1815                         for (auto slot : slots) {
1816                             bitmap_slots_used |= 1 << slot;
1817                         }
1818                     } else {
1819                         string enc;
1820                         pack_uint(enc, last_slot);
1821                         if (slots.size() > 1) {
1822                             BitWriter slots_used(enc);
1823                             slots_used.encode(first_slot, last_slot);
1824                             slots_used.encode(slots.size() - 2,
1825                                               last_slot - first_slot);
1826                             slots_used.encode_interpolative(slots, 0,
1827                                                             slots.size() - 1);
1828                             encoded_slots_used = slots_used.freeze();
1829                         } else {
1830                             encoded_slots_used = std::move(enc);
1831                         }
1832                     }
1833                 }
1834
1835                 const char* pos = tag.data();
1836                 const char* end = pos + tag.size();
1837
1838                 string newtag;
1839                 if (encoded_slots_used.empty()) {
1840                     newtag += char(bitmap_slots_used);
1841                 } else {
1842                     auto size = encoded_slots_used.size();
1843                     if (size < 0x80) {
1844                         newtag += char(0x80 | size);
1845                     } else {
1846                         newtag += '\x80';
1847                         pack_uint(newtag, size);
1848                     }
1849                     newtag += encoded_slots_used;
1850                 }
1851
1852                 if (pos != end) {
1853                     Xapian::termcount doclen;
1854                     if (!unpack_uint(&pos, end, &doclen)) {
1855                         throw_database_corrupt("doclen", pos);
1856                     }
1857                     Xapian::termcount termlist_size;
1858                     if (!unpack_uint(&pos, end, &termlist_size)) {
1859                         throw_database_corrupt("termlist length", pos);
1860                     }
1861                     pack_uint(newtag, termlist_size - 1);
1862                     pack_uint(newtag, doclen);
1863
1864                     string current_term;
1865                     while (pos != end) {
1866                         Xapian::termcount current_wdf = 0;
1867
1868                         if (!current_term.empty()) {
1869                             size_t reuse = static_cast<unsigned char>(*pos++);
1870                             newtag += char(reuse);
1871
1872                             if (reuse > current_term.size()) {
1873                                 current_wdf = reuse / (current_term.size() + 1);
1874                                 reuse = reuse % (current_term.size() + 1);
1875                             }
1876                             current_term.resize(reuse);
1877
1878                             if (pos == end)
1879                                 throw_database_corrupt("term", NULL);
1880                         }
1881
1882                         size_t append = static_cast<unsigned char>(*pos++);
1883                         if (size_t(end - pos) < append)
1884                             throw_database_corrupt("term", NULL);
1885
1886                         current_term.append(pos, append);
1887                         pos += append;
1888
1889                         if (current_wdf) {
1890                             --current_wdf;
1891                         } else {
1892                             if (!unpack_uint(&pos, end, &current_wdf)) {
1893                                 throw_database_corrupt("wdf", pos);
1894                             }
1895                             pack_uint(newtag, current_wdf);
1896                         }
1897
1898                         newtag += char(append);
1899                         newtag.append(current_term.end() - append,
1900                                       current_term.end());
1901                     }
1902                 }
1903                 if (!newtag.empty())
1904                     out->add(key, newtag);
1905                 if (!next_result) break;
1906                 if (next_already_done) goto next_without_next;
1907             } else {
1908                 bool compressed = cur.read_tag(true);
1909                 out->add(key, cur.current_tag, compressed);
1910             }
1911         }
1912     }
1913 }
1914 #endif
1915
1916 }
1917
1918 using namespace HoneyCompact;
1919
1920 void
1921 HoneyDatabase::compact(Xapian::Compactor* compactor,
1922                        const char* destdir,
1923                        int fd,
1924                        int source_backend,
1925                        const vector<const Xapian::Database::Internal*>& sources,
1926                        const vector<Xapian::docid>& offset,
1927                        Xapian::Compactor::compaction_level compaction,
1928                        unsigned flags,
1929                        Xapian::docid last_docid)
1930 {
1931     // Currently unused for honey.
1932     (void)compaction;
1933
1934     struct table_list {
1935         // The "base name" of the table.
1936         char name[9];
1937         // The type.
1938         Honey::table_type type;
1939         // Create tables after position lazily.
1940         bool lazy;
1941     };
1942
1943     static const table_list tables[] = {
1944         // name         type                    lazy
1945         { "postlist",   Honey::POSTLIST,        false },
1946         { "docdata",    Honey::DOCDATA,         true },
1947         { "termlist",   Honey::TERMLIST,        false },
1948         { "position",   Honey::POSITION,        true },
1949         { "spelling",   Honey::SPELLING,        true },
1950         { "synonym",    Honey::SYNONYM,         true }
1951     };
1952     const table_list * tables_end = tables +
1953         (sizeof(tables) / sizeof(tables[0]));
1954
1955     const int FLAGS = Xapian::DB_DANGEROUS;
1956
1957     bool single_file = (flags & Xapian::DBCOMPACT_SINGLE_FILE);
1958     bool multipass = (flags & Xapian::DBCOMPACT_MULTIPASS);
1959     if (single_file) {
1960         // FIXME: Support this combination - we need to put temporary files
1961         // somewhere.
1962         multipass = false;
1963     }
1964
1965     if (single_file) {
1966         for (size_t i = 0; i != sources.size(); ++i) {
1967             bool has_uncommitted_changes;
1968             if (source_backend == Xapian::DB_BACKEND_GLASS) {
1969 #ifdef XAPIAN_HAS_GLASS_BACKEND
1970                 auto db = static_cast<const GlassDatabase*>(sources[i]);
1971                 has_uncommitted_changes = db->has_uncommitted_changes();
1972 #else
1973                 throw Xapian::FeatureUnavailableError("Glass backend disabled");
1974 #endif
1975             } else {
1976                 auto db = static_cast<const HoneyDatabase*>(sources[i]);
1977                 has_uncommitted_changes = db->has_uncommitted_changes();
1978             }
1979             if (has_uncommitted_changes) {
1980                 const char * m =
1981                     "Can't compact from a WritableDatabase with uncommitted "
1982                     "changes - either call commit() first, or create a new "
1983                     "Database object from the filename on disk";
1984                 throw Xapian::InvalidOperationError(m);
1985             }
1986         }
1987     }
1988
1989     FlintLock lock(destdir ? destdir : "");
1990     if (!single_file) {
1991         string explanation;
1992         FlintLock::reason why = lock.lock(true, false, explanation);
1993         if (why != FlintLock::SUCCESS) {
1994             lock.throw_databaselockerror(why, destdir, explanation);
1995         }
1996     }
1997
1998     unique_ptr<HoneyVersion> version_file_out;
1999     if (single_file) {
2000         if (destdir) {
2001             fd = open(destdir, O_RDWR|O_CREAT|O_TRUNC|O_BINARY|O_CLOEXEC, 0666);
2002             if (fd < 0) {
2003                 throw Xapian::DatabaseCreateError("open() failed", errno);
2004             }
2005         }
2006         version_file_out.reset(new HoneyVersion(fd));
2007     } else {
2008         fd = -1;
2009         version_file_out.reset(new HoneyVersion(destdir));
2010     }
2011
2012     // Set to true if stat() failed (which can happen if the files are > 2GB
2013     // and off_t is 32 bit) or one of the totals overflowed.
2014     bool bad_totals = false;
2015     off_t in_total = 0, out_total = 0;
2016
2017     version_file_out->create();
2018     for (size_t i = 0; i != sources.size(); ++i) {
2019         bool source_single_file = false;
2020         if (source_backend == Xapian::DB_BACKEND_GLASS) {
2021 #ifdef XAPIAN_HAS_GLASS_BACKEND
2022             auto db = static_cast<const GlassDatabase*>(sources[i]);
2023             auto& v_in = db->version_file;
2024             auto& v_out = version_file_out;
2025             v_out->merge_stats(v_in.get_doccount(),
2026                                v_in.get_doclength_lower_bound(),
2027                                v_in.get_doclength_upper_bound(),
2028                                v_in.get_wdf_upper_bound(),
2029                                v_in.get_total_doclen(),
2030                                v_in.get_spelling_wordfreq_upper_bound());
2031             source_single_file = db->single_file();
2032 #else
2033             Assert(false);
2034 #endif
2035         } else {
2036             auto db = static_cast<const HoneyDatabase*>(sources[i]);
2037             version_file_out->merge_stats(db->version_file);
2038             source_single_file = db->single_file();
2039         }
2040         if (source_single_file) {
2041             // Add single file input DB sizes to in_total here.  For other
2042             // databases, we sum per-table below.
2043             string path;
2044             sources[i]->get_backend_info(&path);
2045             off_t db_size = file_size(path);
2046             if (errno == 0) {
2047                 // FIXME: check overflow and set bad_totals
2048                 in_total += db_size;
2049             } else {
2050                 bad_totals = true;
2051             }
2052         }
2053     }
2054
2055     string fl_serialised;
2056 #if 0
2057     if (single_file) {
2058         HoneyFreeList fl;
2059         fl.set_first_unused_block(1); // FIXME: Assumption?
2060         fl.pack(fl_serialised);
2061     }
2062 #endif
2063
2064     // FIXME: sort out indentation.
2065 if (source_backend == Xapian::DB_BACKEND_GLASS) {
2066 #ifndef XAPIAN_HAS_GLASS_BACKEND
2067     throw Xapian::FeatureUnavailableError("Glass backend disabled");
2068 #else
2069     vector<HoneyTable *> tabs;
2070     tabs.reserve(tables_end - tables);
2071     off_t prev_size = 0;
2072     for (const table_list * t = tables; t < tables_end; ++t) {
2073         // The postlist table requires an N-way merge, adjusting the
2074         // headers of various blocks.  The spelling and synonym tables also
2075         // need special handling.  The other tables have keys sorted in
2076         // docid order, so we can merge them by simply copying all the keys
2077         // from each source table in turn.
2078         if (compactor)
2079             compactor->set_status(t->name, string());
2080
2081         string dest;
2082         if (!single_file) {
2083             dest = destdir;
2084             dest += '/';
2085             dest += t->name;
2086             dest += '.';
2087         }
2088
2089         bool output_will_exist = !t->lazy;
2090
2091         // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2092         // on certain systems).
2093         bool bad_stat = false;
2094
2095         // We can't currently report input sizes if there's a single file DB
2096         // amongst the inputs.
2097         bool single_file_in = false;
2098
2099         off_t in_size = 0;
2100
2101         vector<const GlassTable*> inputs;
2102         inputs.reserve(sources.size());
2103         size_t inputs_present = 0;
2104         for (auto src : sources) {
2105             auto db = static_cast<const GlassDatabase*>(src);
2106             const GlassTable * table;
2107             switch (t->type) {
2108                 case Honey::POSTLIST:
2109                     table = &(db->postlist_table);
2110                     break;
2111                 case Honey::DOCDATA:
2112                     table = &(db->docdata_table);
2113                     break;
2114                 case Honey::TERMLIST:
2115                     table = &(db->termlist_table);
2116                     break;
2117                 case Honey::POSITION:
2118                     table = &(db->position_table);
2119                     break;
2120                 case Honey::SPELLING:
2121                     table = &(db->spelling_table);
2122                     break;
2123                 case Honey::SYNONYM:
2124                     table = &(db->synonym_table);
2125                     break;
2126                 default:
2127                     Assert(false);
2128                     return;
2129             }
2130
2131             if (db->single_file()) {
2132                 if (t->lazy && !GlassCursor(table).next()) {
2133                     // Lazy table with no entries, so essentially doesn't
2134                     // exist.
2135                 } else {
2136                     // FIXME: Find actual size somehow?
2137                     // in_size += table->size() / 1024;
2138                     single_file_in = true;
2139                     output_will_exist = true;
2140                     ++inputs_present;
2141                 }
2142             } else {
2143                 off_t db_size = file_size(table->get_path());
2144                 if (errno == 0) {
2145                     // FIXME: check overflow and set bad_totals
2146                     in_total += db_size;
2147                     in_size += db_size / 1024;
2148                     output_will_exist = true;
2149                     ++inputs_present;
2150                 } else if (errno != ENOENT) {
2151                     // We get ENOENT for an optional table.
2152                     bad_totals = bad_stat = true;
2153                     output_will_exist = true;
2154                     ++inputs_present;
2155                 }
2156             }
2157             inputs.push_back(table);
2158         }
2159
2160         // If any inputs lack a termlist table, suppress it in the output.
2161         if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2162             if (inputs_present != 0) {
2163                 if (compactor) {
2164                     string m = str(inputs_present);
2165                     m += " of ";
2166                     m += str(sources.size());
2167                     m += " inputs present, so suppressing output";
2168                     compactor->set_status(t->name, m);
2169                 }
2170                 continue;
2171             }
2172             output_will_exist = false;
2173         }
2174
2175         if (!output_will_exist) {
2176             if (compactor)
2177                 compactor->set_status(t->name, "doesn't exist");
2178             continue;
2179         }
2180
2181         HoneyTable * out;
2182         off_t table_start_offset = -1;
2183         if (single_file) {
2184             if (t == tables) {
2185                 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2186                 // space for version file.  It's tricky to exactly know the
2187                 // size of the version file beforehand.
2188                 table_start_offset = HONEY_VERSION_MAX_SIZE;
2189                 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2190                     throw Xapian::DatabaseError("lseek() failed", errno);
2191             } else {
2192                 table_start_offset = lseek(fd, 0, SEEK_CUR);
2193             }
2194             out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2195                                  false, false);
2196         } else {
2197             out = new HoneyTable(t->name, dest, false, t->lazy);
2198         }
2199         tabs.push_back(out);
2200         Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2201         if (single_file) {
2202             root_info->set_free_list(fl_serialised);
2203             root_info->set_offset(table_start_offset);
2204             out->open(FLAGS,
2205                       version_file_out->get_root(t->type),
2206                       version_file_out->get_revision());
2207         } else {
2208             out->create_and_open(FLAGS, *root_info);
2209         }
2210
2211         switch (t->type) {
2212             case Honey::POSTLIST: {
2213                 if (multipass && inputs.size() > 3) {
2214                     multimerge_postlists(compactor, out, destdir,
2215                                          inputs, offset);
2216                 } else {
2217                     merge_postlists(compactor, out, offset.begin(),
2218                                     inputs.begin(), inputs.end());
2219                 }
2220                 break;
2221             }
2222             case Honey::SPELLING:
2223                 merge_spellings(out, inputs.cbegin(), inputs.cend());
2224                 break;
2225             case Honey::SYNONYM:
2226                 merge_synonyms(out, inputs.begin(), inputs.end());
2227                 break;
2228             case Honey::POSITION:
2229                 merge_positions(out, inputs, offset);
2230                 break;
2231             default:
2232                 // DocData, Termlist
2233                 merge_docid_keyed(out, inputs, offset, t->type);
2234                 break;
2235         }
2236
2237         // Commit as revision 1.
2238         out->flush_db();
2239         out->commit(1, root_info);
2240         out->sync();
2241         if (single_file) fl_serialised = root_info->get_free_list();
2242
2243         off_t out_size = 0;
2244         if (!bad_stat && !single_file_in) {
2245             off_t db_size;
2246             if (single_file) {
2247                 db_size = file_size(fd);
2248             } else {
2249                 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2250             }
2251             if (errno == 0) {
2252                 if (single_file) {
2253                     off_t old_prev_size = prev_size;
2254                     prev_size = db_size;
2255                     db_size -= old_prev_size;
2256                 }
2257                 // FIXME: check overflow and set bad_totals
2258                 out_total += db_size;
2259                 out_size = db_size / 1024;
2260             } else if (errno != ENOENT) {
2261                 bad_totals = bad_stat = true;
2262             }
2263         }
2264         if (bad_stat) {
2265             if (compactor)
2266                 compactor->set_status(t->name,
2267                                       "Done (couldn't stat all the DB files)");
2268         } else if (single_file_in) {
2269             if (compactor)
2270                 compactor->set_status(t->name,
2271                                       "Done (table sizes unknown for single "
2272                                       "file DB input)");
2273         } else {
2274             string status;
2275             if (out_size == in_size) {
2276                 status = "Size unchanged (";
2277             } else {
2278                 off_t delta;
2279                 if (out_size < in_size) {
2280                     delta = in_size - out_size;
2281                     status = "Reduced by ";
2282                 } else {
2283                     delta = out_size - in_size;
2284                     status = "INCREASED by ";
2285                 }
2286                 if (in_size) {
2287                     status += str(100 * delta / in_size);
2288                     status += "% ";
2289                 }
2290                 status += str(delta);
2291                 status += "K (";
2292                 status += str(in_size);
2293                 status += "K -> ";
2294             }
2295             status += str(out_size);
2296             status += "K)";
2297             if (compactor)
2298                 compactor->set_status(t->name, status);
2299         }
2300     }
2301
2302     // If compacting to a single file output and all the tables are empty, pad
2303     // the output so that it isn't mistaken for a stub database when we try to
2304     // open it.  For this it needs to at least HONEY_MIN_DB_SIZE in size.
2305     if (single_file && prev_size < HONEY_MIN_DB_SIZE) {
2306         out_total = HONEY_MIN_DB_SIZE;
2307 #ifdef HAVE_FTRUNCATE
2308         if (ftruncate(fd, HONEY_MIN_DB_SIZE) < 0) {
2309             throw Xapian::DatabaseError("Failed to set size of output "
2310                                         "database", errno);
2311         }
2312 #else
2313         const off_t off = HONEY_MIN_DB_SIZE - 1;
2314         if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2315             throw Xapian::DatabaseError("Failed to set size of output "
2316                                         "database", errno);
2317         }
2318 #endif
2319     }
2320
2321     if (single_file) {
2322         if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2323             throw Xapian::DatabaseError("lseek() failed", errno);
2324         }
2325     }
2326     version_file_out->set_last_docid(last_docid);
2327     string tmpfile = version_file_out->write(1, FLAGS);
2328     if (single_file) {
2329         off_t version_file_size = lseek(fd, 0, SEEK_CUR);
2330         if (version_file_size < 0) {
2331             throw Xapian::DatabaseError("lseek() failed", errno);
2332         }
2333         if (version_file_size > HONEY_VERSION_MAX_SIZE) {
2334             throw Xapian::DatabaseError("Didn't allow enough space for "
2335                                         "version file data");
2336         }
2337     }
2338     for (unsigned j = 0; j != tabs.size(); ++j) {
2339         tabs[j]->sync();
2340     }
2341     // Commit with revision 1.
2342     version_file_out->sync(tmpfile, 1, FLAGS);
2343     for (unsigned j = 0; j != tabs.size(); ++j) {
2344         delete tabs[j];
2345     }
2346 #endif
2347 } else {
2348     vector<HoneyTable *> tabs;
2349     tabs.reserve(tables_end - tables);
2350     off_t prev_size = HONEY_MIN_DB_SIZE;
2351     for (const table_list * t = tables; t < tables_end; ++t) {
2352         // The postlist table requires an N-way merge, adjusting the
2353         // headers of various blocks.  The spelling and synonym tables also
2354         // need special handling.  The other tables have keys sorted in
2355         // docid order, so we can merge them by simply copying all the keys
2356         // from each source table in turn.
2357         if (compactor)
2358             compactor->set_status(t->name, string());
2359
2360         string dest;
2361         if (!single_file) {
2362             dest = destdir;
2363             dest += '/';
2364             dest += t->name;
2365             dest += '.';
2366         }
2367
2368         bool output_will_exist = !t->lazy;
2369
2370         // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2371         // on certain systems).
2372         bool bad_stat = false;
2373
2374         // We can't currently report input sizes if there's a single file DB
2375         // amongst the inputs.
2376         bool single_file_in = false;
2377
2378         off_t in_size = 0;
2379
2380         vector<const HoneyTable*> inputs;
2381         inputs.reserve(sources.size());
2382         size_t inputs_present = 0;
2383         for (auto src : sources) {
2384             auto db = static_cast<const HoneyDatabase*>(src);
2385             const HoneyTable * table;
2386             switch (t->type) {
2387                 case Honey::POSTLIST:
2388                     table = &(db->postlist_table);
2389                     break;
2390                 case Honey::DOCDATA:
2391                     table = &(db->docdata_table);
2392                     break;
2393                 case Honey::TERMLIST:
2394                     table = &(db->termlist_table);
2395                     break;
2396                 case Honey::POSITION:
2397                     table = &(db->position_table);
2398                     break;
2399                 case Honey::SPELLING:
2400                     table = &(db->spelling_table);
2401                     break;
2402                 case Honey::SYNONYM:
2403                     table = &(db->synonym_table);
2404                     break;
2405                 default:
2406                     Assert(false);
2407                     return;
2408             }
2409
2410             if (db->single_file()) {
2411                 if (t->lazy && !HoneyCursor(table).next()) {
2412                     // Lazy table with no entries, so essentially doesn't
2413                     // exist.
2414                 } else {
2415                     // FIXME: Find actual size somehow?
2416                     // in_size += table->size() / 1024;
2417                     single_file_in = true;
2418                     output_will_exist = true;
2419                     ++inputs_present;
2420                 }
2421             } else {
2422                 off_t db_size = file_size(table->get_path());
2423                 if (errno == 0) {
2424                     // FIXME: check overflow and set bad_totals
2425                     in_total += db_size;
2426                     in_size += db_size / 1024;
2427                     output_will_exist = true;
2428                     ++inputs_present;
2429                 } else if (errno != ENOENT) {
2430                     // We get ENOENT for an optional table.
2431                     bad_totals = bad_stat = true;
2432                     output_will_exist = true;
2433                     ++inputs_present;
2434                 }
2435             }
2436             inputs.push_back(table);
2437         }
2438
2439         // If any inputs lack a termlist table, suppress it in the output.
2440         if (t->type == Honey::TERMLIST && inputs_present != sources.size()) {
2441             if (inputs_present != 0) {
2442                 if (compactor) {
2443                     string m = str(inputs_present);
2444                     m += " of ";
2445                     m += str(sources.size());
2446                     m += " inputs present, so suppressing output";
2447                     compactor->set_status(t->name, m);
2448                 }
2449                 continue;
2450             }
2451             output_will_exist = false;
2452         }
2453
2454         if (!output_will_exist) {
2455             if (compactor)
2456                 compactor->set_status(t->name, "doesn't exist");
2457             continue;
2458         }
2459
2460         HoneyTable * out;
2461         off_t table_start_offset = -1;
2462         if (single_file) {
2463             if (t == tables) {
2464                 // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2465                 // space for version file.  It's tricky to exactly know the
2466                 // size of the version file beforehand.
2467                 table_start_offset = HONEY_VERSION_MAX_SIZE;
2468                 if (lseek(fd, table_start_offset, SEEK_SET) < 0)
2469                     throw Xapian::DatabaseError("lseek() failed", errno);
2470             } else {
2471                 table_start_offset = lseek(fd, 0, SEEK_CUR);
2472             }
2473             out = new HoneyTable(t->name, fd, version_file_out->get_offset(),
2474                                  false, false);
2475         } else {
2476             out = new HoneyTable(t->name, dest, false, t->lazy);
2477         }
2478         tabs.push_back(out);
2479         Honey::RootInfo * root_info = version_file_out->root_to_set(t->type);
2480         if (single_file) {
2481             root_info->set_free_list(fl_serialised);
2482             root_info->set_offset(table_start_offset);
2483             out->open(FLAGS,
2484                       version_file_out->get_root(t->type),
2485                       version_file_out->get_revision());
2486         } else {
2487             out->create_and_open(FLAGS, *root_info);
2488         }
2489
2490         switch (t->type) {
2491             case Honey::POSTLIST: {
2492                 if (multipass && inputs.size() > 3) {
2493                     multimerge_postlists(compactor, out, destdir,
2494                                          inputs, offset);
2495                 } else {
2496                     merge_postlists(compactor, out, offset.begin(),
2497                                     inputs.begin(), inputs.end());
2498                 }
2499                 break;
2500             }
2501             case Honey::SPELLING:
2502                 merge_spellings(out, inputs.begin(), inputs.end());
2503                 break;
2504             case Honey::SYNONYM:
2505                 merge_synonyms(out, inputs.begin(), inputs.end());
2506                 break;
2507             case Honey::POSITION:
2508                 merge_positions(out, inputs, offset);
2509                 break;
2510             default:
2511                 // DocData, Termlist
2512                 merge_docid_keyed(out, inputs, offset);
2513                 break;
2514         }
2515
2516         // Commit as revision 1.
2517         out->flush_db();
2518         out->commit(1, root_info);
2519         out->sync();
2520         if (single_file) fl_serialised = root_info->get_free_list();
2521
2522         off_t out_size = 0;
2523         if (!bad_stat && !single_file_in) {
2524             off_t db_size;
2525             if (single_file) {
2526                 db_size = file_size(fd);
2527             } else {
2528                 db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2529             }
2530             if (errno == 0) {
2531                 if (single_file) {
2532                     off_t old_prev_size = prev_size;
2533                     prev_size = db_size;
2534                     db_size -= old_prev_size;
2535                 }
2536                 // FIXME: check overflow and set bad_totals
2537                 out_total += db_size;
2538                 out_size = db_size / 1024;
2539             } else if (errno != ENOENT) {
2540                 bad_totals = bad_stat = true;
2541             }
2542         }
2543         if (bad_stat) {
2544             if (compactor)
2545                 compactor->set_status(t->name,
2546                                       "Done (couldn't stat all the DB files)");
2547         } else if (single_file_in) {
2548             if (compactor)
2549                 compactor->set_status(t->name,
2550                                       "Done (table sizes unknown for single "
2551                                       "file DB input)");
2552         } else {
2553             string status;
2554             if (out_size == in_size) {
2555                 status = "Size unchanged (";
2556             } else {
2557                 off_t delta;
2558                 if (out_size < in_size) {
2559                     delta = in_size - out_size;
2560                     status = "Reduced by ";
2561                 } else {
2562                     delta = out_size - in_size;
2563                     status = "INCREASED by ";
2564                 }
2565                 if (in_size) {
2566                     status += str(100 * delta / in_size);
2567                     status += "% ";
2568                 }
2569                 status += str(delta);
2570                 status += "K (";
2571                 status += str(in_size);
2572                 status += "K -> ";
2573             }
2574             status += str(out_size);
2575             status += "K)";
2576             if (compactor)
2577                 compactor->set_status(t->name, status);
2578         }
2579     }
2580
2581     // If compacting to a single file output and all the tables are empty, pad
2582     // the output so that it isn't mistaken for a stub database when we try to
2583     // open it.  For this it needs to at least HONEY_MIN_DB_SIZE in size.
2584     if (single_file && prev_size < HONEY_MIN_DB_SIZE) {
2585         out_total = HONEY_MIN_DB_SIZE;
2586 #ifdef HAVE_FTRUNCATE
2587         if (ftruncate(fd, HONEY_MIN_DB_SIZE) < 0) {
2588             throw Xapian::DatabaseError("Failed to set size of output "
2589                                         "database", errno);
2590         }
2591 #else
2592         const off_t off = HONEY_MIN_DB_SIZE - 1;
2593         if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2594             throw Xapian::DatabaseError("Failed to set size of output "
2595                                         "database", errno);
2596         }
2597 #endif
2598     }
2599
2600     if (single_file) {
2601         if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2602             throw Xapian::DatabaseError("lseek() failed", errno);
2603         }
2604     }
2605     version_file_out->set_last_docid(last_docid);
2606     string tmpfile = version_file_out->write(1, FLAGS);
2607     for (unsigned j = 0; j != tabs.size(); ++j) {
2608         tabs[j]->sync();
2609     }
2610     // Commit with revision 1.
2611     version_file_out->sync(tmpfile, 1, FLAGS);
2612     for (unsigned j = 0; j != tabs.size(); ++j) {
2613         delete tabs[j];
2614     }
2615 }
2616
2617     if (!single_file) lock.release();
2618
2619     if (!bad_totals && compactor) {
2620         string status;
2621         in_total /= 1024;
2622         out_total /= 1024;
2623         if (out_total == in_total) {
2624             status = "Size unchanged (";
2625         } else {
2626             off_t delta;
2627             if (out_total < in_total) {
2628                 delta = in_total - out_total;
2629                 status = "Reduced by ";
2630             } else {
2631                 delta = out_total - in_total;
2632                 status = "INCREASED by ";
2633             }
2634             if (in_total) {
2635                 status += str(100 * delta / in_total);
2636                 status += "% ";
2637             }
2638             status += str(delta);
2639             status += "K (";
2640             status += str(in_total);
2641             status += "K -> ";
2642         }
2643         status += str(out_total);
2644         status += "K)";
2645         compactor->set_status("Total", status);
2646     }
2647 }