From 47be2f45b0b3d778873dc1f840b1f944ccbc8063 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 27 Mar 2018 17:07:11 +1300 Subject: [PATCH] [honey] New format for more keys in postlist table Change the format of keys for valuestats, value chunks and doclen chunks to be more compact. There's now key-space reserved for unique terms chunks and max doc wdf chunks in anticipation of these being added at some point. --- .../backends/honey/honey_alldocspostlist.cc | 6 +- xapian-core/backends/honey/honey_alldocspostlist.h | 53 ++++++++--- xapian-core/backends/honey/honey_compact.cc | 102 +++++++++++---------- xapian-core/backends/honey/honey_defs.h | 42 +++++++-- xapian-core/backends/honey/honey_postlisttable.cc | 6 +- xapian-core/backends/honey/honey_values.cc | 4 +- xapian-core/backends/honey/honey_values.h | 52 ++++++++--- xapian-core/backends/honey/honey_version.cc | 5 +- xapian-core/common/pack.h | 22 ----- 9 files changed, 182 insertions(+), 110 deletions(-) diff --git a/xapian-core/backends/honey/honey_alldocspostlist.cc b/xapian-core/backends/honey/honey_alldocspostlist.cc index ed6ff730f..4b4d0d23b 100644 --- a/xapian-core/backends/honey/honey_alldocspostlist.cc +++ b/xapian-core/backends/honey/honey_alldocspostlist.cc @@ -23,6 +23,7 @@ #include "honey_alldocspostlist.h" #include "honey_database.h" +#include "honey_defs.h" #include "debuglog.h" #include "str.h" @@ -40,7 +41,10 @@ HoneyAllDocsPostList::HoneyAllDocsPostList(const HoneyDatabase* db, doccount(doccount_) { LOGCALL_CTOR(DB, "HoneyAllDocsPostList", db | doccount_); - cursor->find_entry_ge(string("\0\xe0", 2)); + static const char doclen_key_prefix[2] = { + 0, char(Honey::KEY_DOCLEN_CHUNK) + }; + cursor->find_entry_ge(string(doclen_key_prefix, 2)); } HoneyAllDocsPostList::~HoneyAllDocsPostList() diff --git a/xapian-core/backends/honey/honey_alldocspostlist.h b/xapian-core/backends/honey/honey_alldocspostlist.h index 77211cc66..fe4c344c5 100644 --- a/xapian-core/backends/honey/honey_alldocspostlist.h +++ b/xapian-core/backends/honey/honey_alldocspostlist.h @@ -23,7 +23,9 @@ #define XAPIAN_INCLUDED_HONEY_ALLDOCSPOSTLIST_H #include "api/leafpostlist.h" +#include "honey_defs.h" #include "pack.h" +#include "wordaccess.h" #include @@ -34,26 +36,51 @@ namespace Honey { /** Generate a key for a doclen chunk. */ inline std::string -make_doclenchunk_key(Xapian::docid did) +make_doclenchunk_key(Xapian::docid last_did) { - std::string key("\0\xe0", 2); - pack_uint_preserving_sort(key, did); + std::string key(1, '\0'); + Assert(last_did != 0); +#ifdef DO_CLZ + int width = (do_clz(last_did) >> 3) + 1; +#else + int width = 0; + for (auto v = last_did; v; v >>= 8) { + ++width; + } +#endif + key += char(Honey::KEY_DOCLEN_CHUNK + width - 1); + Xapian::docid v = last_did; +#ifndef WORDS_BIGENDIAN + v = do_bswap(v); +#endif + key.append(reinterpret_cast(&v) + (sizeof(v) - width), width); return key; } inline Xapian::docid docid_from_key(const std::string& key) { - const char * p = key.data(); - const char * end = p + key.length(); - // Fail if not a doclen chunk key. - if (end - p <= 2 || *p++ != '\0' || *p++ != '\xe0') return 0; - Xapian::docid did; - if (!unpack_uint_preserving_sort(&p, end, &did)) - throw Xapian::DatabaseCorruptError("bad doclen key"); - AssertEq(end - p, 0); - Assert(did != 0); - return did; + const char* p = key.data(); + const char* end = p + key.length(); + if (end - p < 3 || *p++ != '\0') { + // Not a doclen chunk key. + return 0; + } + unsigned char code = *p++; + if (code < Honey::KEY_DOCLEN_CHUNK || code > Honey::KEY_DOCLEN_CHUNK_HI) { + // Also not a doclen chunk key. + return 0; + } + + size_t width = (code - Honey::KEY_DOCLEN_CHUNK) + 1; + AssertEq(width, size_t(end - p)); + Xapian::docid v = 0; + memcpy(reinterpret_cast(&v) + (sizeof(v) - width), p, width); +#ifndef WORDS_BIGENDIAN + v = do_bswap(v); +#endif + Assert(v != 0); + return v; } class DocLenChunkReader { diff --git a/xapian-core/backends/honey/honey_compact.cc b/xapian-core/backends/honey/honey_compact.cc index 9f70fa910..c7d7433b4 100644 --- a/xapian-core/backends/honey/honey_compact.cc +++ b/xapian-core/backends/honey/honey_compact.cc @@ -42,6 +42,7 @@ #include "honey_defs.h" #include "honey_postlist_encodings.h" #include "honey_table.h" +#include "honey_values.h" #include "honey_version.h" #include "filetests.h" #include "internaltypes.h" @@ -123,31 +124,27 @@ termlist_key_is_values_used(const string& key) // the same name in other flint-derived backends. namespace HoneyCompact { -enum { - KEY_USER_METADATA = 0x00, - KEY_VALUE_STATS = 0x01, - KEY_VALUE_CHUNK = 0xd8, - KEY_DOCLEN_CHUNK = 0xe0, - KEY_POSTING_CHUNK = 0xff -}; - -/// Return one of the KEY_* constants, or a different value for an invalid key. +/// Return a Honey::KEY_* constant, or a different value for an invalid key. static inline int key_type(const string& key) { if (key[0] != '\0') - return KEY_POSTING_CHUNK; + return Honey::KEY_POSTING_CHUNK; if (key.size() <= 1) return -1; - switch (static_cast(key[1])) { - case 0x01: case 0x02: case 0x03: case 0x04: - return KEY_VALUE_STATS; - } - - // If key[1] is \xff then this correctly returns KEY_POSTING_CHUNK. - return static_cast(key[1]); + unsigned char ch = key[1]; + if (ch >= Honey::KEY_VALUE_STATS && ch <= Honey::KEY_VALUE_STATS_HI) + return Honey::KEY_VALUE_STATS; + if (ch >= Honey::KEY_VALUE_CHUNK && ch <= Honey::KEY_VALUE_CHUNK_HI) + return Honey::KEY_VALUE_CHUNK; + if (ch >= Honey::KEY_DOCLEN_CHUNK && ch <= Honey::KEY_DOCLEN_CHUNK_HI) + return Honey::KEY_DOCLEN_CHUNK; + + // Handle Honey::KEY_USER_METADATA, Honey::KEY_POSTING_CHUNK, and currently + // unused values. + return ch; } template class PostlistCursor; @@ -182,7 +179,7 @@ class PostlistCursor : private GlassCursor { tag = current_tag; tf = cf = 0; if (GlassCompact::is_user_metadata_key(key)) { - key[1] = KEY_USER_METADATA; + key[1] = Honey::KEY_USER_METADATA; return true; } if (GlassCompact::is_valuestats_key(key)) { @@ -193,7 +190,15 @@ class PostlistCursor : private GlassCursor { Xapian::valueno slot; if (!unpack_uint_last(&p, end, &slot)) throw Xapian::DatabaseCorruptError("bad value stats key"); - key = pack_honey_valuestats_key(slot); + // FIXME: pack_uint_last() is not a good encoding for keys, as the + // encoded values do not in general sort the same way as the + // numeric values. The first 256 values will be in order at least. + // + // We could just buffer up the stats for all the slots - it's + // unlikely there are going to be very many. Another option is + // to use multiple cursors or seek a single cursor around. + AssertRel(slot, <=, 256); + key = Honey::make_valuestats_key(slot); return true; } if (GlassCompact::is_valuechunk_key(key)) { @@ -203,6 +208,14 @@ class PostlistCursor : private GlassCursor { Xapian::valueno slot; if (!unpack_uint(&p, end, &slot)) throw Xapian::DatabaseCorruptError("bad value key"); + // FIXME: pack_uint() is not a good encoding for keys, as the + // encoded values do not in general sort the same way as the + // numeric values. The first 128 values will be in order at least. + // + // Buffering up this data for all slots is potentially prohibitively + // costly. We probably need to use multiple cursors or seek a single + // cursor around. + AssertRel(slot, <=, 128); Xapian::docid first_did; if (!unpack_uint_preserving_sort(&p, end, &first_did)) throw Xapian::DatabaseCorruptError("bad value key"); @@ -214,9 +227,7 @@ class PostlistCursor : private GlassCursor { last_did = reader.get_docid(); } - key.assign("\0\xd8", 2); - pack_uint(key, slot); - pack_uint_preserving_sort(key, last_did); + key = Honey::make_valuechunk_key(slot, last_did); // Add the docid delta across the chunk to the start of the tag. string newtag; @@ -243,14 +254,19 @@ class PostlistCursor : private GlassCursor { ++firstdid; tag.erase(0, d - tag.data()); } else { - // Not an initial chunk, so adjust key. - size_t tmp = d - key.data(); + // Not an initial chunk, just unpack firstdid. if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e) throw Xapian::DatabaseCorruptError("Bad postlist key"); - key.erase(tmp); } firstdid += offset; + // Set key to placeholder value which just indicates that this is a + // doclen chunk. + static const char doclen_key_prefix[2] = { + 0, char(Honey::KEY_DOCLEN_CHUNK) + }; + key.assign(doclen_key_prefix, 2); + d = tag.data(); e = d + tag.size(); @@ -427,10 +443,10 @@ class PostlistCursor : private HoneyCursor { tag = current_tag; tf = 0; switch (key_type(key)) { - case KEY_USER_METADATA: - case KEY_VALUE_STATS: + case Honey::KEY_USER_METADATA: + case Honey::KEY_VALUE_STATS: return true; - case KEY_VALUE_CHUNK: { + case Honey::KEY_VALUE_CHUNK: { const char * p = key.data(); const char * end = p + key.length(); p += 2; @@ -440,14 +456,10 @@ class PostlistCursor : private HoneyCursor { Xapian::docid did; if (!unpack_uint_preserving_sort(&p, end, &did)) throw Xapian::DatabaseCorruptError("bad value key"); - did += offset; - - key.assign("\0\xd8", 2); - pack_uint(key, slot); - pack_uint_preserving_sort(key, did); + key = Honey::make_valuechunk_key(slot, did + offset); return true; } - case KEY_DOCLEN_CHUNK: { + case Honey::KEY_DOCLEN_CHUNK: { const char * p = key.data(); const char * end = p + key.length(); p += 2; @@ -456,15 +468,10 @@ class PostlistCursor : private HoneyCursor { (!unpack_uint_preserving_sort(&p, end, &did) || p != end)) { throw Xapian::DatabaseCorruptError("Bad doclen key"); } - did += offset; - - key.erase(2); - if (did != 1) { - pack_uint_preserving_sort(key, did); - } + key = Honey::make_doclenchunk_key(did + offset); return true; } - case KEY_POSTING_CHUNK: + case Honey::KEY_POSTING_CHUNK: break; default: throw Xapian::DatabaseCorruptError("Bad postlist table key " @@ -509,7 +516,6 @@ class PostlistCursor : private HoneyCursor { d = tag.data(); e = d + tag.size(); - bool have_wdfs = (cf != 0); if (have_wdfs && tf > 2) { Xapian::termcount remaining_cf_for_flat_wdf = @@ -612,7 +618,7 @@ merge_postlists(Xapian::Compactor * compactor, while (!pq.empty()) { cursor_type * cur = pq.top(); const string& key = cur->key; - if (key_type(key) != KEY_USER_METADATA) break; + if (key_type(key) != Honey::KEY_USER_METADATA) break; if (key != last_key) { if (!tags.empty()) { @@ -666,7 +672,7 @@ merge_postlists(Xapian::Compactor * compactor, while (!pq.empty()) { cursor_type * cur = pq.top(); const string& key = cur->key; - if (key_type(key) != KEY_VALUE_STATS) break; + if (key_type(key) != Honey::KEY_VALUE_STATS) break; if (key != last_key) { // For the first valuestats key, last_key will be the previous // key we wrote, which we don't want to overwrite. This is the @@ -732,7 +738,7 @@ merge_postlists(Xapian::Compactor * compactor, while (!pq.empty()) { cursor_type * cur = pq.top(); const string & key = cur->key; - if (key_type(key) != KEY_VALUE_CHUNK) break; + if (key_type(key) != Honey::KEY_VALUE_CHUNK) break; out->add(key, cur->tag); pq.pop(); if (cur->next()) { @@ -746,8 +752,8 @@ merge_postlists(Xapian::Compactor * compactor, while (!pq.empty()) { cursor_type * cur = pq.top(); string & key = cur->key; - if (key_type(key) != KEY_DOCLEN_CHUNK) break; - pack_uint_preserving_sort(key, cur->chunk_lastdid); + if (key_type(key) != Honey::KEY_DOCLEN_CHUNK) break; + key = Honey::make_doclenchunk_key(cur->chunk_lastdid); out->add(key, cur->tag); pq.pop(); if (cur->next()) { @@ -784,7 +790,7 @@ merge_postlists(Xapian::Compactor * compactor, pq.pop(); } if (cur) { - AssertEq(key_type(cur->key), KEY_POSTING_CHUNK); + AssertEq(key_type(cur->key), Honey::KEY_POSTING_CHUNK); } if (cur == NULL || cur->key != last_key) { if (!tags.empty()) { diff --git a/xapian-core/backends/honey/honey_defs.h b/xapian-core/backends/honey/honey_defs.h index ac326e153..55418c7dc 100644 --- a/xapian-core/backends/honey/honey_defs.h +++ b/xapian-core/backends/honey/honey_defs.h @@ -1,7 +1,7 @@ /** @file honey_defs.h * @brief Definitions, types, etc for use inside honey. */ -/* Copyright (C) 2010,2014,2015,2017 Olly Betts +/* Copyright (C) 2010,2014,2015,2017,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -48,15 +48,37 @@ #define HONEY_MAX_DOCID Xapian::docid(0xffffffffffffffff) namespace Honey { - enum table_type { - POSTLIST, - DOCDATA, - TERMLIST, - POSITION, - SPELLING, - SYNONYM, - MAX_ - }; + +enum table_type { + POSTLIST, + DOCDATA, + TERMLIST, + POSITION, + SPELLING, + SYNONYM, + MAX_ +}; + +/// Postlist key second bytes when first byte is zero. +enum { + KEY_USER_METADATA = 0x00, + KEY_VALUE_STATS = 0x01, + KEY_VALUE_STATS_HI = 0x08, + KEY_VALUE_CHUNK = 0x09, + KEY_VALUE_CHUNK_HI = 0xe1, // (0xe1 for slots > 26) + /* 0xe2-0xe6 inclusive unused currently. */ + /* 0xe7-0xee inclusive reserved for doc max wdf chunks. */ + /* 0xef-0xf6 inclusive reserved for unique terms chunks. */ + KEY_DOCLEN_CHUNK = 0xf7, + KEY_DOCLEN_CHUNK_HI = 0xfe, + KEY_POSTING_CHUNK = 0xff +}; + +#define KEY_DOCLEN_PREFIX "\0\xf7" + +static_assert(((KEY_VALUE_CHUNK_HI - KEY_VALUE_CHUNK) & 0x07) == 0, + "No wasted values"); + } /// A block number in a honey Btree file. diff --git a/xapian-core/backends/honey/honey_postlisttable.cc b/xapian-core/backends/honey/honey_postlisttable.cc index 1626f0ad7..88076dc1d 100644 --- a/xapian-core/backends/honey/honey_postlisttable.cc +++ b/xapian-core/backends/honey/honey_postlisttable.cc @@ -25,6 +25,7 @@ #include "honey_alldocspostlist.h" #include "honey_cursor.h" #include "honey_database.h" +#include "honey_defs.h" #include "honey_postlist.h" #include "honey_postlist_encodings.h" @@ -82,7 +83,10 @@ HoneyPostListTable::get_used_docid_range(Xapian::doccount doccount, Xapian::docid& last) const { unique_ptr cursor; - if (cursor->find_entry_ge(string("\0\xe0", 2))) { + static const char doclen_key_prefix[2] = { + 0, char(Honey::KEY_DOCLEN_CHUNK) + }; + if (cursor->find_entry_ge(string(doclen_key_prefix, 2))) { first = 1; } else { // doccount == 0 should be handled by our caller. diff --git a/xapian-core/backends/honey/honey_values.cc b/xapian-core/backends/honey/honey_values.cc index 4c8369b3b..d6569715e 100644 --- a/xapian-core/backends/honey/honey_values.cc +++ b/xapian-core/backends/honey/honey_values.cc @@ -604,7 +604,7 @@ HoneyValueManager::get_value_stats(Xapian::valueno slot, LOGCALL_VOID(DB, "HoneyValueManager::get_value_stats", slot | Literal("[stats]")); string tag; - if (postlist_table.get_exact_entry(pack_honey_valuestats_key(slot), tag)) { + if (postlist_table.get_exact_entry(Honey::make_valuestats_key(slot), tag)) { const char * pos = tag.data(); const char * end = pos + tag.size(); @@ -641,7 +641,7 @@ HoneyValueManager::set_value_stats(map& val_stats) LOGCALL_VOID(DB, "HoneyValueManager::set_value_stats", val_stats); map::const_iterator i; for (i = val_stats.begin(); i != val_stats.end(); ++i) { - string key = pack_honey_valuestats_key(i->first); + string key = Honey::make_valuestats_key(i->first); const ValueStats & stats = i->second; if (stats.freq != 0) { string new_value; diff --git a/xapian-core/backends/honey/honey_values.h b/xapian-core/backends/honey/honey_values.h index d437a98af..9f61852fd 100644 --- a/xapian-core/backends/honey/honey_values.h +++ b/xapian-core/backends/honey/honey_values.h @@ -1,7 +1,7 @@ /** @file honey_values.h * @brief HoneyValueManager class */ -/* Copyright (C) 2008,2009,2011 Olly Betts +/* Copyright (C) 2008,2009,2011,2018 Olly Betts * Copyright (C) 2008 Lemur Consulting Ltd * * This program is free software; you can redistribute it and/or modify @@ -36,24 +36,41 @@ namespace Honey { /** Generate a key for a value stream chunk. */ inline std::string -make_valuechunk_key(Xapian::valueno slot, Xapian::docid did) +make_valuechunk_key(Xapian::valueno slot, Xapian::docid last_did) { - std::string key("\0\xd8", 2); - pack_uint(key, slot); - pack_uint_preserving_sort(key, did); + std::string key(1, '\0'); + if (slot < (Honey::KEY_VALUE_CHUNK_HI - Honey::KEY_VALUE_CHUNK) >> 3) { + key += char(Honey::KEY_VALUE_CHUNK + slot); + } else { + key += char(Honey::KEY_VALUE_CHUNK_HI); + pack_uint_preserving_sort(key, slot); + } + pack_uint_preserving_sort(key, last_did); return key; } inline Xapian::docid docid_from_key(Xapian::valueno required_slot, const std::string & key) { - const char * p = key.data(); - const char * end = p + key.length(); - // Fail if not a value chunk key. - if (end - p < 2 || *p++ != '\0' || *p++ != '\xd8') return 0; + const char* p = key.data(); + const char* end = p + key.length(); + if (end - p < 3 || *p++ != '\0') { + // Not a value chunk key. + return 0; + } + unsigned char code = *p++; + if (code < Honey::KEY_VALUE_CHUNK || code > Honey::KEY_VALUE_CHUNK_HI) { + // Also not a value chunk key. + return 0; + } + Xapian::valueno slot; - if (!unpack_uint(&p, end, &slot)) - throw Xapian::DatabaseCorruptError("Bad value key"); + if (code < Honey::KEY_VALUE_CHUNK_HI) { + slot = code - Honey::KEY_VALUE_CHUNK; + } else { + if (!unpack_uint_preserving_sort(&p, end, &slot)) + throw Xapian::DatabaseCorruptError("Bad value key"); + } // Fail if for a different slot. if (slot != required_slot) return 0; Xapian::docid did; @@ -62,6 +79,19 @@ docid_from_key(Xapian::valueno required_slot, const std::string & key) return did; } +inline std::string +make_valuestats_key(Xapian::valueno slot) +{ + std::string key(1, '\0'); + if (slot <= 7) { + key += char(Honey::KEY_VALUE_STATS + slot); + } else { + key += char(Honey::KEY_VALUE_STATS + 7); + pack_uint_preserving_sort(key, slot); + } + return key; +} + } namespace Xapian { diff --git a/xapian-core/backends/honey/honey_version.cc b/xapian-core/backends/honey/honey_version.cc index bf88fee18..882fe8b57 100644 --- a/xapian-core/backends/honey/honey_version.cc +++ b/xapian-core/backends/honey/honey_version.cc @@ -50,8 +50,9 @@ using namespace std; /// Honey format version (date of change): -#define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,3,26) -// 2018,3,26 1.5.0 use known suffix from spelling B and T keys +#define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,3,27) +// 2018,3,27 1.5.0 new key format for value stats, value chunks, doclen chunks +// 2018,3,26 use known suffix from spelling B and T keys // 2018,3,25 use known prefix from spelling B and H keys // 2018,3,15 avoid storing flat wdf // 2018,3,14 store per term wdf_max diff --git a/xapian-core/common/pack.h b/xapian-core/common/pack.h index b9cfff9d7..c5c9667b2 100644 --- a/xapian-core/common/pack.h +++ b/xapian-core/common/pack.h @@ -544,26 +544,4 @@ pack_honey_postlist_key(const std::string& term, Xapian::docid did) return key; } -inline std::string -pack_honey_valuestats_key(Xapian::valueno slot) -{ - char data[6]; - data[0] = '\0'; - int l = 1; - if (slot >= 0x100) { - if (slot >= 0x10000) { - if (slot >= 0x1000000) { - data[++l] = char(slot >> 24); - } - data[++l] = char(slot >> 16); - } - data[++l] = char(slot >> 8); - } - data[++l] = char(slot); - data[1] = l - 1; - // Prune trailing zero bytes. - while (data[l] == 0) --l; - return std::string(data, l + 1); -} - #endif // XAPIAN_INCLUDED_PACK_H -- 2.11.4.GIT