From 13672b2b812e20d47e78c477d7a76814920f7286 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Fri, 2 Feb 2018 09:47:54 +1300 Subject: [PATCH] [honey] Special case tf=2; first_wdf = floor(collfreq/2) This is the case for most tf=2 terms in the database I'm testing with. --- .../backends/honey/honey_postlist_encodings.h | 24 +++++++++++++++++----- xapian-core/backends/honey/honey_version.cc | 5 +++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/xapian-core/backends/honey/honey_postlist_encodings.h b/xapian-core/backends/honey/honey_postlist_encodings.h index 747f1f065..0a6c5351d 100644 --- a/xapian-core/backends/honey/honey_postlist_encodings.h +++ b/xapian-core/backends/honey/honey_postlist_encodings.h @@ -56,14 +56,30 @@ encode_initial_chunk_header(Xapian::doccount termfreq, // A term which only occurs in two documents. By Zipf's Law these // are also fairly common (typically 10-15% of words in a large // corpus: https://en.wikipedia.org/wiki/Hapax_legomenon ) + // // We have to encode collfreq == 0 explicitly or else the decoder can't // distinguish between the cases: + // // tf = 1; collfreq = x // tf = 2; last = first + x + 1; collfreq = 0 + // + // We turn this to our advantage to encode tf = 2 lists with constant + // wdf or where the second wdf is one more than the first (which are + // common cases) more compactly, so instead of: + // + // + // + // We encode these as: + // + // + // + // And then when its omitted: first_wdf = collfreq >> 1 + // + // The collfreq = 0 case is then a particular example of this. pack_uint(out, collfreq); AssertRel(last, >, first); pack_uint(out, last - first - 1); - if (collfreq) { + if (first_wdf != (collfreq / 2)) { pack_uint(out, first_wdf); } } else { @@ -109,12 +125,10 @@ decode_initial_chunk_header(const char ** p, const char * end, return false; } if (*p == end) { - // Double occurrence boolean term. - // FIXME: Use non-zero value here to encode equal wdf? - AssertEq(collfreq, 0); + // Double occurrence boolean term with first_wdf = floor(collfreq / 2). chunk_last = last = first + termfreq + 1; termfreq = 2; - first_wdf = 0; + first_wdf = collfreq / 2; return true; } diff --git a/xapian-core/backends/honey/honey_version.cc b/xapian-core/backends/honey/honey_version.cc index faf85ec16..5324dcdd0 100644 --- a/xapian-core/backends/honey/honey_version.cc +++ b/xapian-core/backends/honey/honey_version.cc @@ -50,8 +50,9 @@ using namespace std; /// Honey format version (date of change): -#define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,2,1) -// 2018,2,1 1.5.0 pack_uint for postlist data +#define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,2,2) +// 2018,2,2 1.5.0 special case tf=2; first_wdf = floor(collfreq/2) +// 2018,2,1 pack_uint for postlist data // 2018,1,31 Special case postlist when termfreq==2 // 2018,1,30 More compact postlist chunk headers // 2018,1,23 Elide last-first for single occurrence terms -- 2.11.4.GIT