From caca4e6999620c207584f061647a3ec8a5c96aaf Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 13 Mar 2018 15:46:25 +1300 Subject: [PATCH] [honey] Better per-term wdf upper bound Previously we used min(cf(term), wdf_upper_bound(db)) for the per term upper bound - that's tight for any terms which attain that upper bound, and for terms with termfreq == 1, which are common in the database (e.g. 66% for a database of wikipedia), but probably much less common in searches. We now use max(first_wdf(term), cf(term) - first_wdf(term)) when termfreq > 1, which means terms with termfreq == 2 will also attain their bound (another 11% for the same database) while terms with higher termfreq but below the global bound will get a tighter bound. --- xapian-core/backends/honey/honey_database.cc | 19 ++++++++--------- xapian-core/backends/honey/honey_postlisttable.cc | 25 ++++++++++++++++++++++- xapian-core/backends/honey/honey_postlisttable.h | 4 +++- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/xapian-core/backends/honey/honey_database.cc b/xapian-core/backends/honey/honey_database.cc index df6a642d6..bc24dad69 100644 --- a/xapian-core/backends/honey/honey_database.cc +++ b/xapian-core/backends/honey/honey_database.cc @@ -205,19 +205,16 @@ HoneyDatabase::get_doclength_upper_bound() const Xapian::termcount HoneyDatabase::get_wdf_upper_bound(const string& term) const { - // We don't store per-term wdf upper bounds currently, only a per-database - // wdf bound. However, the collection frequency of the term provides a - // second upper bound (since collection frequency is the sum of the wdf and - // wdf >= 0), so pick the tighter of these bounds. Xapian::termcount wdf_bound = version_file.get_wdf_upper_bound(); - // It's unlikely wdf is always 0, but when it is there's no need to check - // the collection frequency. + // It's unlikely wdf is always 0, but when it is there's no need to do any + // further work. if (usual(wdf_bound != 0)) { - Xapian::termcount coll_freq; - get_freqs(term, NULL, &coll_freq); - if (coll_freq < wdf_bound) { - wdf_bound = coll_freq; - } + // We don't store per-term wdf upper bounds currently, but + // HoneyPostListTable can provide an upper bound based on termfreq, + // coll_freq, and the first wdf value, which more often than not is + // actually the exact bound (in 77% of cases in an example database of + // wikipedia data). + wdf_bound = min(wdf_bound, postlist_table.get_wdf_upper_bound(term)); } return wdf_bound; } diff --git a/xapian-core/backends/honey/honey_postlisttable.cc b/xapian-core/backends/honey/honey_postlisttable.cc index e09748e88..a63766bdd 100644 --- a/xapian-core/backends/honey/honey_postlisttable.cc +++ b/xapian-core/backends/honey/honey_postlisttable.cc @@ -1,7 +1,7 @@ /** @file honey_postlisttable.cc * @brief Subclass of HoneyTable which holds postlists. */ -/* Copyright (C) 2007,2008,2009,2010,2013,2014,2015,2016,2017 Olly Betts +/* Copyright (C) 2007,2008,2009,2010,2013,2014,2015,2016,2017,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -126,3 +126,26 @@ HoneyPostListTable::get_used_docid_range(Xapian::doccount doccount, // We've reached the end of the table (only possible if there are no terms // at all!) } + +Xapian::termcount +HoneyPostListTable::get_wdf_upper_bound(const std::string& term) const +{ + string chunk; + if (!get_exact_entry(Honey::make_postingchunk_key(term), chunk)) { + // Term not present. + return 0; + } + + const char* p = chunk.data(); + const char* pend = p + chunk.size(); + Xapian::doccount tf; + Xapian::termcount cf; + Xapian::docid first; + Xapian::docid last; + Xapian::docid chunk_last; + Xapian::termcount first_wdf; + if (!decode_initial_chunk_header(&p, pend, tf, cf, first, last, chunk_last, + first_wdf)) + throw Xapian::DatabaseCorruptError("Postlist initial chunk header"); + return (cf > 0 && tf > 1) ? max(cf - first_wdf, first_wdf) : cf; +} diff --git a/xapian-core/backends/honey/honey_postlisttable.h b/xapian-core/backends/honey/honey_postlisttable.h index b46f55a57..29378ab7d 100644 --- a/xapian-core/backends/honey/honey_postlisttable.h +++ b/xapian-core/backends/honey/honey_postlisttable.h @@ -1,7 +1,7 @@ /** @file honey_postlisttable.h * @brief Subclass of HoneyTable which holds postlists. */ -/* Copyright (C) 2007,2008,2009,2010,2013,2014,2015,2016 Olly Betts +/* Copyright (C) 2007,2008,2009,2010,2013,2014,2015,2016,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -66,6 +66,8 @@ class HoneyPostListTable : public HoneyTable { Xapian::docid& first, Xapian::docid& last) const; + Xapian::termcount get_wdf_upper_bound(const std::string& term) const; + std::string get_metadata(const std::string& key) const { std::string value; (void)get_exact_entry(std::string("\0", 2) + key, value); -- 2.11.4.GIT