xapian-core/weight/bm25weight.cc

   1 /** @file bm25weight.cc
   2  * @brief Xapian::BM25Weight class - the BM25 probabilistic formula
   3  */
   4 /* Copyright (C) 2009,2010,2011,2012,2014,2015 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "xapian/weight.h"
  24
  25 #include "debuglog.h"
  26 #include "omassert.h"
  27 #include "serialise-double.h"
  28
  29 #include "xapian/error.h"
  30
  31 #include <algorithm>
  32 #include <cmath>
  33
  34 using namespace std;
  35
  36 namespace Xapian {
  37
  38 BM25Weight *
  39 BM25Weight::clone() const
  40 {
  41     return new BM25Weight(param_k1, param_k2, param_k3, param_b,
  42                           param_min_normlen);
  43 }
  44
  45 void
  46 BM25Weight::init(double factor)
  47 {
  48     Xapian::doccount tf = get_termfreq();
  49
  50     double tw = 0;
  51     if (get_rset_size() != 0) {
  52         Xapian::doccount reltermfreq = get_reltermfreq();
  53
  54         // There can't be more relevant documents indexed by a term than there
  55         // are documents indexed by that term.
  56         AssertRel(reltermfreq,<=,tf);
  57
  58         // There can't be more relevant documents indexed by a term than there
  59         // are relevant documents.
  60         AssertRel(reltermfreq,<=,get_rset_size());
  61
  62         Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
  63
  64         // There can't be more relevant documents not indexed by a term than
  65         // there are documents not indexed by that term.
  66         AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
  67
  68         Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
  69
  70         Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
  71         double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
  72         double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
  73         tw = numerator / denom;
  74     } else {
  75         tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
  76     }
  77
  78     AssertRel(tw,>,0);
  79
  80     // The "official" formula can give a negative termweight in unusual cases
  81     // (without an RSet, when a term indexes more than half the documents in
  82     // the database).  These negative weights aren't actually helpful, and it
  83     // is common for implementations to replace them with a small positive
  84     // weight or similar.
  85     //
  86     // Truncating to zero doesn't seem a great approach in practice as it
  87     // means that some terms in the query can have no effect at all on the
  88     // ranking, and that some results can have zero weight, both of which
  89     // are seem surprising.
  90     //
  91     // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
  92     // more than a third of documents, which seems rather "intrusive".  That's
  93     // what the code currently enabled does, but perhaps it would be better to
  94     // do something else. (FIXME)
  95 #if 0
  96     if (rare(tw <= 1.0)) {
  97         termweight = 0;
  98     } else {
  99         termweight = log(tw) * factor;
 100         if (param_k3 != 0) {
 101             double wqf_double = get_wqf();
 102             termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
 103         }
 104     }
 105 #else
 106     if (tw < 2) tw = tw * 0.5 + 1;
 107     termweight = log(tw) * factor;
 108     if (param_k3 != 0) {
 109         double wqf_double = get_wqf();
 110         termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
 111     }
 112 #endif
 113     termweight *= (param_k1 + 1);
 114
 115     LOGVALUE(WTCALC, termweight);
 116
 117     if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
 118         // If k2 is 0, and either param_b or param_k1 is 0 then the document
 119         // length doesn't affect the weight.
 120         len_factor = 0;
 121     } else {
 122         len_factor = get_average_length();
 123         // len_factor can be zero if all documents are empty (or the database
 124         // is empty!)
 125         if (len_factor != 0) len_factor = 1 / len_factor;
 126     }
 127
 128     LOGVALUE(WTCALC, len_factor);
 129 }
 130
 131 string
 132 BM25Weight::name() const
 133 {
 134     return "Xapian::BM25Weight";
 135 }
 136
 137 string
 138 BM25Weight::serialise() const
 139 {
 140     string result = serialise_double(param_k1);
 141     result += serialise_double(param_k2);
 142     result += serialise_double(param_k3);
 143     result += serialise_double(param_b);
 144     result += serialise_double(param_min_normlen);
 145     return result;
 146 }
 147
 148 BM25Weight *
 149 BM25Weight::unserialise(const string & s) const
 150 {
 151     const char *ptr = s.data();
 152     const char *end = ptr + s.size();
 153     double k1 = unserialise_double(&ptr, end);
 154     double k2 = unserialise_double(&ptr, end);
 155     double k3 = unserialise_double(&ptr, end);
 156     double b = unserialise_double(&ptr, end);
 157     double min_normlen = unserialise_double(&ptr, end);
 158     if (rare(ptr != end))
 159         throw Xapian::SerialisationError("Extra data in BM25Weight::unserialise()");
 160     return new BM25Weight(k1, k2, k3, b, min_normlen);
 161 }
 162
 163 double
 164 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
 165                         Xapian::termcount) const
 166 {
 167     LOGCALL(WTCALC, double, "BM25Weight::get_sumpart", wdf | len);
 168     Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
 169
 170     double wdf_double = wdf;
 171     double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
 172     AssertRel(denom,>,0);
 173     RETURN(termweight * (wdf_double / denom));
 174 }
 175
 176 double
 177 BM25Weight::get_maxpart() const
 178 {
 179     LOGCALL(WTCALC, double, "BM25Weight::get_maxpart", NO_ARGS);
 180     double denom = param_k1;
 181     if (param_k1 != 0.0) {
 182         if (param_b != 0.0) {
 183             // "Upper-bound Approximations for Dynamic Pruning" Craig
 184             // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
 185             // Information Systems. 29(4), 2011 shows that evaluating at
 186             // doclen=wdf_max is a good bound.
 187             //
 188             // However, we can do better if doclen_min > wdf_max since then a
 189             // better bound can be found by simply evaluating at
 190             // doclen=doclen_min and wdf=wdf_max.
 191             Xapian::doclength normlen_lb =
 192                  max(max(get_wdf_upper_bound(), get_doclength_lower_bound()) * len_factor, param_min_normlen);
 193             denom *= (normlen_lb * param_b + (1 - param_b));
 194         }
 195     }
 196     double wdf_max = get_wdf_upper_bound();
 197     denom += wdf_max;
 198     AssertRel(denom,>,0);
 199     RETURN(termweight * (wdf_max / denom));
 200 }
 201
 202 /* The BM25 formula gives:
 203  *
 204  * param_k2 * query_length * (1 - normlen) / (1 + normlen)
 205  *
 206  * To avoid negative sumextra we add the constant (param_k2 * query_length)
 207  * to give:
 208  *
 209  * 2 * param_k2 * query_length / (1 + normlen)
 210  */
 211 double
 212 BM25Weight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
 213 {
 214     LOGCALL(WTCALC, double, "BM25Weight::get_sumextra", len);
 215     double num = (2.0 * param_k2 * get_query_length());
 216     RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
 217 }
 218
 219 double
 220 BM25Weight::get_maxextra() const
 221 {
 222     LOGCALL(WTCALC, double, "BM25Weight::get_maxextra", NO_ARGS);
 223     if (param_k2 == 0.0)
 224         RETURN(0.0);
 225     double num = (2.0 * param_k2 * get_query_length());
 226     RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
 227                             param_min_normlen)));
 228 }
 229
 230 }