xapian-core/weight/lmweight.cc

   1 /** @file lmweight.cc
   2  * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
   3  */
   4 /* Copyright (C) 2012 Gaurav Arora
   5  * Copyright (C) 2016 Olly Betts
   6  * Copyright (C) 2016 Vivek Pal
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "xapian/weight.h"
  26 #include "weightinternal.h"
  27
  28 #include "debuglog.h"
  29 #include "omassert.h"
  30 #include "serialise-double.h"
  31
  32 #include "xapian/error.h"
  33
  34 #include <cerrno>
  35 #include <cmath>
  36 #include <cstdlib>
  37
  38 using namespace std;
  39
  40 namespace Xapian {
  41
  42 LMWeight *
  43 LMWeight::clone() const {
  44     return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
  45 }
  46
  47 void
  48 LMWeight::init(double factor_)
  49 {
  50     factor = factor_;
  51
  52     // Storing collection frequency of current term in collection_freq to be
  53     // accessed while smoothing of weights for the term, for term not present
  54     // in the document.
  55     double collection_freq = get_collection_freq();
  56
  57     // Collection_freq of a term in collection should be always greater than or
  58     // equal to zero (Non Negative).
  59     AssertRel(collection_freq,>=,0);
  60     LOGVALUE(WTCALC, collection_freq);
  61
  62     // calculating approximate number of total terms in the collection to be
  63     // accessed for smoothing of the document.
  64     double total_collection_term = get_collection_size() * get_average_length();
  65
  66     /* In case the within document frequency of term is zero smoothing will
  67      * be required and should be return instead of returning zero, as returning
  68      * LM score are multiplication of contribution of all terms, due to absence
  69      * of single term whole document is scored zero, hence apply collection
  70      * frequency smoothing.
  71      */
  72     weight_collection = double(collection_freq) / total_collection_term;
  73
  74     // Total term should be greater than zero as there would be at least one
  75     // document in collection.
  76     AssertRel(total_collection_term,>,0);
  77     LOGVALUE(WTCALC, total_collection_term);
  78
  79     // There can't be more relevant term in collection than total number of
  80     // term.
  81     AssertRel(collection_freq,<=,total_collection_term);
  82
  83     /* Setting default values of the param_log to handle negative value of log.
  84      * It is considered to be upperbound of document length.
  85      * initializing param_log to upperbound of document_length.
  86      */
  87
  88     if (param_log == 0.0) {
  89         param_log = get_doclength_upper_bound();
  90     }
  91
  92     /* Since the optimal parameter for Jelinek mercer smoothing
  93      * is based on query length, so if query is title query changing
  94      * default value of smoothing parameter.
  95      */
  96
  97     if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
  98         select_smoothing == TWO_STAGE_SMOOTHING) {
  99         if (param_smoothing1 == 0.7) {
 100             if (get_query_length() <= 2) {
 101                 param_smoothing1 = 0.1;
 102             }
 103         }
 104     }
 105
 106     /* param_smoothing1 default value should be 2000 in case
 107      * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
 108      * if user supply his own value for param_smoothing1 value will not be set
 109      * to 2000(default value)
 110      */
 111     if (select_smoothing == DIRICHLET_SMOOTHING) {
 112         if (param_smoothing1 == 0.7) {
 113             param_smoothing1 = 2000;
 114         }
 115     }
 116
 117     /* Setting param_smoothing1 and param_smoothing2 default value to used when
 118      * DIRICHLET_PLUS_SMOOTHING is selected.*/
 119     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 120         if (param_smoothing1 == 0.7) {
 121             param_smoothing1 = 2000;
 122         }
 123     }
 124 }
 125
 126 string
 127 LMWeight::name() const
 128 {
 129     return "Xapian::LMWeight";
 130 }
 131
 132 string
 133 LMWeight::short_name() const
 134 {
 135     return "lm";
 136 }
 137
 138 string
 139 LMWeight::serialise() const
 140 {
 141     string result = serialise_double(param_log);
 142     result += static_cast<unsigned char>(select_smoothing);
 143     result += serialise_double(param_smoothing1);
 144     result += serialise_double(param_smoothing2);
 145     return result;
 146 }
 147
 148 LMWeight *
 149 LMWeight::unserialise(const string & s) const
 150 {
 151     const char *ptr = s.data();
 152     const char *end = ptr + s.size();
 153     double param_log_ = unserialise_double(&ptr, end);
 154     type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
 155     double param_smoothing1_ = unserialise_double(&ptr, end);
 156     double param_smoothing2_ = unserialise_double(&ptr, end);
 157     if (rare(ptr != end))
 158         throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
 159     return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
 160 }
 161
 162 double
 163 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
 164                       Xapian::termcount uniqterm) const
 165 {
 166     // Within Document Frequency of the term in document being considered.
 167     double wdf_double = wdf;
 168     // Length of the Document in terms of number of terms.
 169     double len_double = len;
 170     // variable to store weight contribution of term in the document scoring for LM.
 171     double weight_sum;
 172
 173     // Calculating weights considering different smoothing option available to user.
 174     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
 175         /* Maximum likelihood of current term, weight contribution of term in
 176          * case query term is present in the document.
 177          */
 178         double weight_document = wdf_double / len_double;
 179         weight_sum = (param_smoothing1 * weight_collection) +
 180                      ((1 - param_smoothing1) * weight_document);
 181     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
 182         weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) /
 183                      (len_double + param_smoothing1);
 184     } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 185         /* In the Dir+ weighting formula, sumpart weight contribution is :-
 186          *
 187          * sum of log of (1 + (wdf/(param_smoothing1 * weight_collection))) and
 188          * log of (1 + (delta/param_smoothing1 * weight_collection))).
 189          * Since, sum of logs is log of product so weight_sum is calculated as product
 190          * of terms in log in the Dir+ formula.
 191          */
 192         weight_sum = (1 + (wdf_double / (param_smoothing1 * weight_collection))) *
 193                      (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
 194     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
 195         double uniqterm_double = uniqterm;
 196         weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double);
 197     } else {
 198         weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection));
 199     }
 200
 201     /* Since LM score is calculated with multiplication, instead of changing
 202      * the current implementation log trick have been used to calculate the
 203      * product since (sum of log is log of product and since aim is ranking
 204      * ranking document by product or log of product won't make a large
 205      * difference hence log(product) will be used for ranking.
 206      */
 207     double product = weight_sum * param_log;
 208     return (product > 1.0) ? factor * log(product) : 0;
 209 }
 210
 211 double
 212 LMWeight::get_maxpart() const
 213 {
 214     // Variable to store the collection frequency
 215     double upper_bound;
 216     // Store upper bound on wdf in variable wdf_max
 217     double wdf_max = get_wdf_upper_bound();
 218
 219     // Calculating upper bound considering different smoothing option available to user.
 220     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
 221         upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1);
 222     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
 223         upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1);
 224     } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 225         upper_bound = (1 + (wdf_max / (param_smoothing1 * weight_collection))) *
 226                       (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
 227     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
 228         upper_bound = param_smoothing1 * weight_collection + 1;
 229     } else {
 230         upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection));
 231     }
 232
 233     /* Since weight are calculated using log trick, using same with the bounds. Refer
 234      * comment in get_sumpart for the details.
 235      */
 236     double product = upper_bound * param_log;
 237     return (product > 1.0) ? factor * log(product) : 1.0;
 238 }
 239
 240
 241 /* The extra weight component in the Dir+ formula is :-
 242  *
 243  * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
 244  *
 245  * where, |Q| is total query length.
 246  *        |D| is total document length.
 247  */
 248 double
 249 LMWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
 250 {
 251     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 252         double extra_weight = param_smoothing1 / (len + param_smoothing1);
 253         return get_query_length() * log(extra_weight);
 254     }
 255     return 0;
 256 }
 257
 258 double
 259 LMWeight::get_maxextra() const
 260 {
 261     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 262         double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
 263         return get_query_length() * log(extra_weight);
 264     }
 265     return 0;
 266 }
 267
 268 static bool
 269 type_smoothing_param(const char ** p, Xapian::Weight::type_smoothing * ptr_val)
 270 {
 271     char *end;
 272     errno = 0;
 273     int v = strtol(*p, &end, 10);
 274     if (*p == end || errno || v < 1 || v > 5)
 275         return false;
 276     *p = end;
 277     static const Xapian::Weight::type_smoothing smooth_tab[5] = {
 278         Xapian::Weight::TWO_STAGE_SMOOTHING,
 279         Xapian::Weight::DIRICHLET_SMOOTHING,
 280         Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING,
 281         Xapian::Weight::JELINEK_MERCER_SMOOTHING,
 282         Xapian::Weight::DIRICHLET_PLUS_SMOOTHING
 283     };
 284     *ptr_val = smooth_tab[v - 1];
 285     return true;
 286 }
 287
 288 LMWeight *
 289 LMWeight::create_from_parameters(const char * p) const
 290 {
 291     if (*p == '\0')
 292         return new Xapian::LMWeight();
 293     double param_log_ = 0;
 294     Xapian::Weight::type_smoothing type = Xapian::Weight::TWO_STAGE_SMOOTHING;
 295     double smoothing1 = 0.7;
 296     double smoothing2 = 2000;
 297     if (!Xapian::Weight::Internal::double_param(&p, &param_log_))
 298         Xapian::Weight::Internal::parameter_error("Parameter 1 (log) is invalid", "lm");
 299     if (*p && !type_smoothing_param(&p, &type))
 300         Xapian::Weight::Internal::parameter_error("Parameter 2 (smoothing_type) is invalid", "lm");
 301     if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing1))
 302         Xapian::Weight::Internal::parameter_error("Parameter 3 (smoothing1) is invalid", "lm");
 303     if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing2))
 304         Xapian::Weight::Internal::parameter_error("Parameter 4 (smoothing2) is invalid", "lm");
 305     if (*p)
 306         Xapian::Weight::Internal::parameter_error("Extra data after parameter 4", "lm");
 307     return new Xapian::LMWeight(param_log_, type, smoothing1, smoothing2);
 308 }
 309
 310 }