xapian-core/weight/lmweight.cc

   1 /** @file lmweight.cc
   2  * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
   3  */
   4 /* Copyright (C) 2012 Gaurav Arora
   5  * Copyright (C) 2016 Olly Betts
   6  * Copyright (C) 2016 Vivek Pal
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "xapian/weight.h"
  26
  27 #include "debuglog.h"
  28 #include "omassert.h"
  29 #include "serialise-double.h"
  30
  31 #include "xapian/error.h"
  32
  33 #include <cmath>
  34
  35 using namespace std;
  36
  37 namespace Xapian {
  38
  39 LMWeight *
  40 LMWeight::clone() const {
  41     return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
  42 }
  43
  44 void
  45 LMWeight::init(double factor_)
  46 {
  47     // weight_collection is really factor.
  48     weight_collection = factor_;
  49
  50     /* Setting default values of the param_log to handle negative value of log.
  51      * It is considered to be upperbound of document length.
  52      * initializing param_log to upperbound of document_length.
  53      */
  54
  55     if (param_log == 0.0) {
  56         param_log = get_doclength_upper_bound();
  57     }
  58
  59     /* Since the optimal parameter for Jelinek mercer smoothing
  60      * is based on query length, so if query is title query changing
  61      * default value of smoothing parameter.
  62      */
  63
  64     if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
  65         select_smoothing == TWO_STAGE_SMOOTHING) {
  66         if (param_smoothing1 == 0.7) {
  67             if (get_query_length() <= 2) {
  68                 param_smoothing1 = 0.1;
  69             }
  70         }
  71     }
  72
  73     /* param_smoothing1 default value should be 2000 in case
  74      * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
  75      * if user supply his own value for param_smoothing1 value will not be set
  76      * to 2000(default value)
  77      */
  78     if (select_smoothing == DIRICHLET_SMOOTHING) {
  79         if (param_smoothing1 == 0.7) {
  80             param_smoothing1 = 2000;
  81         }
  82     }
  83
  84     /* Setting param_smoothing1 and param_smoothing2 default value to used when
  85      * DIRICHLET_PLUS_SMOOTHING is selected.*/
  86     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
  87         if (param_smoothing1 == 0.7) {
  88             param_smoothing1 = 2000;
  89         }
  90     }
  91 }
  92
  93 string
  94 LMWeight::name() const
  95 {
  96     return "Xapian::LMWeight";
  97 }
  98
  99 string
 100 LMWeight::serialise() const
 101 {
 102     string result = serialise_double(param_log);
 103     result += static_cast<unsigned char>(select_smoothing);
 104     result += serialise_double(param_smoothing1);
 105     result += serialise_double(param_smoothing2);
 106     return result;
 107 }
 108
 109 LMWeight *
 110 LMWeight::unserialise(const string & s) const
 111 {
 112     const char *ptr = s.data();
 113     const char *end = ptr + s.size();
 114     double param_log_ = unserialise_double(&ptr, end);
 115     type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
 116     double param_smoothing1_ = unserialise_double(&ptr, end);
 117     double param_smoothing2_ = unserialise_double(&ptr, end);
 118     if (rare(ptr != end))
 119         throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
 120     return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
 121 }
 122
 123 double
 124 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
 125                       Xapian::termcount uniqterm) const
 126 {
 127     // Within Document Frequency of the term in document being considered.
 128     double wdf_double = wdf;
 129     // Length of the Document in terms of number of terms.
 130     double len_double = len;
 131     // variable to store weight contribution of term in the document scoring for LM.
 132     double weight_sum;
 133
 134     /* In case the within document frequency of term is zero smoothing will
 135      * be required and should be return instead of returning zero, as returning
 136      * LM score are multiplication of contribution of all terms, due to absence
 137      * of single term whole document is scored zero, hence apply collection
 138      * frequency smoothing.
 139      */
 140     double wt_coll =
 141         get_collection_freq() / (get_collection_size() * get_average_length());
 142
 143     // Calculating weights considering different smoothing option available to user.
 144     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
 145         /* Maximum likelihood of current term, weight contribution of term in
 146          * case query term is present in the document.
 147          */
 148         double weight_document = wdf_double / len_double;
 149         weight_sum = (param_smoothing1 * wt_coll) +
 150                      ((1 - param_smoothing1) * weight_document);
 151     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
 152         weight_sum = (wdf_double + (param_smoothing1 * wt_coll)) /
 153                      (len_double + param_smoothing1);
 154     } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 155         /* In the Dir+ weighting formula, sumpart weight contribution is :-
 156          *
 157          * sum of log of (1 + (wdf/(param_smoothing1 * wt_coll))) and
 158          * log of (1 + (delta/param_smoothing1 * wt_coll))).
 159          * Since, sum of logs is log of product so weight_sum is calculated as product
 160          * of terms in log in the Dir+ formula.
 161          */
 162         weight_sum = (1 + (wdf_double / (param_smoothing1 * wt_coll))) *
 163                      (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
 164     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
 165         double uniqterm_double = uniqterm;
 166         weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * wt_coll * uniqterm_double) / len_double);
 167     } else {
 168         weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * wt_coll)) / (len_double + param_smoothing2)) + (param_smoothing1 * wt_coll));
 169     }
 170
 171     /* Since LM score is calculated with multiplication, instead of changing
 172      * the current implementation log trick have been used to calculate the
 173      * product since (sum of log is log of product and since aim is ranking
 174      * ranking document by product or log of product won't make a large
 175      * difference hence log(product) will be used for ranking.
 176      */
 177     double product = weight_sum * param_log;
 178     // weight_collection is really factor.
 179     return (product > 1.0) ? weight_collection * log(product) : 0;
 180 }
 181
 182 double
 183 LMWeight::get_maxpart() const
 184 {
 185     // Variable to store the collection frequency
 186     double upper_bound;
 187     // Store upper bound on wdf in variable wdf_max
 188     double wdf_max = get_wdf_upper_bound();
 189
 190     /* In case the within document frequency of term is zero smoothing will
 191      * be required and should be return instead of returning zero, as
 192      * returning LM score are multiplication of contribution of all terms,
 193      * due to absence of single term whole document is scored zero, hence
 194      * apply collection frequency smoothing.
 195      */
 196     double wt_coll =
 197         get_collection_freq() / (get_collection_size() * get_average_length());
 198
 199     // Calculating upper bound considering different smoothing option available to user.
 200     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
 201         upper_bound = (param_smoothing1 * wt_coll) + (1 - param_smoothing1);
 202     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
 203         upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing1);
 204     } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 205         upper_bound = (1 + (wdf_max / (param_smoothing1 * wt_coll))) *
 206                       (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
 207     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
 208         upper_bound = param_smoothing1 * wt_coll + 1;
 209     } else {
 210         upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * wt_coll));
 211     }
 212
 213     /* Since weight are calculated using log trick, using same with the bounds. Refer
 214      * comment in get_sumpart for the details.
 215      */
 216     double product = upper_bound * param_log;
 217     // weight_collection is really factor.
 218     return (product > 1.0) ? weight_collection * log(product) : 1.0;
 219 }
 220
 221
 222 /* The extra weight component in the Dir+ formula is :-
 223  *
 224  * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
 225  *
 226  * where, |Q| is total query length.
 227  *        |D| is total document length.
 228  */
 229 double
 230 LMWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
 231 {
 232     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 233         double extra_weight = param_smoothing1 / (len + param_smoothing1);
 234         return get_query_length() * log(extra_weight);
 235     }
 236     return 0;
 237 }
 238
 239 double
 240 LMWeight::get_maxextra() const
 241 {
 242     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
 243         double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
 244         return get_query_length() * log(extra_weight);
 245     }
 246     return 0;
 247 }
 248
 249 }