xapian-core/weight/lmweight.cc

   1 /** @file lmweight.cc
   2  * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
   3  */
   4 /* Copyright (C) 2012 Gaurav Arora
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "xapian/weight.h"
  24
  25 #include "debuglog.h"
  26 #include "omassert.h"
  27 #include "serialise-double.h"
  28
  29 #include "xapian/error.h"
  30
  31 #include <cmath>
  32
  33 using namespace std;
  34
  35 namespace Xapian {
  36
  37 LMWeight *
  38 LMWeight::clone() const  {
  39     return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
  40 }
  41
  42 void
  43 LMWeight::init(double)
  44 {
  45     // Storing collection frequency of current term in collection_freq to be
  46     // accessed while smoothing of weights for the term, for term not present
  47     // in the document.
  48     double collection_freq = get_collection_freq();
  49
  50     // Collection_freq of a term in collection should be always greater than or
  51     // equal to zero (Non Negative).
  52     AssertRel(collection_freq,>=,0);
  53     LOGVALUE(WTCALC, collection_freq);
  54
  55     // calculating approximate number of total terms in the collection to be
  56     // accessed for smoothing of the document.
  57     double total_collection_term = get_collection_size() * get_average_length();
  58
  59     /* In case the within document frequency of term is zero smoothing will
  60      * be required and should be return instead of returning zero, as returning
  61      * LM score are multiplication of contribution of all terms, due to absence
  62      * of single term whole document is scored zero, hence apply collection
  63      * frequency smoothing.
  64      */
  65     weight_collection = double(collection_freq) / total_collection_term;
  66
  67     // Total term should be greater than zero as there would be at least one
  68     // document in collection.
  69     AssertRel(total_collection_term,>,0);
  70     LOGVALUE(WTCALC, total_collection_term);
  71
  72     // There can't be more relevant term in collection than total number of
  73     // term.
  74     AssertRel(collection_freq,<=,total_collection_term);
  75
  76     /* Setting default values of the param_log to handle negative value of log.
  77      * It is considered to be upperbound of document length.
  78      * initializing param_log to upperbound of document_length.
  79      */
  80
  81     if (param_log == 0.0) {
  82         param_log = get_doclength_upper_bound();
  83     }
  84
  85     /* Since the optimal parameter for Jelinek mercer smoothing
  86      * is based on query length, so if query is title query changing
  87      * default value of smoothing parameter.
  88      */
  89
  90     if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
  91         select_smoothing == TWO_STAGE_SMOOTHING) {
  92         if (param_smoothing1 == 0.7) {
  93             if (get_query_length() <= 2) {
  94                 param_smoothing1 = 0.1;
  95             }
  96         }
  97     }
  98
  99     /* param_smoothing1 default value should be 2000 in case
 100      * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
 101      * if user supply his own value for param_smoothing1 value will not be set
 102      * to 2000(default value)
 103      */
 104     if (select_smoothing == DIRICHLET_SMOOTHING) {
 105         if (param_smoothing1 == 0.7) {
 106             param_smoothing1 = 2000;
 107         }
 108     }
 109 }
 110
 111 string
 112 LMWeight::name() const
 113 {
 114     return "Xapian::LMWeight";
 115 }
 116
 117 string
 118 LMWeight::serialise() const
 119 {
 120     string result = serialise_double(param_log);
 121     result += static_cast<unsigned char>(select_smoothing);
 122     result += serialise_double(param_smoothing1);
 123     result += serialise_double(param_smoothing2);
 124
 125     return result;
 126 }
 127
 128 LMWeight *
 129 LMWeight::unserialise(const string & s) const
 130 {
 131     const char *ptr =  s.data();
 132     const char *end = ptr + s.size();
 133     double param_log_ = unserialise_double(&ptr,end);
 134     type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
 135     double param_smoothing1_ = unserialise_double(&ptr, end);
 136     double param_smoothing2_ = unserialise_double(&ptr, end);
 137     if(rare(ptr != end))
 138         throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
 139     return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
 140 }
 141
 142 double
 143 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
 144                       Xapian::termcount uniqterm) const
 145 {
 146     // Within Document Frequency of the term in document being considered.
 147     double wdf_double = wdf;
 148     // Length of the Document in terms of number of terms.
 149     double len_double = len;
 150     // variable to store weight contribution of term in the document scoring for LM.
 151     double weight_sum;
 152
 153     // Calculating weights considering different smoothing option available to user.
 154     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
 155         /* Maximum likelihood of current term, weight contribution of term in
 156          * case query term is present in the document.
 157          */
 158         double weight_document = wdf_double / len_double;
 159         weight_sum = (param_smoothing1 * weight_collection) +
 160                      ((1 - param_smoothing1) * weight_document);
 161     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
 162         weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) /
 163                      (len_double + param_smoothing1);
 164     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
 165         double uniqterm_double = uniqterm;
 166         weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double);
 167     } else {
 168         weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection));
 169     }
 170
 171     /* Since LM score is calculated with multiplication, instead of changing
 172      * the current implementation log trick have been used to calculate the
 173      * product since (sum of log is log of product and since aim is ranking
 174      * ranking document by product or log of product won't make a large
 175      * difference hence log(product) will be used for ranking.
 176      */
 177     return (weight_sum * param_log > 1.0) ? log(weight_sum * param_log) : 0;
 178 }
 179
 180 double
 181 LMWeight::get_maxpart() const
 182 {
 183     // Variable to store the collection frequency
 184     double upper_bound;
 185
 186     // Calculating upper bound considering different smoothing option available to user.
 187     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
 188         upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1);
 189     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
 190         upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1);
 191     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
 192         upper_bound =  param_smoothing1 * weight_collection + 1;
 193     } else {
 194         upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection));
 195     }
 196
 197     /* Since weight are calculated using log trick, using same with the bounds. Refer
 198      * comment in get_sumpart for the details.
 199      */
 200     return (upper_bound * param_log > 1.0) ? log(upper_bound * param_log) : 1.0;
 201 }
 202
 203 double
 204 LMWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
 205 {
 206     return 0;
 207 }
 208
 209 double
 210 LMWeight::get_maxextra() const
 211 {
 212     return 0;
 213 }
 214
 215 }