Fix whitespace irregularities in code
[xapian.git] / xapian-core / weight / lmweight.cc
blobba6bf345787376f5fb4e98ac94b464478013ea7f
1 /** @file lmweight.cc
2 * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
3 */
4 /* Copyright (C) 2012 Gaurav Arora
5 * Copyright (C) 2016 Olly Betts
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "xapian/weight.h"
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "serialise-double.h"
31 #include "xapian/error.h"
33 #include <cmath>
35 using namespace std;
37 namespace Xapian {
39 LMWeight *
40 LMWeight::clone() const {
41 return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
44 void
45 LMWeight::init(double factor_)
47 // weight_collection is really factor.
48 weight_collection = factor_;
50 /* Setting default values of the param_log to handle negative value of log.
51 * It is considered to be upperbound of document length.
52 * initializing param_log to upperbound of document_length.
55 if (param_log == 0.0) {
56 param_log = get_doclength_upper_bound();
59 /* Since the optimal parameter for Jelinek mercer smoothing
60 * is based on query length, so if query is title query changing
61 * default value of smoothing parameter.
64 if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
65 select_smoothing == TWO_STAGE_SMOOTHING) {
66 if (param_smoothing1 == 0.7) {
67 if (get_query_length() <= 2) {
68 param_smoothing1 = 0.1;
73 /* param_smoothing1 default value should be 2000 in case
74 * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
75 * if user supply his own value for param_smoothing1 value will not be set
76 * to 2000(default value)
78 if (select_smoothing == DIRICHLET_SMOOTHING) {
79 if (param_smoothing1 == 0.7) {
80 param_smoothing1 = 2000;
84 /* Setting param_smoothing1 and param_smoothing2 default value to used when
85 * DIRICHLET_PLUS_SMOOTHING is selected.*/
86 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
87 if (param_smoothing1 == 0.7) {
88 param_smoothing1 = 2000;
93 string
94 LMWeight::name() const
96 return "Xapian::LMWeight";
99 string
100 LMWeight::serialise() const
102 string result = serialise_double(param_log);
103 result += static_cast<unsigned char>(select_smoothing);
104 result += serialise_double(param_smoothing1);
105 result += serialise_double(param_smoothing2);
106 return result;
109 LMWeight *
110 LMWeight::unserialise(const string & s) const
112 const char *ptr = s.data();
113 const char *end = ptr + s.size();
114 double param_log_ = unserialise_double(&ptr, end);
115 type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
116 double param_smoothing1_ = unserialise_double(&ptr, end);
117 double param_smoothing2_ = unserialise_double(&ptr, end);
118 if (rare(ptr != end))
119 throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
120 return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
123 double
124 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
125 Xapian::termcount uniqterm) const
127 // Within Document Frequency of the term in document being considered.
128 double wdf_double = wdf;
129 // Length of the Document in terms of number of terms.
130 double len_double = len;
131 // variable to store weight contribution of term in the document scoring for LM.
132 double weight_sum;
134 /* In case the within document frequency of term is zero smoothing will
135 * be required and should be return instead of returning zero, as returning
136 * LM score are multiplication of contribution of all terms, due to absence
137 * of single term whole document is scored zero, hence apply collection
138 * frequency smoothing.
140 double wt_coll =
141 get_collection_freq() / (get_collection_size() * get_average_length());
143 // Calculating weights considering different smoothing option available to user.
144 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
145 /* Maximum likelihood of current term, weight contribution of term in
146 * case query term is present in the document.
148 double weight_document = wdf_double / len_double;
149 weight_sum = (param_smoothing1 * wt_coll) +
150 ((1 - param_smoothing1) * weight_document);
151 } else if (select_smoothing == DIRICHLET_SMOOTHING) {
152 weight_sum = (wdf_double + (param_smoothing1 * wt_coll)) /
153 (len_double + param_smoothing1);
154 } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
155 /* In the Dir+ weighting formula, sumpart weight contribution is :-
157 * sum of log of (1 + (wdf/(param_smoothing1 * wt_coll))) and
158 * log of (1 + (delta/param_smoothing1 * wt_coll))).
159 * Since, sum of logs is log of product so weight_sum is calculated as product
160 * of terms in log in the Dir+ formula.
162 weight_sum = (1 + (wdf_double / (param_smoothing1 * wt_coll))) *
163 (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
164 } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
165 double uniqterm_double = uniqterm;
166 weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * wt_coll * uniqterm_double) / len_double);
167 } else {
168 weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * wt_coll)) / (len_double + param_smoothing2)) + (param_smoothing1 * wt_coll));
171 /* Since LM score is calculated with multiplication, instead of changing
172 * the current implementation log trick have been used to calculate the
173 * product since (sum of log is log of product and since aim is ranking
174 * ranking document by product or log of product won't make a large
175 * difference hence log(product) will be used for ranking.
177 double product = weight_sum * param_log;
178 // weight_collection is really factor.
179 return (product > 1.0) ? weight_collection * log(product) : 0;
182 double
183 LMWeight::get_maxpart() const
185 // Variable to store the collection frequency
186 double upper_bound;
187 // Store upper bound on wdf in variable wdf_max
188 double wdf_max = get_wdf_upper_bound();
190 /* In case the within document frequency of term is zero smoothing will
191 * be required and should be return instead of returning zero, as
192 * returning LM score are multiplication of contribution of all terms,
193 * due to absence of single term whole document is scored zero, hence
194 * apply collection frequency smoothing.
196 double wt_coll =
197 get_collection_freq() / (get_collection_size() * get_average_length());
199 // Calculating upper bound considering different smoothing option available to user.
200 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
201 upper_bound = (param_smoothing1 * wt_coll) + (1 - param_smoothing1);
202 } else if (select_smoothing == DIRICHLET_SMOOTHING) {
203 upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing1);
204 } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
205 upper_bound = (1 + (wdf_max / (param_smoothing1 * wt_coll))) *
206 (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
207 } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
208 upper_bound = param_smoothing1 * wt_coll + 1;
209 } else {
210 upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * wt_coll));
213 /* Since weight are calculated using log trick, using same with the bounds. Refer
214 * comment in get_sumpart for the details.
216 double product = upper_bound * param_log;
217 // weight_collection is really factor.
218 return (product > 1.0) ? weight_collection * log(product) : 1.0;
222 /* The extra weight component in the Dir+ formula is :-
224 * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
226 * where, |Q| is total query length.
227 * |D| is total document length.
229 double
230 LMWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
232 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
233 double extra_weight = param_smoothing1 / (len + param_smoothing1);
234 return get_query_length() * log(extra_weight);
236 return 0;
239 double
240 LMWeight::get_maxextra() const
242 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
243 double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
244 return get_query_length() * log(extra_weight);
246 return 0;