Add colon after prefix when term starts with a colon
[xapian.git] / xapian-core / weight / lmweight.cc
blobdb803067aef1a45ee029c2d090766434c205a942
1 /** @file lmweight.cc
2 * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
3 */
4 /* Copyright (C) 2012 Gaurav Arora
5 * Copyright (C) 2016 Olly Betts
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "xapian/weight.h"
26 #include "weightinternal.h"
28 #include "debuglog.h"
29 #include "omassert.h"
30 #include "serialise-double.h"
32 #include "xapian/error.h"
34 #include <cerrno>
35 #include <cmath>
36 #include <cstdlib>
38 using namespace std;
40 namespace Xapian {
42 LMWeight *
43 LMWeight::clone() const {
44 return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
47 void
48 LMWeight::init(double factor_)
50 factor = factor_;
52 // Storing collection frequency of current term in collection_freq to be
53 // accessed while smoothing of weights for the term, for term not present
54 // in the document.
55 double collection_freq = get_collection_freq();
57 // Collection_freq of a term in collection should be always greater than or
58 // equal to zero (Non Negative).
59 AssertRel(collection_freq,>=,0);
60 LOGVALUE(WTCALC, collection_freq);
62 // calculating approximate number of total terms in the collection to be
63 // accessed for smoothing of the document.
64 double total_collection_term = get_collection_size() * get_average_length();
66 /* In case the within document frequency of term is zero smoothing will
67 * be required and should be return instead of returning zero, as returning
68 * LM score are multiplication of contribution of all terms, due to absence
69 * of single term whole document is scored zero, hence apply collection
70 * frequency smoothing.
72 weight_collection = double(collection_freq) / total_collection_term;
74 // Total term should be greater than zero as there would be at least one
75 // document in collection.
76 AssertRel(total_collection_term,>,0);
77 LOGVALUE(WTCALC, total_collection_term);
79 // There can't be more relevant term in collection than total number of
80 // term.
81 AssertRel(collection_freq,<=,total_collection_term);
83 /* Setting default values of the param_log to handle negative value of log.
84 * It is considered to be upperbound of document length.
85 * initializing param_log to upperbound of document_length.
88 if (param_log == 0.0) {
89 param_log = get_doclength_upper_bound();
92 /* Since the optimal parameter for Jelinek mercer smoothing
93 * is based on query length, so if query is title query changing
94 * default value of smoothing parameter.
97 if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
98 select_smoothing == TWO_STAGE_SMOOTHING) {
99 if (param_smoothing1 == 0.7) {
100 if (get_query_length() <= 2) {
101 param_smoothing1 = 0.1;
106 /* param_smoothing1 default value should be 2000 in case
107 * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
108 * if user supply his own value for param_smoothing1 value will not be set
109 * to 2000(default value)
111 if (select_smoothing == DIRICHLET_SMOOTHING) {
112 if (param_smoothing1 == 0.7) {
113 param_smoothing1 = 2000;
117 /* Setting param_smoothing1 and param_smoothing2 default value to used when
118 * DIRICHLET_PLUS_SMOOTHING is selected.*/
119 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
120 if (param_smoothing1 == 0.7) {
121 param_smoothing1 = 2000;
126 string
127 LMWeight::name() const
129 return "Xapian::LMWeight";
132 string
133 LMWeight::short_name() const
135 return "lm";
138 string
139 LMWeight::serialise() const
141 string result = serialise_double(param_log);
142 result += static_cast<unsigned char>(select_smoothing);
143 result += serialise_double(param_smoothing1);
144 result += serialise_double(param_smoothing2);
145 return result;
148 LMWeight *
149 LMWeight::unserialise(const string & s) const
151 const char *ptr = s.data();
152 const char *end = ptr + s.size();
153 double param_log_ = unserialise_double(&ptr, end);
154 type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
155 double param_smoothing1_ = unserialise_double(&ptr, end);
156 double param_smoothing2_ = unserialise_double(&ptr, end);
157 if (rare(ptr != end))
158 throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
159 return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
162 double
163 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
164 Xapian::termcount uniqterm) const
166 // Within Document Frequency of the term in document being considered.
167 double wdf_double = wdf;
168 // Length of the Document in terms of number of terms.
169 double len_double = len;
170 // variable to store weight contribution of term in the document scoring for LM.
171 double weight_sum;
173 // Calculating weights considering different smoothing option available to user.
174 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
175 /* Maximum likelihood of current term, weight contribution of term in
176 * case query term is present in the document.
178 double weight_document = wdf_double / len_double;
179 weight_sum = (param_smoothing1 * weight_collection) +
180 ((1 - param_smoothing1) * weight_document);
181 } else if (select_smoothing == DIRICHLET_SMOOTHING) {
182 weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) /
183 (len_double + param_smoothing1);
184 } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
185 /* In the Dir+ weighting formula, sumpart weight contribution is :-
187 * sum of log of (1 + (wdf/(param_smoothing1 * weight_collection))) and
188 * log of (1 + (delta/param_smoothing1 * weight_collection))).
189 * Since, sum of logs is log of product so weight_sum is calculated as product
190 * of terms in log in the Dir+ formula.
192 weight_sum = (1 + (wdf_double / (param_smoothing1 * weight_collection))) *
193 (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
194 } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
195 double uniqterm_double = uniqterm;
196 weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double);
197 } else {
198 weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection));
201 /* Since LM score is calculated with multiplication, instead of changing
202 * the current implementation log trick have been used to calculate the
203 * product since (sum of log is log of product and since aim is ranking
204 * ranking document by product or log of product won't make a large
205 * difference hence log(product) will be used for ranking.
207 double product = weight_sum * param_log;
208 return (product > 1.0) ? factor * log(product) : 0;
211 double
212 LMWeight::get_maxpart() const
214 // Variable to store the collection frequency
215 double upper_bound;
216 // Store upper bound on wdf in variable wdf_max
217 double wdf_max = get_wdf_upper_bound();
219 // Calculating upper bound considering different smoothing option available to user.
220 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
221 upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1);
222 } else if (select_smoothing == DIRICHLET_SMOOTHING) {
223 upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1);
224 } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
225 upper_bound = (1 + (wdf_max / (param_smoothing1 * weight_collection))) *
226 (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
227 } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
228 upper_bound = param_smoothing1 * weight_collection + 1;
229 } else {
230 upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection));
233 /* Since weight are calculated using log trick, using same with the bounds. Refer
234 * comment in get_sumpart for the details.
236 double product = upper_bound * param_log;
237 return (product > 1.0) ? factor * log(product) : 1.0;
241 /* The extra weight component in the Dir+ formula is :-
243 * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
245 * where, |Q| is total query length.
246 * |D| is total document length.
248 double
249 LMWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
251 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
252 double extra_weight = param_smoothing1 / (len + param_smoothing1);
253 return get_query_length() * log(extra_weight);
255 return 0;
258 double
259 LMWeight::get_maxextra() const
261 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
262 double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
263 return get_query_length() * log(extra_weight);
265 return 0;
268 static bool
269 type_smoothing_param(const char ** p, Xapian::Weight::type_smoothing * ptr_val)
271 char *end;
272 errno = 0;
273 int v = strtol(*p, &end, 10);
274 if (*p == end || errno || v < 1 || v > 5)
275 return false;
276 *p = end;
277 static const Xapian::Weight::type_smoothing smooth_tab[5] = {
278 Xapian::Weight::TWO_STAGE_SMOOTHING,
279 Xapian::Weight::DIRICHLET_SMOOTHING,
280 Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING,
281 Xapian::Weight::JELINEK_MERCER_SMOOTHING,
282 Xapian::Weight::DIRICHLET_PLUS_SMOOTHING
284 *ptr_val = smooth_tab[v - 1];
285 return true;
288 LMWeight *
289 LMWeight::create_from_parameters(const char * p) const
291 if (*p == '\0')
292 return new Xapian::LMWeight();
293 double param_log_ = 0;
294 Xapian::Weight::type_smoothing type = Xapian::Weight::TWO_STAGE_SMOOTHING;
295 double smoothing1 = 0.7;
296 double smoothing2 = 2000;
297 if (!Xapian::Weight::Internal::double_param(&p, &param_log_))
298 Xapian::Weight::Internal::parameter_error("Parameter 1 (log) is invalid", "lm");
299 if (*p && !type_smoothing_param(&p, &type))
300 Xapian::Weight::Internal::parameter_error("Parameter 2 (smoothing_type) is invalid", "lm");
301 if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing1))
302 Xapian::Weight::Internal::parameter_error("Parameter 3 (smoothing1) is invalid", "lm");
303 if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing2))
304 Xapian::Weight::Internal::parameter_error("Parameter 4 (smoothing2) is invalid", "lm");
305 if (*p)
306 Xapian::Weight::Internal::parameter_error("Extra data after parameter 4", "lm");
307 return new Xapian::LMWeight(param_log_, type, smoothing1, smoothing2);