2 * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
4 /* Copyright (C) 2012 Gaurav Arora
5 * Copyright (C) 2016 Olly Betts
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "xapian/weight.h"
26 #include "weightinternal.h"
30 #include "serialise-double.h"
32 #include "xapian/error.h"
43 LMWeight::clone() const {
44 return new LMWeight(param_log
, select_smoothing
, param_smoothing1
, param_smoothing2
);
48 LMWeight::init(double factor_
)
52 // Storing collection frequency of current term in collection_freq to be
53 // accessed while smoothing of weights for the term, for term not present
55 double collection_freq
= get_collection_freq();
57 // Collection_freq of a term in collection should be always greater than or
58 // equal to zero (Non Negative).
59 AssertRel(collection_freq
,>=,0);
60 LOGVALUE(WTCALC
, collection_freq
);
62 // calculating approximate number of total terms in the collection to be
63 // accessed for smoothing of the document.
64 double total_collection_term
= get_collection_size() * get_average_length();
66 /* In case the within document frequency of term is zero smoothing will
67 * be required and should be return instead of returning zero, as returning
68 * LM score are multiplication of contribution of all terms, due to absence
69 * of single term whole document is scored zero, hence apply collection
70 * frequency smoothing.
72 weight_collection
= double(collection_freq
) / total_collection_term
;
74 // Total term should be greater than zero as there would be at least one
75 // document in collection.
76 AssertRel(total_collection_term
,>,0);
77 LOGVALUE(WTCALC
, total_collection_term
);
79 // There can't be more relevant term in collection than total number of
81 AssertRel(collection_freq
,<=,total_collection_term
);
83 /* Setting default values of the param_log to handle negative value of log.
84 * It is considered to be upperbound of document length.
85 * initializing param_log to upperbound of document_length.
88 if (param_log
== 0.0) {
89 param_log
= get_doclength_upper_bound();
92 /* Since the optimal parameter for Jelinek mercer smoothing
93 * is based on query length, so if query is title query changing
94 * default value of smoothing parameter.
97 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
||
98 select_smoothing
== TWO_STAGE_SMOOTHING
) {
99 if (param_smoothing1
== 0.7) {
100 if (get_query_length() <= 2) {
101 param_smoothing1
= 0.1;
106 /* param_smoothing1 default value should be 2000 in case
107 * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
108 * if user supply his own value for param_smoothing1 value will not be set
109 * to 2000(default value)
111 if (select_smoothing
== DIRICHLET_SMOOTHING
) {
112 if (param_smoothing1
== 0.7) {
113 param_smoothing1
= 2000;
117 /* Setting param_smoothing1 and param_smoothing2 default value to used when
118 * DIRICHLET_PLUS_SMOOTHING is selected.*/
119 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
120 if (param_smoothing1
== 0.7) {
121 param_smoothing1
= 2000;
127 LMWeight::name() const
129 return "Xapian::LMWeight";
133 LMWeight::short_name() const
139 LMWeight::serialise() const
141 string result
= serialise_double(param_log
);
142 result
+= static_cast<unsigned char>(select_smoothing
);
143 result
+= serialise_double(param_smoothing1
);
144 result
+= serialise_double(param_smoothing2
);
149 LMWeight::unserialise(const string
& s
) const
151 const char *ptr
= s
.data();
152 const char *end
= ptr
+ s
.size();
153 double param_log_
= unserialise_double(&ptr
, end
);
154 type_smoothing select_smoothing_
= static_cast<type_smoothing
>(*(ptr
)++);
155 double param_smoothing1_
= unserialise_double(&ptr
, end
);
156 double param_smoothing2_
= unserialise_double(&ptr
, end
);
157 if (rare(ptr
!= end
))
158 throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
159 return new LMWeight(param_log_
, select_smoothing_
, param_smoothing1_
, param_smoothing2_
);
163 LMWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
164 Xapian::termcount uniqterm
) const
166 // Within Document Frequency of the term in document being considered.
167 double wdf_double
= wdf
;
168 // Length of the Document in terms of number of terms.
169 double len_double
= len
;
170 // variable to store weight contribution of term in the document scoring for LM.
173 // Calculating weights considering different smoothing option available to user.
174 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
) {
175 /* Maximum likelihood of current term, weight contribution of term in
176 * case query term is present in the document.
178 double weight_document
= wdf_double
/ len_double
;
179 weight_sum
= (param_smoothing1
* weight_collection
) +
180 ((1 - param_smoothing1
) * weight_document
);
181 } else if (select_smoothing
== DIRICHLET_SMOOTHING
) {
182 weight_sum
= (wdf_double
+ (param_smoothing1
* weight_collection
)) /
183 (len_double
+ param_smoothing1
);
184 } else if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
185 /* In the Dir+ weighting formula, sumpart weight contribution is :-
187 * sum of log of (1 + (wdf/(param_smoothing1 * weight_collection))) and
188 * log of (1 + (delta/param_smoothing1 * weight_collection))).
189 * Since, sum of logs is log of product so weight_sum is calculated as product
190 * of terms in log in the Dir+ formula.
192 weight_sum
= (1 + (wdf_double
/ (param_smoothing1
* weight_collection
))) *
193 (1 + (param_smoothing2
/ (param_smoothing1
* weight_collection
)));
194 } else if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
) {
195 double uniqterm_double
= uniqterm
;
196 weight_sum
= ((((wdf_double
- param_smoothing1
) > 0) ? (wdf_double
- param_smoothing1
) : 0) / len_double
) + ((param_smoothing1
* weight_collection
* uniqterm_double
) / len_double
);
198 weight_sum
= (((1 - param_smoothing1
) * (wdf_double
+ (param_smoothing2
* weight_collection
)) / (len_double
+ param_smoothing2
)) + (param_smoothing1
* weight_collection
));
201 /* Since LM score is calculated with multiplication, instead of changing
202 * the current implementation log trick have been used to calculate the
203 * product since (sum of log is log of product and since aim is ranking
204 * ranking document by product or log of product won't make a large
205 * difference hence log(product) will be used for ranking.
207 double product
= weight_sum
* param_log
;
208 return (product
> 1.0) ? factor
* log(product
) : 0;
212 LMWeight::get_maxpart() const
214 // Variable to store the collection frequency
216 // Store upper bound on wdf in variable wdf_max
217 double wdf_max
= get_wdf_upper_bound();
219 // Calculating upper bound considering different smoothing option available to user.
220 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
) {
221 upper_bound
= (param_smoothing1
* weight_collection
) + (1 - param_smoothing1
);
222 } else if (select_smoothing
== DIRICHLET_SMOOTHING
) {
223 upper_bound
= (get_doclength_upper_bound() + (param_smoothing1
* weight_collection
)) / (get_doclength_upper_bound() + param_smoothing1
);
224 } else if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
225 upper_bound
= (1 + (wdf_max
/ (param_smoothing1
* weight_collection
))) *
226 (1 + (param_smoothing2
/ (param_smoothing1
* weight_collection
)));
227 } else if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
) {
228 upper_bound
= param_smoothing1
* weight_collection
+ 1;
230 upper_bound
= (((1 - param_smoothing1
) * (get_doclength_upper_bound() + (param_smoothing2
* weight_collection
)) / (get_doclength_upper_bound() + param_smoothing2
)) + (param_smoothing1
* weight_collection
));
233 /* Since weight are calculated using log trick, using same with the bounds. Refer
234 * comment in get_sumpart for the details.
236 double product
= upper_bound
* param_log
;
237 return (product
> 1.0) ? factor
* log(product
) : 1.0;
241 /* The extra weight component in the Dir+ formula is :-
243 * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
245 * where, |Q| is total query length.
246 * |D| is total document length.
249 LMWeight::get_sumextra(Xapian::termcount len
, Xapian::termcount
) const
251 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
252 double extra_weight
= param_smoothing1
/ (len
+ param_smoothing1
);
253 return get_query_length() * log(extra_weight
);
259 LMWeight::get_maxextra() const
261 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
262 double extra_weight
= param_smoothing1
/ (get_doclength_lower_bound() + param_smoothing1
);
263 return get_query_length() * log(extra_weight
);
269 type_smoothing_param(const char ** p
, Xapian::Weight::type_smoothing
* ptr_val
)
273 int v
= strtol(*p
, &end
, 10);
274 if (*p
== end
|| errno
|| v
< 1 || v
> 5)
277 static const Xapian::Weight::type_smoothing smooth_tab
[5] = {
278 Xapian::Weight::TWO_STAGE_SMOOTHING
,
279 Xapian::Weight::DIRICHLET_SMOOTHING
,
280 Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING
,
281 Xapian::Weight::JELINEK_MERCER_SMOOTHING
,
282 Xapian::Weight::DIRICHLET_PLUS_SMOOTHING
284 *ptr_val
= smooth_tab
[v
- 1];
289 LMWeight::create_from_parameters(const char * p
) const
292 return new Xapian::LMWeight();
293 double param_log_
= 0;
294 Xapian::Weight::type_smoothing type
= Xapian::Weight::TWO_STAGE_SMOOTHING
;
295 double smoothing1
= 0.7;
296 double smoothing2
= 2000;
297 if (!Xapian::Weight::Internal::double_param(&p
, ¶m_log_
))
298 Xapian::Weight::Internal::parameter_error("Parameter 1 (log) is invalid", "lm");
299 if (*p
&& !type_smoothing_param(&p
, &type
))
300 Xapian::Weight::Internal::parameter_error("Parameter 2 (smoothing_type) is invalid", "lm");
301 if (*p
&& !Xapian::Weight::Internal::double_param(&p
, &smoothing1
))
302 Xapian::Weight::Internal::parameter_error("Parameter 3 (smoothing1) is invalid", "lm");
303 if (*p
&& !Xapian::Weight::Internal::double_param(&p
, &smoothing2
))
304 Xapian::Weight::Internal::parameter_error("Parameter 4 (smoothing2) is invalid", "lm");
306 Xapian::Weight::Internal::parameter_error("Extra data after parameter 4", "lm");
307 return new Xapian::LMWeight(param_log_
, type
, smoothing1
, smoothing2
);