2 * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
4 /* Copyright (C) 2012 Gaurav Arora
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "xapian/weight.h"
27 #include "serialise-double.h"
29 #include "xapian/error.h"
38 LMWeight::clone() const {
39 return new LMWeight(param_log
, select_smoothing
, param_smoothing1
, param_smoothing2
);
43 LMWeight::init(double)
45 // Storing collection frequency of current term in collection_freq to be
46 // accessed while smoothing of weights for the term, for term not present
48 double collection_freq
= get_collection_freq();
50 // Collection_freq of a term in collection should be always greater than or
51 // equal to zero (Non Negative).
52 AssertRel(collection_freq
,>=,0);
53 LOGVALUE(WTCALC
, collection_freq
);
55 // calculating approximate number of total terms in the collection to be
56 // accessed for smoothing of the document.
57 double total_collection_term
= get_collection_size() * get_average_length();
59 /* In case the within document frequency of term is zero smoothing will
60 * be required and should be return instead of returning zero, as returning
61 * LM score are multiplication of contribution of all terms, due to absence
62 * of single term whole document is scored zero, hence apply collection
63 * frequency smoothing.
65 weight_collection
= double(collection_freq
) / total_collection_term
;
67 // Total term should be greater than zero as there would be at least one
68 // document in collection.
69 AssertRel(total_collection_term
,>,0);
70 LOGVALUE(WTCALC
, total_collection_term
);
72 // There can't be more relevant term in collection than total number of
74 AssertRel(collection_freq
,<=,total_collection_term
);
76 /* Setting default values of the param_log to handle negative value of log.
77 * It is considered to be upperbound of document length.
78 * initializing param_log to upperbound of document_length.
81 if (param_log
== 0.0) {
82 param_log
= get_doclength_upper_bound();
85 /* Since the optimal parameter for Jelinek mercer smoothing
86 * is based on query length, so if query is title query changing
87 * default value of smoothing parameter.
90 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
||
91 select_smoothing
== TWO_STAGE_SMOOTHING
) {
92 if (param_smoothing1
== 0.7) {
93 if (get_query_length() <= 2) {
94 param_smoothing1
= 0.1;
99 /* param_smoothing1 default value should be 2000 in case
100 * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
101 * if user supply his own value for param_smoothing1 value will not be set
102 * to 2000(default value)
104 if (select_smoothing
== DIRICHLET_SMOOTHING
) {
105 if (param_smoothing1
== 0.7) {
106 param_smoothing1
= 2000;
112 LMWeight::name() const
114 return "Xapian::LMWeight";
118 LMWeight::serialise() const
120 string result
= serialise_double(param_log
);
121 result
+= static_cast<unsigned char>(select_smoothing
);
122 result
+= serialise_double(param_smoothing1
);
123 result
+= serialise_double(param_smoothing2
);
129 LMWeight::unserialise(const string
& s
) const
131 const char *ptr
= s
.data();
132 const char *end
= ptr
+ s
.size();
133 double param_log_
= unserialise_double(&ptr
,end
);
134 type_smoothing select_smoothing_
= static_cast<type_smoothing
>(*(ptr
)++);
135 double param_smoothing1_
= unserialise_double(&ptr
, end
);
136 double param_smoothing2_
= unserialise_double(&ptr
, end
);
138 throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
139 return new LMWeight(param_log_
, select_smoothing_
, param_smoothing1_
, param_smoothing2_
);
143 LMWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
144 Xapian::termcount uniqterm
) const
146 // Within Document Frequency of the term in document being considered.
147 double wdf_double
= wdf
;
148 // Length of the Document in terms of number of terms.
149 double len_double
= len
;
150 // variable to store weight contribution of term in the document scoring for LM.
153 // Calculating weights considering different smoothing option available to user.
154 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
) {
155 /* Maximum likelihood of current term, weight contribution of term in
156 * case query term is present in the document.
158 double weight_document
= wdf_double
/ len_double
;
159 weight_sum
= (param_smoothing1
* weight_collection
) +
160 ((1 - param_smoothing1
) * weight_document
);
161 } else if (select_smoothing
== DIRICHLET_SMOOTHING
) {
162 weight_sum
= (wdf_double
+ (param_smoothing1
* weight_collection
)) /
163 (len_double
+ param_smoothing1
);
164 } else if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
) {
165 double uniqterm_double
= uniqterm
;
166 weight_sum
= ((((wdf_double
- param_smoothing1
) > 0) ? (wdf_double
- param_smoothing1
) : 0) / len_double
) + ((param_smoothing1
* weight_collection
* uniqterm_double
) / len_double
);
168 weight_sum
= (((1 - param_smoothing1
) * (wdf_double
+ (param_smoothing2
* weight_collection
)) / (len_double
+ param_smoothing2
)) + (param_smoothing1
* weight_collection
));
171 /* Since LM score is calculated with multiplication, instead of changing
172 * the current implementation log trick have been used to calculate the
173 * product since (sum of log is log of product and since aim is ranking
174 * ranking document by product or log of product won't make a large
175 * difference hence log(product) will be used for ranking.
177 return (weight_sum
* param_log
> 1.0) ? log(weight_sum
* param_log
) : 0;
181 LMWeight::get_maxpart() const
183 // Variable to store the collection frequency
186 // Calculating upper bound considering different smoothing option available to user.
187 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
) {
188 upper_bound
= (param_smoothing1
* weight_collection
) + (1 - param_smoothing1
);
189 } else if (select_smoothing
== DIRICHLET_SMOOTHING
) {
190 upper_bound
= (get_doclength_upper_bound() + (param_smoothing1
* weight_collection
)) / (get_doclength_upper_bound() + param_smoothing1
);
191 } else if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
) {
192 upper_bound
= param_smoothing1
* weight_collection
+ 1;
194 upper_bound
= (((1 - param_smoothing1
) * (get_doclength_upper_bound() + (param_smoothing2
* weight_collection
)) / (get_doclength_upper_bound() + param_smoothing2
)) + (param_smoothing1
* weight_collection
));
197 /* Since weight are calculated using log trick, using same with the bounds. Refer
198 * comment in get_sumpart for the details.
200 return (upper_bound
* param_log
> 1.0) ? log(upper_bound
* param_log
) : 1.0;
204 LMWeight::get_sumextra(Xapian::termcount
, Xapian::termcount
) const
210 LMWeight::get_maxextra() const