Support: quest -f cjk_ngram
[xapian.git] / xapian-core / weight / lmweight.cc
blobdf754a1e23f37dd855b8eb1ae1429aa765324a8e
1 /** @file lmweight.cc
2 * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
3 */
4 /* Copyright (C) 2012 Gaurav Arora
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/weight.h"
25 #include "debuglog.h"
26 #include "omassert.h"
27 #include "serialise-double.h"
29 #include "xapian/error.h"
31 #include <cmath>
33 using namespace std;
35 namespace Xapian {
37 LMWeight *
38 LMWeight::clone() const {
39 return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
42 void
43 LMWeight::init(double)
45 // Storing collection frequency of current term in collection_freq to be
46 // accessed while smoothing of weights for the term, for term not present
47 // in the document.
48 double collection_freq = get_collection_freq();
50 // Collection_freq of a term in collection should be always greater than or
51 // equal to zero (Non Negative).
52 AssertRel(collection_freq,>=,0);
53 LOGVALUE(WTCALC, collection_freq);
55 // calculating approximate number of total terms in the collection to be
56 // accessed for smoothing of the document.
57 double total_collection_term = get_collection_size() * get_average_length();
59 /* In case the within document frequency of term is zero smoothing will
60 * be required and should be return instead of returning zero, as returning
61 * LM score are multiplication of contribution of all terms, due to absence
62 * of single term whole document is scored zero, hence apply collection
63 * frequency smoothing.
65 weight_collection = double(collection_freq) / total_collection_term;
67 // Total term should be greater than zero as there would be at least one
68 // document in collection.
69 AssertRel(total_collection_term,>,0);
70 LOGVALUE(WTCALC, total_collection_term);
72 // There can't be more relevant term in collection than total number of
73 // term.
74 AssertRel(collection_freq,<=,total_collection_term);
76 /* Setting default values of the param_log to handle negative value of log.
77 * It is considered to be upperbound of document length.
78 * initializing param_log to upperbound of document_length.
81 if (param_log == 0.0) {
82 param_log = get_doclength_upper_bound();
85 /* Since the optimal parameter for Jelinek mercer smoothing
86 * is based on query length, so if query is title query changing
87 * default value of smoothing parameter.
90 if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
91 select_smoothing == TWO_STAGE_SMOOTHING) {
92 if (param_smoothing1 == 0.7) {
93 if (get_query_length() <= 2) {
94 param_smoothing1 = 0.1;
99 /* param_smoothing1 default value should be 2000 in case
100 * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
101 * if user supply his own value for param_smoothing1 value will not be set
102 * to 2000(default value)
104 if (select_smoothing == DIRICHLET_SMOOTHING) {
105 if (param_smoothing1 == 0.7) {
106 param_smoothing1 = 2000;
111 string
112 LMWeight::name() const
114 return "Xapian::LMWeight";
117 string
118 LMWeight::serialise() const
120 string result = serialise_double(param_log);
121 result += static_cast<unsigned char>(select_smoothing);
122 result += serialise_double(param_smoothing1);
123 result += serialise_double(param_smoothing2);
125 return result;
128 LMWeight *
129 LMWeight::unserialise(const string & s) const
131 const char *ptr = s.data();
132 const char *end = ptr + s.size();
133 double param_log_ = unserialise_double(&ptr,end);
134 type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
135 double param_smoothing1_ = unserialise_double(&ptr, end);
136 double param_smoothing2_ = unserialise_double(&ptr, end);
137 if(rare(ptr != end))
138 throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
139 return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
142 double
143 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
144 Xapian::termcount uniqterm) const
146 // Within Document Frequency of the term in document being considered.
147 double wdf_double = wdf;
148 // Length of the Document in terms of number of terms.
149 double len_double = len;
150 // variable to store weight contribution of term in the document scoring for LM.
151 double weight_sum;
153 // Calculating weights considering different smoothing option available to user.
154 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
155 /* Maximum likelihood of current term, weight contribution of term in
156 * case query term is present in the document.
158 double weight_document = wdf_double / len_double;
159 weight_sum = (param_smoothing1 * weight_collection) +
160 ((1 - param_smoothing1) * weight_document);
161 } else if (select_smoothing == DIRICHLET_SMOOTHING) {
162 weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) /
163 (len_double + param_smoothing1);
164 } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
165 double uniqterm_double = uniqterm;
166 weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double);
167 } else {
168 weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection));
171 /* Since LM score is calculated with multiplication, instead of changing
172 * the current implementation log trick have been used to calculate the
173 * product since (sum of log is log of product and since aim is ranking
174 * ranking document by product or log of product won't make a large
175 * difference hence log(product) will be used for ranking.
177 return (weight_sum * param_log > 1.0) ? log(weight_sum * param_log) : 0;
180 double
181 LMWeight::get_maxpart() const
183 // Variable to store the collection frequency
184 double upper_bound;
186 // Calculating upper bound considering different smoothing option available to user.
187 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
188 upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1);
189 } else if (select_smoothing == DIRICHLET_SMOOTHING) {
190 upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1);
191 } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
192 upper_bound = param_smoothing1 * weight_collection + 1;
193 } else {
194 upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection));
197 /* Since weight are calculated using log trick, using same with the bounds. Refer
198 * comment in get_sumpart for the details.
200 return (upper_bound * param_log > 1.0) ? log(upper_bound * param_log) : 1.0;
203 double
204 LMWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
206 return 0;
209 double
210 LMWeight::get_maxextra() const
212 return 0;