[java] Improve build on FreeBSD and DragonFlyBSD
[xapian.git] / xapian-core / weight / bm25weight.cc
blob58591a91fa91eb91a6d7a94f9e863167e1f5b336
1 /** @file
2 * @brief Xapian::BM25Weight class - the BM25 probabilistic formula
3 */
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2024 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/weight.h"
24 #include "weightinternal.h"
26 #include "debuglog.h"
27 #include "omassert.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
32 #include <algorithm>
33 #include <cmath>
35 using namespace std;
37 namespace Xapian {
39 BM25Weight *
40 BM25Weight::clone() const
42 return new BM25Weight(param_k1, param_k2, param_k3, param_b,
43 param_min_normlen);
46 void
47 BM25Weight::init(double factor)
49 Xapian::doccount tf = get_termfreq();
51 double tw = 0;
52 if (get_rset_size() != 0) {
53 Xapian::doccount reltermfreq = get_reltermfreq();
55 // There can't be more relevant documents indexed by a term than there
56 // are documents indexed by that term.
57 AssertRel(reltermfreq,<=,tf);
59 // There can't be more relevant documents indexed by a term than there
60 // are relevant documents.
61 AssertRel(reltermfreq,<=,get_rset_size());
63 Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
65 // There can't be more relevant documents not indexed by a term than
66 // there are documents not indexed by that term.
67 AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
69 Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
71 Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
72 double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
73 double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
74 tw = numerator / denom;
75 } else {
76 tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
79 AssertRel(tw,>,0);
81 // The "official" formula can give a negative termweight in unusual cases
82 // (without an RSet, when a term indexes more than half the documents in
83 // the database). These negative weights aren't actually helpful, and it
84 // is common for implementations to replace them with a small positive
85 // weight or similar.
87 // Truncating to zero doesn't seem a great approach in practice as it
88 // means that some terms in the query can have no effect at all on the
89 // ranking, and that some results can have zero weight, both of which
90 // are seem surprising.
92 // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
93 // more than a third of documents, which seems rather "intrusive". That's
94 // what the code currently enabled does, but perhaps it would be better to
95 // do something else. (FIXME)
96 #if 0
97 if (rare(tw <= 1.0)) {
98 termweight = 0;
99 } else {
100 termweight = log(tw) * factor;
101 if (param_k3 != 0) {
102 double wqf_double = get_wqf();
103 termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
106 #else
107 if (tw < 2) tw = tw * 0.5 + 1;
108 termweight = log(tw) * factor;
109 if (param_k3 != 0) {
110 double wqf_double = get_wqf();
111 termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
113 #endif
114 termweight *= (param_k1 + 1);
116 LOGVALUE(WTCALC, termweight);
118 if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
119 // If k2 is 0, and either param_b or param_k1 is 0 then the document
120 // length doesn't affect the weight.
121 len_factor = 0;
122 } else {
123 len_factor = get_average_length();
124 // len_factor can be zero if all documents are empty (or the database
125 // is empty!)
126 if (len_factor != 0) len_factor = 1 / len_factor;
129 LOGVALUE(WTCALC, len_factor);
132 string
133 BM25Weight::name() const
135 return "bm25";
138 string
139 BM25Weight::serialise() const
141 string result = serialise_double(param_k1);
142 result += serialise_double(param_k2);
143 result += serialise_double(param_k3);
144 result += serialise_double(param_b);
145 result += serialise_double(param_min_normlen);
146 return result;
149 BM25Weight *
150 BM25Weight::unserialise(const string & s) const
152 const char *ptr = s.data();
153 const char *end = ptr + s.size();
154 double k1 = unserialise_double(&ptr, end);
155 double k2 = unserialise_double(&ptr, end);
156 double k3 = unserialise_double(&ptr, end);
157 double b = unserialise_double(&ptr, end);
158 double min_normlen = unserialise_double(&ptr, end);
159 if (rare(ptr != end))
160 throw Xapian::SerialisationError("Extra data in BM25Weight::unserialise()");
161 return new BM25Weight(k1, k2, k3, b, min_normlen);
164 double
165 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
166 Xapian::termcount, Xapian::termcount) const
168 LOGCALL(WTCALC, double, "BM25Weight::get_sumpart", wdf | len);
169 Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
171 double wdf_double = wdf;
172 double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
173 AssertRel(denom,>,0);
174 RETURN(termweight * (wdf_double / denom));
177 double
178 BM25Weight::get_maxpart() const
180 LOGCALL(WTCALC, double, "BM25Weight::get_maxpart", NO_ARGS);
181 double denom = param_k1;
182 Xapian::termcount wdf_max = get_wdf_upper_bound();
183 if (param_k1 != 0.0) {
184 if (param_b != 0.0) {
185 // "Upper-bound Approximations for Dynamic Pruning" Craig
186 // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
187 // Information Systems. 29(4), 2011 shows that evaluating at
188 // doclen=wdf_max is a good bound.
190 // However, we can do better if doclen_min > wdf_max since then a
191 // better bound can be found by simply evaluating at
192 // doclen=doclen_min and wdf=wdf_max.
193 Xapian::doclength normlen_lb =
194 max(max(wdf_max, get_doclength_lower_bound()) * len_factor,
195 param_min_normlen);
196 denom *= (normlen_lb * param_b + (1 - param_b));
199 denom += wdf_max;
200 AssertRel(denom,>,0);
201 RETURN(termweight * (wdf_max / denom));
204 /* The BM25 formula gives:
206 * param_k2 * query_length * (1 - normlen) / (1 + normlen)
208 * To avoid negative sumextra we add the constant (param_k2 * query_length)
209 * to give:
211 * 2 * param_k2 * query_length / (1 + normlen)
213 double
214 BM25Weight::get_sumextra(Xapian::termcount len,
215 Xapian::termcount,
216 Xapian::termcount) const
218 LOGCALL(WTCALC, double, "BM25Weight::get_sumextra", len);
219 double num = (2.0 * param_k2 * get_query_length());
220 RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
223 double
224 BM25Weight::get_maxextra() const
226 LOGCALL(WTCALC, double, "BM25Weight::get_maxextra", NO_ARGS);
227 if (param_k2 == 0.0)
228 RETURN(0.0);
229 double num = (2.0 * param_k2 * get_query_length());
230 RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
231 param_min_normlen)));
234 [[noreturn]]
235 static inline void
236 parameter_error(const char* message, const char* params)
238 Xapian::Weight::Internal::parameter_error(message, "bm25", params);
241 BM25Weight*
242 BM25Weight::create_from_parameters(const char* params) const
244 const char* p = params;
245 if (*p == '\0')
246 return new Xapian::BM25Weight();
247 double k1 = 1;
248 double k2 = 0;
249 double k3 = 1;
250 double b = 0.5;
251 double min_normlen = 0.5;
252 if (!Xapian::Weight::Internal::double_param(&p, &k1))
253 parameter_error("Parameter 1 (k1) is invalid", params);
254 if (*p && !Xapian::Weight::Internal::double_param(&p, &k2))
255 parameter_error("Parameter 2 (k2) is invalid", params);
256 if (*p && !Xapian::Weight::Internal::double_param(&p, &k3))
257 parameter_error("Parameter 3 (k3) is invalid", params);
258 if (*p && !Xapian::Weight::Internal::double_param(&p, &b))
259 parameter_error("Parameter 4 (b) is invalid", params);
260 if (*p && !Xapian::Weight::Internal::double_param(&p, &min_normlen))
261 parameter_error("Parameter 5 (min_normlen) is invalid", params);
262 if (*p)
263 parameter_error("Extra data after parameter 5", params);
264 return new Xapian::BM25Weight(k1, k2, k3, b, min_normlen);