Add test coverage for $set{weighting,...}
[xapian.git] / xapian-core / weight / tradweight.cc
blob81ac54425dcba2dc4553c2e7d665a505e5ca6098
1 /** @file
2 * @brief Xapian::TradWeight class - the "traditional" probabilistic formula
3 */
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2017,2024 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/weight.h"
24 #include "weightinternal.h"
26 #include "debuglog.h"
27 #include "omassert.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
32 #include <algorithm>
33 #include <cmath>
35 using namespace std;
37 namespace Xapian {
39 TradWeight *
40 TradWeight::clone() const
42 return new TradWeight(param_k);
45 void
46 TradWeight::init(double factor)
48 if (factor == 0.0) {
49 // This object is for the term-independent contribution, and that's
50 // always zero for this scheme.
51 return;
54 Xapian::doccount tf = get_termfreq();
56 double tw = 0;
57 if (get_rset_size() != 0) {
58 Xapian::doccount reltermfreq = get_reltermfreq();
60 // There can't be more relevant documents indexed by a term than there
61 // are documents indexed by that term.
62 AssertRel(reltermfreq,<=,tf);
64 // There can't be more relevant documents indexed by a term than there
65 // are relevant documents.
66 AssertRel(reltermfreq,<=,get_rset_size());
68 Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
70 // There can't be more relevant documents not indexed by a term than
71 // there are documents not indexed by that term.
72 AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
74 Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
76 Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
77 double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
78 double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
79 tw = numerator / denom;
80 } else {
81 tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
84 AssertRel(tw,>,0);
86 // The "official" formula can give a negative termweight in unusual cases
87 // (without an RSet, when a term indexes more than half the documents in
88 // the database). These negative weights aren't actually helpful, and it
89 // is common for implementations to replace them with a small positive
90 // weight or similar.
92 // Truncating to zero doesn't seem a great approach in practice as it
93 // means that some terms in the query can have no effect at all on the
94 // ranking, and that some results can have zero weight, both of which
95 // are seem surprising.
97 // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
98 // more than a third of documents, which seems rather "intrusive". That's
99 // what the code currently enabled does, but perhaps it would be better to
100 // do something else. (FIXME)
101 #if 0
102 if (rare(tw <= 1.0)) {
103 termweight = 0;
104 } else {
105 termweight = log(tw) * factor;
107 #else
108 if (tw < 2) tw = tw * 0.5 + 1;
109 termweight = log(tw) * factor;
110 #endif
112 LOGVALUE(WTCALC, termweight);
114 if (param_k == 0) {
115 // If param_k is 0 then the document length doesn't affect the weight.
116 len_factor = 0;
117 } else {
118 len_factor = get_average_length();
119 // len_factor can be zero if all documents are empty (or the database is
120 // empty!)
121 if (len_factor != 0) len_factor = param_k / len_factor;
124 LOGVALUE(WTCALC, len_factor);
127 string
128 TradWeight::name() const
130 return "trad";
133 string
134 TradWeight::serialise() const
136 return serialise_double(param_k);
139 TradWeight *
140 TradWeight::unserialise(const string & s) const
142 const char *ptr = s.data();
143 const char *end = ptr + s.size();
144 double k = unserialise_double(&ptr, end);
145 if (rare(ptr != end))
146 throw Xapian::SerialisationError("Extra data in TradWeight::unserialise()");
147 return new TradWeight(k);
150 double
151 TradWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
152 Xapian::termcount, Xapian::termcount) const
154 double wdf_double = wdf;
155 return termweight * (wdf_double / (len * len_factor + wdf_double));
158 double
159 TradWeight::get_maxpart() const
161 double wdf_max = get_wdf_upper_bound();
162 Xapian::termcount doclen_lb = get_doclength_lower_bound();
163 return termweight * (wdf_max / (doclen_lb * len_factor + wdf_max));
166 static inline void
167 parameter_error(const char* message, const char* params)
169 Xapian::Weight::Internal::parameter_error(message, "trad", params);
172 TradWeight*
173 TradWeight::create_from_parameters(const char* params) const
175 const char* p = params;
176 if (*p == '\0')
177 return new Xapian::TradWeight();
178 double k = 1.0;
179 if (!Xapian::Weight::Internal::double_param(&p, &k))
180 parameter_error("Parameter is invalid", params);
181 if (*p)
182 parameter_error("Extra data after parameter", params);
183 return new Xapian::TradWeight(k);