lcd_clusterer.cc: Use 'auto' for loop variable
[xapian.git] / xapian-core / weight / bb2weight.cc
blobc6c4620f440cb58426d6bfc2665760ade637c74a
1 /** @file bb2weight.cc
2 * @brief Xapian::BB2Weight class - the BB2 weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013,2014 Aarsh Shah
5 * Copyright (C) 2014,2015,2016,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian/weight.h"
25 #include "common/log2.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
32 using namespace std;
34 namespace Xapian {
36 static double stirling_value(double difference, double y, double stirling_constant)
38 return ((y + 0.5) * (stirling_constant - log2(y)) + (difference * stirling_constant));
41 BB2Weight::BB2Weight(double c) : param_c(c)
43 if (param_c <= 0)
44 throw Xapian::InvalidArgumentError("Parameter c is invalid.");
45 need_stat(AVERAGE_LENGTH);
46 need_stat(DOC_LENGTH);
47 need_stat(DOC_LENGTH_MIN);
48 need_stat(DOC_LENGTH_MAX);
49 need_stat(COLLECTION_SIZE);
50 need_stat(COLLECTION_FREQ);
51 need_stat(WDF);
52 need_stat(WDF_MAX);
53 need_stat(WQF);
54 need_stat(TERMFREQ);
57 BB2Weight *
58 BB2Weight::clone() const
60 return new BB2Weight(param_c);
63 void
64 BB2Weight::init(double factor)
66 if (factor == 0.0) {
67 // This object is for the term-independent contribution, and that's
68 // always zero for this scheme.
69 return;
72 double wdfn_upper = get_wdf_upper_bound();
74 if (wdfn_upper == 0) {
75 upper_bound = 0.0;
76 return;
79 c_product_avlen = param_c * get_average_length();
80 double wdfn_lower(1.0);
81 wdfn_lower *= log2(1 + c_product_avlen / get_doclength_upper_bound());
82 wdfn_upper *= log2(1 + c_product_avlen / get_doclength_lower_bound());
84 double F = get_collection_freq();
86 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
87 // stirling_value().
88 if (rare(wdfn_lower >= F - 1))
89 wdfn_upper = F - 1;
90 if (rare(wdfn_upper >= F - 1))
91 wdfn_upper = F - 1;
93 B_constant = get_wqf() * factor * (F + 1.0) / get_termfreq();
95 // Clamp N to at least 2 to avoid ill-defined log calculations in
96 // stirling_value().
97 double N = rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
99 wt = -1.0 / log(2.0) - log2(N - 1.0);
100 stirling_constant_1 = log2(N + F - 1.0);
101 stirling_constant_2 = log2(F);
103 // Maximize the Stirling value to be used in the upper bound.
104 // Calculate the individual terms keeping the maximization of Stirling value
105 // in mind.
106 double y_min = F - wdfn_upper;
107 double y_max = N + F - wdfn_lower - 2.0;
109 double stirling_max = stirling_value(wdfn_upper + 1.0, y_max,
110 stirling_constant_1) -
111 stirling_value(wdfn_lower, y_min,
112 stirling_constant_2);
114 double B_max = B_constant / (wdfn_lower + 1.0);
115 upper_bound = B_max * (wt + stirling_max);
116 if (rare(upper_bound < 0.0))
117 upper_bound = 0.0;
120 string
121 BB2Weight::name() const
123 return "Xapian::BB2Weight";
126 string
127 BB2Weight::short_name() const
129 return "bb2";
132 string
133 BB2Weight::serialise() const
135 return serialise_double(param_c);
138 BB2Weight *
139 BB2Weight::unserialise(const string & s) const
141 const char *ptr = s.data();
142 const char *end = ptr + s.size();
143 double c = unserialise_double(&ptr, end);
144 if (rare(ptr != end))
145 throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
146 return new BB2Weight(c);
149 double
150 BB2Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
151 Xapian::termcount) const
153 if (wdf == 0) return 0.0;
155 double wdfn = wdf * log2(1 + c_product_avlen / len);
157 double F = get_collection_freq();
159 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
160 // stirling_value().
161 if (rare(wdfn >= F - 1))
162 wdfn = F - 1;
164 // Clamp N to at least 2 to avoid ill-defined log calculations in
165 // stirling_value().
166 Xapian::doccount N = get_collection_size();
167 Xapian::doccount N_less_2 = rare(N <= 2) ? 0 : N - 2;
169 double y2 = F - wdfn;
170 double y1 = N_less_2 + y2;
171 double stirling = stirling_value(wdfn + 1.0, y1, stirling_constant_1) -
172 stirling_value(wdfn, y2, stirling_constant_2);
174 double B = B_constant / (wdfn + 1.0);
175 double final_weight = B * (wt + stirling);
176 if (rare(final_weight < 0.0))
177 final_weight = 0.0;
178 return final_weight;
181 double
182 BB2Weight::get_maxpart() const
184 return upper_bound;
187 double
188 BB2Weight::get_sumextra(Xapian::termcount, Xapian::termcount) const
190 return 0;
193 double
194 BB2Weight::get_maxextra() const
196 return 0;
199 BB2Weight *
200 BB2Weight::create_from_parameters(const char * p) const
202 if (*p == '\0')
203 return new Xapian::BB2Weight();
204 double k = 1.0;
205 if (!Xapian::Weight::Internal::double_param(&p, &k))
206 Xapian::Weight::Internal::parameter_error("Parameter is invalid", "bb2");
207 if (*p)
208 Xapian::Weight::Internal::parameter_error("Extra data after parameter", "bb2");
209 return new Xapian::BB2Weight(k);