[ci] Try to get .gitignore check to work
[xapian.git] / xapian-core / weight / pl2weight.cc
blobb835ae90ee2e103c280eae62e9ba7c8a1ec1c172
1 /** @file pl2weight.cc
2 * @brief Xapian::PL2Weight class - the PL2 weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013 Aarsh Shah
5 * Copyright (C) 2013,2014,2016 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian/weight.h"
25 #include "common/log2.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
32 #include <algorithm>
34 using namespace std;
36 namespace Xapian {
38 PL2Weight::PL2Weight(double c) : param_c(c)
40 if (param_c <= 0)
41 throw Xapian::InvalidArgumentError("Parameter c is invalid.");
42 need_stat(AVERAGE_LENGTH);
43 need_stat(DOC_LENGTH);
44 need_stat(DOC_LENGTH_MIN);
45 need_stat(DOC_LENGTH_MAX);
46 need_stat(COLLECTION_SIZE);
47 need_stat(COLLECTION_FREQ);
48 need_stat(WDF);
49 need_stat(WDF_MAX);
50 need_stat(WQF);
53 PL2Weight *
54 PL2Weight::clone() const
56 return new PL2Weight(param_c);
59 void
60 PL2Weight::init(double factor_)
62 if (factor_ == 0.0) {
63 // This object is for the term-independent contribution, and that's
64 // always zero for this scheme.
65 return;
68 factor = factor_;
70 if (get_wdf_upper_bound() == 0) {
71 // The "extra" weight object is cloned, init() called and then
72 // get_maxextra() is called and we discover that we don't need it.
73 // So we need to handle that case (which will give us 0 from
74 // get_wdf_upper_bound() here).
75 upper_bound = 0;
76 return;
79 factor *= get_wqf();
81 cl = param_c * get_average_length();
83 double base_change(1.0 / log(2.0));
84 double mean = double(get_collection_freq()) / get_collection_size();
85 P1 = mean * base_change + 0.5 * log2(2.0 * M_PI);
86 P2 = log2(mean) + base_change;
88 double wdfn_lower = log2(1 + cl / get_doclength_upper_bound());
89 double divisior = max(get_wdf_upper_bound(), get_doclength_lower_bound());
90 double wdfn_upper = get_wdf_upper_bound() * log2(1 + cl / divisior);
92 // Calculate an upper bound on the weights which get_sumpart() can return.
94 // We consider the equation for P as the sum of two parts which we
95 // maximise individually:
97 // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
98 // (b) (P1 - P2 * wdfn) / (wdfn + 1)
100 // To maximise (a), the fractional part is always positive (since wdfn>0)
101 // and is maximised by maximising wdfn - clearer when rewritten as:
102 // (1 - 0.5 / (wdfn + 1))
104 // The log part of (a) is clearly also maximised by maximising wdfn,
105 // so we want to evaluate (a) at wdfn=wdfn_upper.
106 double P_max2a = (wdfn_upper + 0.5) * log2(wdfn_upper) / (wdfn_upper + 1.0);
107 // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
109 // (P1 + P2)/x - P2
111 // Differentiating wrt x gives:
113 // -(P1 + P2)/x²
115 // So there are no local minima or maxima, and the function is continuous
116 // in the range of interest, so the sign of this differential tells us
117 // whether we want to maximise or minimise wdfn, and since x>1, we can
118 // just consider the sign of: (P1 + P2)
120 // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
121 // giving us a bound that can't be bettered if wdfn_upper is tight.
122 double wdfn_optb = P1 + P2 > 0 ? wdfn_upper : wdfn_lower;
123 double P_max2b = (P1 - P2 * wdfn_optb) / (wdfn_optb + 1.0);
124 upper_bound = factor * (P_max2a + P_max2b);
126 if (rare(upper_bound <= 0)) upper_bound = 0;
129 string
130 PL2Weight::name() const
132 return "Xapian::PL2Weight";
135 string
136 PL2Weight::short_name() const
138 return "pl2";
141 string
142 PL2Weight::serialise() const
144 return serialise_double(param_c);
147 PL2Weight *
148 PL2Weight::unserialise(const string & s) const
150 const char *ptr = s.data();
151 const char *end = ptr + s.size();
152 double c = unserialise_double(&ptr, end);
153 if (rare(ptr != end))
154 throw Xapian::SerialisationError("Extra data in PL2Weight::unserialise()");
155 return new PL2Weight(c);
158 double
159 PL2Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
160 Xapian::termcount) const
162 if (wdf == 0) return 0.0;
164 double wdfn = wdf * log2(1 + cl / len);
166 double P = P1 + (wdfn + 0.5) * log2(wdfn) - P2 * wdfn;
167 if (rare(P <= 0)) return 0.0;
169 return factor * P / (wdfn + 1.0);
172 double
173 PL2Weight::get_maxpart() const
175 return upper_bound;
178 double
179 PL2Weight::get_sumextra(Xapian::termcount, Xapian::termcount) const
181 return 0;
184 double
185 PL2Weight::get_maxextra() const
187 return 0;
190 PL2Weight *
191 PL2Weight::create_from_parameters(const char * p) const
193 if (*p == '\0')
194 return new Xapian::PL2Weight();
195 double k = 1.0;
196 if (!Xapian::Weight::Internal::double_param(&p, &k))
197 Xapian::Weight::Internal::parameter_error("Parameter is invalid", "pl2");
198 if (*p)
199 Xapian::Weight::Internal::parameter_error("Extra data after parameter", "pl2");
200 return new Xapian::PL2Weight(k);