Add Weight::create() and Weight::create_from_parameters()
[xapian.git] / xapian-core / weight / dlhweight.cc
blob6f879a5aba94f77c08bd7bdbc6e4c9fb6cbcfca8
1 /** @file dlhweight.cc
2 * @brief Xapian::DLHWeight class - The DLH weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5 * Copyright (C) 2016 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian/weight.h"
26 #include "xapian/error.h"
27 #include "common/log2.h"
28 #include <algorithm>
30 using namespace std;
32 namespace Xapian {
34 DLHWeight *
35 DLHWeight::clone() const
37 return new DLHWeight();
40 void
41 DLHWeight::init(double factor)
43 double wdf_upper = get_wdf_upper_bound();
44 if (wdf_upper == 0) {
45 upper_bound = 0.0;
46 return;
49 const double wdf_lower = 1.0;
50 double len_upper = get_doclength_upper_bound();
51 double len_lower = get_doclength_lower_bound();
53 double N = get_collection_size();
54 double F = get_collection_freq();
56 // Calculate constant values to be used in get_sumpart().
57 log_constant = get_average_length() * N / F;
58 wqf_product_factor = get_wqf() * factor;
60 // Calculate values for the upper bound.
62 // w <= l, so if the allowed ranges overlap, max w/l is 1.0.
63 double max_wdf_over_l = wdf_upper < len_lower ? wdf_upper / len_lower : 1.0;
65 // First term A: w/(w+.5)*log2(w/l*L) where L=l_avg.N/coll_freq
66 // Assume log >= 0:
67 // w/(w+.5) = 1-1/(2w+1) and w >= 1 so max A at w=w_max
68 // log2(w/l*L) maximised when w/l maximised
69 // so max A at w=w_max, l=max(l_min, w_max)
70 // If log < 0 => then A <= 0, so max A <= 0 and we want to minimise the
71 // factor outside the log.
72 double logged_expr = max_wdf_over_l * log_constant;
73 double w_for_A = logged_expr > 1.0 ? wdf_upper : wdf_lower;
74 double A = w_for_A / (w_for_A + 0.5) * log2(logged_expr);
76 // Second term B:
78 // (l-w)*log2(1-w/l)
80 // The log is negative, and w <= l so B <= 0 and its maximum is the value
81 // as close to zero as possible. So smaller (l-w) is better and smaller
82 // w/l is better.
84 // This function is ill defined at w=l, but -> 0 as w -> l.
86 // If w=l is valid (i.e. len_lower > wdf_upper) then B = 0.
87 double B = 0;
88 if (len_lower > wdf_upper) {
89 // If not, then minimising l-w gives us a candidate (i.e. w=wdf_upper
90 // and l=len_lower).
92 // The function is also 0 at w = 0 (there must be a local mimina at
93 // some value of w between 0 and l), so the other candidate is at
94 // w=wdf_lower.
96 // We need to find the optimum value of l in this case, so
97 // differentiate the formula by l:
99 // d/dl: log2(1-w/l) + (l-w)*(1-w/l)/(l*log(2))
100 // = (log(1-w/l) + (1-w/l)²)/log(2)
101 // = (log(x) + x²)/log(2) [x=1-w/l]
103 // which is 0 at approx x=0.65291864
105 // x=1-w/l <=> l*(1-x)=w <=> l=w/(1-x) <=> l ~= 0.34708136*w
107 // but l >= w so we can't attain that (and the log isn't valid there).
109 // Gradient at (without loss of generality) l=2*w is:
110 // (log(0.5) + 0.25)/log(2)
111 // which is < 0 so want to minimise l, i.e. l=len_lower, so the other
112 // candidate is w=wdf_lower and l=len_lower.
114 // So evaluate both candidates and pick the larger:
115 double B1 = (len_lower - wdf_lower) * log2(1.0 - wdf_lower / len_lower);
116 double B2 = (len_lower - wdf_upper) * log2(1.0 - wdf_upper / len_lower);
117 B = max(B1, B2);
120 /* An upper bound of the term used in the third log can be obtained by:
122 * 0.5 * log2(2.0 * M_PI * wdf * (1 - wdf / len))
124 * An upper bound on wdf * (1 - wdf / len) (and hence on the log, since
125 * log is a monotonically increasing function on positive numbers) can
126 * be obtained by plugging in the upper bound of the length and
127 * differentiating the term w.r.t wdf which gives the value of wdf at which
128 * the function attains maximum value - at wdf = len_upper / 2 (or if the
129 * wdf can't be that large, at wdf = wdf_upper): */
130 double wdf_var = min(wdf_upper, len_upper / 2.0);
131 double max_product = wdf_var * (1.0 - wdf_var / len_upper);
132 #if 0
133 /* An upper bound can also be obtained by taking the minimum and maximum
134 * wdf value in the formula as shown (which isn't useful now as it's always
135 * >= the bound above, but could be useful if we tracked bounds on wdf/len):
137 double min_wdf_to_len = wdf_lower / len_upper;
138 double max_product_2 = wdf_upper * (1.0 - min_wdf_to_len);
139 /* Take the minimum of the two upper bounds. */
140 max_product = min(max_product, max_product_2);
141 #endif
142 double C = 0.5 * log2(2.0 * M_PI * max_product) / (wdf_lower + 0.5);
143 upper_bound = A + B + C;
145 if (rare(upper_bound < 0.0))
146 upper_bound = 0.0;
147 else
148 upper_bound *= wqf_product_factor;
151 string
152 DLHWeight::name() const
154 return "Xapian::DLHWeight";
157 string
158 DLHWeight::short_name() const
160 return "dlh";
163 string
164 DLHWeight::serialise() const
166 return string();
169 DLHWeight *
170 DLHWeight::unserialise(const string& s) const
172 if (rare(!s.empty()))
173 throw Xapian::SerialisationError("Extra data in DLHWeight::unserialise()");
174 return new DLHWeight();
177 double
178 DLHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
179 Xapian::termcount) const
181 if (wdf == 0 || wdf == len) return 0.0;
183 double wdf_to_len = double(wdf) / len;
184 double one_minus_wdf_to_len = 1.0 - wdf_to_len;
186 double wt = wdf * log2(wdf_to_len * log_constant) +
187 (len - wdf) * log2(one_minus_wdf_to_len) +
188 0.5 * log2(2.0 * M_PI * wdf * one_minus_wdf_to_len);
189 if (rare(wt <= 0.0)) return 0.0;
191 return wqf_product_factor * wt / (wdf + 0.5);
194 double
195 DLHWeight::get_maxpart() const
197 return upper_bound;
200 double
201 DLHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
203 return 0;
206 double
207 DLHWeight::get_maxextra() const
209 return 0;
212 DLHWeight *
213 DLHWeight::create_from_parameters(const char * p) const
215 if (*p != '\0')
216 throw InvalidArgumentError("No parameters are required for DLHWeight");
217 return new Xapian::DLHWeight();