[honey] Remove incorrect assertion
[xapian.git] / xapian-core / weight / dphweight.cc
blob4ebf0c08e26c3921cc383c5e31d2e707dc102914
1 /** @file dphweight.cc
2 * @brief Xapian::DPHWeight class - The DPH weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5 * Copyright (C) 2016,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian/weight.h"
26 #include "xapian/error.h"
27 #include "common/log2.h"
28 #include <algorithm>
29 #include <cmath>
31 using namespace std;
33 namespace Xapian {
35 DPHWeight *
36 DPHWeight::clone() const
38 return new DPHWeight();
41 void
42 DPHWeight::init(double factor)
44 if (factor == 0.0) {
45 // This object is for the term-independent contribution, and that's
46 // always zero for this scheme.
47 return;
50 double F = get_collection_freq();
51 double N = get_collection_size();
52 double wdf_lower = 1.0;
53 double wdf_upper = get_wdf_upper_bound();
55 double len_upper = get_doclength_upper_bound();
57 double min_wdf_to_len = wdf_lower / len_upper;
59 if (wdf_upper == 0) {
60 upper_bound = 0.0;
61 return;
64 /* Calculate constant value to be used in get_sumpart(). */
65 log_constant = get_average_length() * N / F;
66 wqf_product_factor = get_wqf() * factor;
68 // Calculate the upper bound on the weight.
70 /* Calculations to decide the values to be used for calculating upper bound. */
71 /* The upper bound of the term appearing in the second log is obtained
72 by taking the minimum and maximum wdf value in the formula as shown. */
73 double max_product_1 = wdf_upper * (1.0 - min_wdf_to_len);
74 /* A second upper bound of the term can be obtained by plugging in the
75 upper bound of the length and differentiating the term w.r.t wdf
76 to find the value of wdf at which function attains maximum value. */
77 double wdf_var = min(wdf_upper, len_upper / 2.0);
78 double max_product_2 = wdf_var * (1.0 - wdf_var / len_upper);
79 /* Take the minimum of the two upper bounds. */
80 double max_product = min(max_product_1, max_product_2);
82 // Maximization of the product of wdf and normalized wdf.
83 /* The expression is (wdf * (1.0 - wdf / len) * (1.0 - wdf / len)) /
84 (wdf + 1.0). */
85 /* Now, assuming len to be len_upper for the purpose of maximization,
86 (d)/(dx) (x * (1 - x / c) * (1 - x / c)) / (x+1) =
87 ((c - x) * (c - x * (2 * x + 3))) / (c ^ 2 * (x + 1) ^ 2)
88 Thus, if (c - x * (2 * x + 3)) is positive, the differentiation
89 value will be positive and hence the function will be an
90 increasing function. By finding the positive root of the equation
91 2 * x ^ 2 + 3 * x - c = 0, we get the value of x(wdf)
92 at which the differentiation value turns to negative from positive,
93 and hence, the function will have maximum value for that value of wdf. */
94 double wdf_root = 0.25 * (sqrt(8.0 * len_upper + 9.0) - 3.0);
96 // If wdf_root outside valid range, use nearest value in range.
97 if (wdf_root > wdf_upper) {
98 wdf_root = wdf_upper;
99 } else if (wdf_root < wdf_lower) {
100 wdf_root = wdf_lower;
103 double max_wdf_product_normalization = wdf_root / (wdf_root + 1) *
104 pow((1 - wdf_root / len_upper), 2.0);
106 double max_weight = max_wdf_product_normalization *
107 (log2(log_constant) + (0.5 * log2(2 * M_PI * max_product)));
109 upper_bound = wqf_product_factor * max_weight;
110 if (rare(upper_bound < 0.0)) upper_bound = 0.0;
113 string
114 DPHWeight::name() const
116 return "Xapian::DPHWeight";
119 string
120 DPHWeight::short_name() const
122 return "dph";
125 string
126 DPHWeight::serialise() const
128 return string();
131 DPHWeight *
132 DPHWeight::unserialise(const string& s) const
134 if (rare(!s.empty()))
135 throw Xapian::SerialisationError("Extra data in DPHWeight::unserialise()");
136 return new DPHWeight();
139 double
140 DPHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
141 Xapian::termcount) const
143 if (wdf == 0 || wdf == len) return 0.0;
145 double wdf_to_len = double(wdf) / len;
147 double normalization = pow((1 - wdf_to_len), 2) / (wdf + 1);
149 double wt = normalization *
150 (wdf * log2(wdf_to_len * log_constant) +
151 (0.5 * log2(2 * M_PI * wdf * (1 - wdf_to_len))));
152 if (rare(wt <= 0.0)) return 0.0;
154 return wqf_product_factor * wt;
157 double
158 DPHWeight::get_maxpart() const
160 return upper_bound;
163 double
164 DPHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
166 return 0;
169 double
170 DPHWeight::get_maxextra() const
172 return 0;
175 DPHWeight *
176 DPHWeight::create_from_parameters(const char * p) const
178 if (*p != '\0')
179 throw InvalidArgumentError("No parameters are required for DPHWeight");
180 return new Xapian::DPHWeight();