Support: quest -f cjk_ngram
[xapian.git] / xapian-core / weight / dphweight.cc
bloba0da1bd9d037e8958d2c750b7646f974c88fc10b
1 /** @file dphweight.cc
2 * @brief Xapian::DPHWeight class - The DPH weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013, 2014 Aarsh Shah
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/weight.h"
24 #include "common/log2.h"
25 #include <algorithm>
26 #include <cmath>
28 using namespace std;
30 namespace Xapian {
32 DPHWeight *
33 DPHWeight::clone() const
35 return new DPHWeight();
38 void
39 DPHWeight::init(double factor)
41 double F = get_collection_freq();
42 double N = get_collection_size();
43 double wdf_lower = 1.0;
44 double wdf_upper = get_wdf_upper_bound();
46 double len_upper = get_doclength_upper_bound();
48 double min_wdf_to_len = wdf_lower / len_upper;
49 double min_normalization = pow(1.0 / len_upper, 2) / (wdf_upper + 1.0);
51 if (wdf_upper == 0) {
52 lower_bound = upper_bound = 0.0;
53 return;
56 /* Calculate lower bound on the weight in order to deal with negative
57 * weights. */
58 double min_weight = min_normalization *
59 (wdf_lower *
60 log2((wdf_lower * get_average_length() / len_upper) *
61 (N / F)) +
62 (0.5 * log2(2.0 * M_PI * wdf_lower / len_upper)));
64 lower_bound = factor * get_wqf() * min_weight;
66 /* Calculate constant value to be used in get_sumpart(). */
67 log_constant = get_average_length() * N / F;
68 wqf_product_factor = get_wqf() * factor;
70 // Calculate the upper bound on the weight.
72 /* Calculations to decide the values to be used for calculating upper bound. */
73 /* The upper bound of the term appearing in the second log is obtained
74 by taking the minimum and maximum wdf value in the formula as shown. */
75 double max_product_1 = wdf_upper * (1.0 - min_wdf_to_len);
76 /* A second upper bound of the term can be obtained by plugging in the
77 upper bound of the length and differentiating the term w.r.t wdf
78 to find the value of wdf at which function attains maximum value. */
79 double wdf_var = min(wdf_upper, len_upper / 2.0);
80 double max_product_2 = wdf_var * (1.0 - wdf_var / len_upper) ;
81 /* Take the minimum of the two upper bounds. */
82 double max_product = min(max_product_1, max_product_2);
84 // Maximization of the product of wdf and normalized wdf.
85 /* The expression is (wdf * (1.0 - wdf / len) * (1.0 - wdf / len)) /
86 (wdf + 1.0). */
87 /* Now, assuming len to be len_upper for the purpose of maximization,
88 (d)/(dx) (x * (1 - x / c) * (1 - x / c)) / (x+1) =
89 ((c - x) * (c - x * (2 * x + 3))) / (c ^ 2 * (x + 1) ^ 2)
90 Thus, if (c - x * (2 * x + 3)) is positive, the differentiation
91 value will be positive and hence the function will be an
92 increasing function. By finding the positive root of the equation
93 2 * x ^ 2 + 3 * x - c = 0, we get the value of x(wdf)
94 at which the differentiation value turns to negative from positive,
95 and hence, the function will have maximum value for that value of wdf. */
96 double wdf_root = 0.25 * (sqrt(8.0 * len_upper + 9.0) + 3.0);
98 // Use the smaller value among the root and wdf_upper.
99 wdf_root = min(wdf_root, wdf_upper);
101 double max_wdf_product_normalization = (wdf_root *
102 pow((1 - wdf_root / len_upper),2.0)) /
103 (wdf_root + 1);
105 double max_weight = max_wdf_product_normalization *
106 (log2(log_constant) +
107 (0.5 * log2(2 * M_PI * max_product)));
109 upper_bound = ((wqf_product_factor * max_weight) - lower_bound);
112 string
113 DPHWeight::name() const
115 return "Xapian::DPHWeight";
118 string
119 DPHWeight::serialise() const
121 return string();
124 DPHWeight *
125 DPHWeight::unserialise(const string &) const
127 return new DPHWeight();
130 double
131 DPHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
132 Xapian::termcount) const
134 if (wdf == 0) return 0.0;
136 double wdf_to_len = double(wdf) / len;
138 double normalization = pow((1 - wdf_to_len), 2) / (wdf + 1);
140 double wt = normalization *
141 (wdf *
142 log2(wdf_to_len * log_constant) +
143 (0.5 * log2(2 * M_PI * wdf * (1 - wdf_to_len))));
145 // Subtract the lower bound from the actual weight to avoid negative weights.
146 return ((wqf_product_factor * wt) - lower_bound);
149 double
150 DPHWeight::get_maxpart() const
152 return upper_bound;
155 double
156 DPHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
158 return 0;
161 double
162 DPHWeight::get_maxextra() const
164 return 0;