2 * @brief Xapian::PL2Weight class - the PL2 weighting scheme of the DFR framework.
4 /* Copyright (C) 2013 Aarsh Shah
5 * Copyright (C) 2013,2014,2016 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "xapian/weight.h"
25 #include "common/log2.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
36 PL2Weight::PL2Weight(double c
) : param_c(c
)
39 throw Xapian::InvalidArgumentError("Parameter c is invalid.");
40 need_stat(AVERAGE_LENGTH
);
41 need_stat(DOC_LENGTH
);
42 need_stat(DOC_LENGTH_MIN
);
43 need_stat(DOC_LENGTH_MAX
);
44 need_stat(COLLECTION_SIZE
);
45 need_stat(COLLECTION_FREQ
);
52 PL2Weight::clone() const
54 return new PL2Weight(param_c
);
58 PL2Weight::init(double factor_
)
62 if (get_wdf_upper_bound() == 0) {
63 // The "extra" weight object is cloned, init() called and then
64 // get_maxextra() is called and we discover that we don't need it.
65 // So we need to handle that case (which will give us 0 from
66 // get_wdf_upper_bound() here).
73 cl
= param_c
* get_average_length();
75 double base_change(1.0 / log(2.0));
76 double mean
= double(get_collection_freq()) / get_collection_size();
77 P1
= mean
* base_change
+ 0.5 * log2(2.0 * M_PI
);
78 P2
= log2(mean
) + base_change
;
80 double wdfn_lower
= log2(1 + cl
/ get_doclength_upper_bound());
81 double divisior
= max(get_wdf_upper_bound(), get_doclength_lower_bound());
82 double wdfn_upper
= get_wdf_upper_bound() * log2(1 + cl
/ divisior
);
84 // Calculate an upper bound on the weights which get_sumpart() can return.
86 // We consider the equation for P as the sum of two parts which we
87 // maximise individually:
89 // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
90 // (b) (P1 - P2 * wdfn) / (wdfn + 1)
92 // To maximise (a), the fractional part is always positive (since wdfn>0)
93 // and is maximised by maximising wdfn - clearer when rewritten as:
94 // (1 - 0.5 / (wdfn + 1))
96 // The log part of (a) is clearly also maximised by maximising wdfn,
97 // so we want to evaluate (a) at wdfn=wdfn_upper.
98 double P_max2a
= (wdfn_upper
+ 0.5) * log2(wdfn_upper
) / (wdfn_upper
+ 1.0);
99 // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
103 // Differentiating wrt x gives:
107 // So there are no local minima or maxima, and the function is continuous
108 // in the range of interest, so the sign of this differential tells us
109 // whether we want to maximise or minimise wdfn, and since x>1, we can
110 // just consider the sign of: (P1 + P2)
112 // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
113 // giving us a bound that can't be bettered if wdfn_upper is tight.
114 double wdfn_optb
= P1
+ P2
> 0 ? wdfn_upper
: wdfn_lower
;
115 double P_max2b
= (P1
- P2
* wdfn_optb
) / (wdfn_optb
+ 1.0);
116 upper_bound
= factor
* (P_max2a
+ P_max2b
);
118 if (rare(upper_bound
<= 0)) upper_bound
= 0;
122 PL2Weight::name() const
124 return "Xapian::PL2Weight";
128 PL2Weight::short_name() const
134 PL2Weight::serialise() const
136 return serialise_double(param_c
);
140 PL2Weight::unserialise(const string
& s
) const
142 const char *ptr
= s
.data();
143 const char *end
= ptr
+ s
.size();
144 double c
= unserialise_double(&ptr
, end
);
145 if (rare(ptr
!= end
))
146 throw Xapian::SerialisationError("Extra data in PL2Weight::unserialise()");
147 return new PL2Weight(c
);
151 PL2Weight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
152 Xapian::termcount
) const
154 if (wdf
== 0) return 0.0;
156 double wdfn
= wdf
* log2(1 + cl
/ len
);
158 double P
= P1
+ (wdfn
+ 0.5) * log2(wdfn
) - P2
* wdfn
;
159 if (rare(P
<= 0)) return 0.0;
161 return factor
* P
/ (wdfn
+ 1.0);
165 PL2Weight::get_maxpart() const
171 PL2Weight::get_sumextra(Xapian::termcount
, Xapian::termcount
) const
177 PL2Weight::get_maxextra() const
183 PL2Weight::create_from_parameters(const char * p
) const
186 return new Xapian::PL2Weight();
188 if (!Xapian::Weight::Internal::double_param(&p
, &k
))
189 Xapian::Weight::Internal::parameter_error("Parameter is invalid", "pl2");
191 Xapian::Weight::Internal::parameter_error("Extra data after parameter", "pl2");
192 return new Xapian::PL2Weight(k
);