2 * @brief Xapian::BB2Weight class - the BB2 weighting scheme of the DFR framework.
4 /* Copyright (C) 2013,2014 Aarsh Shah
5 * Copyright (C) 2014,2015,2016,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "xapian/weight.h"
25 #include "common/log2.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
36 static double stirling_value(double difference
, double y
, double stirling_constant
)
38 return ((y
+ 0.5) * (stirling_constant
- log2(y
)) + (difference
* stirling_constant
));
41 BB2Weight::BB2Weight(double c
) : param_c(c
)
44 throw Xapian::InvalidArgumentError("Parameter c is invalid.");
45 need_stat(AVERAGE_LENGTH
);
46 need_stat(DOC_LENGTH
);
47 need_stat(DOC_LENGTH_MIN
);
48 need_stat(DOC_LENGTH_MAX
);
49 need_stat(COLLECTION_SIZE
);
50 need_stat(COLLECTION_FREQ
);
58 BB2Weight::clone() const
60 return new BB2Weight(param_c
);
64 BB2Weight::init(double factor
)
67 // This object is for the term-independent contribution, and that's
68 // always zero for this scheme.
72 double wdfn_upper
= get_wdf_upper_bound();
74 if (wdfn_upper
== 0) {
79 c_product_avlen
= param_c
* get_average_length();
80 double wdfn_lower(1.0);
81 wdfn_lower
*= log2(1 + c_product_avlen
/ get_doclength_upper_bound());
82 wdfn_upper
*= log2(1 + c_product_avlen
/ get_doclength_lower_bound());
84 double F
= get_collection_freq();
86 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
88 if (rare(wdfn_lower
>= F
- 1))
90 if (rare(wdfn_upper
>= F
- 1))
93 B_constant
= get_wqf() * factor
* (F
+ 1.0) / get_termfreq();
95 // Clamp N to at least 2 to avoid ill-defined log calculations in
97 double N
= rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
99 wt
= -1.0 / log(2.0) - log2(N
- 1.0);
100 stirling_constant_1
= log2(N
+ F
- 1.0);
101 stirling_constant_2
= log2(F
);
103 // Maximize the Stirling value to be used in the upper bound.
104 // Calculate the individual terms keeping the maximization of Stirling value
106 double y_min
= F
- wdfn_upper
;
107 double y_max
= N
+ F
- wdfn_lower
- 2.0;
109 double stirling_max
= stirling_value(wdfn_upper
+ 1.0, y_max
,
110 stirling_constant_1
) -
111 stirling_value(wdfn_lower
, y_min
,
112 stirling_constant_2
);
114 double B_max
= B_constant
/ (wdfn_lower
+ 1.0);
115 upper_bound
= B_max
* (wt
+ stirling_max
);
116 if (rare(upper_bound
< 0.0))
121 BB2Weight::name() const
123 return "Xapian::BB2Weight";
127 BB2Weight::short_name() const
133 BB2Weight::serialise() const
135 return serialise_double(param_c
);
139 BB2Weight::unserialise(const string
& s
) const
141 const char *ptr
= s
.data();
142 const char *end
= ptr
+ s
.size();
143 double c
= unserialise_double(&ptr
, end
);
144 if (rare(ptr
!= end
))
145 throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
146 return new BB2Weight(c
);
150 BB2Weight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
151 Xapian::termcount
) const
153 if (wdf
== 0) return 0.0;
155 double wdfn
= wdf
* log2(1 + c_product_avlen
/ len
);
157 double F
= get_collection_freq();
159 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
161 if (rare(wdfn
>= F
- 1))
164 // Clamp N to at least 2 to avoid ill-defined log calculations in
166 Xapian::doccount N
= get_collection_size();
167 Xapian::doccount N_less_2
= rare(N
<= 2) ? 0 : N
- 2;
169 double y2
= F
- wdfn
;
170 double y1
= N_less_2
+ y2
;
171 double stirling
= stirling_value(wdfn
+ 1.0, y1
, stirling_constant_1
) -
172 stirling_value(wdfn
, y2
, stirling_constant_2
);
174 double B
= B_constant
/ (wdfn
+ 1.0);
175 double final_weight
= B
* (wt
+ stirling
);
176 if (rare(final_weight
< 0.0))
182 BB2Weight::get_maxpart() const
188 BB2Weight::get_sumextra(Xapian::termcount
, Xapian::termcount
) const
194 BB2Weight::get_maxextra() const
200 BB2Weight::create_from_parameters(const char * p
) const
203 return new Xapian::BB2Weight();
205 if (!Xapian::Weight::Internal::double_param(&p
, &k
))
206 Xapian::Weight::Internal::parameter_error("Parameter is invalid", "bb2");
208 Xapian::Weight::Internal::parameter_error("Extra data after parameter", "bb2");
209 return new Xapian::BB2Weight(k
);