2 * @brief Xapian::MSet class
4 /* Copyright (C) 2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "msetinternal.h"
24 #include "xapian/mset.h"
26 #include "net/length.h"
27 #include "net/serialise.h"
28 #include "matcher/msetcmp.h"
29 #include "roundestimate.h"
30 #include "serialise-double.h"
32 #include "unicode/description_append.h"
42 MSet::MSet(const MSet
& o
) : internal(o
.internal
) {}
45 MSet::operator=(const MSet
& o
)
47 internal
= o
.internal
;
51 MSet::MSet() : internal(new MSet::Internal
) {}
56 MSet::fetch_(Xapian::doccount first
, Xapian::doccount last
) const
58 internal
->fetch(first
, last
);
62 MSet::set_item_weight(Xapian::doccount i
, double weight
)
64 internal
->set_item_weight(i
, weight
);
68 MSet::sort_by_relevance()
70 std::sort(internal
->items
.begin(), internal
->items
.end(),
71 get_msetcmp_function(Enquire::Internal::REL
, true, false));
75 MSet::convert_to_percent(double weight
) const
77 return internal
->convert_to_percent(weight
);
81 MSet::get_termfreq(const std::string
& term
) const
83 // Check the cached data for query terms first.
84 Xapian::doccount termfreq
;
85 if (usual(internal
->stats
&& internal
->stats
->get_stats(term
, termfreq
))) {
89 if (rare(internal
->enquire
.get() == NULL
)) {
90 // Consistent with get_termfreq() on an empty database which always
95 // Fall back to asking the database via enquire.
96 return internal
->enquire
->get_termfreq(term
);
100 MSet::get_termweight(const std::string
& term
) const
102 // A term not in the query has no termweight, so 0.0 makes sense as the
103 // answer in such cases.
105 if (usual(internal
->stats
)) {
106 (void)internal
->stats
->get_termweight(term
, weight
);
112 MSet::get_firstitem() const
114 return internal
->first
;
118 MSet::get_matches_lower_bound() const
120 return internal
->matches_lower_bound
;
124 MSet::get_matches_estimated() const
126 // Doing this here avoids calculating if the estimate is never looked at,
127 // though does mean we recalculate if this method is called more than once.
128 return round_estimate(internal
->matches_lower_bound
,
129 internal
->matches_upper_bound
,
130 internal
->matches_estimated
);
134 MSet::get_matches_upper_bound() const
136 return internal
->matches_upper_bound
;
140 MSet::get_uncollapsed_matches_lower_bound() const
142 return internal
->uncollapsed_lower_bound
;
146 MSet::get_uncollapsed_matches_estimated() const
148 // Doing this here avoids calculating if the estimate is never looked at,
149 // though does mean we recalculate if this method is called more than once.
150 return round_estimate(internal
->uncollapsed_lower_bound
,
151 internal
->uncollapsed_upper_bound
,
152 internal
->uncollapsed_estimated
);
156 MSet::get_uncollapsed_matches_upper_bound() const
158 return internal
->uncollapsed_upper_bound
;
162 MSet::get_max_attained() const
164 return internal
->max_attained
;
168 MSet::get_max_possible() const
170 return internal
->max_possible
;
176 Assert(internal
.get());
177 return internal
->items
.size();
181 MSet::snippet(const std::string
& text
,
183 const Xapian::Stem
& stemmer
,
185 const std::string
& hi_start
,
186 const std::string
& hi_end
,
187 const std::string
& omit
) const
189 // The actual implementation is in queryparser/termgenerator_internal.cc.
190 return internal
->snippet(text
, length
, stemmer
, flags
,
191 hi_start
, hi_end
, omit
);
195 MSet::get_description() const
197 return internal
->get_description();
201 MSet::Internal::get_document(Xapian::doccount index
) const
203 if (index
>= items
.size()) {
204 string msg
= "Requested index ";
206 msg
+= " in MSet of size ";
207 msg
+= str(items
.size());
208 throw Xapian::RangeError(msg
);
210 Assert(enquire
.get());
211 return enquire
->get_document(items
[index
].get_docid());
215 MSet::Internal::fetch(Xapian::doccount first_
, Xapian::doccount last
) const
217 if (items
.empty() || enquire
.get() == NULL
) {
220 if (last
> items
.size() - 1) {
221 last
= items
.size() - 1;
223 if (first_
<= last
) {
224 Xapian::doccount n
= last
- first_
;
225 for (Xapian::doccount i
= 0; i
<= n
; ++i
) {
226 enquire
->request_document(items
[i
].get_docid());
232 MSet::Internal::set_item_weight(Xapian::doccount i
, double weight
)
234 // max_attained is updated assuming that set_item_weight is called on every
235 // MSet item from 0 up. While assigning new weights max_attained is updated
236 // as the maximum of the new weights set till Xapian::doccount i.
238 max_attained
= weight
;
240 max_attained
= max(max_attained
, weight
);
241 // Ideally the max_possible should be the maximum possible weight that
242 // can be assigned by the reranking algorithm, but since it is not always
243 // possible to calculate the max possible weight for a reranking algorithm
244 // we use this approach.
245 max_possible
= max(max_possible
, max_attained
);
246 items
[i
].set_weight(weight
);
250 MSet::Internal::convert_to_percent(double weight
) const
253 if (percent_scale_factor
== 0.0) {
254 // For an unweighted search, give all matches 100%.
256 } else if (weight
<= 0.0) {
257 // Some weighting schemes can return zero relevance while matching,
258 // so give such matches 0%.
261 // Adding on 100 * DBL_EPSILON was a hack to work around excess
262 // precision (e.g. on x86 when not using SSE), but this code seems like
263 // it's generally asking for problems with floating point rounding
264 // issues - maybe we ought to carry through the matching and total
265 // number of subqueries and calculate using those instead.
267 // There are corresponding hacks in matcher/multimatch.cc.
268 percent
= int(weight
* percent_scale_factor
+ 100.0 * DBL_EPSILON
);
270 // Make any non-zero weight give a non-zero percentage.
272 } else if (percent
> 100) {
273 // Make sure we don't ever exceed 100%.
276 // FIXME: Ideally we should also make sure any non-exact match gives
283 MSet::Internal::serialise() const
287 result
+= encode_length(first
);
288 // Send back the raw matches_* values. MSet::get_matches_estimated()
289 // rounds the estimate lazily, but MSetPostList::get_termfreq_est()
290 // returns the estimate, and the raw estimate is better for that.
292 // It is also cleaner that a round-trip through serialisation gives you an
293 // object which is as close to the original as possible.
294 result
+= encode_length(matches_lower_bound
);
295 result
+= encode_length(matches_estimated
);
296 result
+= encode_length(matches_upper_bound
);
297 result
+= encode_length(uncollapsed_lower_bound
);
298 result
+= encode_length(uncollapsed_estimated
);
299 result
+= encode_length(uncollapsed_upper_bound
);
300 result
+= serialise_double(max_possible
);
301 result
+= serialise_double(max_attained
);
303 result
+= serialise_double(percent_scale_factor
);
305 result
+= encode_length(items
.size());
306 for (auto&& item
: items
) {
307 result
+= serialise_double(item
.get_weight());
308 result
+= encode_length(item
.get_docid());
309 result
+= encode_length(item
.get_sort_key().size());
310 result
+= item
.get_sort_key();
311 result
+= encode_length(item
.get_collapse_key().size());
312 result
+= item
.get_collapse_key();
313 result
+= encode_length(item
.get_collapse_count());
317 result
+= serialise_stats(*stats
);
323 MSet::Internal::unserialise(const char * p
, const char * p_end
)
327 decode_length(&p
, p_end
, first
);
328 decode_length(&p
, p_end
, matches_lower_bound
);
329 decode_length(&p
, p_end
, matches_estimated
);
330 decode_length(&p
, p_end
, matches_upper_bound
);
331 decode_length(&p
, p_end
, uncollapsed_lower_bound
);
332 decode_length(&p
, p_end
, uncollapsed_estimated
);
333 decode_length(&p
, p_end
, uncollapsed_upper_bound
);
334 max_possible
= unserialise_double(&p
, p_end
);
335 max_attained
= unserialise_double(&p
, p_end
);
337 percent_scale_factor
= unserialise_double(&p
, p_end
);
340 decode_length(&p
, p_end
, msize
);
341 while (msize
-- > 0) {
342 double wt
= unserialise_double(&p
, p_end
);
344 decode_length(&p
, p_end
, did
);
346 decode_length_and_check(&p
, p_end
, len
);
347 string
sort_key(p
, len
);
349 decode_length_and_check(&p
, p_end
, len
);
352 Xapian::doccount collapse_cnt
;
353 decode_length(&p
, p_end
, collapse_cnt
);
354 items
.emplace_back(wt
, did
, std::move(key
), collapse_cnt
,
355 std::move(sort_key
));
359 stats
.reset(new Xapian::Weight::Internal());
360 unserialise_stats(string(p
, p_end
- p
), *stats
);
365 MSet::Internal::get_description() const
367 string desc
= "MSet(matches_lower_bound=";
368 desc
+= str(matches_lower_bound
);
369 desc
+= ", matches_estimated=";
370 desc
+= str(matches_estimated
);
371 desc
+= ", matches_upper_bound=";
372 desc
+= str(matches_upper_bound
);
373 if (uncollapsed_lower_bound
!= matches_lower_bound
) {
374 desc
+= ", uncollapsed_lower_bound=";
375 desc
+= str(uncollapsed_lower_bound
);
377 if (uncollapsed_estimated
!= matches_estimated
) {
378 desc
+= ", uncollapsed_estimated=";
379 desc
+= str(uncollapsed_estimated
);
381 if (uncollapsed_upper_bound
!= matches_upper_bound
) {
382 desc
+= ", uncollapsed_upper_bound=";
383 desc
+= str(uncollapsed_upper_bound
);
389 if (max_possible
> 0) {
390 desc
+= ", max_possible=";
391 desc
+= str(max_possible
);
393 if (max_attained
> 0) {
394 desc
+= ", max_attained=";
395 desc
+= str(max_attained
);
399 for (auto&& item
: items
) {
405 desc
+= item
.get_description();