2 * @brief Xapian::MSet class
4 /* Copyright (C) 2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "msetinternal.h"
24 #include "xapian/mset.h"
26 #include "net/length.h"
27 #include "net/serialise.h"
28 #include "matcher/msetcmp.h"
29 #include "roundestimate.h"
30 #include "serialise-double.h"
32 #include "unicode/description_append.h"
42 MSet::MSet(const MSet
& o
) : internal(o
.internal
) {}
45 MSet::operator=(const MSet
& o
)
47 internal
= o
.internal
;
51 MSet::MSet() : internal(new MSet::Internal
) {}
53 MSet::MSet(Internal
* internal_
) : internal(internal_
) {}
58 MSet::fetch_(Xapian::doccount first
, Xapian::doccount last
) const
60 internal
->fetch(first
, last
);
64 MSet::set_item_weight(Xapian::doccount i
, double weight
)
66 internal
->set_item_weight(i
, weight
);
70 MSet::sort_by_relevance()
72 std::sort(internal
->items
.begin(), internal
->items
.end(),
73 get_msetcmp_function(Enquire::Internal::REL
, true, false));
77 MSet::convert_to_percent(double weight
) const
79 return internal
->convert_to_percent(weight
);
83 MSet::get_termfreq(const std::string
& term
) const
85 // Check the cached data for query terms first.
86 Xapian::doccount termfreq
;
87 if (usual(internal
->stats
&& internal
->stats
->get_stats(term
, termfreq
))) {
91 if (rare(internal
->enquire
.get() == NULL
)) {
92 // Consistent with get_termfreq() on an empty database which always
97 // Fall back to asking the database via enquire.
98 return internal
->enquire
->get_termfreq(term
);
102 MSet::get_termweight(const std::string
& term
) const
104 // A term not in the query has no termweight, so 0.0 makes sense as the
105 // answer in such cases.
107 if (usual(internal
->stats
)) {
108 (void)internal
->stats
->get_termweight(term
, weight
);
114 MSet::get_firstitem() const
116 return internal
->first
;
120 MSet::get_matches_lower_bound() const
122 return internal
->matches_lower_bound
;
126 MSet::get_matches_estimated() const
128 // Doing this here avoids calculating if the estimate is never looked at,
129 // though does mean we recalculate if this method is called more than once.
130 return round_estimate(internal
->matches_lower_bound
,
131 internal
->matches_upper_bound
,
132 internal
->matches_estimated
);
136 MSet::get_matches_upper_bound() const
138 return internal
->matches_upper_bound
;
142 MSet::get_uncollapsed_matches_lower_bound() const
144 return internal
->uncollapsed_lower_bound
;
148 MSet::get_uncollapsed_matches_estimated() const
150 // Doing this here avoids calculating if the estimate is never looked at,
151 // though does mean we recalculate if this method is called more than once.
152 return round_estimate(internal
->uncollapsed_lower_bound
,
153 internal
->uncollapsed_upper_bound
,
154 internal
->uncollapsed_estimated
);
158 MSet::get_uncollapsed_matches_upper_bound() const
160 return internal
->uncollapsed_upper_bound
;
164 MSet::get_max_attained() const
166 return internal
->max_attained
;
170 MSet::get_max_possible() const
172 return internal
->max_possible
;
178 Assert(internal
.get());
179 return internal
->items
.size();
183 MSet::snippet(const std::string
& text
,
185 const Xapian::Stem
& stemmer
,
187 const std::string
& hi_start
,
188 const std::string
& hi_end
,
189 const std::string
& omit
) const
191 // The actual implementation is in queryparser/termgenerator_internal.cc.
192 return internal
->snippet(text
, length
, stemmer
, flags
,
193 hi_start
, hi_end
, omit
);
197 MSet::get_description() const
199 return internal
->get_description();
203 MSet::Internal::get_document(Xapian::doccount index
) const
205 if (index
>= items
.size()) {
206 string msg
= "Requested index ";
208 msg
+= " in MSet of size ";
209 msg
+= str(items
.size());
210 throw Xapian::RangeError(msg
);
212 Assert(enquire
.get());
213 return enquire
->get_document(items
[index
].get_docid());
217 MSet::Internal::fetch(Xapian::doccount first_
, Xapian::doccount last
) const
219 if (items
.empty() || enquire
.get() == NULL
) {
222 if (last
> items
.size() - 1) {
223 last
= items
.size() - 1;
225 if (first_
<= last
) {
226 Xapian::doccount n
= last
- first_
;
227 for (Xapian::doccount i
= 0; i
<= n
; ++i
) {
228 enquire
->request_document(items
[i
].get_docid());
234 MSet::Internal::set_item_weight(Xapian::doccount i
, double weight
)
236 // max_attained is updated assuming that set_item_weight is called on every
237 // MSet item from 0 up. While assigning new weights max_attained is updated
238 // as the maximum of the new weights set till Xapian::doccount i.
240 max_attained
= weight
;
242 max_attained
= max(max_attained
, weight
);
243 // Ideally the max_possible should be the maximum possible weight that
244 // can be assigned by the reranking algorithm, but since it is not always
245 // possible to calculate the max possible weight for a reranking algorithm
246 // we use this approach.
247 max_possible
= max(max_possible
, max_attained
);
248 items
[i
].set_weight(weight
);
252 MSet::Internal::convert_to_percent(double weight
) const
255 if (percent_scale_factor
== 0.0) {
256 // For an unweighted search, give all matches 100%.
258 } else if (weight
<= 0.0) {
259 // Some weighting schemes can return zero relevance while matching,
260 // so give such matches 0%.
263 // Adding on 100 * DBL_EPSILON was a hack to work around excess
264 // precision (e.g. on x86 when not using SSE), but this code seems like
265 // it's generally asking for problems with floating point rounding
266 // issues - maybe we ought to carry through the matching and total
267 // number of subqueries and calculate using those instead.
269 // There are corresponding hacks in matcher/matcher.cc.
270 percent
= int(weight
* percent_scale_factor
+ 100.0 * DBL_EPSILON
);
272 // Make any non-zero weight give a non-zero percentage.
274 } else if (percent
> 100) {
275 // Make sure we don't ever exceed 100%.
278 // FIXME: Ideally we should also make sure any non-exact match gives
285 MSet::Internal::unshard_docids(Xapian::doccount shard
,
286 Xapian::doccount n_shards
)
288 for (auto& result
: items
) {
289 result
.unshard_docid(shard
, n_shards
);
294 MSet::Internal::merge_stats(const Internal
* o
)
296 if (snippet_bg_relevance
.empty()) {
297 snippet_bg_relevance
= o
->snippet_bg_relevance
;
299 Assert(snippet_bg_relevance
== o
->snippet_bg_relevance
);
301 matches_lower_bound
+= o
->matches_lower_bound
;
302 matches_estimated
+= o
->matches_estimated
;
303 matches_upper_bound
+= o
->matches_upper_bound
;
304 uncollapsed_lower_bound
+= o
->uncollapsed_lower_bound
;
305 uncollapsed_estimated
+= o
->uncollapsed_estimated
;
306 uncollapsed_upper_bound
+= o
->uncollapsed_upper_bound
;
307 max_possible
= max(max_possible
, o
->max_possible
);
308 if (o
->max_attained
> max_attained
) {
309 max_attained
= o
->max_attained
;
310 percent_scale_factor
= o
->percent_scale_factor
;
315 MSet::Internal::serialise() const
319 result
+= encode_length(first
);
320 // Send back the raw matches_* values. MSet::get_matches_estimated()
321 // rounds the estimate lazily, but when we merge MSet objects we really
322 // want to merge based on the raw estimates.
324 // It is also cleaner that a round-trip through serialisation gives you an
325 // object which is as close to the original as possible.
326 result
+= encode_length(matches_lower_bound
);
327 result
+= encode_length(matches_estimated
);
328 result
+= encode_length(matches_upper_bound
);
329 result
+= encode_length(uncollapsed_lower_bound
);
330 result
+= encode_length(uncollapsed_estimated
);
331 result
+= encode_length(uncollapsed_upper_bound
);
332 result
+= serialise_double(max_possible
);
333 result
+= serialise_double(max_attained
);
335 result
+= serialise_double(percent_scale_factor
);
337 result
+= encode_length(items
.size());
338 for (auto&& item
: items
) {
339 result
+= serialise_double(item
.get_weight());
340 result
+= encode_length(item
.get_docid());
341 result
+= encode_length(item
.get_sort_key().size());
342 result
+= item
.get_sort_key();
343 result
+= encode_length(item
.get_collapse_key().size());
344 result
+= item
.get_collapse_key();
345 result
+= encode_length(item
.get_collapse_count());
349 result
+= serialise_stats(*stats
);
355 MSet::Internal::unserialise(const char * p
, const char * p_end
)
359 decode_length(&p
, p_end
, first
);
360 decode_length(&p
, p_end
, matches_lower_bound
);
361 decode_length(&p
, p_end
, matches_estimated
);
362 decode_length(&p
, p_end
, matches_upper_bound
);
363 decode_length(&p
, p_end
, uncollapsed_lower_bound
);
364 decode_length(&p
, p_end
, uncollapsed_estimated
);
365 decode_length(&p
, p_end
, uncollapsed_upper_bound
);
366 max_possible
= unserialise_double(&p
, p_end
);
367 max_attained
= unserialise_double(&p
, p_end
);
369 percent_scale_factor
= unserialise_double(&p
, p_end
);
372 decode_length(&p
, p_end
, msize
);
373 while (msize
-- > 0) {
374 double wt
= unserialise_double(&p
, p_end
);
376 decode_length(&p
, p_end
, did
);
378 decode_length_and_check(&p
, p_end
, len
);
379 string
sort_key(p
, len
);
381 decode_length_and_check(&p
, p_end
, len
);
384 Xapian::doccount collapse_cnt
;
385 decode_length(&p
, p_end
, collapse_cnt
);
386 items
.emplace_back(wt
, did
, std::move(key
), collapse_cnt
,
387 std::move(sort_key
));
391 stats
.reset(new Xapian::Weight::Internal());
392 unserialise_stats(string(p
, p_end
- p
), *stats
);
397 MSet::Internal::get_description() const
399 string desc
= "MSet(matches_lower_bound=";
400 desc
+= str(matches_lower_bound
);
401 desc
+= ", matches_estimated=";
402 desc
+= str(matches_estimated
);
403 desc
+= ", matches_upper_bound=";
404 desc
+= str(matches_upper_bound
);
405 if (uncollapsed_lower_bound
!= matches_lower_bound
) {
406 desc
+= ", uncollapsed_lower_bound=";
407 desc
+= str(uncollapsed_lower_bound
);
409 if (uncollapsed_estimated
!= matches_estimated
) {
410 desc
+= ", uncollapsed_estimated=";
411 desc
+= str(uncollapsed_estimated
);
413 if (uncollapsed_upper_bound
!= matches_upper_bound
) {
414 desc
+= ", uncollapsed_upper_bound=";
415 desc
+= str(uncollapsed_upper_bound
);
421 if (max_possible
> 0) {
422 desc
+= ", max_possible=";
423 desc
+= str(max_possible
);
425 if (max_attained
> 0) {
426 desc
+= ", max_attained=";
427 desc
+= str(max_attained
);
431 for (auto&& item
: items
) {
437 desc
+= item
.get_description();