Fix clamping of maxitems and time limit handling
[xapian.git] / xapian-core / matcher / multimatch.cc
blob5813f793c8f266b01decbd104bd056ff4fd33c87
1 /* multimatch.cc
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6 * Copyright 2003 Orange PCS Ltd
7 * Copyright 2003 Sam Liddicott
8 * Copyright 2007,2008,2009 Lemur Consulting Ltd
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include "multimatch.h"
30 #include "api/msetinternal.h"
31 #include "backends/multi/multi_database.h"
32 #include "collapser.h"
33 #include "debuglog.h"
34 #include "submatch.h"
35 #include "localsubmatch.h"
36 #include "omassert.h"
37 #include "api/enquireinternal.h"
38 #include "api/rsetinternal.h"
39 #include "realtime.h"
41 #include "deciderpostlist.h"
42 #include "mergepostlist.h"
43 #include "postlisttree.h"
44 #include "spymaster.h"
46 #include "matchtimeout.h"
47 #include "msetcmp.h"
49 #include "valuestreamdocument.h"
50 #include "weight/weightinternal.h"
52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
54 #ifdef XAPIAN_HAS_REMOTE_BACKEND
55 #include "remotesubmatch.h"
56 #include "backends/remote/remote-database.h"
57 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
59 #include <algorithm>
60 #include <cfloat> // For DBL_EPSILON.
61 #include <climits> // For UINT_MAX.
62 #include <vector>
64 using namespace std;
65 using Xapian::Internal::intrusive_ptr;
67 const Xapian::Enquire::Internal::sort_setting REL =
68 Xapian::Enquire::Internal::REL;
69 const Xapian::Enquire::Internal::sort_setting REL_VAL =
70 Xapian::Enquire::Internal::REL_VAL;
71 const Xapian::Enquire::Internal::sort_setting VAL =
72 Xapian::Enquire::Internal::VAL;
73 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
74 const Xapian::Enquire::Internal::sort_setting VAL_REL =
75 Xapian::Enquire::Internal::VAL_REL;
76 #endif
78 /** Prepare some SubMatches.
80 * This calls the prepare_match() method on each SubMatch object, causing them
81 * to lookup various statistics.
83 * This method is rather complicated in order to handle remote matches
84 * efficiently. Instead of simply calling "prepare_match()" on each submatch
85 * and waiting for it to return, it first calls "prepare_match(true)" on each
86 * submatch. If any of these calls return false, indicating that the required
87 * information has not yet been received from the server, the method will try
88 * those which returned false again, passing "false" as a parameter to
89 * indicate that they should block until the information is ready.
91 * This should improve performance in the case of mixed local-and-remote
92 * searches - the local searchers will all fetch their statistics from disk
93 * without waiting for the remote searchers, so as soon as the remote searcher
94 * statistics arrive, we can move on to the next step.
96 static void
97 prepare_sub_matches(vector<intrusive_ptr<SubMatch> > & leaves,
98 Xapian::Weight::Internal & stats)
100 LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | stats);
101 // We use a vector<bool> to track which SubMatches we're already prepared.
102 vector<bool> prepared;
103 prepared.resize(leaves.size(), false);
104 size_t unprepared = leaves.size();
105 bool nowait = true;
106 while (unprepared) {
107 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
108 if (prepared[leaf]) continue;
109 SubMatch * submatch = leaves[leaf].get();
110 if (!submatch || submatch->prepare_match(nowait, stats)) {
111 prepared[leaf] = true;
112 --unprepared;
115 // Use blocking IO on subsequent passes, so that we don't go into
116 // a tight loop.
117 nowait = false;
121 ////////////////////////////////////
122 // Initialisation and cleaning up //
123 ////////////////////////////////////
124 MultiMatch::MultiMatch(const Xapian::Database &db_,
125 const Xapian::Query & query_,
126 Xapian::termcount qlen,
127 const Xapian::RSet * omrset,
128 Xapian::doccount collapse_max_,
129 Xapian::valueno collapse_key_,
130 int percent_cutoff_, double weight_cutoff_,
131 Xapian::Enquire::docid_order order_,
132 Xapian::valueno sort_key_,
133 Xapian::Enquire::Internal::sort_setting sort_by_,
134 bool sort_value_forward_,
135 double time_limit_,
136 Xapian::Weight::Internal & stats,
137 const Xapian::Weight * weight_,
138 const vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & matchspies_,
139 bool have_sorter, bool have_mdecider)
140 : db(db_), query(query_),
141 collapse_max(collapse_max_), collapse_key(collapse_key_),
142 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
143 order(order_),
144 sort_key(sort_key_), sort_by(sort_by_),
145 sort_value_forward(sort_value_forward_),
146 time_limit(time_limit_),
147 weight(weight_),
148 is_remote(db.internal->size()),
149 matchspies(matchspies_)
151 LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | time_limit_| stats | weight_ | matchspies_ | have_sorter | have_mdecider);
153 Assert(!query.empty());
155 Xapian::doccount number_of_subdbs = db.internal->size();
156 vector<Xapian::RSet> subrsets;
157 if (omrset && omrset->internal.get()) {
158 omrset->internal->shard(number_of_subdbs, subrsets);
159 } else {
160 subrsets.resize(number_of_subdbs);
163 for (size_t i = 0; i != number_of_subdbs; ++i) {
164 const Xapian::Database::Internal *subdb = db.internal.get();
165 if (number_of_subdbs > 1) {
166 auto multidb = static_cast<const MultiDatabase*>(subdb);
167 subdb = multidb->shards[i];
169 Assert(subdb);
170 intrusive_ptr<SubMatch> smatch;
172 // There is currently only one special case, for network databases.
173 #ifdef XAPIAN_HAS_REMOTE_BACKEND
174 if (subdb->get_backend_info(NULL) == BACKEND_REMOTE) {
175 auto rem_db = static_cast<const RemoteDatabase*>(subdb);
176 if (have_sorter) {
177 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
179 if (have_mdecider) {
180 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
182 // FIXME: Remote handling for time_limit with multiple
183 // databases may need some work.
184 rem_db->set_query(query, qlen, collapse_max, collapse_key,
185 order, sort_key, sort_by, sort_value_forward,
186 time_limit,
187 percent_cutoff, weight_cutoff, weight,
188 subrsets[i], matchspies);
189 bool decreasing_relevance =
190 (sort_by == REL || sort_by == REL_VAL);
191 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
192 is_remote[i] = true;
193 } else {
194 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight,
195 db.has_positions());
196 subdb->readahead_for_query(query);
198 #else
199 // Avoid unused parameter warnings.
200 (void)have_sorter;
201 (void)have_mdecider;
202 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight,
203 db.has_positions());
204 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
205 leaves.push_back(smatch);
208 stats.set_query(query);
209 prepare_sub_matches(leaves, stats);
210 stats.set_bounds_from_db(db);
213 void
214 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
215 Xapian::doccount check_at_least,
216 Xapian::MSet & mset,
217 Xapian::Weight::Internal & stats,
218 const Xapian::MatchDecider *mdecider,
219 const Xapian::KeyMaker *sorter)
221 LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("sorter"));
222 AssertRel(check_at_least,>=,first + maxitems);
224 Assert(!query.empty());
226 Assert(!leaves.empty());
228 TimeOut timeout(time_limit);
230 #ifdef XAPIAN_HAS_REMOTE_BACKEND
231 // If there's only one database and it's remote, we can just unserialise
232 // its MSet and return that.
233 if (leaves.size() == 1 && is_remote[0]) {
234 RemoteSubMatch * rem_match;
235 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
236 rem_match->start_match(first, maxitems, check_at_least, stats);
237 rem_match->get_mset(mset);
238 return;
240 #endif
242 // Start matchers.
243 for (auto && leaf : leaves) {
244 leaf->start_match(0, first + maxitems, first + check_at_least, stats);
247 ValueStreamDocument vsdoc(db);
248 ++vsdoc._refs;
249 Xapian::Document doc(&vsdoc);
251 // Get postlists and term info
252 vector<PostList *> postlists;
253 PostListTree pltree;
254 Xapian::termcount total_subqs = 0;
255 // Keep a count of matches which we know exist, but we won't see. This
256 // occurs when a submatch is remote, and returns a lower bound on the
257 // number of matching documents which is higher than the number of
258 // documents it returns (because it wasn't asked for more documents).
259 Xapian::doccount definite_matches_not_seen = 0;
260 for (size_t i = 0; i != leaves.size(); ++i) {
261 // Pick the highest total subqueries answer amongst the subdatabases,
262 // as the query to postlist conversion doesn't recurse into positional
263 // queries for shards that don't have positional data when at least one
264 // other shard does.
265 Xapian::termcount total_subqs_i = 0;
266 PostList * pl = leaves[i]->get_postlist(&pltree, &total_subqs_i);
267 total_subqs = max(total_subqs, total_subqs_i);
268 if (is_remote[i]) {
269 if (pl->get_termfreq_min() > first + maxitems) {
270 LOGLINE(MATCH, "Found " <<
271 pl->get_termfreq_min() - (first + maxitems)
272 << " definite matches in remote submatch "
273 "which aren't passed to local match");
274 definite_matches_not_seen += pl->get_termfreq_min();
275 definite_matches_not_seen -= first + maxitems;
277 } else if (mdecider) {
278 pl = new DeciderPostList(pl, mdecider, &vsdoc);
280 postlists.push_back(pl);
282 Assert(!postlists.empty());
284 Xapian::doccount n_shards = postlists.size();
285 if (n_shards == 1) {
286 pltree.set_postlist(postlists[0]);
287 } else {
288 pltree.set_postlist(new MergePostList(&postlists[0], n_shards, vsdoc));
291 LOGLINE(MATCH, "pl = (" << pltree.get_description() << ")");
293 // Empty result set
294 Xapian::doccount docs_matched = 0;
295 double greatest_wt = 0;
296 Xapian::termcount greatest_wt_subqs_matched = 0;
297 #ifdef XAPIAN_HAS_REMOTE_BACKEND
298 unsigned greatest_wt_subqs_db_num = UINT_MAX;
299 #endif
300 vector<Result> items;
302 // maximum weight a document could possibly have
303 const double max_possible = pltree.recalc_maxweight();
305 LOGLINE(MATCH, "pl = (" << pltree.get_description() << ")");
307 Xapian::doccount matches_upper_bound = pltree.get_termfreq_max();
308 Xapian::doccount matches_lower_bound = pltree.get_termfreq_min();
309 Xapian::doccount matches_estimated = pltree.get_termfreq_est();
311 SpyMaster spymaster(&matchspies, &is_remote);
313 // Check if any results have been asked for (might just be wanting
314 // maxweight).
315 if (check_at_least == 0) {
316 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
317 if (collapse_max) {
318 // Lower bound must be set to no more than collapse_max, since it's
319 // possible that all matching documents have the same collapse_key
320 // value and so are collapsed together.
321 if (matches_lower_bound > collapse_max)
322 matches_lower_bound = collapse_max;
325 mset.internal = new Xapian::MSet::Internal(
326 first,
327 matches_upper_bound,
328 matches_lower_bound,
329 matches_estimated,
330 matches_upper_bound,
331 uncollapsed_lower_bound,
332 matches_estimated,
333 max_possible, greatest_wt,
334 std::move(items),
336 return;
339 // Set max number of results that we want - this is used to decide
340 // when to throw away unwanted items.
341 Xapian::doccount max_msize = first + maxitems;
342 items.reserve(max_msize + 1);
344 // Tracks the minimum item currently eligible for the MSet - we compare
345 // candidate items against this.
346 Result min_item(0.0, 0);
348 // Minimum weight an item must have to be worth considering.
349 double min_weight = weight_cutoff;
351 // Factor to multiply maximum weight seen by to get the cutoff weight.
352 double percent_cutoff_factor = percent_cutoff / 100.0;
353 // Corresponding correction to that in omenquire.cc to account for excess
354 // precision on x86.
355 percent_cutoff_factor -= DBL_EPSILON;
357 // Object to handle collapsing.
358 Collapser collapser(collapse_key, collapse_max);
360 /// Comparison functor for sorting MSet
361 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
362 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
364 // Perform query
366 // We form the mset in two stages. In the first we fill up our working
367 // mset. Adding a new document does not remove another.
369 // In the second, we consider documents which rank higher than the current
370 // lowest ranking document in the mset. Each document added expels the
371 // current lowest ranking document.
373 // If a percentage cutoff is in effect, it can cause the matcher to return
374 // from the second stage from the first.
376 // Is the mset a valid heap?
377 bool is_heap = false;
379 while (true) {
380 bool pushback;
382 if (!pltree.next(min_weight)) {
383 LOGLINE(MATCH, "Reached end of potential matches");
384 break;
387 // Only calculate the weight if we need it for mcmp, or there's a
388 // percentage or weight cutoff in effect. Otherwise we calculate it
389 // below if we haven't already rejected this candidate.
390 double wt = 0.0;
391 bool calculated_weight = false;
392 if (sort_by != VAL || min_weight > 0.0) {
393 wt = pltree.get_weight();
394 if (wt < min_weight) {
395 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
396 continue;
398 calculated_weight = true;
401 Xapian::docid did = pltree.get_docid();
402 vsdoc.set_document(did);
403 LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
404 Result new_item(wt, did);
405 if (check_at_least > first + maxitems && timeout.timed_out()) {
406 check_at_least = first + maxitems;
409 if (sort_by != REL) {
410 const string * ptr = pltree.get_sort_key();
411 if (ptr) {
412 new_item.set_sort_key(*ptr);
413 } else if (sorter) {
414 new_item.set_sort_key((*sorter)(doc));
415 } else {
416 new_item.set_sort_key(vsdoc.get_value(sort_key));
419 // We're sorting by value (in part at least), so compare the item
420 // against the lowest currently in the proto-mset. If sort_by is
421 // VAL, then new_item.get_weight() won't yet be set, but that
422 // doesn't matter since it's not used by the sort function.
423 if (!mcmp(new_item, min_item)) {
424 if (mdecider == NULL && !collapser) {
425 // Document was definitely suitable for mset - no more
426 // processing needed.
427 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
428 ++docs_matched;
429 if (!calculated_weight) wt = pltree.get_weight();
430 spymaster(doc, wt, did);
431 if (wt > greatest_wt) goto new_greatest_weight;
432 continue;
434 if (docs_matched >= check_at_least) {
435 // We've seen enough items - we can drop this one.
436 LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
437 // FIXME: hmm, match decider might have rejected this...
438 if (!calculated_weight) wt = pltree.get_weight();
439 if (wt > greatest_wt) goto new_greatest_weight;
440 continue;
442 // We can't drop the item, because we need to test whether the
443 // mdecider would accept it and/or test whether it would be
444 // collapsed.
445 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
449 // Apply any MatchSpy objects.
450 if (spymaster) {
451 if (!calculated_weight) {
452 wt = pltree.get_weight();
453 new_item.set_weight(wt);
454 calculated_weight = true;
456 spymaster(doc, wt, did);
459 if (!calculated_weight) {
460 // we didn't calculate the weight above, but now we will need it
461 wt = pltree.get_weight();
462 new_item.set_weight(wt);
465 pushback = true;
467 // Perform collapsing on key if requested.
468 if (collapser) {
469 collapse_result res;
470 res = collapser.process(new_item, pltree.get_collapse_key(), vsdoc,
471 mcmp);
472 if (res == REJECTED) {
473 // If we're sorting by relevance primarily, then we throw away
474 // the lower weighted document anyway.
475 if (sort_by != REL && sort_by != REL_VAL) {
476 if (wt > greatest_wt) goto new_greatest_weight;
478 continue;
481 if (res == REPLACED) {
482 // There was a previous item in the collapse tab so
483 // the MSet can't be empty.
484 Assert(!items.empty());
486 const Result& old_item = collapser.old_item;
487 // This is one of the best collapse_max potential MSet entries
488 // with this key which we've seen so far. Check if the
489 // entry with this key which it displaced might still be in the
490 // proto-MSet. If it might be, we need to check through for
491 // it.
492 double old_wt = old_item.get_weight();
493 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
494 // Scan through (unsorted) MSet looking for entry.
495 // FIXME: more efficient way than just scanning?
496 Xapian::docid olddid = old_item.get_docid();
497 vector<Result>::iterator i;
498 for (i = items.begin(); i != items.end(); ++i) {
499 if (i->get_docid() == olddid) {
500 LOGLINE(MATCH, "collapse: removing " <<
501 olddid << ": " <<
502 new_item.get_collapse_key());
503 // We can replace an arbitrary element in O(log N)
504 // but have to do it by hand (in this case the new
505 // elt is bigger, so we just swap down the tree).
506 // FIXME: implement this, and clean up is_heap
507 // handling
508 *i = new_item;
509 pushback = false;
510 is_heap = false;
511 break;
518 // OK, actually add the item to the mset.
519 if (pushback) {
520 ++docs_matched;
521 if (items.size() >= max_msize) {
522 items.push_back(new_item);
523 if (!is_heap) {
524 is_heap = true;
525 make_heap(items.begin(), items.end(), mcmp);
526 } else {
527 push_heap<vector<Result>::iterator,
528 MSetCmp>(items.begin(), items.end(), mcmp);
530 pop_heap<vector<Result>::iterator,
531 MSetCmp>(items.begin(), items.end(), mcmp);
532 items.pop_back();
534 min_item = items.front();
535 if (sort_by == REL || sort_by == REL_VAL) {
536 if (docs_matched >= check_at_least) {
537 if (sort_by == REL) {
538 // We're done if this is a forward boolean match
539 // with only one database (bodgetastic, FIXME
540 // better if we can!)
541 if (rare(max_possible == 0 && sort_forward)) {
542 // In the multi database case, MergePostList
543 // currently processes each database
544 // sequentially (which actually may well be
545 // more efficient) so the docids in general
546 // won't arrive in order.
547 if (leaves.size() == 1) break;
550 if (min_item.get_weight() > min_weight) {
551 LOGLINE(MATCH, "Setting min_weight to " <<
552 min_item.get_weight() << " from " << min_weight);
553 min_weight = min_item.get_weight();
557 if (rare(pltree.recalc_maxweight() < min_weight)) {
558 LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
559 break;
561 } else {
562 items.push_back(new_item);
563 is_heap = false;
564 if (sort_by == REL && items.size() == max_msize) {
565 if (docs_matched >= check_at_least) {
566 // We're done if this is a forward boolean match
567 // with only one database (bodgetastic, FIXME
568 // better if we can!)
569 if (rare(max_possible == 0 && sort_forward)) {
570 // In the multi database case, MergePostList
571 // currently processes each database
572 // sequentially (which actually may well be
573 // more efficient) so the docids in general
574 // won't arrive in order.
575 if (leaves.size() == 1) break;
582 // Keep a track of the greatest weight we've seen.
583 if (wt > greatest_wt) {
584 new_greatest_weight:
585 greatest_wt = wt;
586 #ifdef XAPIAN_HAS_REMOTE_BACKEND
587 const unsigned int multiplier = db.internal->size();
588 unsigned int db_num = (did - 1) % multiplier;
589 if (is_remote[db_num]) {
590 // Note that the greatest weighted document came from a remote
591 // database, and which one.
592 greatest_wt_subqs_db_num = db_num;
593 } else
594 #endif
596 greatest_wt_subqs_matched = pltree.count_matching_subqs();
597 #ifdef XAPIAN_HAS_REMOTE_BACKEND
598 greatest_wt_subqs_db_num = UINT_MAX;
599 #endif
601 if (percent_cutoff) {
602 double w = wt * percent_cutoff_factor;
603 if (w > min_weight) {
604 min_weight = w;
605 if (!is_heap) {
606 is_heap = true;
607 make_heap<vector<Result>::iterator,
608 MSetCmp>(items.begin(), items.end(), mcmp);
610 while (!items.empty() && items.front().get_weight() < min_weight) {
611 pop_heap<vector<Result>::iterator,
612 MSetCmp>(items.begin(), items.end(), mcmp);
613 Assert(items.back().get_weight() < min_weight);
614 items.pop_back();
616 #ifdef XAPIAN_ASSERTIONS_PARANOID
617 vector<Result>::const_iterator i;
618 for (i = items.begin(); i != items.end(); ++i) {
619 Assert(i->get_weight() >= min_weight);
621 #endif
627 double percent_scale = 0;
628 if (!items.empty() && greatest_wt > 0) {
629 #ifdef XAPIAN_HAS_REMOTE_BACKEND
630 if (greatest_wt_subqs_db_num != UINT_MAX) {
631 const unsigned int n = greatest_wt_subqs_db_num;
632 RemoteSubMatch * rem_match;
633 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
634 percent_scale = rem_match->get_percent_factor() / 100.0;
635 } else
636 #endif
638 percent_scale = greatest_wt_subqs_matched / double(total_subqs);
639 percent_scale /= greatest_wt;
641 Assert(percent_scale > 0);
642 if (percent_cutoff) {
643 // FIXME: better to sort and binary chop maybe?
644 // Or we could just use a linear scan here instead.
646 // trim the mset to the correct answer...
647 double min_wt = percent_cutoff_factor / percent_scale;
648 if (!is_heap) {
649 is_heap = true;
650 make_heap<vector<Result>::iterator,
651 MSetCmp>(items.begin(), items.end(), mcmp);
653 while (!items.empty() && items.front().get_weight() < min_wt) {
654 pop_heap<vector<Result>::iterator,
655 MSetCmp>(items.begin(), items.end(), mcmp);
656 Assert(items.back().get_weight() < min_wt);
657 items.pop_back();
659 #ifdef XAPIAN_ASSERTIONS_PARANOID
660 vector<Result>::const_iterator j;
661 for (j = items.begin(); j != items.end(); ++j) {
662 Assert(j->get_weight() >= min_wt);
664 #endif
668 LOGLINE(MATCH,
669 "docs_matched = " << docs_matched <<
670 ", definite_matches_not_seen = " << definite_matches_not_seen <<
671 ", matches_lower_bound = " << matches_lower_bound <<
672 ", matches_estimated = " << matches_estimated <<
673 ", matches_upper_bound = " << matches_upper_bound);
675 // Adjust docs_matched to take account of documents which matched remotely
676 // but weren't sent across.
677 docs_matched += definite_matches_not_seen;
679 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
680 Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
681 Xapian::doccount uncollapsed_estimated = matches_estimated;
682 if (items.size() < max_msize) {
683 // We have fewer items in the mset than we tried to get for it, so we
684 // must have all the matches in it.
685 LOGLINE(MATCH, "items.size() = " << items.size() <<
686 ", max_msize = " << max_msize << ", setting bounds equal");
687 Assert(definite_matches_not_seen == 0);
688 Assert(percent_cutoff || docs_matched == items.size());
689 matches_lower_bound = matches_upper_bound = matches_estimated
690 = items.size();
691 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
692 uncollapsed_lower_bound = matches_lower_bound;
693 } else if (!collapser && docs_matched < check_at_least) {
694 // We have seen fewer matches than we checked for, so we must have seen
695 // all the matches.
696 LOGLINE(MATCH, "Setting bounds equal");
697 matches_lower_bound = matches_upper_bound = matches_estimated
698 = docs_matched;
699 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
700 uncollapsed_lower_bound = matches_lower_bound;
701 } else {
702 AssertRel(matches_estimated,>=,matches_lower_bound);
703 AssertRel(matches_estimated,<=,matches_upper_bound);
705 // We can end up scaling the estimate more than once, so collect
706 // the scale factors and apply them in one go to avoid rounding
707 // more than once.
708 double estimate_scale = 1.0;
709 double unique_rate = 1.0;
711 if (collapser) {
712 LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
713 matches_lower_bound = collapser.get_matches_lower_bound();
714 if (matches_lower_bound > uncollapsed_lower_bound)
715 uncollapsed_lower_bound = matches_lower_bound;
717 // The estimate for the number of hits can be modified by
718 // multiplying it by the rate at which we've been finding
719 // unique documents.
720 Xapian::doccount docs_considered = collapser.get_docs_considered();
721 Xapian::doccount dups_ignored = collapser.get_dups_ignored();
722 if (docs_considered > 0) {
723 double unique = double(docs_considered - dups_ignored);
724 unique_rate = unique / double(docs_considered);
727 // We can safely reduce the upper bound by the number of duplicates
728 // we've ignored.
729 matches_upper_bound -= dups_ignored;
731 LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
732 ", matches_estimated=" << matches_estimated <<
733 "*" << estimate_scale << "*" << unique_rate <<
734 ", matches_upper_bound=" << matches_upper_bound);
737 if (mdecider) {
738 if (!percent_cutoff) {
739 if (!collapser) {
740 // We're not collapsing or doing a percentage cutoff, so
741 // docs_matched is a lower bound on the total number of
742 // matches.
743 matches_lower_bound = max(docs_matched, matches_lower_bound);
747 Xapian::doccount decider_denied = mdecider->docs_denied_;
748 Xapian::doccount decider_considered =
749 mdecider->docs_allowed_ + mdecider->docs_denied_;
751 // The estimate for the number of hits can be modified by
752 // multiplying it by the rate at which the match decider has
753 // been accepting documents.
754 if (decider_considered > 0) {
755 double accept = double(decider_considered - decider_denied);
756 double accept_rate = accept / double(decider_considered);
757 estimate_scale *= accept_rate;
760 // If a document is denied by a match decider, it is not possible
761 // for it to found to be a duplicate, so it is safe to also reduce
762 // the upper bound by the number of documents denied by a match
763 // decider.
764 matches_upper_bound -= decider_denied;
765 if (collapser) uncollapsed_upper_bound -= decider_denied;
768 if (percent_cutoff) {
769 estimate_scale *= (1.0 - percent_cutoff_factor);
770 // another approach:
771 // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
772 // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
774 // Very likely an underestimate, but we can't really do better
775 // without checking further matches... Only possibility would be
776 // to track how many docs made the min weight test but didn't make
777 // the candidate set since the last greatest_wt change, which we
778 // could use if the top documents matched all the prob terms.
779 matches_lower_bound = items.size();
780 if (collapser) uncollapsed_lower_bound = matches_lower_bound;
782 // matches_upper_bound could be reduced by the number of documents
783 // which fail the min weight test (FIXME)
785 LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
786 percent_cutoff << "): now have matches_estimated=" <<
787 matches_estimated << ", matches_lower_bound=" <<
788 matches_lower_bound);
791 if (collapser && estimate_scale != 1.0) {
792 uncollapsed_estimated =
793 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
796 estimate_scale *= unique_rate;
798 if (estimate_scale != 1.0) {
799 matches_estimated =
800 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
801 if (matches_estimated < matches_lower_bound)
802 matches_estimated = matches_lower_bound;
805 if (collapser || mdecider) {
806 LOGLINE(MATCH, "Clamping estimate between bounds: "
807 "matches_lower_bound = " << matches_lower_bound <<
808 ", matches_estimated = " << matches_estimated <<
809 ", matches_upper_bound = " << matches_upper_bound);
810 AssertRel(matches_lower_bound,<=,matches_upper_bound);
811 matches_estimated = max(matches_estimated, matches_lower_bound);
812 matches_estimated = min(matches_estimated, matches_upper_bound);
813 } else if (!percent_cutoff) {
814 AssertRel(docs_matched,<=,matches_upper_bound);
815 if (docs_matched > matches_lower_bound)
816 matches_lower_bound = docs_matched;
817 if (docs_matched > matches_estimated)
818 matches_estimated = docs_matched;
821 if (collapser && !mdecider && !percent_cutoff) {
822 AssertRel(docs_matched,<=,uncollapsed_upper_bound);
823 if (docs_matched > uncollapsed_lower_bound)
824 uncollapsed_lower_bound = docs_matched;
828 if (collapser) {
829 AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
830 if (uncollapsed_estimated < uncollapsed_lower_bound) {
831 uncollapsed_estimated = uncollapsed_lower_bound;
832 } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
833 uncollapsed_estimated = uncollapsed_upper_bound;
835 } else {
836 // We're not collapsing, so the bounds must be the same.
837 uncollapsed_lower_bound = matches_lower_bound;
838 uncollapsed_upper_bound = matches_upper_bound;
839 uncollapsed_estimated = matches_estimated;
842 LOGLINE(MATCH, items.size() << " items in potential mset");
844 if (first > 0) {
845 // Remove unwanted leading entries
846 if (items.size() <= first) {
847 items.clear();
848 } else {
849 LOGLINE(MATCH, "finding " << first << "th");
850 // We perform nth_element() on reverse iterators so that the
851 // unwanted elements end up at the end of items, which means
852 // that the call to erase() to remove them doesn't have to copy
853 // any elements.
854 vector<Result>::reverse_iterator nth;
855 nth = items.rbegin() + first;
856 nth_element(items.rbegin(), nth, items.rend(), mcmp);
857 // Erase the trailing "first" elements
858 items.erase(items.begin() + items.size() - first, items.end());
862 LOGLINE(MATCH, "sorting " << items.size() << " entries");
864 // We could use std::sort_heap if is_heap is true, but profiling
865 // suggests that's actually slower. Cast to void to suppress
866 // compiler warnings that the last set value of is_heap is never
867 // used.
868 (void)is_heap;
870 // Need a stable sort, but this is provided by comparison operator
871 sort(items.begin(), items.end(), mcmp);
873 if (!items.empty()) {
874 LOGLINE(MATCH, "min weight in mset = " << items.back().get_weight());
875 LOGLINE(MATCH, "max weight in mset = " << items[0].get_weight());
878 AssertRel(matches_estimated,>=,matches_lower_bound);
879 AssertRel(matches_estimated,<=,matches_upper_bound);
881 AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
882 AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
884 // We may need to qualify any collapse_count to see if the highest weight
885 // collapsed item would have qualified percent_cutoff
886 // We WILL need to restore collapse_count to the mset by taking from
887 // collapse_tab; this is what comes of copying around whole objects
888 // instead of taking references, we find it hard to update collapse_count
889 // of an item that has already been pushed-back as we don't know where it
890 // is any more. If we keep or find references we won't need to mess with
891 // is_heap so much maybe?
892 if (!items.empty() && collapser && !collapser.empty()) {
893 // Nicked this formula from above.
894 double min_wt = 0.0;
895 if (percent_scale > 0.0)
896 min_wt = percent_cutoff_factor / percent_scale;
897 Xapian::doccount entries = collapser.entries();
898 vector<Result>::iterator i;
899 for (i = items.begin(); i != items.end(); ++i) {
900 // Skip entries without a collapse key.
901 if (i->get_collapse_key().empty()) continue;
903 // Set collapse_count appropriately.
904 i->set_collapse_count(collapser.get_collapse_count(i->get_collapse_key(), percent_cutoff, min_wt));
905 if (--entries == 0) {
906 // Stop once we've processed all items with collapse keys.
907 break;
912 mset.internal = new Xapian::MSet::Internal(
913 first,
914 matches_upper_bound,
915 matches_lower_bound,
916 matches_estimated,
917 uncollapsed_upper_bound,
918 uncollapsed_lower_bound,
919 uncollapsed_estimated,
920 max_possible, greatest_wt,
921 std::move(items),
922 percent_scale * 100.0);