Make setting an ErrorHandler a no-op
[xapian.git] / xapian-core / matcher / multimatch.cc
blobd0de1b42c00f58e45a3bdae363cca8d84420b83a
1 /* multimatch.cc
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6 * Copyright 2003 Orange PCS Ltd
7 * Copyright 2003 Sam Liddicott
8 * Copyright 2007,2008,2009 Lemur Consulting Ltd
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include "multimatch.h"
30 #include "autoptr.h"
31 #include "collapser.h"
32 #include "debuglog.h"
33 #include "submatch.h"
34 #include "localsubmatch.h"
35 #include "omassert.h"
36 #include "api/omenquireinternal.h"
37 #include "realtime.h"
39 #include "api/emptypostlist.h"
40 #include "branchpostlist.h"
41 #include "mergepostlist.h"
43 #include "backends/document.h"
45 #include "msetcmp.h"
47 #include "valuestreamdocument.h"
48 #include "weight/weightinternal.h"
50 #include <xapian/matchspy.h>
51 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
53 #ifdef XAPIAN_HAS_REMOTE_BACKEND
54 #include "remotesubmatch.h"
55 #include "backends/remote/remote-database.h"
56 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
58 #include <algorithm>
59 #include <cfloat> // For DBL_EPSILON.
60 #include <climits> // For UINT_MAX.
61 #include <vector>
62 #include <map>
63 #include <set>
65 #ifdef HAVE_TIMER_CREATE
66 #include <signal.h>
67 #include <time.h>
69 extern "C" {
71 static void
72 set_timeout_flag(union sigval sv)
74 *(reinterpret_cast<volatile bool*>(sv.sival_ptr)) = true;
79 #ifdef __sun
80 // Solaris defines CLOCK_MONOTONIC, but "man timer_create" doesn't mention it
81 // and using it fails.
82 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
83 #elif defined __CYGWIN__
84 // https://cygwin.com/cygwin-api/std-notes.html currently (2015-11-22) says:
86 // clock_setres, clock_settime, and timer_create currently support only
87 // CLOCK_REALTIME.
88 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
89 #else
90 const clockid_t TIMEOUT_CLOCK = CLOCK_MONOTONIC;
91 #endif
93 class TimeOut {
94 struct sigevent sev;
95 timer_t timerid;
96 volatile bool expired;
98 public:
99 TimeOut(double limit) : expired(false) {
100 if (limit > 0) {
101 sev.sigev_notify = SIGEV_THREAD;
102 sev.sigev_notify_function = set_timeout_flag;
103 sev.sigev_notify_attributes = NULL;
104 sev.sigev_value.sival_ptr =
105 static_cast<void*>(const_cast<bool*>(&expired));
106 if (usual(timer_create(TIMEOUT_CLOCK, &sev, &timerid) == 0)) {
107 struct itimerspec interval;
108 interval.it_interval.tv_sec = 0;
109 interval.it_interval.tv_nsec = 0;
110 RealTime::to_timespec(limit, &interval.it_value);
111 if (usual(timer_settime(timerid, 0, &interval, NULL) == 0)) {
112 // Timeout successfully set.
113 return;
115 timer_delete(timerid);
118 sev.sigev_notify = SIGEV_NONE;
121 ~TimeOut() {
122 if (sev.sigev_notify != SIGEV_NONE) {
123 timer_delete(timerid);
124 sev.sigev_notify = SIGEV_NONE;
128 bool timed_out() const { return expired; }
130 #else
131 class TimeOut {
132 public:
133 TimeOut(double) { }
134 bool timed_out() const { return false; }
136 #endif
138 using namespace std;
139 using Xapian::Internal::intrusive_ptr;
141 const Xapian::Enquire::Internal::sort_setting REL =
142 Xapian::Enquire::Internal::REL;
143 const Xapian::Enquire::Internal::sort_setting REL_VAL =
144 Xapian::Enquire::Internal::REL_VAL;
145 const Xapian::Enquire::Internal::sort_setting VAL =
146 Xapian::Enquire::Internal::VAL;
147 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
148 const Xapian::Enquire::Internal::sort_setting VAL_REL =
149 Xapian::Enquire::Internal::VAL_REL;
150 #endif
152 /** Split an RSet into several sub rsets, one for each database.
154 * @param rset The RSet to split.
155 * @param number_of_subdbs The number of sub databases which exist.
156 * @param subrsets Vector of RSets which will the sub rsets will be placed in.
157 * This should be empty when the function is called.
159 static void
160 split_rset_by_db(const Xapian::RSet * rset,
161 Xapian::doccount number_of_subdbs,
162 vector<Xapian::RSet> & subrsets)
164 LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
165 if (rset && !rset->empty()) {
166 if (number_of_subdbs == 1) {
167 // The common case of a single database is easy to handle.
168 subrsets.push_back(*rset);
169 } else {
170 // Can't just use vector::resize() here, since that creates N
171 // copies of the same RSet!
172 subrsets.reserve(number_of_subdbs);
173 for (size_t i = 0; i < number_of_subdbs; ++i) {
174 subrsets.push_back(Xapian::RSet());
177 const set<Xapian::docid> & rsetitems = rset->internal->get_items();
178 set<Xapian::docid>::const_iterator j;
179 for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
180 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
181 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
182 subrsets[subdatabase].add_document(local_docid);
185 } else {
186 // NB vector::resize() creates N copies of the same empty RSet.
187 subrsets.resize(number_of_subdbs);
189 Assert(subrsets.size() == number_of_subdbs);
192 /** Prepare some SubMatches.
194 * This calls the prepare_match() method on each SubMatch object, causing them
195 * to lookup various statistics.
197 * This method is rather complicated in order to handle remote matches
198 * efficiently. Instead of simply calling "prepare_match()" on each submatch
199 * and waiting for it to return, it first calls "prepare_match(true)" on each
200 * submatch. If any of these calls return false, indicating that the required
201 * information has not yet been received from the server, the method will try
202 * those which returned false again, passing "false" as a parameter to
203 * indicate that they should block until the information is ready.
205 * This should improve performance in the case of mixed local-and-remote
206 * searches - the local searchers will all fetch their statistics from disk
207 * without waiting for the remote searchers, so as soon as the remote searcher
208 * statistics arrive, we can move on to the next step.
210 static void
211 prepare_sub_matches(vector<intrusive_ptr<SubMatch> > & leaves,
212 Xapian::Weight::Internal & stats)
214 LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | stats);
215 // We use a vector<bool> to track which SubMatches we're already prepared.
216 vector<bool> prepared;
217 prepared.resize(leaves.size(), false);
218 size_t unprepared = leaves.size();
219 bool nowait = true;
220 while (unprepared) {
221 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
222 if (prepared[leaf]) continue;
223 SubMatch * submatch = leaves[leaf].get();
224 if (!submatch || submatch->prepare_match(nowait, stats)) {
225 prepared[leaf] = true;
226 --unprepared;
229 // Use blocking IO on subsequent passes, so that we don't go into
230 // a tight loop.
231 nowait = false;
235 /// Class which applies several match spies in turn.
236 class MultipleMatchSpy : public Xapian::MatchSpy {
237 private:
238 /// List of match spies to call, in order.
239 const std::vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & spies;
241 public:
242 MultipleMatchSpy(const std::vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & spies_)
243 : spies(spies_) {}
245 /** Implementation of virtual operator().
247 * This implementation calls all the spies in turn.
249 void operator()(const Xapian::Document &doc, double wt);
252 void
253 MultipleMatchSpy::operator()(const Xapian::Document &doc, double wt) {
254 LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
255 for (auto i : spies) {
256 (*i)(doc, wt);
260 ////////////////////////////////////
261 // Initialisation and cleaning up //
262 ////////////////////////////////////
263 MultiMatch::MultiMatch(const Xapian::Database &db_,
264 const Xapian::Query & query_,
265 Xapian::termcount qlen,
266 const Xapian::RSet * omrset,
267 Xapian::doccount collapse_max_,
268 Xapian::valueno collapse_key_,
269 int percent_cutoff_, double weight_cutoff_,
270 Xapian::Enquire::docid_order order_,
271 Xapian::valueno sort_key_,
272 Xapian::Enquire::Internal::sort_setting sort_by_,
273 bool sort_value_forward_,
274 double time_limit_,
275 Xapian::Weight::Internal & stats,
276 const Xapian::Weight * weight_,
277 const vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & matchspies_,
278 bool have_sorter, bool have_mdecider)
279 : db(db_), query(query_),
280 collapse_max(collapse_max_), collapse_key(collapse_key_),
281 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
282 order(order_),
283 sort_key(sort_key_), sort_by(sort_by_),
284 sort_value_forward(sort_value_forward_),
285 time_limit(time_limit_),
286 weight(weight_),
287 is_remote(db.internal.size()),
288 matchspies(matchspies_)
290 LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | time_limit_| stats | weight_ | matchspies_ | have_sorter | have_mdecider);
292 if (query.empty()) return;
294 Xapian::doccount number_of_subdbs = db.internal.size();
295 vector<Xapian::RSet> subrsets;
296 split_rset_by_db(omrset, number_of_subdbs, subrsets);
298 for (size_t i = 0; i != number_of_subdbs; ++i) {
299 Xapian::Database::Internal *subdb = db.internal[i].get();
300 Assert(subdb);
301 intrusive_ptr<SubMatch> smatch;
303 // There is currently only one special case, for network databases.
304 #ifdef XAPIAN_HAS_REMOTE_BACKEND
305 if (subdb->get_backend_info(NULL) == BACKEND_REMOTE) {
306 RemoteDatabase *rem_db = static_cast<RemoteDatabase*>(subdb);
307 if (have_sorter) {
308 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
310 if (have_mdecider) {
311 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
313 // FIXME: Remote handling for time_limit with multiple
314 // databases may need some work.
315 rem_db->set_query(query, qlen, collapse_max, collapse_key,
316 order, sort_key, sort_by, sort_value_forward,
317 time_limit,
318 percent_cutoff, weight_cutoff, weight,
319 subrsets[i], matchspies);
320 bool decreasing_relevance =
321 (sort_by == REL || sort_by == REL_VAL);
322 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
323 is_remote[i] = true;
324 } else {
325 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
326 subdb->readahead_for_query(query);
328 #else
329 // Avoid unused parameter warnings.
330 (void)have_sorter;
331 (void)have_mdecider;
332 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
333 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
334 leaves.push_back(smatch);
337 stats.set_query(query);
338 prepare_sub_matches(leaves, stats);
339 stats.set_bounds_from_db(db);
342 double
343 MultiMatch::getorrecalc_maxweight(PostList *pl)
345 LOGCALL(MATCH, double, "MultiMatch::getorrecalc_maxweight", pl);
346 double wt;
347 if (recalculate_w_max) {
348 LOGLINE(MATCH, "recalculating max weight");
349 wt = pl->recalc_maxweight();
350 recalculate_w_max = false;
351 } else {
352 wt = pl->get_maxweight();
353 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
354 AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
356 LOGLINE(MATCH, "max possible doc weight = " << wt);
357 RETURN(wt);
360 void
361 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
362 Xapian::doccount check_at_least,
363 Xapian::MSet & mset,
364 Xapian::Weight::Internal & stats,
365 const Xapian::MatchDecider *mdecider,
366 const Xapian::KeyMaker *sorter)
368 LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("sorter"));
369 AssertRel(check_at_least,>=,maxitems);
371 if (query.empty()) {
372 mset = Xapian::MSet();
373 mset.internal->firstitem = first;
374 return;
377 Assert(!leaves.empty());
379 TimeOut timeout(time_limit);
381 #ifdef XAPIAN_HAS_REMOTE_BACKEND
382 // If there's only one database and it's remote, we can just unserialise
383 // its MSet and return that.
384 if (leaves.size() == 1 && is_remote[0]) {
385 RemoteSubMatch * rem_match;
386 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
387 rem_match->start_match(first, maxitems, check_at_least, stats);
388 rem_match->get_mset(mset);
389 return;
391 #endif
393 // Start matchers.
394 for (auto && leaf : leaves) {
395 leaf->start_match(0, first + maxitems, first + check_at_least, stats);
398 // Get postlists and term info
399 vector<PostList *> postlists;
400 Xapian::termcount total_subqs = 0;
401 // Keep a count of matches which we know exist, but we won't see. This
402 // occurs when a submatch is remote, and returns a lower bound on the
403 // number of matching documents which is higher than the number of
404 // documents it returns (because it wasn't asked for more documents).
405 Xapian::doccount definite_matches_not_seen = 0;
406 for (size_t i = 0; i != leaves.size(); ++i) {
407 PostList * pl = leaves[i]->get_postlist(this, &total_subqs);
408 if (is_remote[i]) {
409 if (pl->get_termfreq_min() > first + maxitems) {
410 LOGLINE(MATCH, "Found " <<
411 pl->get_termfreq_min() - (first + maxitems)
412 << " definite matches in remote submatch "
413 "which aren't passed to local match");
414 definite_matches_not_seen += pl->get_termfreq_min();
415 definite_matches_not_seen -= first + maxitems;
418 postlists.push_back(pl);
420 Assert(!postlists.empty());
422 ValueStreamDocument vsdoc(db);
423 ++vsdoc._refs;
424 Xapian::Document doc(&vsdoc);
426 // Get a single combined postlist
427 AutoPtr<PostList> pl;
428 if (postlists.size() == 1) {
429 pl.reset(postlists.front());
430 } else {
431 pl.reset(new MergePostList(postlists, this, vsdoc));
434 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
436 // Empty result set
437 Xapian::doccount docs_matched = 0;
438 double greatest_wt = 0;
439 Xapian::termcount greatest_wt_subqs_matched = 0;
440 #ifdef XAPIAN_HAS_REMOTE_BACKEND
441 unsigned greatest_wt_subqs_db_num = UINT_MAX;
442 #endif
443 vector<Xapian::Internal::MSetItem> items;
445 // maximum weight a document could possibly have
446 const double max_possible = pl->recalc_maxweight();
448 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
449 recalculate_w_max = false;
451 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
452 Xapian::doccount matches_lower_bound = 0;
453 Xapian::doccount matches_estimated = pl->get_termfreq_est();
455 if (mdecider == NULL) {
456 // If we have a match decider, the lower bound must be
457 // set to 0 as we could discard all hits. Otherwise set it to the
458 // minimum number of entries which the postlist could return.
459 matches_lower_bound = pl->get_termfreq_min();
462 // Prepare the matchspy
463 Xapian::MatchSpy *matchspy = NULL;
464 MultipleMatchSpy multispy(matchspies);
465 if (!matchspies.empty()) {
466 if (matchspies.size() == 1) {
467 matchspy = matchspies[0].get();
468 } else {
469 matchspy = &multispy;
473 // Check if any results have been asked for (might just be wanting
474 // maxweight).
475 if (check_at_least == 0) {
476 pl.reset(NULL);
477 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
478 if (collapse_max) {
479 // Lower bound must be set to no more than collapse_max, since it's
480 // possible that all matching documents have the same collapse_key
481 // value and so are collapsed together.
482 if (matches_lower_bound > collapse_max)
483 matches_lower_bound = collapse_max;
486 mset.internal = new Xapian::MSet::Internal(
487 first,
488 matches_upper_bound,
489 matches_lower_bound,
490 matches_estimated,
491 matches_upper_bound,
492 uncollapsed_lower_bound,
493 matches_estimated,
494 max_possible, greatest_wt, items,
496 return;
499 // Number of documents considered by a decider.
500 Xapian::doccount decider_considered = 0;
501 // Number of documents denied by the decider.
502 Xapian::doccount decider_denied = 0;
504 // Set max number of results that we want - this is used to decide
505 // when to throw away unwanted items.
506 Xapian::doccount max_msize = first + maxitems;
507 items.reserve(max_msize + 1);
509 // Tracks the minimum item currently eligible for the MSet - we compare
510 // candidate items against this.
511 Xapian::Internal::MSetItem min_item(0.0, 0);
513 // Minimum weight an item must have to be worth considering.
514 double min_weight = weight_cutoff;
516 // Factor to multiply maximum weight seen by to get the cutoff weight.
517 double percent_cutoff_factor = percent_cutoff / 100.0;
518 // Corresponding correction to that in omenquire.cc to account for excess
519 // precision on x86.
520 percent_cutoff_factor -= DBL_EPSILON;
522 // Object to handle collapsing.
523 Collapser collapser(collapse_key, collapse_max);
525 /// Comparison functor for sorting MSet
526 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
527 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
529 // Perform query
531 // We form the mset in two stages. In the first we fill up our working
532 // mset. Adding a new document does not remove another.
534 // In the second, we consider documents which rank higher than the current
535 // lowest ranking document in the mset. Each document added expels the
536 // current lowest ranking document.
538 // If a percentage cutoff is in effect, it can cause the matcher to return
539 // from the second stage from the first.
541 // Is the mset a valid heap?
542 bool is_heap = false;
544 while (true) {
545 bool pushback;
547 if (rare(recalculate_w_max)) {
548 if (min_weight > 0.0) {
549 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
550 LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
551 break;
556 PostList * pl_copy = pl.get();
557 if (rare(next_handling_prune(pl_copy, min_weight, this))) {
558 (void)pl.release();
559 pl.reset(pl_copy);
560 LOGLINE(MATCH, "*** REPLACING ROOT");
562 if (min_weight > 0.0) {
563 // No need for a full recalc (unless we've got to do one
564 // because of a prune elsewhere) - we're just switching to a
565 // subtree.
566 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
567 LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
568 break;
573 if (rare(pl->at_end())) {
574 LOGLINE(MATCH, "Reached end of potential matches");
575 break;
578 // Only calculate the weight if we need it for mcmp, or there's a
579 // percentage or weight cutoff in effect. Otherwise we calculate it
580 // below if we haven't already rejected this candidate.
581 double wt = 0.0;
582 bool calculated_weight = false;
583 if (sort_by != VAL || min_weight > 0.0) {
584 wt = pl->get_weight();
585 if (wt < min_weight) {
586 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
587 continue;
589 calculated_weight = true;
592 Xapian::docid did = pl->get_docid();
593 vsdoc.set_document(did);
594 LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
595 Xapian::Internal::MSetItem new_item(wt, did);
596 if (check_at_least > maxitems && timeout.timed_out()) {
597 check_at_least = maxitems;
600 if (sort_by != REL) {
601 const string * ptr = pl->get_sort_key();
602 if (ptr) {
603 new_item.sort_key = *ptr;
604 } else if (sorter) {
605 new_item.sort_key = (*sorter)(doc);
606 } else {
607 new_item.sort_key = vsdoc.get_value(sort_key);
610 // We're sorting by value (in part at least), so compare the item
611 // against the lowest currently in the proto-mset. If sort_by is
612 // VAL, then new_item.wt won't yet be set, but that doesn't
613 // matter since it's not used by the sort function.
614 if (!mcmp(new_item, min_item)) {
615 if (mdecider == NULL && !collapser) {
616 // Document was definitely suitable for mset - no more
617 // processing needed.
618 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
619 ++docs_matched;
620 if (!calculated_weight) wt = pl->get_weight();
621 if (matchspy) {
622 matchspy->operator()(doc, wt);
624 if (wt > greatest_wt) goto new_greatest_weight;
625 continue;
627 if (docs_matched >= check_at_least) {
628 // We've seen enough items - we can drop this one.
629 LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
630 // FIXME: hmm, match decider might have rejected this...
631 if (!calculated_weight) wt = pl->get_weight();
632 if (wt > greatest_wt) goto new_greatest_weight;
633 continue;
635 // We can't drop the item, because we need to test whether the
636 // mdecider would accept it and/or test whether it would be
637 // collapsed.
638 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
642 // Use the match spy and/or decision functors (if specified).
643 if (matchspy != NULL || mdecider != NULL) {
644 const unsigned int multiplier = db.internal.size();
645 Assert(multiplier != 0);
646 Xapian::doccount n = (did - 1) % multiplier; // which actual database
647 // If the results are from a remote database, then the functor will
648 // already have been applied there so we can skip this step.
649 if (!is_remote[n]) {
650 ++decider_considered;
651 if (mdecider && !mdecider->operator()(doc)) {
652 ++decider_denied;
653 continue;
655 if (matchspy) {
656 if (!calculated_weight) {
657 wt = pl->get_weight();
658 new_item.wt = wt;
659 calculated_weight = true;
661 matchspy->operator()(doc, wt);
666 if (!calculated_weight) {
667 // we didn't calculate the weight above, but now we will need it
668 wt = pl->get_weight();
669 new_item.wt = wt;
672 pushback = true;
674 // Perform collapsing on key if requested.
675 if (collapser) {
676 collapse_result res;
677 res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
678 if (res == REJECTED) {
679 // If we're sorting by relevance primarily, then we throw away
680 // the lower weighted document anyway.
681 if (sort_by != REL && sort_by != REL_VAL) {
682 if (wt > greatest_wt) goto new_greatest_weight;
684 continue;
687 if (res == REPLACED) {
688 // There was a previous item in the collapse tab so
689 // the MSet can't be empty.
690 Assert(!items.empty());
692 const Xapian::Internal::MSetItem & old_item =
693 collapser.old_item;
694 // This is one of the best collapse_max potential MSet entries
695 // with this key which we've seen so far. Check if the
696 // entry with this key which it displaced might still be in the
697 // proto-MSet. If it might be, we need to check through for
698 // it.
699 double old_wt = old_item.wt;
700 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
701 // Scan through (unsorted) MSet looking for entry.
702 // FIXME: more efficient way than just scanning?
703 Xapian::docid olddid = old_item.did;
704 vector<Xapian::Internal::MSetItem>::iterator i;
705 for (i = items.begin(); i != items.end(); ++i) {
706 if (i->did == olddid) {
707 LOGLINE(MATCH, "collapse: removing " <<
708 olddid << ": " <<
709 new_item.collapse_key);
710 // We can replace an arbitrary element in O(log N)
711 // but have to do it by hand (in this case the new
712 // elt is bigger, so we just swap down the tree).
713 // FIXME: implement this, and clean up is_heap
714 // handling
715 *i = new_item;
716 pushback = false;
717 is_heap = false;
718 break;
725 // OK, actually add the item to the mset.
726 if (pushback) {
727 ++docs_matched;
728 if (items.size() >= max_msize) {
729 items.push_back(new_item);
730 if (!is_heap) {
731 is_heap = true;
732 make_heap(items.begin(), items.end(), mcmp);
733 } else {
734 push_heap<vector<Xapian::Internal::MSetItem>::iterator,
735 MSetCmp>(items.begin(), items.end(), mcmp);
737 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
738 MSetCmp>(items.begin(), items.end(), mcmp);
739 items.pop_back();
741 min_item = items.front();
742 if (sort_by == REL || sort_by == REL_VAL) {
743 if (docs_matched >= check_at_least) {
744 if (sort_by == REL) {
745 // We're done if this is a forward boolean match
746 // with only one database (bodgetastic, FIXME
747 // better if we can!)
748 if (rare(max_possible == 0 && sort_forward)) {
749 // In the multi database case, MergePostList
750 // currently processes each database
751 // sequentially (which actually may well be
752 // more efficient) so the docids in general
753 // won't arrive in order.
754 if (leaves.size() == 1) break;
757 if (min_item.wt > min_weight) {
758 LOGLINE(MATCH, "Setting min_weight to " <<
759 min_item.wt << " from " << min_weight);
760 min_weight = min_item.wt;
764 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
765 LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
766 break;
768 } else {
769 items.push_back(new_item);
770 is_heap = false;
771 if (sort_by == REL && items.size() == max_msize) {
772 if (docs_matched >= check_at_least) {
773 // We're done if this is a forward boolean match
774 // with only one database (bodgetastic, FIXME
775 // better if we can!)
776 if (rare(max_possible == 0 && sort_forward)) {
777 // In the multi database case, MergePostList
778 // currently processes each database
779 // sequentially (which actually may well be
780 // more efficient) so the docids in general
781 // won't arrive in order.
782 if (leaves.size() == 1) break;
789 // Keep a track of the greatest weight we've seen.
790 if (wt > greatest_wt) {
791 new_greatest_weight:
792 greatest_wt = wt;
793 #ifdef XAPIAN_HAS_REMOTE_BACKEND
794 const unsigned int multiplier = db.internal.size();
795 unsigned int db_num = (did - 1) % multiplier;
796 if (is_remote[db_num]) {
797 // Note that the greatest weighted document came from a remote
798 // database, and which one.
799 greatest_wt_subqs_db_num = db_num;
800 } else
801 #endif
803 greatest_wt_subqs_matched = pl->count_matching_subqs();
804 #ifdef XAPIAN_HAS_REMOTE_BACKEND
805 greatest_wt_subqs_db_num = UINT_MAX;
806 #endif
808 if (percent_cutoff) {
809 double w = wt * percent_cutoff_factor;
810 if (w > min_weight) {
811 min_weight = w;
812 if (!is_heap) {
813 is_heap = true;
814 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
815 MSetCmp>(items.begin(), items.end(), mcmp);
817 while (!items.empty() && items.front().wt < min_weight) {
818 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
819 MSetCmp>(items.begin(), items.end(), mcmp);
820 Assert(items.back().wt < min_weight);
821 items.pop_back();
823 #ifdef XAPIAN_ASSERTIONS_PARANOID
824 vector<Xapian::Internal::MSetItem>::const_iterator i;
825 for (i = items.begin(); i != items.end(); ++i) {
826 Assert(i->wt >= min_weight);
828 #endif
834 // done with posting list tree
835 pl.reset(NULL);
837 double percent_scale = 0;
838 if (!items.empty() && greatest_wt > 0) {
839 #ifdef XAPIAN_HAS_REMOTE_BACKEND
840 if (greatest_wt_subqs_db_num != UINT_MAX) {
841 const unsigned int n = greatest_wt_subqs_db_num;
842 RemoteSubMatch * rem_match;
843 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
844 percent_scale = rem_match->get_percent_factor() / 100.0;
845 } else
846 #endif
848 percent_scale = greatest_wt_subqs_matched / double(total_subqs);
849 percent_scale /= greatest_wt;
851 Assert(percent_scale > 0);
852 if (percent_cutoff) {
853 // FIXME: better to sort and binary chop maybe?
854 // Or we could just use a linear scan here instead.
856 // trim the mset to the correct answer...
857 double min_wt = percent_cutoff_factor / percent_scale;
858 if (!is_heap) {
859 is_heap = true;
860 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
861 MSetCmp>(items.begin(), items.end(), mcmp);
863 while (!items.empty() && items.front().wt < min_wt) {
864 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
865 MSetCmp>(items.begin(), items.end(), mcmp);
866 Assert(items.back().wt < min_wt);
867 items.pop_back();
869 #ifdef XAPIAN_ASSERTIONS_PARANOID
870 vector<Xapian::Internal::MSetItem>::const_iterator j;
871 for (j = items.begin(); j != items.end(); ++j) {
872 Assert(j->wt >= min_wt);
874 #endif
878 LOGLINE(MATCH,
879 "docs_matched = " << docs_matched <<
880 ", definite_matches_not_seen = " << definite_matches_not_seen <<
881 ", matches_lower_bound = " << matches_lower_bound <<
882 ", matches_estimated = " << matches_estimated <<
883 ", matches_upper_bound = " << matches_upper_bound);
885 // Adjust docs_matched to take account of documents which matched remotely
886 // but weren't sent across.
887 docs_matched += definite_matches_not_seen;
889 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
890 Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
891 Xapian::doccount uncollapsed_estimated = matches_estimated;
892 if (items.size() < max_msize) {
893 // We have fewer items in the mset than we tried to get for it, so we
894 // must have all the matches in it.
895 LOGLINE(MATCH, "items.size() = " << items.size() <<
896 ", max_msize = " << max_msize << ", setting bounds equal");
897 Assert(definite_matches_not_seen == 0);
898 Assert(percent_cutoff || docs_matched == items.size());
899 matches_lower_bound = matches_upper_bound = matches_estimated
900 = items.size();
901 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
902 uncollapsed_lower_bound = matches_lower_bound;
903 } else if (!collapser && docs_matched < check_at_least) {
904 // We have seen fewer matches than we checked for, so we must have seen
905 // all the matches.
906 LOGLINE(MATCH, "Setting bounds equal");
907 matches_lower_bound = matches_upper_bound = matches_estimated
908 = docs_matched;
909 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
910 uncollapsed_lower_bound = matches_lower_bound;
911 } else {
912 AssertRel(matches_estimated,>=,matches_lower_bound);
913 AssertRel(matches_estimated,<=,matches_upper_bound);
915 // We can end up scaling the estimate more than once, so collect
916 // the scale factors and apply them in one go to avoid rounding
917 // more than once.
918 double estimate_scale = 1.0;
919 double unique_rate = 1.0;
921 if (collapser) {
922 LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
923 matches_lower_bound = collapser.get_matches_lower_bound();
924 if (matches_lower_bound > uncollapsed_lower_bound)
925 uncollapsed_lower_bound = matches_lower_bound;
927 // The estimate for the number of hits can be modified by
928 // multiplying it by the rate at which we've been finding
929 // unique documents.
930 Xapian::doccount docs_considered = collapser.get_docs_considered();
931 Xapian::doccount dups_ignored = collapser.get_dups_ignored();
932 if (docs_considered > 0) {
933 double unique = double(docs_considered - dups_ignored);
934 unique_rate = unique / double(docs_considered);
937 // We can safely reduce the upper bound by the number of duplicates
938 // we've ignored.
939 matches_upper_bound -= dups_ignored;
941 LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
942 ", matches_estimated=" << matches_estimated <<
943 "*" << estimate_scale << "*" << unique_rate <<
944 ", matches_upper_bound=" << matches_upper_bound);
947 if (mdecider) {
948 if (!percent_cutoff) {
949 if (!collapser) {
950 // We're not collapsing or doing a percentage cutoff, so
951 // docs_matched is a lower bound on the total number of
952 // matches.
953 matches_lower_bound = max(docs_matched, matches_lower_bound);
957 // The estimate for the number of hits can be modified by
958 // multiplying it by the rate at which the match decider has
959 // been accepting documents.
960 if (decider_considered > 0) {
961 double accept = double(decider_considered - decider_denied);
962 double accept_rate = accept / double(decider_considered);
963 estimate_scale *= accept_rate;
966 // If a document is denied by a match decider, it is not possible
967 // for it to found to be a duplicate, so it is safe to also reduce
968 // the upper bound by the number of documents denied by a match
969 // decider.
970 matches_upper_bound -= decider_denied;
971 if (collapser) uncollapsed_upper_bound -= decider_denied;
974 if (percent_cutoff) {
975 estimate_scale *= (1.0 - percent_cutoff_factor);
976 // another approach:
977 // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
978 // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
980 // Very likely an underestimate, but we can't really do better
981 // without checking further matches... Only possibility would be
982 // to track how many docs made the min weight test but didn't make
983 // the candidate set since the last greatest_wt change, which we
984 // could use if the top documents matched all the prob terms.
985 matches_lower_bound = items.size();
986 if (collapser) uncollapsed_lower_bound = matches_lower_bound;
988 // matches_upper_bound could be reduced by the number of documents
989 // which fail the min weight test (FIXME)
991 LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
992 percent_cutoff << "): now have matches_estimated=" <<
993 matches_estimated << ", matches_lower_bound=" <<
994 matches_lower_bound);
997 if (collapser && estimate_scale != 1.0) {
998 uncollapsed_estimated =
999 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
1002 estimate_scale *= unique_rate;
1004 if (estimate_scale != 1.0) {
1005 matches_estimated =
1006 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
1007 if (matches_estimated < matches_lower_bound)
1008 matches_estimated = matches_lower_bound;
1011 if (collapser || mdecider) {
1012 LOGLINE(MATCH, "Clamping estimate between bounds: "
1013 "matches_lower_bound = " << matches_lower_bound <<
1014 ", matches_estimated = " << matches_estimated <<
1015 ", matches_upper_bound = " << matches_upper_bound);
1016 AssertRel(matches_lower_bound,<=,matches_upper_bound);
1017 matches_estimated = max(matches_estimated, matches_lower_bound);
1018 matches_estimated = min(matches_estimated, matches_upper_bound);
1019 } else if (!percent_cutoff) {
1020 AssertRel(docs_matched,<=,matches_upper_bound);
1021 if (docs_matched > matches_lower_bound)
1022 matches_lower_bound = docs_matched;
1023 if (docs_matched > matches_estimated)
1024 matches_estimated = docs_matched;
1027 if (collapser && !mdecider && !percent_cutoff) {
1028 AssertRel(docs_matched,<=,uncollapsed_upper_bound);
1029 if (docs_matched > uncollapsed_lower_bound)
1030 uncollapsed_lower_bound = docs_matched;
1034 if (collapser) {
1035 AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
1036 if (uncollapsed_estimated < uncollapsed_lower_bound) {
1037 uncollapsed_estimated = uncollapsed_lower_bound;
1038 } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
1039 uncollapsed_estimated = uncollapsed_upper_bound;
1041 } else {
1042 // We're not collapsing, so the bounds must be the same.
1043 uncollapsed_lower_bound = matches_lower_bound;
1044 uncollapsed_upper_bound = matches_upper_bound;
1045 uncollapsed_estimated = matches_estimated;
1048 LOGLINE(MATCH, items.size() << " items in potential mset");
1050 if (first > 0) {
1051 // Remove unwanted leading entries
1052 if (items.size() <= first) {
1053 items.clear();
1054 } else {
1055 LOGLINE(MATCH, "finding " << first << "th");
1056 // We perform nth_element() on reverse iterators so that the
1057 // unwanted elements end up at the end of items, which means
1058 // that the call to erase() to remove them doesn't have to copy
1059 // any elements.
1060 vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
1061 nth = items.rbegin() + first;
1062 nth_element(items.rbegin(), nth, items.rend(), mcmp);
1063 // Erase the trailing "first" elements
1064 items.erase(items.begin() + items.size() - first, items.end());
1068 LOGLINE(MATCH, "sorting " << items.size() << " entries");
1070 // Need a stable sort, but this is provided by comparison operator
1071 sort(items.begin(), items.end(), mcmp);
1073 if (!items.empty()) {
1074 LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
1075 LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
1078 AssertRel(matches_estimated,>=,matches_lower_bound);
1079 AssertRel(matches_estimated,<=,matches_upper_bound);
1081 AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
1082 AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
1084 // We may need to qualify any collapse_count to see if the highest weight
1085 // collapsed item would have qualified percent_cutoff
1086 // We WILL need to restore collapse_count to the mset by taking from
1087 // collapse_tab; this is what comes of copying around whole objects
1088 // instead of taking references, we find it hard to update collapse_count
1089 // of an item that has already been pushed-back as we don't know where it
1090 // is any more. If we keep or find references we won't need to mess with
1091 // is_heap so much maybe?
1092 if (!items.empty() && collapser && !collapser.empty()) {
1093 // Nicked this formula from above.
1094 double min_wt = 0.0;
1095 if (percent_scale > 0.0)
1096 min_wt = percent_cutoff_factor / percent_scale;
1097 Xapian::doccount entries = collapser.entries();
1098 vector<Xapian::Internal::MSetItem>::iterator i;
1099 for (i = items.begin(); i != items.end(); ++i) {
1100 // Skip entries without a collapse key.
1101 if (i->collapse_key.empty()) continue;
1103 // Set collapse_count appropriately.
1104 i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
1105 if (--entries == 0) {
1106 // Stop once we've processed all items with collapse keys.
1107 break;
1112 mset.internal = new Xapian::MSet::Internal(
1113 first,
1114 matches_upper_bound,
1115 matches_lower_bound,
1116 matches_estimated,
1117 uncollapsed_upper_bound,
1118 uncollapsed_lower_bound,
1119 uncollapsed_estimated,
1120 max_possible, greatest_wt, items,
1121 percent_scale * 100.0);