3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6 * Copyright 2003 Orange PCS Ltd
7 * Copyright 2003 Sam Liddicott
8 * Copyright 2007,2008,2009 Lemur Consulting Ltd
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
28 #include "multimatch.h"
31 #include "collapser.h"
34 #include "localsubmatch.h"
36 #include "api/omenquireinternal.h"
39 #include "api/emptypostlist.h"
40 #include "branchpostlist.h"
41 #include "mergepostlist.h"
43 #include "backends/document.h"
47 #include "valuestreamdocument.h"
48 #include "weight/weightinternal.h"
50 #include <xapian/matchspy.h>
51 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
53 #ifdef XAPIAN_HAS_REMOTE_BACKEND
54 #include "remotesubmatch.h"
55 #include "backends/remote/remote-database.h"
56 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
59 #include <cfloat> // For DBL_EPSILON.
60 #include <climits> // For UINT_MAX.
65 #ifdef HAVE_TIMER_CREATE
72 set_timeout_flag(union sigval sv
)
74 *(reinterpret_cast<volatile bool*>(sv
.sival_ptr
)) = true;
80 // Solaris defines CLOCK_MONOTONIC, but "man timer_create" doesn't mention it
81 // and using it fails.
82 const clockid_t TIMEOUT_CLOCK
= CLOCK_REALTIME
;
83 #elif defined __CYGWIN__
84 // https://cygwin.com/cygwin-api/std-notes.html currently (2015-11-22) says:
86 // clock_setres, clock_settime, and timer_create currently support only
88 const clockid_t TIMEOUT_CLOCK
= CLOCK_REALTIME
;
90 const clockid_t TIMEOUT_CLOCK
= CLOCK_MONOTONIC
;
96 volatile bool expired
;
99 TimeOut(double limit
) : expired(false) {
101 sev
.sigev_notify
= SIGEV_THREAD
;
102 sev
.sigev_notify_function
= set_timeout_flag
;
103 sev
.sigev_notify_attributes
= NULL
;
104 sev
.sigev_value
.sival_ptr
=
105 static_cast<void*>(const_cast<bool*>(&expired
));
106 if (usual(timer_create(TIMEOUT_CLOCK
, &sev
, &timerid
) == 0)) {
107 struct itimerspec interval
;
108 interval
.it_interval
.tv_sec
= 0;
109 interval
.it_interval
.tv_nsec
= 0;
110 RealTime::to_timespec(limit
, &interval
.it_value
);
111 if (usual(timer_settime(timerid
, 0, &interval
, NULL
) == 0)) {
112 // Timeout successfully set.
115 timer_delete(timerid
);
118 sev
.sigev_notify
= SIGEV_NONE
;
122 if (sev
.sigev_notify
!= SIGEV_NONE
) {
123 timer_delete(timerid
);
124 sev
.sigev_notify
= SIGEV_NONE
;
128 bool timed_out() const { return expired
; }
134 bool timed_out() const { return false; }
139 using Xapian::Internal::intrusive_ptr
;
141 const Xapian::Enquire::Internal::sort_setting REL
=
142 Xapian::Enquire::Internal::REL
;
143 const Xapian::Enquire::Internal::sort_setting REL_VAL
=
144 Xapian::Enquire::Internal::REL_VAL
;
145 const Xapian::Enquire::Internal::sort_setting VAL
=
146 Xapian::Enquire::Internal::VAL
;
147 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
148 const Xapian::Enquire::Internal::sort_setting VAL_REL
=
149 Xapian::Enquire::Internal::VAL_REL
;
152 /** Split an RSet into several sub rsets, one for each database.
154 * @param rset The RSet to split.
155 * @param number_of_subdbs The number of sub databases which exist.
156 * @param subrsets Vector of RSets which will the sub rsets will be placed in.
157 * This should be empty when the function is called.
160 split_rset_by_db(const Xapian::RSet
* rset
,
161 Xapian::doccount number_of_subdbs
,
162 vector
<Xapian::RSet
> & subrsets
)
164 LOGCALL_STATIC_VOID(MATCH
, "split_rset_by_db", rset
| number_of_subdbs
| subrsets
);
165 if (rset
&& !rset
->empty()) {
166 if (number_of_subdbs
== 1) {
167 // The common case of a single database is easy to handle.
168 subrsets
.push_back(*rset
);
170 // Can't just use vector::resize() here, since that creates N
171 // copies of the same RSet!
172 subrsets
.reserve(number_of_subdbs
);
173 for (size_t i
= 0; i
< number_of_subdbs
; ++i
) {
174 subrsets
.push_back(Xapian::RSet());
177 const set
<Xapian::docid
> & rsetitems
= rset
->internal
->get_items();
178 set
<Xapian::docid
>::const_iterator j
;
179 for (j
= rsetitems
.begin(); j
!= rsetitems
.end(); ++j
) {
180 Xapian::doccount local_docid
= (*j
- 1) / number_of_subdbs
+ 1;
181 Xapian::doccount subdatabase
= (*j
- 1) % number_of_subdbs
;
182 subrsets
[subdatabase
].add_document(local_docid
);
186 // NB vector::resize() creates N copies of the same empty RSet.
187 subrsets
.resize(number_of_subdbs
);
189 Assert(subrsets
.size() == number_of_subdbs
);
192 /** Prepare some SubMatches.
194 * This calls the prepare_match() method on each SubMatch object, causing them
195 * to lookup various statistics.
197 * This method is rather complicated in order to handle remote matches
198 * efficiently. Instead of simply calling "prepare_match()" on each submatch
199 * and waiting for it to return, it first calls "prepare_match(true)" on each
200 * submatch. If any of these calls return false, indicating that the required
201 * information has not yet been received from the server, the method will try
202 * those which returned false again, passing "false" as a parameter to
203 * indicate that they should block until the information is ready.
205 * This should improve performance in the case of mixed local-and-remote
206 * searches - the local searchers will all fetch their statistics from disk
207 * without waiting for the remote searchers, so as soon as the remote searcher
208 * statistics arrive, we can move on to the next step.
211 prepare_sub_matches(vector
<intrusive_ptr
<SubMatch
> > & leaves
,
212 Xapian::Weight::Internal
& stats
)
214 LOGCALL_STATIC_VOID(MATCH
, "prepare_sub_matches", leaves
| stats
);
215 // We use a vector<bool> to track which SubMatches we're already prepared.
216 vector
<bool> prepared
;
217 prepared
.resize(leaves
.size(), false);
218 size_t unprepared
= leaves
.size();
221 for (size_t leaf
= 0; leaf
< leaves
.size(); ++leaf
) {
222 if (prepared
[leaf
]) continue;
223 SubMatch
* submatch
= leaves
[leaf
].get();
224 if (!submatch
|| submatch
->prepare_match(nowait
, stats
)) {
225 prepared
[leaf
] = true;
229 // Use blocking IO on subsequent passes, so that we don't go into
235 /// Class which applies several match spies in turn.
236 class MultipleMatchSpy
: public Xapian::MatchSpy
{
238 /// List of match spies to call, in order.
239 const std::vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & spies
;
242 MultipleMatchSpy(const std::vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & spies_
)
245 /** Implementation of virtual operator().
247 * This implementation calls all the spies in turn.
249 void operator()(const Xapian::Document
&doc
, double wt
);
253 MultipleMatchSpy::operator()(const Xapian::Document
&doc
, double wt
) {
254 LOGCALL_VOID(MATCH
, "MultipleMatchSpy::operator()", doc
| wt
);
255 for (auto i
: spies
) {
260 ////////////////////////////////////
261 // Initialisation and cleaning up //
262 ////////////////////////////////////
263 MultiMatch::MultiMatch(const Xapian::Database
&db_
,
264 const Xapian::Query
& query_
,
265 Xapian::termcount qlen
,
266 const Xapian::RSet
* omrset
,
267 Xapian::doccount collapse_max_
,
268 Xapian::valueno collapse_key_
,
269 int percent_cutoff_
, double weight_cutoff_
,
270 Xapian::Enquire::docid_order order_
,
271 Xapian::valueno sort_key_
,
272 Xapian::Enquire::Internal::sort_setting sort_by_
,
273 bool sort_value_forward_
,
275 Xapian::Weight::Internal
& stats
,
276 const Xapian::Weight
* weight_
,
277 const vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & matchspies_
,
278 bool have_sorter
, bool have_mdecider
)
279 : db(db_
), query(query_
),
280 collapse_max(collapse_max_
), collapse_key(collapse_key_
),
281 percent_cutoff(percent_cutoff_
), weight_cutoff(weight_cutoff_
),
283 sort_key(sort_key_
), sort_by(sort_by_
),
284 sort_value_forward(sort_value_forward_
),
285 time_limit(time_limit_
),
287 is_remote(db
.internal
.size()),
288 matchspies(matchspies_
)
290 LOGCALL_CTOR(MATCH
, "MultiMatch", db_
| query_
| qlen
| omrset
| collapse_max_
| collapse_key_
| percent_cutoff_
| weight_cutoff_
| int(order_
) | sort_key_
| int(sort_by_
) | sort_value_forward_
| time_limit_
| stats
| weight_
| matchspies_
| have_sorter
| have_mdecider
);
292 if (query
.empty()) return;
294 Xapian::doccount number_of_subdbs
= db
.internal
.size();
295 vector
<Xapian::RSet
> subrsets
;
296 split_rset_by_db(omrset
, number_of_subdbs
, subrsets
);
298 for (size_t i
= 0; i
!= number_of_subdbs
; ++i
) {
299 Xapian::Database::Internal
*subdb
= db
.internal
[i
].get();
301 intrusive_ptr
<SubMatch
> smatch
;
303 // There is currently only one special case, for network databases.
304 #ifdef XAPIAN_HAS_REMOTE_BACKEND
305 if (subdb
->get_backend_info(NULL
) == BACKEND_REMOTE
) {
306 RemoteDatabase
*rem_db
= static_cast<RemoteDatabase
*>(subdb
);
308 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
311 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
313 // FIXME: Remote handling for time_limit with multiple
314 // databases may need some work.
315 rem_db
->set_query(query
, qlen
, collapse_max
, collapse_key
,
316 order
, sort_key
, sort_by
, sort_value_forward
,
318 percent_cutoff
, weight_cutoff
, weight
,
319 subrsets
[i
], matchspies
);
320 bool decreasing_relevance
=
321 (sort_by
== REL
|| sort_by
== REL_VAL
);
322 smatch
= new RemoteSubMatch(rem_db
, decreasing_relevance
, matchspies
);
325 smatch
= new LocalSubMatch(subdb
, query
, qlen
, subrsets
[i
], weight
);
326 subdb
->readahead_for_query(query
);
329 // Avoid unused parameter warnings.
332 smatch
= new LocalSubMatch(subdb
, query
, qlen
, subrsets
[i
], weight
);
333 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
334 leaves
.push_back(smatch
);
337 stats
.set_query(query
);
338 prepare_sub_matches(leaves
, stats
);
339 stats
.set_bounds_from_db(db
);
343 MultiMatch::getorrecalc_maxweight(PostList
*pl
)
345 LOGCALL(MATCH
, double, "MultiMatch::getorrecalc_maxweight", pl
);
347 if (recalculate_w_max
) {
348 LOGLINE(MATCH
, "recalculating max weight");
349 wt
= pl
->recalc_maxweight();
350 recalculate_w_max
= false;
352 wt
= pl
->get_maxweight();
353 LOGLINE(MATCH
, "pl = (" << pl
->get_description() << ")");
354 AssertEqDoubleParanoid(wt
, pl
->recalc_maxweight());
356 LOGLINE(MATCH
, "max possible doc weight = " << wt
);
361 MultiMatch::get_mset(Xapian::doccount first
, Xapian::doccount maxitems
,
362 Xapian::doccount check_at_least
,
364 Xapian::Weight::Internal
& stats
,
365 const Xapian::MatchDecider
*mdecider
,
366 const Xapian::KeyMaker
*sorter
)
368 LOGCALL_VOID(MATCH
, "MultiMatch::get_mset", first
| maxitems
| check_at_least
| Literal("mset") | stats
| Literal("mdecider") | Literal("sorter"));
369 AssertRel(check_at_least
,>=,maxitems
);
372 mset
= Xapian::MSet();
373 mset
.internal
->firstitem
= first
;
377 Assert(!leaves
.empty());
379 TimeOut
timeout(time_limit
);
381 #ifdef XAPIAN_HAS_REMOTE_BACKEND
382 // If there's only one database and it's remote, we can just unserialise
383 // its MSet and return that.
384 if (leaves
.size() == 1 && is_remote
[0]) {
385 RemoteSubMatch
* rem_match
;
386 rem_match
= static_cast<RemoteSubMatch
*>(leaves
[0].get());
387 rem_match
->start_match(first
, maxitems
, check_at_least
, stats
);
388 rem_match
->get_mset(mset
);
394 for (auto && leaf
: leaves
) {
395 leaf
->start_match(0, first
+ maxitems
, first
+ check_at_least
, stats
);
398 // Get postlists and term info
399 vector
<PostList
*> postlists
;
400 Xapian::termcount total_subqs
= 0;
401 // Keep a count of matches which we know exist, but we won't see. This
402 // occurs when a submatch is remote, and returns a lower bound on the
403 // number of matching documents which is higher than the number of
404 // documents it returns (because it wasn't asked for more documents).
405 Xapian::doccount definite_matches_not_seen
= 0;
406 for (size_t i
= 0; i
!= leaves
.size(); ++i
) {
407 PostList
* pl
= leaves
[i
]->get_postlist(this, &total_subqs
);
409 if (pl
->get_termfreq_min() > first
+ maxitems
) {
410 LOGLINE(MATCH
, "Found " <<
411 pl
->get_termfreq_min() - (first
+ maxitems
)
412 << " definite matches in remote submatch "
413 "which aren't passed to local match");
414 definite_matches_not_seen
+= pl
->get_termfreq_min();
415 definite_matches_not_seen
-= first
+ maxitems
;
418 postlists
.push_back(pl
);
420 Assert(!postlists
.empty());
422 ValueStreamDocument
vsdoc(db
);
424 Xapian::Document
doc(&vsdoc
);
426 // Get a single combined postlist
427 AutoPtr
<PostList
> pl
;
428 if (postlists
.size() == 1) {
429 pl
.reset(postlists
.front());
431 pl
.reset(new MergePostList(postlists
, this, vsdoc
));
434 LOGLINE(MATCH
, "pl = (" << pl
->get_description() << ")");
437 Xapian::doccount docs_matched
= 0;
438 double greatest_wt
= 0;
439 Xapian::termcount greatest_wt_subqs_matched
= 0;
440 #ifdef XAPIAN_HAS_REMOTE_BACKEND
441 unsigned greatest_wt_subqs_db_num
= UINT_MAX
;
443 vector
<Xapian::Internal::MSetItem
> items
;
445 // maximum weight a document could possibly have
446 const double max_possible
= pl
->recalc_maxweight();
448 LOGLINE(MATCH
, "pl = (" << pl
->get_description() << ")");
449 recalculate_w_max
= false;
451 Xapian::doccount matches_upper_bound
= pl
->get_termfreq_max();
452 Xapian::doccount matches_lower_bound
= 0;
453 Xapian::doccount matches_estimated
= pl
->get_termfreq_est();
455 if (mdecider
== NULL
) {
456 // If we have a match decider, the lower bound must be
457 // set to 0 as we could discard all hits. Otherwise set it to the
458 // minimum number of entries which the postlist could return.
459 matches_lower_bound
= pl
->get_termfreq_min();
462 // Prepare the matchspy
463 Xapian::MatchSpy
*matchspy
= NULL
;
464 MultipleMatchSpy
multispy(matchspies
);
465 if (!matchspies
.empty()) {
466 if (matchspies
.size() == 1) {
467 matchspy
= matchspies
[0].get();
469 matchspy
= &multispy
;
473 // Check if any results have been asked for (might just be wanting
475 if (check_at_least
== 0) {
477 Xapian::doccount uncollapsed_lower_bound
= matches_lower_bound
;
479 // Lower bound must be set to no more than collapse_max, since it's
480 // possible that all matching documents have the same collapse_key
481 // value and so are collapsed together.
482 if (matches_lower_bound
> collapse_max
)
483 matches_lower_bound
= collapse_max
;
486 mset
.internal
= new Xapian::MSet::Internal(
492 uncollapsed_lower_bound
,
494 max_possible
, greatest_wt
, items
,
499 // Number of documents considered by a decider.
500 Xapian::doccount decider_considered
= 0;
501 // Number of documents denied by the decider.
502 Xapian::doccount decider_denied
= 0;
504 // Set max number of results that we want - this is used to decide
505 // when to throw away unwanted items.
506 Xapian::doccount max_msize
= first
+ maxitems
;
507 items
.reserve(max_msize
+ 1);
509 // Tracks the minimum item currently eligible for the MSet - we compare
510 // candidate items against this.
511 Xapian::Internal::MSetItem
min_item(0.0, 0);
513 // Minimum weight an item must have to be worth considering.
514 double min_weight
= weight_cutoff
;
516 // Factor to multiply maximum weight seen by to get the cutoff weight.
517 double percent_cutoff_factor
= percent_cutoff
/ 100.0;
518 // Corresponding correction to that in omenquire.cc to account for excess
520 percent_cutoff_factor
-= DBL_EPSILON
;
522 // Object to handle collapsing.
523 Collapser
collapser(collapse_key
, collapse_max
);
525 /// Comparison functor for sorting MSet
526 bool sort_forward
= (order
!= Xapian::Enquire::DESCENDING
);
527 MSetCmp
mcmp(get_msetcmp_function(sort_by
, sort_forward
, sort_value_forward
));
531 // We form the mset in two stages. In the first we fill up our working
532 // mset. Adding a new document does not remove another.
534 // In the second, we consider documents which rank higher than the current
535 // lowest ranking document in the mset. Each document added expels the
536 // current lowest ranking document.
538 // If a percentage cutoff is in effect, it can cause the matcher to return
539 // from the second stage from the first.
541 // Is the mset a valid heap?
542 bool is_heap
= false;
547 if (rare(recalculate_w_max
)) {
548 if (min_weight
> 0.0) {
549 if (rare(getorrecalc_maxweight(pl
.get()) < min_weight
)) {
550 LOGLINE(MATCH
, "*** TERMINATING EARLY (1)");
556 PostList
* pl_copy
= pl
.get();
557 if (rare(next_handling_prune(pl_copy
, min_weight
, this))) {
560 LOGLINE(MATCH
, "*** REPLACING ROOT");
562 if (min_weight
> 0.0) {
563 // No need for a full recalc (unless we've got to do one
564 // because of a prune elsewhere) - we're just switching to a
566 if (rare(getorrecalc_maxweight(pl
.get()) < min_weight
)) {
567 LOGLINE(MATCH
, "*** TERMINATING EARLY (2)");
573 if (rare(pl
->at_end())) {
574 LOGLINE(MATCH
, "Reached end of potential matches");
578 // Only calculate the weight if we need it for mcmp, or there's a
579 // percentage or weight cutoff in effect. Otherwise we calculate it
580 // below if we haven't already rejected this candidate.
582 bool calculated_weight
= false;
583 if (sort_by
!= VAL
|| min_weight
> 0.0) {
584 wt
= pl
->get_weight();
585 if (wt
< min_weight
) {
586 LOGLINE(MATCH
, "Rejecting potential match due to insufficient weight");
589 calculated_weight
= true;
592 Xapian::docid did
= pl
->get_docid();
593 vsdoc
.set_document(did
);
594 LOGLINE(MATCH
, "Candidate document id " << did
<< " wt " << wt
);
595 Xapian::Internal::MSetItem
new_item(wt
, did
);
596 if (check_at_least
> maxitems
&& timeout
.timed_out()) {
597 check_at_least
= maxitems
;
600 if (sort_by
!= REL
) {
601 const string
* ptr
= pl
->get_sort_key();
603 new_item
.sort_key
= *ptr
;
605 new_item
.sort_key
= (*sorter
)(doc
);
607 new_item
.sort_key
= vsdoc
.get_value(sort_key
);
610 // We're sorting by value (in part at least), so compare the item
611 // against the lowest currently in the proto-mset. If sort_by is
612 // VAL, then new_item.wt won't yet be set, but that doesn't
613 // matter since it's not used by the sort function.
614 if (!mcmp(new_item
, min_item
)) {
615 if (mdecider
== NULL
&& !collapser
) {
616 // Document was definitely suitable for mset - no more
617 // processing needed.
618 LOGLINE(MATCH
, "Making note of match item which sorts lower than min_item");
620 if (!calculated_weight
) wt
= pl
->get_weight();
622 matchspy
->operator()(doc
, wt
);
624 if (wt
> greatest_wt
) goto new_greatest_weight
;
627 if (docs_matched
>= check_at_least
) {
628 // We've seen enough items - we can drop this one.
629 LOGLINE(MATCH
, "Dropping candidate which sorts lower than min_item");
630 // FIXME: hmm, match decider might have rejected this...
631 if (!calculated_weight
) wt
= pl
->get_weight();
632 if (wt
> greatest_wt
) goto new_greatest_weight
;
635 // We can't drop the item, because we need to test whether the
636 // mdecider would accept it and/or test whether it would be
638 LOGLINE(MATCH
, "Keeping candidate which sorts lower than min_item for further investigation");
642 // Use the match spy and/or decision functors (if specified).
643 if (matchspy
!= NULL
|| mdecider
!= NULL
) {
644 const unsigned int multiplier
= db
.internal
.size();
645 Assert(multiplier
!= 0);
646 Xapian::doccount n
= (did
- 1) % multiplier
; // which actual database
647 // If the results are from a remote database, then the functor will
648 // already have been applied there so we can skip this step.
650 ++decider_considered
;
651 if (mdecider
&& !mdecider
->operator()(doc
)) {
656 if (!calculated_weight
) {
657 wt
= pl
->get_weight();
659 calculated_weight
= true;
661 matchspy
->operator()(doc
, wt
);
666 if (!calculated_weight
) {
667 // we didn't calculate the weight above, but now we will need it
668 wt
= pl
->get_weight();
674 // Perform collapsing on key if requested.
677 res
= collapser
.process(new_item
, pl
.get(), vsdoc
, mcmp
);
678 if (res
== REJECTED
) {
679 // If we're sorting by relevance primarily, then we throw away
680 // the lower weighted document anyway.
681 if (sort_by
!= REL
&& sort_by
!= REL_VAL
) {
682 if (wt
> greatest_wt
) goto new_greatest_weight
;
687 if (res
== REPLACED
) {
688 // There was a previous item in the collapse tab so
689 // the MSet can't be empty.
690 Assert(!items
.empty());
692 const Xapian::Internal::MSetItem
& old_item
=
694 // This is one of the best collapse_max potential MSet entries
695 // with this key which we've seen so far. Check if the
696 // entry with this key which it displaced might still be in the
697 // proto-MSet. If it might be, we need to check through for
699 double old_wt
= old_item
.wt
;
700 if (old_wt
>= min_weight
&& mcmp(old_item
, min_item
)) {
701 // Scan through (unsorted) MSet looking for entry.
702 // FIXME: more efficient way than just scanning?
703 Xapian::docid olddid
= old_item
.did
;
704 vector
<Xapian::Internal::MSetItem
>::iterator i
;
705 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
706 if (i
->did
== olddid
) {
707 LOGLINE(MATCH
, "collapse: removing " <<
709 new_item
.collapse_key
);
710 // We can replace an arbitrary element in O(log N)
711 // but have to do it by hand (in this case the new
712 // elt is bigger, so we just swap down the tree).
713 // FIXME: implement this, and clean up is_heap
725 // OK, actually add the item to the mset.
728 if (items
.size() >= max_msize
) {
729 items
.push_back(new_item
);
732 make_heap(items
.begin(), items
.end(), mcmp
);
734 push_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
735 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
737 pop_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
738 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
741 min_item
= items
.front();
742 if (sort_by
== REL
|| sort_by
== REL_VAL
) {
743 if (docs_matched
>= check_at_least
) {
744 if (sort_by
== REL
) {
745 // We're done if this is a forward boolean match
746 // with only one database (bodgetastic, FIXME
747 // better if we can!)
748 if (rare(max_possible
== 0 && sort_forward
)) {
749 // In the multi database case, MergePostList
750 // currently processes each database
751 // sequentially (which actually may well be
752 // more efficient) so the docids in general
753 // won't arrive in order.
754 if (leaves
.size() == 1) break;
757 if (min_item
.wt
> min_weight
) {
758 LOGLINE(MATCH
, "Setting min_weight to " <<
759 min_item
.wt
<< " from " << min_weight
);
760 min_weight
= min_item
.wt
;
764 if (rare(getorrecalc_maxweight(pl
.get()) < min_weight
)) {
765 LOGLINE(MATCH
, "*** TERMINATING EARLY (3)");
769 items
.push_back(new_item
);
771 if (sort_by
== REL
&& items
.size() == max_msize
) {
772 if (docs_matched
>= check_at_least
) {
773 // We're done if this is a forward boolean match
774 // with only one database (bodgetastic, FIXME
775 // better if we can!)
776 if (rare(max_possible
== 0 && sort_forward
)) {
777 // In the multi database case, MergePostList
778 // currently processes each database
779 // sequentially (which actually may well be
780 // more efficient) so the docids in general
781 // won't arrive in order.
782 if (leaves
.size() == 1) break;
789 // Keep a track of the greatest weight we've seen.
790 if (wt
> greatest_wt
) {
793 #ifdef XAPIAN_HAS_REMOTE_BACKEND
794 const unsigned int multiplier
= db
.internal
.size();
795 unsigned int db_num
= (did
- 1) % multiplier
;
796 if (is_remote
[db_num
]) {
797 // Note that the greatest weighted document came from a remote
798 // database, and which one.
799 greatest_wt_subqs_db_num
= db_num
;
803 greatest_wt_subqs_matched
= pl
->count_matching_subqs();
804 #ifdef XAPIAN_HAS_REMOTE_BACKEND
805 greatest_wt_subqs_db_num
= UINT_MAX
;
808 if (percent_cutoff
) {
809 double w
= wt
* percent_cutoff_factor
;
810 if (w
> min_weight
) {
814 make_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
815 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
817 while (!items
.empty() && items
.front().wt
< min_weight
) {
818 pop_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
819 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
820 Assert(items
.back().wt
< min_weight
);
823 #ifdef XAPIAN_ASSERTIONS_PARANOID
824 vector
<Xapian::Internal::MSetItem
>::const_iterator i
;
825 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
826 Assert(i
->wt
>= min_weight
);
834 // done with posting list tree
837 double percent_scale
= 0;
838 if (!items
.empty() && greatest_wt
> 0) {
839 #ifdef XAPIAN_HAS_REMOTE_BACKEND
840 if (greatest_wt_subqs_db_num
!= UINT_MAX
) {
841 const unsigned int n
= greatest_wt_subqs_db_num
;
842 RemoteSubMatch
* rem_match
;
843 rem_match
= static_cast<RemoteSubMatch
*>(leaves
[n
].get());
844 percent_scale
= rem_match
->get_percent_factor() / 100.0;
848 percent_scale
= greatest_wt_subqs_matched
/ double(total_subqs
);
849 percent_scale
/= greatest_wt
;
851 Assert(percent_scale
> 0);
852 if (percent_cutoff
) {
853 // FIXME: better to sort and binary chop maybe?
854 // Or we could just use a linear scan here instead.
856 // trim the mset to the correct answer...
857 double min_wt
= percent_cutoff_factor
/ percent_scale
;
860 make_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
861 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
863 while (!items
.empty() && items
.front().wt
< min_wt
) {
864 pop_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
865 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
866 Assert(items
.back().wt
< min_wt
);
869 #ifdef XAPIAN_ASSERTIONS_PARANOID
870 vector
<Xapian::Internal::MSetItem
>::const_iterator j
;
871 for (j
= items
.begin(); j
!= items
.end(); ++j
) {
872 Assert(j
->wt
>= min_wt
);
879 "docs_matched = " << docs_matched
<<
880 ", definite_matches_not_seen = " << definite_matches_not_seen
<<
881 ", matches_lower_bound = " << matches_lower_bound
<<
882 ", matches_estimated = " << matches_estimated
<<
883 ", matches_upper_bound = " << matches_upper_bound
);
885 // Adjust docs_matched to take account of documents which matched remotely
886 // but weren't sent across.
887 docs_matched
+= definite_matches_not_seen
;
889 Xapian::doccount uncollapsed_lower_bound
= matches_lower_bound
;
890 Xapian::doccount uncollapsed_upper_bound
= matches_upper_bound
;
891 Xapian::doccount uncollapsed_estimated
= matches_estimated
;
892 if (items
.size() < max_msize
) {
893 // We have fewer items in the mset than we tried to get for it, so we
894 // must have all the matches in it.
895 LOGLINE(MATCH
, "items.size() = " << items
.size() <<
896 ", max_msize = " << max_msize
<< ", setting bounds equal");
897 Assert(definite_matches_not_seen
== 0);
898 Assert(percent_cutoff
|| docs_matched
== items
.size());
899 matches_lower_bound
= matches_upper_bound
= matches_estimated
901 if (collapser
&& matches_lower_bound
> uncollapsed_lower_bound
)
902 uncollapsed_lower_bound
= matches_lower_bound
;
903 } else if (!collapser
&& docs_matched
< check_at_least
) {
904 // We have seen fewer matches than we checked for, so we must have seen
906 LOGLINE(MATCH
, "Setting bounds equal");
907 matches_lower_bound
= matches_upper_bound
= matches_estimated
909 if (collapser
&& matches_lower_bound
> uncollapsed_lower_bound
)
910 uncollapsed_lower_bound
= matches_lower_bound
;
912 AssertRel(matches_estimated
,>=,matches_lower_bound
);
913 AssertRel(matches_estimated
,<=,matches_upper_bound
);
915 // We can end up scaling the estimate more than once, so collect
916 // the scale factors and apply them in one go to avoid rounding
918 double estimate_scale
= 1.0;
919 double unique_rate
= 1.0;
922 LOGLINE(MATCH
, "Adjusting bounds due to collapse_key");
923 matches_lower_bound
= collapser
.get_matches_lower_bound();
924 if (matches_lower_bound
> uncollapsed_lower_bound
)
925 uncollapsed_lower_bound
= matches_lower_bound
;
927 // The estimate for the number of hits can be modified by
928 // multiplying it by the rate at which we've been finding
930 Xapian::doccount docs_considered
= collapser
.get_docs_considered();
931 Xapian::doccount dups_ignored
= collapser
.get_dups_ignored();
932 if (docs_considered
> 0) {
933 double unique
= double(docs_considered
- dups_ignored
);
934 unique_rate
= unique
/ double(docs_considered
);
937 // We can safely reduce the upper bound by the number of duplicates
939 matches_upper_bound
-= dups_ignored
;
941 LOGLINE(MATCH
, "matches_lower_bound=" << matches_lower_bound
<<
942 ", matches_estimated=" << matches_estimated
<<
943 "*" << estimate_scale
<< "*" << unique_rate
<<
944 ", matches_upper_bound=" << matches_upper_bound
);
948 if (!percent_cutoff
) {
950 // We're not collapsing or doing a percentage cutoff, so
951 // docs_matched is a lower bound on the total number of
953 matches_lower_bound
= max(docs_matched
, matches_lower_bound
);
957 // The estimate for the number of hits can be modified by
958 // multiplying it by the rate at which the match decider has
959 // been accepting documents.
960 if (decider_considered
> 0) {
961 double accept
= double(decider_considered
- decider_denied
);
962 double accept_rate
= accept
/ double(decider_considered
);
963 estimate_scale
*= accept_rate
;
966 // If a document is denied by a match decider, it is not possible
967 // for it to found to be a duplicate, so it is safe to also reduce
968 // the upper bound by the number of documents denied by a match
970 matches_upper_bound
-= decider_denied
;
971 if (collapser
) uncollapsed_upper_bound
-= decider_denied
;
974 if (percent_cutoff
) {
975 estimate_scale
*= (1.0 - percent_cutoff_factor
);
977 // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
978 // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
980 // Very likely an underestimate, but we can't really do better
981 // without checking further matches... Only possibility would be
982 // to track how many docs made the min weight test but didn't make
983 // the candidate set since the last greatest_wt change, which we
984 // could use if the top documents matched all the prob terms.
985 matches_lower_bound
= items
.size();
986 if (collapser
) uncollapsed_lower_bound
= matches_lower_bound
;
988 // matches_upper_bound could be reduced by the number of documents
989 // which fail the min weight test (FIXME)
991 LOGLINE(MATCH
, "Adjusted bounds due to percent_cutoff (" <<
992 percent_cutoff
<< "): now have matches_estimated=" <<
993 matches_estimated
<< ", matches_lower_bound=" <<
994 matches_lower_bound
);
997 if (collapser
&& estimate_scale
!= 1.0) {
998 uncollapsed_estimated
=
999 Xapian::doccount(uncollapsed_estimated
* estimate_scale
+ 0.5);
1002 estimate_scale
*= unique_rate
;
1004 if (estimate_scale
!= 1.0) {
1006 Xapian::doccount(matches_estimated
* estimate_scale
+ 0.5);
1007 if (matches_estimated
< matches_lower_bound
)
1008 matches_estimated
= matches_lower_bound
;
1011 if (collapser
|| mdecider
) {
1012 LOGLINE(MATCH
, "Clamping estimate between bounds: "
1013 "matches_lower_bound = " << matches_lower_bound
<<
1014 ", matches_estimated = " << matches_estimated
<<
1015 ", matches_upper_bound = " << matches_upper_bound
);
1016 AssertRel(matches_lower_bound
,<=,matches_upper_bound
);
1017 matches_estimated
= max(matches_estimated
, matches_lower_bound
);
1018 matches_estimated
= min(matches_estimated
, matches_upper_bound
);
1019 } else if (!percent_cutoff
) {
1020 AssertRel(docs_matched
,<=,matches_upper_bound
);
1021 if (docs_matched
> matches_lower_bound
)
1022 matches_lower_bound
= docs_matched
;
1023 if (docs_matched
> matches_estimated
)
1024 matches_estimated
= docs_matched
;
1027 if (collapser
&& !mdecider
&& !percent_cutoff
) {
1028 AssertRel(docs_matched
,<=,uncollapsed_upper_bound
);
1029 if (docs_matched
> uncollapsed_lower_bound
)
1030 uncollapsed_lower_bound
= docs_matched
;
1035 AssertRel(uncollapsed_lower_bound
,<=,uncollapsed_upper_bound
);
1036 if (uncollapsed_estimated
< uncollapsed_lower_bound
) {
1037 uncollapsed_estimated
= uncollapsed_lower_bound
;
1038 } else if (uncollapsed_estimated
> uncollapsed_upper_bound
) {
1039 uncollapsed_estimated
= uncollapsed_upper_bound
;
1042 // We're not collapsing, so the bounds must be the same.
1043 uncollapsed_lower_bound
= matches_lower_bound
;
1044 uncollapsed_upper_bound
= matches_upper_bound
;
1045 uncollapsed_estimated
= matches_estimated
;
1048 LOGLINE(MATCH
, items
.size() << " items in potential mset");
1051 // Remove unwanted leading entries
1052 if (items
.size() <= first
) {
1055 LOGLINE(MATCH
, "finding " << first
<< "th");
1056 // We perform nth_element() on reverse iterators so that the
1057 // unwanted elements end up at the end of items, which means
1058 // that the call to erase() to remove them doesn't have to copy
1060 vector
<Xapian::Internal::MSetItem
>::reverse_iterator nth
;
1061 nth
= items
.rbegin() + first
;
1062 nth_element(items
.rbegin(), nth
, items
.rend(), mcmp
);
1063 // Erase the trailing "first" elements
1064 items
.erase(items
.begin() + items
.size() - first
, items
.end());
1068 LOGLINE(MATCH
, "sorting " << items
.size() << " entries");
1070 // Need a stable sort, but this is provided by comparison operator
1071 sort(items
.begin(), items
.end(), mcmp
);
1073 if (!items
.empty()) {
1074 LOGLINE(MATCH
, "min weight in mset = " << items
.back().wt
);
1075 LOGLINE(MATCH
, "max weight in mset = " << items
[0].wt
);
1078 AssertRel(matches_estimated
,>=,matches_lower_bound
);
1079 AssertRel(matches_estimated
,<=,matches_upper_bound
);
1081 AssertRel(uncollapsed_estimated
,>=,uncollapsed_lower_bound
);
1082 AssertRel(uncollapsed_estimated
,<=,uncollapsed_upper_bound
);
1084 // We may need to qualify any collapse_count to see if the highest weight
1085 // collapsed item would have qualified percent_cutoff
1086 // We WILL need to restore collapse_count to the mset by taking from
1087 // collapse_tab; this is what comes of copying around whole objects
1088 // instead of taking references, we find it hard to update collapse_count
1089 // of an item that has already been pushed-back as we don't know where it
1090 // is any more. If we keep or find references we won't need to mess with
1091 // is_heap so much maybe?
1092 if (!items
.empty() && collapser
&& !collapser
.empty()) {
1093 // Nicked this formula from above.
1094 double min_wt
= 0.0;
1095 if (percent_scale
> 0.0)
1096 min_wt
= percent_cutoff_factor
/ percent_scale
;
1097 Xapian::doccount entries
= collapser
.entries();
1098 vector
<Xapian::Internal::MSetItem
>::iterator i
;
1099 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
1100 // Skip entries without a collapse key.
1101 if (i
->collapse_key
.empty()) continue;
1103 // Set collapse_count appropriately.
1104 i
->collapse_count
= collapser
.get_collapse_count(i
->collapse_key
, percent_cutoff
, min_wt
);
1105 if (--entries
== 0) {
1106 // Stop once we've processed all items with collapse keys.
1112 mset
.internal
= new Xapian::MSet::Internal(
1114 matches_upper_bound
,
1115 matches_lower_bound
,
1117 uncollapsed_upper_bound
,
1118 uncollapsed_lower_bound
,
1119 uncollapsed_estimated
,
1120 max_possible
, greatest_wt
, items
,
1121 percent_scale
* 100.0);