3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6 * Copyright 2003 Orange PCS Ltd
7 * Copyright 2003 Sam Liddicott
8 * Copyright 2007,2008,2009 Lemur Consulting Ltd
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
28 #include "multimatch.h"
30 #include "api/msetinternal.h"
31 #include "backends/multi/multi_database.h"
32 #include "collapser.h"
35 #include "localsubmatch.h"
37 #include "api/enquireinternal.h"
38 #include "api/rsetinternal.h"
41 #include "deciderpostlist.h"
42 #include "mergepostlist.h"
43 #include "postlisttree.h"
44 #include "spymaster.h"
46 #include "matchtimeout.h"
49 #include "valuestreamdocument.h"
50 #include "weight/weightinternal.h"
52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
54 #ifdef XAPIAN_HAS_REMOTE_BACKEND
55 #include "remotesubmatch.h"
56 #include "backends/remote/remote-database.h"
57 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
60 #include <cfloat> // For DBL_EPSILON.
61 #include <climits> // For UINT_MAX.
65 using Xapian::Internal::intrusive_ptr
;
67 const Xapian::Enquire::Internal::sort_setting REL
=
68 Xapian::Enquire::Internal::REL
;
69 const Xapian::Enquire::Internal::sort_setting REL_VAL
=
70 Xapian::Enquire::Internal::REL_VAL
;
71 const Xapian::Enquire::Internal::sort_setting VAL
=
72 Xapian::Enquire::Internal::VAL
;
73 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
74 const Xapian::Enquire::Internal::sort_setting VAL_REL
=
75 Xapian::Enquire::Internal::VAL_REL
;
78 /** Prepare some SubMatches.
80 * This calls the prepare_match() method on each SubMatch object, causing them
81 * to lookup various statistics.
83 * This method is rather complicated in order to handle remote matches
84 * efficiently. Instead of simply calling "prepare_match()" on each submatch
85 * and waiting for it to return, it first calls "prepare_match(true)" on each
86 * submatch. If any of these calls return false, indicating that the required
87 * information has not yet been received from the server, the method will try
88 * those which returned false again, passing "false" as a parameter to
89 * indicate that they should block until the information is ready.
91 * This should improve performance in the case of mixed local-and-remote
92 * searches - the local searchers will all fetch their statistics from disk
93 * without waiting for the remote searchers, so as soon as the remote searcher
94 * statistics arrive, we can move on to the next step.
97 prepare_sub_matches(vector
<intrusive_ptr
<SubMatch
> > & leaves
,
98 Xapian::Weight::Internal
& stats
)
100 LOGCALL_STATIC_VOID(MATCH
, "prepare_sub_matches", leaves
| stats
);
101 // We use a vector<bool> to track which SubMatches we're already prepared.
102 vector
<bool> prepared
;
103 prepared
.resize(leaves
.size(), false);
104 size_t unprepared
= leaves
.size();
107 for (size_t leaf
= 0; leaf
< leaves
.size(); ++leaf
) {
108 if (prepared
[leaf
]) continue;
109 SubMatch
* submatch
= leaves
[leaf
].get();
110 if (!submatch
|| submatch
->prepare_match(nowait
, stats
)) {
111 prepared
[leaf
] = true;
115 // Use blocking IO on subsequent passes, so that we don't go into
121 ////////////////////////////////////
122 // Initialisation and cleaning up //
123 ////////////////////////////////////
124 MultiMatch::MultiMatch(const Xapian::Database
&db_
,
125 const Xapian::Query
& query_
,
126 Xapian::termcount qlen
,
127 const Xapian::RSet
* omrset
,
128 Xapian::doccount collapse_max_
,
129 Xapian::valueno collapse_key_
,
130 int percent_cutoff_
, double weight_cutoff_
,
131 Xapian::Enquire::docid_order order_
,
132 Xapian::valueno sort_key_
,
133 Xapian::Enquire::Internal::sort_setting sort_by_
,
134 bool sort_value_forward_
,
136 Xapian::Weight::Internal
& stats
,
137 const Xapian::Weight
* weight_
,
138 const vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & matchspies_
,
139 bool have_sorter
, bool have_mdecider
)
140 : db(db_
), query(query_
),
141 collapse_max(collapse_max_
), collapse_key(collapse_key_
),
142 percent_cutoff(percent_cutoff_
), weight_cutoff(weight_cutoff_
),
144 sort_key(sort_key_
), sort_by(sort_by_
),
145 sort_value_forward(sort_value_forward_
),
146 time_limit(time_limit_
),
148 is_remote(db
.internal
->size()),
149 matchspies(matchspies_
)
151 LOGCALL_CTOR(MATCH
, "MultiMatch", db_
| query_
| qlen
| omrset
| collapse_max_
| collapse_key_
| percent_cutoff_
| weight_cutoff_
| int(order_
) | sort_key_
| int(sort_by_
) | sort_value_forward_
| time_limit_
| stats
| weight_
| matchspies_
| have_sorter
| have_mdecider
);
153 Assert(!query
.empty());
155 Xapian::doccount number_of_subdbs
= db
.internal
->size();
156 vector
<Xapian::RSet
> subrsets
;
157 if (omrset
&& omrset
->internal
.get()) {
158 omrset
->internal
->shard(number_of_subdbs
, subrsets
);
160 subrsets
.resize(number_of_subdbs
);
163 for (size_t i
= 0; i
!= number_of_subdbs
; ++i
) {
164 const Xapian::Database::Internal
*subdb
= db
.internal
.get();
165 if (number_of_subdbs
> 1) {
166 auto multidb
= static_cast<const MultiDatabase
*>(subdb
);
167 subdb
= multidb
->shards
[i
];
170 intrusive_ptr
<SubMatch
> smatch
;
172 // There is currently only one special case, for network databases.
173 #ifdef XAPIAN_HAS_REMOTE_BACKEND
174 if (subdb
->get_backend_info(NULL
) == BACKEND_REMOTE
) {
175 auto rem_db
= static_cast<const RemoteDatabase
*>(subdb
);
177 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
180 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
182 // FIXME: Remote handling for time_limit with multiple
183 // databases may need some work.
184 rem_db
->set_query(query
, qlen
, collapse_max
, collapse_key
,
185 order
, sort_key
, sort_by
, sort_value_forward
,
187 percent_cutoff
, weight_cutoff
, weight
,
188 subrsets
[i
], matchspies
);
189 bool decreasing_relevance
=
190 (sort_by
== REL
|| sort_by
== REL_VAL
);
191 smatch
= new RemoteSubMatch(rem_db
, decreasing_relevance
, matchspies
);
194 smatch
= new LocalSubMatch(subdb
, query
, qlen
, subrsets
[i
], weight
,
196 subdb
->readahead_for_query(query
);
199 // Avoid unused parameter warnings.
202 smatch
= new LocalSubMatch(subdb
, query
, qlen
, subrsets
[i
], weight
,
204 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
205 leaves
.push_back(smatch
);
208 stats
.set_query(query
);
209 prepare_sub_matches(leaves
, stats
);
210 stats
.set_bounds_from_db(db
);
214 MultiMatch::get_mset(Xapian::doccount first
, Xapian::doccount maxitems
,
215 Xapian::doccount check_at_least
,
217 Xapian::Weight::Internal
& stats
,
218 const Xapian::MatchDecider
*mdecider
,
219 const Xapian::KeyMaker
*sorter
)
221 LOGCALL_VOID(MATCH
, "MultiMatch::get_mset", first
| maxitems
| check_at_least
| Literal("mset") | stats
| Literal("mdecider") | Literal("sorter"));
222 AssertRel(check_at_least
,>=,first
+ maxitems
);
224 Assert(!query
.empty());
226 Assert(!leaves
.empty());
228 TimeOut
timeout(time_limit
);
230 #ifdef XAPIAN_HAS_REMOTE_BACKEND
231 // If there's only one database and it's remote, we can just unserialise
232 // its MSet and return that.
233 if (leaves
.size() == 1 && is_remote
[0]) {
234 RemoteSubMatch
* rem_match
;
235 rem_match
= static_cast<RemoteSubMatch
*>(leaves
[0].get());
236 rem_match
->start_match(first
, maxitems
, check_at_least
, stats
);
237 rem_match
->get_mset(mset
);
243 for (auto && leaf
: leaves
) {
244 leaf
->start_match(0, first
+ maxitems
, first
+ check_at_least
, stats
);
247 ValueStreamDocument
vsdoc(db
);
249 Xapian::Document
doc(&vsdoc
);
251 // Get postlists and term info
252 vector
<PostList
*> postlists
;
254 Xapian::termcount total_subqs
= 0;
255 // Keep a count of matches which we know exist, but we won't see. This
256 // occurs when a submatch is remote, and returns a lower bound on the
257 // number of matching documents which is higher than the number of
258 // documents it returns (because it wasn't asked for more documents).
259 Xapian::doccount definite_matches_not_seen
= 0;
260 for (size_t i
= 0; i
!= leaves
.size(); ++i
) {
261 // Pick the highest total subqueries answer amongst the subdatabases,
262 // as the query to postlist conversion doesn't recurse into positional
263 // queries for shards that don't have positional data when at least one
265 Xapian::termcount total_subqs_i
= 0;
266 PostList
* pl
= leaves
[i
]->get_postlist(&pltree
, &total_subqs_i
);
267 total_subqs
= max(total_subqs
, total_subqs_i
);
269 if (pl
->get_termfreq_min() > first
+ maxitems
) {
270 LOGLINE(MATCH
, "Found " <<
271 pl
->get_termfreq_min() - (first
+ maxitems
)
272 << " definite matches in remote submatch "
273 "which aren't passed to local match");
274 definite_matches_not_seen
+= pl
->get_termfreq_min();
275 definite_matches_not_seen
-= first
+ maxitems
;
277 } else if (mdecider
) {
278 pl
= new DeciderPostList(pl
, mdecider
, &vsdoc
);
280 postlists
.push_back(pl
);
282 Assert(!postlists
.empty());
284 Xapian::doccount n_shards
= postlists
.size();
286 pltree
.set_postlist(postlists
[0]);
288 pltree
.set_postlist(new MergePostList(&postlists
[0], n_shards
, vsdoc
));
291 LOGLINE(MATCH
, "pl = (" << pltree
.get_description() << ")");
294 Xapian::doccount docs_matched
= 0;
295 double greatest_wt
= 0;
296 Xapian::termcount greatest_wt_subqs_matched
= 0;
297 #ifdef XAPIAN_HAS_REMOTE_BACKEND
298 unsigned greatest_wt_subqs_db_num
= UINT_MAX
;
300 vector
<Result
> items
;
302 // maximum weight a document could possibly have
303 const double max_possible
= pltree
.recalc_maxweight();
305 LOGLINE(MATCH
, "pl = (" << pltree
.get_description() << ")");
307 Xapian::doccount matches_upper_bound
= pltree
.get_termfreq_max();
308 Xapian::doccount matches_lower_bound
= pltree
.get_termfreq_min();
309 Xapian::doccount matches_estimated
= pltree
.get_termfreq_est();
311 SpyMaster
spymaster(&matchspies
, &is_remote
);
313 // Check if any results have been asked for (might just be wanting
315 if (check_at_least
== 0) {
316 Xapian::doccount uncollapsed_lower_bound
= matches_lower_bound
;
318 // Lower bound must be set to no more than collapse_max, since it's
319 // possible that all matching documents have the same collapse_key
320 // value and so are collapsed together.
321 if (matches_lower_bound
> collapse_max
)
322 matches_lower_bound
= collapse_max
;
325 mset
.internal
= new Xapian::MSet::Internal(
331 uncollapsed_lower_bound
,
333 max_possible
, greatest_wt
,
339 // Set max number of results that we want - this is used to decide
340 // when to throw away unwanted items.
341 Xapian::doccount max_msize
= first
+ maxitems
;
342 items
.reserve(max_msize
+ 1);
344 // Tracks the minimum item currently eligible for the MSet - we compare
345 // candidate items against this.
346 Result
min_item(0.0, 0);
348 // Minimum weight an item must have to be worth considering.
349 double min_weight
= weight_cutoff
;
351 // Factor to multiply maximum weight seen by to get the cutoff weight.
352 double percent_cutoff_factor
= percent_cutoff
/ 100.0;
353 // Corresponding correction to that in omenquire.cc to account for excess
355 percent_cutoff_factor
-= DBL_EPSILON
;
357 // Object to handle collapsing.
358 Collapser
collapser(collapse_key
, collapse_max
);
360 /// Comparison functor for sorting MSet
361 bool sort_forward
= (order
!= Xapian::Enquire::DESCENDING
);
362 MSetCmp
mcmp(get_msetcmp_function(sort_by
, sort_forward
, sort_value_forward
));
366 // We form the mset in two stages. In the first we fill up our working
367 // mset. Adding a new document does not remove another.
369 // In the second, we consider documents which rank higher than the current
370 // lowest ranking document in the mset. Each document added expels the
371 // current lowest ranking document.
373 // If a percentage cutoff is in effect, it can cause the matcher to return
374 // from the second stage from the first.
376 // Is the mset a valid heap?
377 bool is_heap
= false;
382 if (!pltree
.next(min_weight
)) {
383 LOGLINE(MATCH
, "Reached end of potential matches");
387 // Only calculate the weight if we need it for mcmp, or there's a
388 // percentage or weight cutoff in effect. Otherwise we calculate it
389 // below if we haven't already rejected this candidate.
391 bool calculated_weight
= false;
392 if (sort_by
!= VAL
|| min_weight
> 0.0) {
393 wt
= pltree
.get_weight();
394 if (wt
< min_weight
) {
395 LOGLINE(MATCH
, "Rejecting potential match due to insufficient weight");
398 calculated_weight
= true;
401 Xapian::docid did
= pltree
.get_docid();
402 vsdoc
.set_document(did
);
403 LOGLINE(MATCH
, "Candidate document id " << did
<< " wt " << wt
);
404 Result
new_item(wt
, did
);
405 if (check_at_least
> first
+ maxitems
&& timeout
.timed_out()) {
406 check_at_least
= first
+ maxitems
;
409 if (sort_by
!= REL
) {
410 const string
* ptr
= pltree
.get_sort_key();
412 new_item
.set_sort_key(*ptr
);
414 new_item
.set_sort_key((*sorter
)(doc
));
416 new_item
.set_sort_key(vsdoc
.get_value(sort_key
));
419 // We're sorting by value (in part at least), so compare the item
420 // against the lowest currently in the proto-mset. If sort_by is
421 // VAL, then new_item.get_weight() won't yet be set, but that
422 // doesn't matter since it's not used by the sort function.
423 if (!mcmp(new_item
, min_item
)) {
424 if (mdecider
== NULL
&& !collapser
) {
425 // Document was definitely suitable for mset - no more
426 // processing needed.
427 LOGLINE(MATCH
, "Making note of match item which sorts lower than min_item");
429 if (!calculated_weight
) wt
= pltree
.get_weight();
430 spymaster(doc
, wt
, did
);
431 if (wt
> greatest_wt
) goto new_greatest_weight
;
434 if (docs_matched
>= check_at_least
) {
435 // We've seen enough items - we can drop this one.
436 LOGLINE(MATCH
, "Dropping candidate which sorts lower than min_item");
437 // FIXME: hmm, match decider might have rejected this...
438 if (!calculated_weight
) wt
= pltree
.get_weight();
439 if (wt
> greatest_wt
) goto new_greatest_weight
;
442 // We can't drop the item, because we need to test whether the
443 // mdecider would accept it and/or test whether it would be
445 LOGLINE(MATCH
, "Keeping candidate which sorts lower than min_item for further investigation");
449 // Apply any MatchSpy objects.
451 if (!calculated_weight
) {
452 wt
= pltree
.get_weight();
453 new_item
.set_weight(wt
);
454 calculated_weight
= true;
456 spymaster(doc
, wt
, did
);
459 if (!calculated_weight
) {
460 // we didn't calculate the weight above, but now we will need it
461 wt
= pltree
.get_weight();
462 new_item
.set_weight(wt
);
467 // Perform collapsing on key if requested.
470 res
= collapser
.process(new_item
, pltree
.get_collapse_key(), vsdoc
,
472 if (res
== REJECTED
) {
473 // If we're sorting by relevance primarily, then we throw away
474 // the lower weighted document anyway.
475 if (sort_by
!= REL
&& sort_by
!= REL_VAL
) {
476 if (wt
> greatest_wt
) goto new_greatest_weight
;
481 if (res
== REPLACED
) {
482 // There was a previous item in the collapse tab so
483 // the MSet can't be empty.
484 Assert(!items
.empty());
486 const Result
& old_item
= collapser
.old_item
;
487 // This is one of the best collapse_max potential MSet entries
488 // with this key which we've seen so far. Check if the
489 // entry with this key which it displaced might still be in the
490 // proto-MSet. If it might be, we need to check through for
492 double old_wt
= old_item
.get_weight();
493 if (old_wt
>= min_weight
&& mcmp(old_item
, min_item
)) {
494 // Scan through (unsorted) MSet looking for entry.
495 // FIXME: more efficient way than just scanning?
496 Xapian::docid olddid
= old_item
.get_docid();
497 vector
<Result
>::iterator i
;
498 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
499 if (i
->get_docid() == olddid
) {
500 LOGLINE(MATCH
, "collapse: removing " <<
502 new_item
.get_collapse_key());
503 // We can replace an arbitrary element in O(log N)
504 // but have to do it by hand (in this case the new
505 // elt is bigger, so we just swap down the tree).
506 // FIXME: implement this, and clean up is_heap
518 // OK, actually add the item to the mset.
521 if (items
.size() >= max_msize
) {
522 items
.push_back(new_item
);
525 make_heap(items
.begin(), items
.end(), mcmp
);
527 push_heap
<vector
<Result
>::iterator
,
528 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
530 pop_heap
<vector
<Result
>::iterator
,
531 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
534 min_item
= items
.front();
535 if (sort_by
== REL
|| sort_by
== REL_VAL
) {
536 if (docs_matched
>= check_at_least
) {
537 if (sort_by
== REL
) {
538 // We're done if this is a forward boolean match
539 // with only one database (bodgetastic, FIXME
540 // better if we can!)
541 if (rare(max_possible
== 0 && sort_forward
)) {
542 // In the multi database case, MergePostList
543 // currently processes each database
544 // sequentially (which actually may well be
545 // more efficient) so the docids in general
546 // won't arrive in order.
547 if (leaves
.size() == 1) break;
550 if (min_item
.get_weight() > min_weight
) {
551 LOGLINE(MATCH
, "Setting min_weight to " <<
552 min_item
.get_weight() << " from " << min_weight
);
553 min_weight
= min_item
.get_weight();
557 if (rare(pltree
.recalc_maxweight() < min_weight
)) {
558 LOGLINE(MATCH
, "*** TERMINATING EARLY (3)");
562 items
.push_back(new_item
);
564 if (sort_by
== REL
&& items
.size() == max_msize
) {
565 if (docs_matched
>= check_at_least
) {
566 // We're done if this is a forward boolean match
567 // with only one database (bodgetastic, FIXME
568 // better if we can!)
569 if (rare(max_possible
== 0 && sort_forward
)) {
570 // In the multi database case, MergePostList
571 // currently processes each database
572 // sequentially (which actually may well be
573 // more efficient) so the docids in general
574 // won't arrive in order.
575 if (leaves
.size() == 1) break;
582 // Keep a track of the greatest weight we've seen.
583 if (wt
> greatest_wt
) {
586 #ifdef XAPIAN_HAS_REMOTE_BACKEND
587 const unsigned int multiplier
= db
.internal
->size();
588 unsigned int db_num
= (did
- 1) % multiplier
;
589 if (is_remote
[db_num
]) {
590 // Note that the greatest weighted document came from a remote
591 // database, and which one.
592 greatest_wt_subqs_db_num
= db_num
;
596 greatest_wt_subqs_matched
= pltree
.count_matching_subqs();
597 #ifdef XAPIAN_HAS_REMOTE_BACKEND
598 greatest_wt_subqs_db_num
= UINT_MAX
;
601 if (percent_cutoff
) {
602 double w
= wt
* percent_cutoff_factor
;
603 if (w
> min_weight
) {
607 make_heap
<vector
<Result
>::iterator
,
608 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
610 while (!items
.empty() && items
.front().get_weight() < min_weight
) {
611 pop_heap
<vector
<Result
>::iterator
,
612 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
613 Assert(items
.back().get_weight() < min_weight
);
616 #ifdef XAPIAN_ASSERTIONS_PARANOID
617 vector
<Result
>::const_iterator i
;
618 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
619 Assert(i
->get_weight() >= min_weight
);
627 double percent_scale
= 0;
628 if (!items
.empty() && greatest_wt
> 0) {
629 #ifdef XAPIAN_HAS_REMOTE_BACKEND
630 if (greatest_wt_subqs_db_num
!= UINT_MAX
) {
631 const unsigned int n
= greatest_wt_subqs_db_num
;
632 RemoteSubMatch
* rem_match
;
633 rem_match
= static_cast<RemoteSubMatch
*>(leaves
[n
].get());
634 percent_scale
= rem_match
->get_percent_factor() / 100.0;
638 percent_scale
= greatest_wt_subqs_matched
/ double(total_subqs
);
639 percent_scale
/= greatest_wt
;
641 Assert(percent_scale
> 0);
642 if (percent_cutoff
) {
643 // FIXME: better to sort and binary chop maybe?
644 // Or we could just use a linear scan here instead.
646 // trim the mset to the correct answer...
647 double min_wt
= percent_cutoff_factor
/ percent_scale
;
650 make_heap
<vector
<Result
>::iterator
,
651 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
653 while (!items
.empty() && items
.front().get_weight() < min_wt
) {
654 pop_heap
<vector
<Result
>::iterator
,
655 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
656 Assert(items
.back().get_weight() < min_wt
);
659 #ifdef XAPIAN_ASSERTIONS_PARANOID
660 vector
<Result
>::const_iterator j
;
661 for (j
= items
.begin(); j
!= items
.end(); ++j
) {
662 Assert(j
->get_weight() >= min_wt
);
669 "docs_matched = " << docs_matched
<<
670 ", definite_matches_not_seen = " << definite_matches_not_seen
<<
671 ", matches_lower_bound = " << matches_lower_bound
<<
672 ", matches_estimated = " << matches_estimated
<<
673 ", matches_upper_bound = " << matches_upper_bound
);
675 // Adjust docs_matched to take account of documents which matched remotely
676 // but weren't sent across.
677 docs_matched
+= definite_matches_not_seen
;
679 Xapian::doccount uncollapsed_lower_bound
= matches_lower_bound
;
680 Xapian::doccount uncollapsed_upper_bound
= matches_upper_bound
;
681 Xapian::doccount uncollapsed_estimated
= matches_estimated
;
682 if (items
.size() < max_msize
) {
683 // We have fewer items in the mset than we tried to get for it, so we
684 // must have all the matches in it.
685 LOGLINE(MATCH
, "items.size() = " << items
.size() <<
686 ", max_msize = " << max_msize
<< ", setting bounds equal");
687 Assert(definite_matches_not_seen
== 0);
688 Assert(percent_cutoff
|| docs_matched
== items
.size());
689 matches_lower_bound
= matches_upper_bound
= matches_estimated
691 if (collapser
&& matches_lower_bound
> uncollapsed_lower_bound
)
692 uncollapsed_lower_bound
= matches_lower_bound
;
693 } else if (!collapser
&& docs_matched
< check_at_least
) {
694 // We have seen fewer matches than we checked for, so we must have seen
696 LOGLINE(MATCH
, "Setting bounds equal");
697 matches_lower_bound
= matches_upper_bound
= matches_estimated
699 if (collapser
&& matches_lower_bound
> uncollapsed_lower_bound
)
700 uncollapsed_lower_bound
= matches_lower_bound
;
702 AssertRel(matches_estimated
,>=,matches_lower_bound
);
703 AssertRel(matches_estimated
,<=,matches_upper_bound
);
705 // We can end up scaling the estimate more than once, so collect
706 // the scale factors and apply them in one go to avoid rounding
708 double estimate_scale
= 1.0;
709 double unique_rate
= 1.0;
712 LOGLINE(MATCH
, "Adjusting bounds due to collapse_key");
713 matches_lower_bound
= collapser
.get_matches_lower_bound();
714 if (matches_lower_bound
> uncollapsed_lower_bound
)
715 uncollapsed_lower_bound
= matches_lower_bound
;
717 // The estimate for the number of hits can be modified by
718 // multiplying it by the rate at which we've been finding
720 Xapian::doccount docs_considered
= collapser
.get_docs_considered();
721 Xapian::doccount dups_ignored
= collapser
.get_dups_ignored();
722 if (docs_considered
> 0) {
723 double unique
= double(docs_considered
- dups_ignored
);
724 unique_rate
= unique
/ double(docs_considered
);
727 // We can safely reduce the upper bound by the number of duplicates
729 matches_upper_bound
-= dups_ignored
;
731 LOGLINE(MATCH
, "matches_lower_bound=" << matches_lower_bound
<<
732 ", matches_estimated=" << matches_estimated
<<
733 "*" << estimate_scale
<< "*" << unique_rate
<<
734 ", matches_upper_bound=" << matches_upper_bound
);
738 if (!percent_cutoff
) {
740 // We're not collapsing or doing a percentage cutoff, so
741 // docs_matched is a lower bound on the total number of
743 matches_lower_bound
= max(docs_matched
, matches_lower_bound
);
747 Xapian::doccount decider_denied
= mdecider
->docs_denied_
;
748 Xapian::doccount decider_considered
=
749 mdecider
->docs_allowed_
+ mdecider
->docs_denied_
;
751 // The estimate for the number of hits can be modified by
752 // multiplying it by the rate at which the match decider has
753 // been accepting documents.
754 if (decider_considered
> 0) {
755 double accept
= double(decider_considered
- decider_denied
);
756 double accept_rate
= accept
/ double(decider_considered
);
757 estimate_scale
*= accept_rate
;
760 // If a document is denied by a match decider, it is not possible
761 // for it to found to be a duplicate, so it is safe to also reduce
762 // the upper bound by the number of documents denied by a match
764 matches_upper_bound
-= decider_denied
;
765 if (collapser
) uncollapsed_upper_bound
-= decider_denied
;
768 if (percent_cutoff
) {
769 estimate_scale
*= (1.0 - percent_cutoff_factor
);
771 // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
772 // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
774 // Very likely an underestimate, but we can't really do better
775 // without checking further matches... Only possibility would be
776 // to track how many docs made the min weight test but didn't make
777 // the candidate set since the last greatest_wt change, which we
778 // could use if the top documents matched all the prob terms.
779 matches_lower_bound
= items
.size();
780 if (collapser
) uncollapsed_lower_bound
= matches_lower_bound
;
782 // matches_upper_bound could be reduced by the number of documents
783 // which fail the min weight test (FIXME)
785 LOGLINE(MATCH
, "Adjusted bounds due to percent_cutoff (" <<
786 percent_cutoff
<< "): now have matches_estimated=" <<
787 matches_estimated
<< ", matches_lower_bound=" <<
788 matches_lower_bound
);
791 if (collapser
&& estimate_scale
!= 1.0) {
792 uncollapsed_estimated
=
793 Xapian::doccount(uncollapsed_estimated
* estimate_scale
+ 0.5);
796 estimate_scale
*= unique_rate
;
798 if (estimate_scale
!= 1.0) {
800 Xapian::doccount(matches_estimated
* estimate_scale
+ 0.5);
801 if (matches_estimated
< matches_lower_bound
)
802 matches_estimated
= matches_lower_bound
;
805 if (collapser
|| mdecider
) {
806 LOGLINE(MATCH
, "Clamping estimate between bounds: "
807 "matches_lower_bound = " << matches_lower_bound
<<
808 ", matches_estimated = " << matches_estimated
<<
809 ", matches_upper_bound = " << matches_upper_bound
);
810 AssertRel(matches_lower_bound
,<=,matches_upper_bound
);
811 matches_estimated
= max(matches_estimated
, matches_lower_bound
);
812 matches_estimated
= min(matches_estimated
, matches_upper_bound
);
813 } else if (!percent_cutoff
) {
814 AssertRel(docs_matched
,<=,matches_upper_bound
);
815 if (docs_matched
> matches_lower_bound
)
816 matches_lower_bound
= docs_matched
;
817 if (docs_matched
> matches_estimated
)
818 matches_estimated
= docs_matched
;
821 if (collapser
&& !mdecider
&& !percent_cutoff
) {
822 AssertRel(docs_matched
,<=,uncollapsed_upper_bound
);
823 if (docs_matched
> uncollapsed_lower_bound
)
824 uncollapsed_lower_bound
= docs_matched
;
829 AssertRel(uncollapsed_lower_bound
,<=,uncollapsed_upper_bound
);
830 if (uncollapsed_estimated
< uncollapsed_lower_bound
) {
831 uncollapsed_estimated
= uncollapsed_lower_bound
;
832 } else if (uncollapsed_estimated
> uncollapsed_upper_bound
) {
833 uncollapsed_estimated
= uncollapsed_upper_bound
;
836 // We're not collapsing, so the bounds must be the same.
837 uncollapsed_lower_bound
= matches_lower_bound
;
838 uncollapsed_upper_bound
= matches_upper_bound
;
839 uncollapsed_estimated
= matches_estimated
;
842 LOGLINE(MATCH
, items
.size() << " items in potential mset");
845 // Remove unwanted leading entries
846 if (items
.size() <= first
) {
849 LOGLINE(MATCH
, "finding " << first
<< "th");
850 // We perform nth_element() on reverse iterators so that the
851 // unwanted elements end up at the end of items, which means
852 // that the call to erase() to remove them doesn't have to copy
854 vector
<Result
>::reverse_iterator nth
;
855 nth
= items
.rbegin() + first
;
856 nth_element(items
.rbegin(), nth
, items
.rend(), mcmp
);
857 // Erase the trailing "first" elements
858 items
.erase(items
.begin() + items
.size() - first
, items
.end());
862 LOGLINE(MATCH
, "sorting " << items
.size() << " entries");
864 // We could use std::sort_heap if is_heap is true, but profiling
865 // suggests that's actually slower. Cast to void to suppress
866 // compiler warnings that the last set value of is_heap is never
870 // Need a stable sort, but this is provided by comparison operator
871 sort(items
.begin(), items
.end(), mcmp
);
873 if (!items
.empty()) {
874 LOGLINE(MATCH
, "min weight in mset = " << items
.back().get_weight());
875 LOGLINE(MATCH
, "max weight in mset = " << items
[0].get_weight());
878 AssertRel(matches_estimated
,>=,matches_lower_bound
);
879 AssertRel(matches_estimated
,<=,matches_upper_bound
);
881 AssertRel(uncollapsed_estimated
,>=,uncollapsed_lower_bound
);
882 AssertRel(uncollapsed_estimated
,<=,uncollapsed_upper_bound
);
884 // We may need to qualify any collapse_count to see if the highest weight
885 // collapsed item would have qualified percent_cutoff
886 // We WILL need to restore collapse_count to the mset by taking from
887 // collapse_tab; this is what comes of copying around whole objects
888 // instead of taking references, we find it hard to update collapse_count
889 // of an item that has already been pushed-back as we don't know where it
890 // is any more. If we keep or find references we won't need to mess with
891 // is_heap so much maybe?
892 if (!items
.empty() && collapser
&& !collapser
.empty()) {
893 // Nicked this formula from above.
895 if (percent_scale
> 0.0)
896 min_wt
= percent_cutoff_factor
/ percent_scale
;
897 Xapian::doccount entries
= collapser
.entries();
898 vector
<Result
>::iterator i
;
899 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
900 // Skip entries without a collapse key.
901 if (i
->get_collapse_key().empty()) continue;
903 // Set collapse_count appropriately.
904 i
->set_collapse_count(collapser
.get_collapse_count(i
->get_collapse_key(), percent_cutoff
, min_wt
));
905 if (--entries
== 0) {
906 // Stop once we've processed all items with collapse keys.
912 mset
.internal
= new Xapian::MSet::Internal(
917 uncollapsed_upper_bound
,
918 uncollapsed_lower_bound
,
919 uncollapsed_estimated
,
920 max_possible
, greatest_wt
,
922 percent_scale
* 100.0);