xapian-core/matcher/multimatch.cc

   1 /* multimatch.cc
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001,2002 Ananova Ltd
   5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
   6  * Copyright 2003 Orange PCS Ltd
   7  * Copyright 2003 Sam Liddicott
   8  * Copyright 2007,2008,2009 Lemur Consulting Ltd
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 #include "multimatch.h"
  29
  30 #include "api/msetinternal.h"
  31 #include "backends/multi/multi_database.h"
  32 #include "collapser.h"
  33 #include "debuglog.h"
  34 #include "submatch.h"
  35 #include "localsubmatch.h"
  36 #include "omassert.h"
  37 #include "api/enquireinternal.h"
  38 #include "api/rsetinternal.h"
  39 #include "realtime.h"
  40
  41 #include "deciderpostlist.h"
  42 #include "mergepostlist.h"
  43 #include "postlisttree.h"
  44 #include "spymaster.h"
  45
  46 #include "matchtimeout.h"
  47 #include "msetcmp.h"
  48
  49 #include "valuestreamdocument.h"
  50 #include "weight/weightinternal.h"
  51
  52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
  53
  54 #ifdef XAPIAN_HAS_REMOTE_BACKEND
  55 #include "remotesubmatch.h"
  56 #include "backends/remote/remote-database.h"
  57 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
  58
  59 #include <algorithm>
  60 #include <cfloat> // For DBL_EPSILON.
  61 #include <climits> // For UINT_MAX.
  62 #include <vector>
  63
  64 using namespace std;
  65 using Xapian::Internal::intrusive_ptr;
  66
  67 const Xapian::Enquire::Internal::sort_setting REL =
  68         Xapian::Enquire::Internal::REL;
  69 const Xapian::Enquire::Internal::sort_setting REL_VAL =
  70         Xapian::Enquire::Internal::REL_VAL;
  71 const Xapian::Enquire::Internal::sort_setting VAL =
  72         Xapian::Enquire::Internal::VAL;
  73 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
  74 const Xapian::Enquire::Internal::sort_setting VAL_REL =
  75         Xapian::Enquire::Internal::VAL_REL;
  76 #endif
  77
  78 /** Prepare some SubMatches.
  79  *
  80  *  This calls the prepare_match() method on each SubMatch object, causing them
  81  *  to lookup various statistics.
  82  *
  83  *  This method is rather complicated in order to handle remote matches
  84  *  efficiently.  Instead of simply calling "prepare_match()" on each submatch
  85  *  and waiting for it to return, it first calls "prepare_match(true)" on each
  86  *  submatch.  If any of these calls return false, indicating that the required
  87  *  information has not yet been received from the server, the method will try
  88  *  those which returned false again, passing "false" as a parameter to
  89  *  indicate that they should block until the information is ready.
  90  *
  91  *  This should improve performance in the case of mixed local-and-remote
  92  *  searches - the local searchers will all fetch their statistics from disk
  93  *  without waiting for the remote searchers, so as soon as the remote searcher
  94  *  statistics arrive, we can move on to the next step.
  95  */
  96 static void
  97 prepare_sub_matches(vector<intrusive_ptr<SubMatch> > & leaves,
  98                     Xapian::Weight::Internal & stats)
  99 {
 100     LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | stats);
 101     // We use a vector<bool> to track which SubMatches we're already prepared.
 102     vector<bool> prepared;
 103     prepared.resize(leaves.size(), false);
 104     size_t unprepared = leaves.size();
 105     bool nowait = true;
 106     while (unprepared) {
 107         for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
 108             if (prepared[leaf]) continue;
 109             SubMatch * submatch = leaves[leaf].get();
 110             if (!submatch || submatch->prepare_match(nowait, stats)) {
 111                 prepared[leaf] = true;
 112                 --unprepared;
 113             }
 114         }
 115         // Use blocking IO on subsequent passes, so that we don't go into
 116         // a tight loop.
 117         nowait = false;
 118     }
 119 }
 120
 121 ////////////////////////////////////
 122 // Initialisation and cleaning up //
 123 ////////////////////////////////////
 124 MultiMatch::MultiMatch(const Xapian::Database &db_,
 125                        const Xapian::Query & query_,
 126                        Xapian::termcount qlen,
 127                        const Xapian::RSet * omrset,
 128                        Xapian::doccount collapse_max_,
 129                        Xapian::valueno collapse_key_,
 130                        int percent_cutoff_, double weight_cutoff_,
 131                        Xapian::Enquire::docid_order order_,
 132                        Xapian::valueno sort_key_,
 133                        Xapian::Enquire::Internal::sort_setting sort_by_,
 134                        bool sort_value_forward_,
 135                        double time_limit_,
 136                        Xapian::Weight::Internal & stats,
 137                        const Xapian::Weight * weight_,
 138                        const vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & matchspies_,
 139                        bool have_sorter, bool have_mdecider)
 140         : db(db_), query(query_),
 141           collapse_max(collapse_max_), collapse_key(collapse_key_),
 142           percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
 143           order(order_),
 144           sort_key(sort_key_), sort_by(sort_by_),
 145           sort_value_forward(sort_value_forward_),
 146           time_limit(time_limit_),
 147           weight(weight_),
 148           is_remote(db.internal->size()),
 149           matchspies(matchspies_)
 150 {
 151     LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | time_limit_| stats | weight_ | matchspies_ | have_sorter | have_mdecider);
 152
 153     Assert(!query.empty());
 154
 155     Xapian::doccount number_of_subdbs = db.internal->size();
 156     vector<Xapian::RSet> subrsets;
 157     if (omrset && omrset->internal.get()) {
 158         omrset->internal->shard(number_of_subdbs, subrsets);
 159     } else {
 160         subrsets.resize(number_of_subdbs);
 161     }
 162
 163     for (size_t i = 0; i != number_of_subdbs; ++i) {
 164         const Xapian::Database::Internal *subdb = db.internal.get();
 165         if (number_of_subdbs > 1) {
 166             auto multidb = static_cast<const MultiDatabase*>(subdb);
 167             subdb = multidb->shards[i];
 168         }
 169         Assert(subdb);
 170         intrusive_ptr<SubMatch> smatch;
 171
 172         // There is currently only one special case, for network databases.
 173 #ifdef XAPIAN_HAS_REMOTE_BACKEND
 174         if (subdb->get_backend_info(NULL) == BACKEND_REMOTE) {
 175             auto rem_db = static_cast<const RemoteDatabase*>(subdb);
 176             if (have_sorter) {
 177                 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
 178             }
 179             if (have_mdecider) {
 180                 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
 181             }
 182             // FIXME: Remote handling for time_limit with multiple
 183             // databases may need some work.
 184             rem_db->set_query(query, qlen, collapse_max, collapse_key,
 185                               order, sort_key, sort_by, sort_value_forward,
 186                               time_limit,
 187                               percent_cutoff, weight_cutoff, weight,
 188                               subrsets[i], matchspies);
 189             bool decreasing_relevance =
 190                 (sort_by == REL || sort_by == REL_VAL);
 191             smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
 192             is_remote[i] = true;
 193         } else {
 194             smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight,
 195                                        db.has_positions());
 196             subdb->readahead_for_query(query);
 197         }
 198 #else
 199         // Avoid unused parameter warnings.
 200         (void)have_sorter;
 201         (void)have_mdecider;
 202         smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight,
 203                                    db.has_positions());
 204 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
 205         leaves.push_back(smatch);
 206     }
 207
 208     stats.set_query(query);
 209     prepare_sub_matches(leaves, stats);
 210     stats.set_bounds_from_db(db);
 211 }
 212
 213 void
 214 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
 215                      Xapian::doccount check_at_least,
 216                      Xapian::MSet & mset,
 217                      Xapian::Weight::Internal & stats,
 218                      const Xapian::MatchDecider *mdecider,
 219                      const Xapian::KeyMaker *sorter)
 220 {
 221     LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("sorter"));
 222     AssertRel(check_at_least,>=,first + maxitems);
 223
 224     Assert(!query.empty());
 225
 226     Assert(!leaves.empty());
 227
 228     TimeOut timeout(time_limit);
 229
 230 #ifdef XAPIAN_HAS_REMOTE_BACKEND
 231     // If there's only one database and it's remote, we can just unserialise
 232     // its MSet and return that.
 233     if (leaves.size() == 1 && is_remote[0]) {
 234         RemoteSubMatch * rem_match;
 235         rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
 236         rem_match->start_match(first, maxitems, check_at_least, stats);
 237         rem_match->get_mset(mset);
 238         return;
 239     }
 240 #endif
 241
 242     // Start matchers.
 243     for (auto && leaf : leaves) {
 244         leaf->start_match(0, first + maxitems, first + check_at_least, stats);
 245     }
 246
 247     ValueStreamDocument vsdoc(db);
 248     ++vsdoc._refs;
 249     Xapian::Document doc(&vsdoc);
 250
 251     // Get postlists and term info
 252     vector<PostList *> postlists;
 253     PostListTree pltree;
 254     Xapian::termcount total_subqs = 0;
 255     // Keep a count of matches which we know exist, but we won't see.  This
 256     // occurs when a submatch is remote, and returns a lower bound on the
 257     // number of matching documents which is higher than the number of
 258     // documents it returns (because it wasn't asked for more documents).
 259     Xapian::doccount definite_matches_not_seen = 0;
 260     for (size_t i = 0; i != leaves.size(); ++i) {
 261         // Pick the highest total subqueries answer amongst the subdatabases,
 262         // as the query to postlist conversion doesn't recurse into positional
 263         // queries for shards that don't have positional data when at least one
 264         // other shard does.
 265         Xapian::termcount total_subqs_i = 0;
 266         PostList * pl = leaves[i]->get_postlist(&pltree, &total_subqs_i);
 267         total_subqs = max(total_subqs, total_subqs_i);
 268         if (is_remote[i]) {
 269             if (pl->get_termfreq_min() > first + maxitems) {
 270                 LOGLINE(MATCH, "Found " <<
 271                                pl->get_termfreq_min() - (first + maxitems)
 272                                << " definite matches in remote submatch "
 273                                "which aren't passed to local match");
 274                 definite_matches_not_seen += pl->get_termfreq_min();
 275                 definite_matches_not_seen -= first + maxitems;
 276             }
 277         } else if (mdecider) {
 278             pl = new DeciderPostList(pl, mdecider, &vsdoc);
 279         }
 280         postlists.push_back(pl);
 281     }
 282     Assert(!postlists.empty());
 283
 284     Xapian::doccount n_shards = postlists.size();
 285     if (n_shards == 1) {
 286         pltree.set_postlist(postlists[0]);
 287     } else {
 288         pltree.set_postlist(new MergePostList(&postlists[0], n_shards, vsdoc));
 289     }
 290
 291     LOGLINE(MATCH, "pl = (" << pltree.get_description() << ")");
 292
 293     // Empty result set
 294     Xapian::doccount docs_matched = 0;
 295     double greatest_wt = 0;
 296     Xapian::termcount greatest_wt_subqs_matched = 0;
 297 #ifdef XAPIAN_HAS_REMOTE_BACKEND
 298     unsigned greatest_wt_subqs_db_num = UINT_MAX;
 299 #endif
 300     vector<Result> items;
 301
 302     // maximum weight a document could possibly have
 303     const double max_possible = pltree.recalc_maxweight();
 304
 305     LOGLINE(MATCH, "pl = (" << pltree.get_description() << ")");
 306
 307     Xapian::doccount matches_upper_bound = pltree.get_termfreq_max();
 308     Xapian::doccount matches_lower_bound = pltree.get_termfreq_min();
 309     Xapian::doccount matches_estimated   = pltree.get_termfreq_est();
 310
 311     SpyMaster spymaster(&matchspies, &is_remote);
 312
 313     // Check if any results have been asked for (might just be wanting
 314     // maxweight).
 315     if (check_at_least == 0) {
 316         Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
 317         if (collapse_max) {
 318             // Lower bound must be set to no more than collapse_max, since it's
 319             // possible that all matching documents have the same collapse_key
 320             // value and so are collapsed together.
 321             if (matches_lower_bound > collapse_max)
 322                 matches_lower_bound = collapse_max;
 323         }
 324
 325         mset.internal = new Xapian::MSet::Internal(
 326                                            first,
 327                                            matches_upper_bound,
 328                                            matches_lower_bound,
 329                                            matches_estimated,
 330                                            matches_upper_bound,
 331                                            uncollapsed_lower_bound,
 332                                            matches_estimated,
 333                                            max_possible, greatest_wt,
 334                                            std::move(items),
 335                                            0);
 336         return;
 337     }
 338
 339     // Set max number of results that we want - this is used to decide
 340     // when to throw away unwanted items.
 341     Xapian::doccount max_msize = first + maxitems;
 342     items.reserve(max_msize + 1);
 343
 344     // Tracks the minimum item currently eligible for the MSet - we compare
 345     // candidate items against this.
 346     Result min_item(0.0, 0);
 347
 348     // Minimum weight an item must have to be worth considering.
 349     double min_weight = weight_cutoff;
 350
 351     // Factor to multiply maximum weight seen by to get the cutoff weight.
 352     double percent_cutoff_factor = percent_cutoff / 100.0;
 353     // Corresponding correction to that in omenquire.cc to account for excess
 354     // precision on x86.
 355     percent_cutoff_factor -= DBL_EPSILON;
 356
 357     // Object to handle collapsing.
 358     Collapser collapser(collapse_key, collapse_max);
 359
 360     /// Comparison functor for sorting MSet
 361     bool sort_forward = (order != Xapian::Enquire::DESCENDING);
 362     MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
 363
 364     // Perform query
 365
 366     // We form the mset in two stages.  In the first we fill up our working
 367     // mset.  Adding a new document does not remove another.
 368     //
 369     // In the second, we consider documents which rank higher than the current
 370     // lowest ranking document in the mset.  Each document added expels the
 371     // current lowest ranking document.
 372     //
 373     // If a percentage cutoff is in effect, it can cause the matcher to return
 374     // from the second stage from the first.
 375
 376     // Is the mset a valid heap?
 377     bool is_heap = false;
 378
 379     while (true) {
 380         bool pushback;
 381
 382         if (!pltree.next(min_weight)) {
 383             LOGLINE(MATCH, "Reached end of potential matches");
 384             break;
 385         }
 386
 387         // Only calculate the weight if we need it for mcmp, or there's a
 388         // percentage or weight cutoff in effect.  Otherwise we calculate it
 389         // below if we haven't already rejected this candidate.
 390         double wt = 0.0;
 391         bool calculated_weight = false;
 392         if (sort_by != VAL || min_weight > 0.0) {
 393             wt = pltree.get_weight();
 394             if (wt < min_weight) {
 395                 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
 396                 continue;
 397             }
 398             calculated_weight = true;
 399         }
 400
 401         Xapian::docid did = pltree.get_docid();
 402         vsdoc.set_document(did);
 403         LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
 404         Result new_item(wt, did);
 405         if (check_at_least > first + maxitems && timeout.timed_out()) {
 406             check_at_least = first + maxitems;
 407         }
 408
 409         if (sort_by != REL) {
 410             const string * ptr = pltree.get_sort_key();
 411             if (ptr) {
 412                 new_item.set_sort_key(*ptr);
 413             } else if (sorter) {
 414                 new_item.set_sort_key((*sorter)(doc));
 415             } else {
 416                 new_item.set_sort_key(vsdoc.get_value(sort_key));
 417             }
 418
 419             // We're sorting by value (in part at least), so compare the item
 420             // against the lowest currently in the proto-mset.  If sort_by is
 421             // VAL, then new_item.get_weight() won't yet be set, but that
 422             // doesn't matter since it's not used by the sort function.
 423             if (!mcmp(new_item, min_item)) {
 424                 if (mdecider == NULL && !collapser) {
 425                     // Document was definitely suitable for mset - no more
 426                     // processing needed.
 427                     LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
 428                     ++docs_matched;
 429                     if (!calculated_weight) wt = pltree.get_weight();
 430                     spymaster(doc, wt, did);
 431                     if (wt > greatest_wt) goto new_greatest_weight;
 432                     continue;
 433                 }
 434                 if (docs_matched >= check_at_least) {
 435                     // We've seen enough items - we can drop this one.
 436                     LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
 437                     // FIXME: hmm, match decider might have rejected this...
 438                     if (!calculated_weight) wt = pltree.get_weight();
 439                     if (wt > greatest_wt) goto new_greatest_weight;
 440                     continue;
 441                 }
 442                 // We can't drop the item, because we need to test whether the
 443                 // mdecider would accept it and/or test whether it would be
 444                 // collapsed.
 445                 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
 446             }
 447         }
 448
 449         // Apply any MatchSpy objects.
 450         if (spymaster) {
 451             if (!calculated_weight) {
 452                 wt = pltree.get_weight();
 453                 new_item.set_weight(wt);
 454                 calculated_weight = true;
 455             }
 456             spymaster(doc, wt, did);
 457         }
 458
 459         if (!calculated_weight) {
 460             // we didn't calculate the weight above, but now we will need it
 461             wt = pltree.get_weight();
 462             new_item.set_weight(wt);
 463         }
 464
 465         pushback = true;
 466
 467         // Perform collapsing on key if requested.
 468         if (collapser) {
 469             collapse_result res;
 470             res = collapser.process(new_item, pltree.get_collapse_key(), vsdoc,
 471                                     mcmp);
 472             if (res == REJECTED) {
 473                 // If we're sorting by relevance primarily, then we throw away
 474                 // the lower weighted document anyway.
 475                 if (sort_by != REL && sort_by != REL_VAL) {
 476                     if (wt > greatest_wt) goto new_greatest_weight;
 477                 }
 478                 continue;
 479             }
 480
 481             if (res == REPLACED) {
 482                 // There was a previous item in the collapse tab so
 483                 // the MSet can't be empty.
 484                 Assert(!items.empty());
 485
 486                 const Result& old_item = collapser.old_item;
 487                 // This is one of the best collapse_max potential MSet entries
 488                 // with this key which we've seen so far.  Check if the
 489                 // entry with this key which it displaced might still be in the
 490                 // proto-MSet.  If it might be, we need to check through for
 491                 // it.
 492                 double old_wt = old_item.get_weight();
 493                 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
 494                     // Scan through (unsorted) MSet looking for entry.
 495                     // FIXME: more efficient way than just scanning?
 496                     Xapian::docid olddid = old_item.get_docid();
 497                     vector<Result>::iterator i;
 498                     for (i = items.begin(); i != items.end(); ++i) {
 499                         if (i->get_docid() == olddid) {
 500                             LOGLINE(MATCH, "collapse: removing " <<
 501                                            olddid << ": " <<
 502                                            new_item.get_collapse_key());
 503                             // We can replace an arbitrary element in O(log N)
 504                             // but have to do it by hand (in this case the new
 505                             // elt is bigger, so we just swap down the tree).
 506                             // FIXME: implement this, and clean up is_heap
 507                             // handling
 508                             *i = new_item;
 509                             pushback = false;
 510                             is_heap = false;
 511                             break;
 512                         }
 513                     }
 514                 }
 515             }
 516         }
 517
 518         // OK, actually add the item to the mset.
 519         if (pushback) {
 520             ++docs_matched;
 521             if (items.size() >= max_msize) {
 522                 items.push_back(new_item);
 523                 if (!is_heap) {
 524                     is_heap = true;
 525                     make_heap(items.begin(), items.end(), mcmp);
 526                 } else {
 527                     push_heap<vector<Result>::iterator,
 528                               MSetCmp>(items.begin(), items.end(), mcmp);
 529                 }
 530                 pop_heap<vector<Result>::iterator,
 531                          MSetCmp>(items.begin(), items.end(), mcmp);
 532                 items.pop_back();
 533
 534                 min_item = items.front();
 535                 if (sort_by == REL || sort_by == REL_VAL) {
 536                     if (docs_matched >= check_at_least) {
 537                         if (sort_by == REL) {
 538                             // We're done if this is a forward boolean match
 539                             // with only one database (bodgetastic, FIXME
 540                             // better if we can!)
 541                             if (rare(max_possible == 0 && sort_forward)) {
 542                                 // In the multi database case, MergePostList
 543                                 // currently processes each database
 544                                 // sequentially (which actually may well be
 545                                 // more efficient) so the docids in general
 546                                 // won't arrive in order.
 547                                 if (leaves.size() == 1) break;
 548                             }
 549                         }
 550                         if (min_item.get_weight() > min_weight) {
 551                             LOGLINE(MATCH, "Setting min_weight to " <<
 552                                     min_item.get_weight() << " from " << min_weight);
 553                             min_weight = min_item.get_weight();
 554                         }
 555                     }
 556                 }
 557                 if (rare(pltree.recalc_maxweight() < min_weight)) {
 558                     LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
 559                     break;
 560                 }
 561             } else {
 562                 items.push_back(new_item);
 563                 is_heap = false;
 564                 if (sort_by == REL && items.size() == max_msize) {
 565                     if (docs_matched >= check_at_least) {
 566                         // We're done if this is a forward boolean match
 567                         // with only one database (bodgetastic, FIXME
 568                         // better if we can!)
 569                         if (rare(max_possible == 0 && sort_forward)) {
 570                             // In the multi database case, MergePostList
 571                             // currently processes each database
 572                             // sequentially (which actually may well be
 573                             // more efficient) so the docids in general
 574                             // won't arrive in order.
 575                             if (leaves.size() == 1) break;
 576                         }
 577                     }
 578                 }
 579             }
 580         }
 581
 582         // Keep a track of the greatest weight we've seen.
 583         if (wt > greatest_wt) {
 584 new_greatest_weight:
 585             greatest_wt = wt;
 586 #ifdef XAPIAN_HAS_REMOTE_BACKEND
 587             const unsigned int multiplier = db.internal->size();
 588             unsigned int db_num = (did - 1) % multiplier;
 589             if (is_remote[db_num]) {
 590                 // Note that the greatest weighted document came from a remote
 591                 // database, and which one.
 592                 greatest_wt_subqs_db_num = db_num;
 593             } else
 594 #endif
 595             {
 596                 greatest_wt_subqs_matched = pltree.count_matching_subqs();
 597 #ifdef XAPIAN_HAS_REMOTE_BACKEND
 598                 greatest_wt_subqs_db_num = UINT_MAX;
 599 #endif
 600             }
 601             if (percent_cutoff) {
 602                 double w = wt * percent_cutoff_factor;
 603                 if (w > min_weight) {
 604                     min_weight = w;
 605                     if (!is_heap) {
 606                         is_heap = true;
 607                         make_heap<vector<Result>::iterator,
 608                                   MSetCmp>(items.begin(), items.end(), mcmp);
 609                     }
 610                     while (!items.empty() && items.front().get_weight() < min_weight) {
 611                         pop_heap<vector<Result>::iterator,
 612                                  MSetCmp>(items.begin(), items.end(), mcmp);
 613                         Assert(items.back().get_weight() < min_weight);
 614                         items.pop_back();
 615                     }
 616 #ifdef XAPIAN_ASSERTIONS_PARANOID
 617                     vector<Result>::const_iterator i;
 618                     for (i = items.begin(); i != items.end(); ++i) {
 619                         Assert(i->get_weight() >= min_weight);
 620                     }
 621 #endif
 622                 }
 623             }
 624         }
 625     }
 626
 627     double percent_scale = 0;
 628     if (!items.empty() && greatest_wt > 0) {
 629 #ifdef XAPIAN_HAS_REMOTE_BACKEND
 630         if (greatest_wt_subqs_db_num != UINT_MAX) {
 631             const unsigned int n = greatest_wt_subqs_db_num;
 632             RemoteSubMatch * rem_match;
 633             rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
 634             percent_scale = rem_match->get_percent_factor() / 100.0;
 635         } else
 636 #endif
 637         {
 638             percent_scale = greatest_wt_subqs_matched / double(total_subqs);
 639             percent_scale /= greatest_wt;
 640         }
 641         Assert(percent_scale > 0);
 642         if (percent_cutoff) {
 643             // FIXME: better to sort and binary chop maybe?
 644             // Or we could just use a linear scan here instead.
 645
 646             // trim the mset to the correct answer...
 647             double min_wt = percent_cutoff_factor / percent_scale;
 648             if (!is_heap) {
 649                 is_heap = true;
 650                 make_heap<vector<Result>::iterator,
 651                           MSetCmp>(items.begin(), items.end(), mcmp);
 652             }
 653             while (!items.empty() && items.front().get_weight() < min_wt) {
 654                 pop_heap<vector<Result>::iterator,
 655                          MSetCmp>(items.begin(), items.end(), mcmp);
 656                 Assert(items.back().get_weight() < min_wt);
 657                 items.pop_back();
 658             }
 659 #ifdef XAPIAN_ASSERTIONS_PARANOID
 660             vector<Result>::const_iterator j;
 661             for (j = items.begin(); j != items.end(); ++j) {
 662                 Assert(j->get_weight() >= min_wt);
 663             }
 664 #endif
 665         }
 666     }
 667
 668     LOGLINE(MATCH,
 669             "docs_matched = " << docs_matched <<
 670             ", definite_matches_not_seen = " << definite_matches_not_seen <<
 671             ", matches_lower_bound = " << matches_lower_bound <<
 672             ", matches_estimated = " << matches_estimated <<
 673             ", matches_upper_bound = " << matches_upper_bound);
 674
 675     // Adjust docs_matched to take account of documents which matched remotely
 676     // but weren't sent across.
 677     docs_matched += definite_matches_not_seen;
 678
 679     Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
 680     Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
 681     Xapian::doccount uncollapsed_estimated = matches_estimated;
 682     if (items.size() < max_msize) {
 683         // We have fewer items in the mset than we tried to get for it, so we
 684         // must have all the matches in it.
 685         LOGLINE(MATCH, "items.size() = " << items.size() <<
 686                 ", max_msize = " << max_msize << ", setting bounds equal");
 687         Assert(definite_matches_not_seen == 0);
 688         Assert(percent_cutoff || docs_matched == items.size());
 689         matches_lower_bound = matches_upper_bound = matches_estimated
 690             = items.size();
 691         if (collapser && matches_lower_bound > uncollapsed_lower_bound)
 692             uncollapsed_lower_bound = matches_lower_bound;
 693     } else if (!collapser && docs_matched < check_at_least) {
 694         // We have seen fewer matches than we checked for, so we must have seen
 695         // all the matches.
 696         LOGLINE(MATCH, "Setting bounds equal");
 697         matches_lower_bound = matches_upper_bound = matches_estimated
 698             = docs_matched;
 699         if (collapser && matches_lower_bound > uncollapsed_lower_bound)
 700            uncollapsed_lower_bound = matches_lower_bound;
 701     } else {
 702         AssertRel(matches_estimated,>=,matches_lower_bound);
 703         AssertRel(matches_estimated,<=,matches_upper_bound);
 704
 705         // We can end up scaling the estimate more than once, so collect
 706         // the scale factors and apply them in one go to avoid rounding
 707         // more than once.
 708         double estimate_scale = 1.0;
 709         double unique_rate = 1.0;
 710
 711         if (collapser) {
 712             LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
 713             matches_lower_bound = collapser.get_matches_lower_bound();
 714             if (matches_lower_bound > uncollapsed_lower_bound)
 715                uncollapsed_lower_bound = matches_lower_bound;
 716
 717             // The estimate for the number of hits can be modified by
 718             // multiplying it by the rate at which we've been finding
 719             // unique documents.
 720             Xapian::doccount docs_considered = collapser.get_docs_considered();
 721             Xapian::doccount dups_ignored = collapser.get_dups_ignored();
 722             if (docs_considered > 0) {
 723                 double unique = double(docs_considered - dups_ignored);
 724                 unique_rate = unique / double(docs_considered);
 725             }
 726
 727             // We can safely reduce the upper bound by the number of duplicates
 728             // we've ignored.
 729             matches_upper_bound -= dups_ignored;
 730
 731             LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
 732                     ", matches_estimated=" << matches_estimated <<
 733                     "*" << estimate_scale << "*" << unique_rate <<
 734                     ", matches_upper_bound=" << matches_upper_bound);
 735         }
 736
 737         if (mdecider) {
 738             if (!percent_cutoff) {
 739                 if (!collapser) {
 740                     // We're not collapsing or doing a percentage cutoff, so
 741                     // docs_matched is a lower bound on the total number of
 742                     // matches.
 743                     matches_lower_bound = max(docs_matched, matches_lower_bound);
 744                 }
 745             }
 746
 747             Xapian::doccount decider_denied = mdecider->docs_denied_;
 748             Xapian::doccount decider_considered =
 749                 mdecider->docs_allowed_ + mdecider->docs_denied_;
 750
 751             // The estimate for the number of hits can be modified by
 752             // multiplying it by the rate at which the match decider has
 753             // been accepting documents.
 754             if (decider_considered > 0) {
 755                 double accept = double(decider_considered - decider_denied);
 756                 double accept_rate = accept / double(decider_considered);
 757                 estimate_scale *= accept_rate;
 758             }
 759
 760             // If a document is denied by a match decider, it is not possible
 761             // for it to found to be a duplicate, so it is safe to also reduce
 762             // the upper bound by the number of documents denied by a match
 763             // decider.
 764             matches_upper_bound -= decider_denied;
 765             if (collapser) uncollapsed_upper_bound -= decider_denied;
 766         }
 767
 768         if (percent_cutoff) {
 769             estimate_scale *= (1.0 - percent_cutoff_factor);
 770             // another approach:
 771             // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
 772             // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
 773
 774             // Very likely an underestimate, but we can't really do better
 775             // without checking further matches...  Only possibility would be
 776             // to track how many docs made the min weight test but didn't make
 777             // the candidate set since the last greatest_wt change, which we
 778             // could use if the top documents matched all the prob terms.
 779             matches_lower_bound = items.size();
 780             if (collapser) uncollapsed_lower_bound = matches_lower_bound;
 781
 782             // matches_upper_bound could be reduced by the number of documents
 783             // which fail the min weight test (FIXME)
 784
 785             LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
 786                     percent_cutoff << "): now have matches_estimated=" <<
 787                     matches_estimated << ", matches_lower_bound=" <<
 788                     matches_lower_bound);
 789         }
 790
 791         if (collapser && estimate_scale != 1.0) {
 792             uncollapsed_estimated =
 793                 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
 794         }
 795
 796         estimate_scale *= unique_rate;
 797
 798         if (estimate_scale != 1.0) {
 799             matches_estimated =
 800                 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
 801             if (matches_estimated < matches_lower_bound)
 802                 matches_estimated = matches_lower_bound;
 803         }
 804
 805         if (collapser || mdecider) {
 806             LOGLINE(MATCH, "Clamping estimate between bounds: "
 807                     "matches_lower_bound = " << matches_lower_bound <<
 808                     ", matches_estimated = " << matches_estimated <<
 809                     ", matches_upper_bound = " << matches_upper_bound);
 810             AssertRel(matches_lower_bound,<=,matches_upper_bound);
 811             matches_estimated = max(matches_estimated, matches_lower_bound);
 812             matches_estimated = min(matches_estimated, matches_upper_bound);
 813         } else if (!percent_cutoff) {
 814             AssertRel(docs_matched,<=,matches_upper_bound);
 815             if (docs_matched > matches_lower_bound)
 816                 matches_lower_bound = docs_matched;
 817             if (docs_matched > matches_estimated)
 818                 matches_estimated = docs_matched;
 819         }
 820
 821         if (collapser && !mdecider && !percent_cutoff) {
 822             AssertRel(docs_matched,<=,uncollapsed_upper_bound);
 823             if (docs_matched > uncollapsed_lower_bound)
 824                 uncollapsed_lower_bound = docs_matched;
 825         }
 826     }
 827
 828     if (collapser) {
 829         AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
 830         if (uncollapsed_estimated < uncollapsed_lower_bound) {
 831             uncollapsed_estimated = uncollapsed_lower_bound;
 832         } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
 833             uncollapsed_estimated = uncollapsed_upper_bound;
 834         }
 835     } else {
 836         // We're not collapsing, so the bounds must be the same.
 837         uncollapsed_lower_bound = matches_lower_bound;
 838         uncollapsed_upper_bound = matches_upper_bound;
 839         uncollapsed_estimated = matches_estimated;
 840     }
 841
 842     LOGLINE(MATCH, items.size() << " items in potential mset");
 843
 844     if (first > 0) {
 845         // Remove unwanted leading entries
 846         if (items.size() <= first) {
 847             items.clear();
 848         } else {
 849             LOGLINE(MATCH, "finding " << first << "th");
 850             // We perform nth_element() on reverse iterators so that the
 851             // unwanted elements end up at the end of items, which means
 852             // that the call to erase() to remove them doesn't have to copy
 853             // any elements.
 854             vector<Result>::reverse_iterator nth;
 855             nth = items.rbegin() + first;
 856             nth_element(items.rbegin(), nth, items.rend(), mcmp);
 857             // Erase the trailing "first" elements
 858             items.erase(items.begin() + items.size() - first, items.end());
 859         }
 860     }
 861
 862     LOGLINE(MATCH, "sorting " << items.size() << " entries");
 863
 864     // We could use std::sort_heap if is_heap is true, but profiling
 865     // suggests that's actually slower.  Cast to void to suppress
 866     // compiler warnings that the last set value of is_heap is never
 867     // used.
 868     (void)is_heap;
 869
 870     // Need a stable sort, but this is provided by comparison operator
 871     sort(items.begin(), items.end(), mcmp);
 872
 873     if (!items.empty()) {
 874         LOGLINE(MATCH, "min weight in mset = " << items.back().get_weight());
 875         LOGLINE(MATCH, "max weight in mset = " << items[0].get_weight());
 876     }
 877
 878     AssertRel(matches_estimated,>=,matches_lower_bound);
 879     AssertRel(matches_estimated,<=,matches_upper_bound);
 880
 881     AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
 882     AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
 883
 884     // We may need to qualify any collapse_count to see if the highest weight
 885     // collapsed item would have qualified percent_cutoff
 886     // We WILL need to restore collapse_count to the mset by taking from
 887     // collapse_tab; this is what comes of copying around whole objects
 888     // instead of taking references, we find it hard to update collapse_count
 889     // of an item that has already been pushed-back as we don't know where it
 890     // is any more.  If we keep or find references we won't need to mess with
 891     // is_heap so much maybe?
 892     if (!items.empty() && collapser && !collapser.empty()) {
 893         // Nicked this formula from above.
 894         double min_wt = 0.0;
 895         if (percent_scale > 0.0)
 896             min_wt = percent_cutoff_factor / percent_scale;
 897         Xapian::doccount entries = collapser.entries();
 898         vector<Result>::iterator i;
 899         for (i = items.begin(); i != items.end(); ++i) {
 900             // Skip entries without a collapse key.
 901             if (i->get_collapse_key().empty()) continue;
 902
 903             // Set collapse_count appropriately.
 904             i->set_collapse_count(collapser.get_collapse_count(i->get_collapse_key(), percent_cutoff, min_wt));
 905             if (--entries == 0) {
 906                 // Stop once we've processed all items with collapse keys.
 907                 break;
 908             }
 909         }
 910     }
 911
 912     mset.internal = new Xapian::MSet::Internal(
 913                                        first,
 914                                        matches_upper_bound,
 915                                        matches_lower_bound,
 916                                        matches_estimated,
 917                                        uncollapsed_upper_bound,
 918                                        uncollapsed_lower_bound,
 919                                        uncollapsed_estimated,
 920                                        max_possible, greatest_wt,
 921                                        std::move(items),
 922                                        percent_scale * 100.0);
 923 }