Reimplement the matcher
[xapian.git] / xapian-core / api / enquire.cc
blobeabbd74a0f8aceb3e2363f155de2da0f5dfc4aab
1 /** @file enquire.cc
2 * @brief Xapian::Enquire class
3 */
4 /* Copyright (C) 2009,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/enquire.h"
24 #include "enquireinternal.h"
26 #include "expand/esetinternal.h"
27 #include "expand/expandweight.h"
28 #include "matcher/matcher.h"
29 #include "msetinternal.h"
30 #include "vectortermlist.h"
31 #include "weight/weightinternal.h"
32 #include "xapian/database.h"
33 #include "xapian/error.h"
34 #include "xapian/expanddecider.h"
35 #include "xapian/intrusive_ptr.h"
36 #include "xapian/keymaker.h"
37 #include "xapian/matchspy.h"
38 #include "xapian/query.h"
39 #include "xapian/rset.h"
40 #include "xapian/weight.h"
42 #include <memory>
43 #include <string>
44 #include <vector>
46 using namespace std;
48 [[noreturn]]
49 static void
50 throw_invalid_arg(const char* msg) {
51 throw Xapian::InvalidArgumentError(msg);
54 namespace Xapian {
56 Enquire::Enquire(const Enquire& o) : internal(o.internal) {}
58 Enquire&
59 Enquire::operator=(const Enquire& o)
61 internal = o.internal;
62 return *this;
65 Enquire::Enquire(const Database& db) : internal(new Enquire::Internal(db)) {}
67 Enquire::~Enquire() {}
69 void
70 Enquire::set_query(const Query& query, termcount query_length)
72 internal->query = query;
73 internal->query_length = query_length;
76 const Query&
77 Enquire::get_query() const
79 return internal->query;
82 void
83 Enquire::set_weighting_scheme(const Weight& weight)
85 internal->weight.reset(weight.clone());
88 void
89 Enquire::set_docid_order(docid_order order)
91 internal->order = order;
94 void
95 Enquire::set_sort_by_relevance()
97 internal->sort_by = Internal::REL;
100 void
101 Enquire::set_sort_by_value(valueno sort_key, bool reverse)
103 internal->sort_by = Internal::VAL;
104 internal->sort_functor = NULL;
105 internal->sort_key = sort_key;
106 internal->sort_val_reverse = reverse;
109 void
110 Enquire::set_sort_by_key(KeyMaker* sorter, bool reverse)
112 if (sorter == NULL) {
113 throw_invalid_arg("Enquire::set_sort_by_key(): sorter cannot be NULL");
115 internal->sort_by = Internal::VAL;
116 internal->sort_functor = sorter;
117 internal->sort_val_reverse = reverse;
120 void
121 Enquire::set_sort_by_value_then_relevance(valueno sort_key, bool reverse)
123 internal->sort_by = Internal::VAL_REL;
124 internal->sort_functor = NULL;
125 internal->sort_key = sort_key;
126 internal->sort_val_reverse = reverse;
129 void
130 Enquire::set_sort_by_key_then_relevance(KeyMaker* sorter, bool reverse)
132 if (sorter == NULL) {
133 throw_invalid_arg("Enquire::set_sort_by_key_then_relevance(): "
134 "sorter cannot be NULL");
136 internal->sort_by = Internal::VAL_REL;
137 internal->sort_functor = sorter;
138 internal->sort_val_reverse = reverse;
141 void
142 Enquire::set_sort_by_relevance_then_value(valueno sort_key, bool reverse)
144 internal->sort_by = Internal::REL_VAL;
145 internal->sort_functor = NULL;
146 internal->sort_key = sort_key;
147 internal->sort_val_reverse = reverse;
150 void
151 Enquire::set_sort_by_relevance_then_key(KeyMaker* sorter, bool reverse)
153 if (sorter == NULL) {
154 throw_invalid_arg("Enquire::set_sort_by_relevance_then_key(): "
155 "sorter cannot be NULL");
157 internal->sort_by = Internal::REL_VAL;
158 internal->sort_functor = sorter;
159 internal->sort_val_reverse = reverse;
162 void
163 Enquire::set_collapse_key(valueno collapse_key, doccount collapse_max)
165 internal->collapse_key = collapse_key;
166 internal->collapse_max = collapse_max;
169 void
170 Enquire::set_cutoff(int percent_threshold, double weight_threshold)
172 internal->percent_threshold = percent_threshold;
173 internal->weight_threshold = weight_threshold;
176 void
177 Enquire::add_matchspy(MatchSpy* spy)
179 using Xapian::Internal::opt_intrusive_ptr;
180 if (spy == NULL)
181 throw_invalid_arg("Enquire::add_matchspy(): spy cannot be NULL");
182 internal->matchspies.push_back(opt_intrusive_ptr<MatchSpy>(spy));
185 void
186 Enquire::clear_matchspies()
188 internal->matchspies.clear();
191 void
192 Enquire::set_time_limit(double time_limit)
194 internal->time_limit = time_limit;
197 MSet
198 Enquire::get_mset(doccount first,
199 doccount maxitems,
200 doccount checkatleast,
201 const RSet* rset,
202 const MatchDecider* mdecider) const
204 return internal->get_mset(first, maxitems, checkatleast, rset, mdecider);
207 TermIterator
208 Enquire::get_matching_terms_begin(docid did) const
210 return internal->get_matching_terms_begin(did);
213 void
214 Enquire::set_expansion_scheme(const std::string &eweightname, double expand_k) const
216 if (eweightname == "bo1") {
217 internal->eweight = Enquire::Internal::EXPAND_BO1;
218 } else if (eweightname == "trad") {
219 internal->eweight = Enquire::Internal::EXPAND_TRAD;
220 } else {
221 throw_invalid_arg("Enquire::set_expansion_scheme(): eweightname must "
222 "be 'bo1' or 'trad'");
224 internal->expand_k = expand_k;
227 ESet
228 Enquire::get_eset(termcount maxitems,
229 const RSet& rset,
230 int flags,
231 const ExpandDecider* edecider,
232 double min_weight) const
234 return internal->get_eset(maxitems, rset, flags, edecider, min_weight);
237 std::string
238 Enquire::get_description() const
240 string desc = "Enquire(db=";
241 desc += internal->db.get_description();
242 if (!internal->query.empty()) {
243 desc += ", query=";
244 desc += internal->query.get_description();
246 desc += ')';
247 return desc;
250 Enquire::Internal::Internal(const Database& db_)
251 : db(db_) {}
253 MSet
254 Enquire::Internal::get_mset(doccount first,
255 doccount maxitems,
256 doccount checkatleast,
257 const RSet* rset,
258 const MatchDecider* mdecider) const
260 if (query.empty()) {
261 MSet mset;
262 mset.internal->set_first(first);
263 return mset;
266 if (percent_threshold && (sort_by == VAL || sort_by == VAL_REL)) {
267 throw Xapian::UnimplementedError("Use of a percentage cutoff while "
268 "sorting primary by value isn't "
269 "currently supported");
272 // Lazily initialise weight to its default if necessary.
273 if (!weight.get())
274 weight.reset(new BM25Weight);
276 // Lazily initialise query_length if it wasn't explicitly specified.
277 if (query_length == 0) {
278 query_length = query.get_length();
281 Xapian::doccount first_orig = first;
283 Xapian::doccount docs = db.get_doccount();
284 first = min(first, docs);
285 maxitems = min(maxitems, docs - first);
286 checkatleast = min(checkatleast, docs);
287 checkatleast = max(checkatleast, first + maxitems);
290 unique_ptr<Xapian::Weight::Internal> stats(new Xapian::Weight::Internal);
291 ::Matcher match(db,
292 query,
293 query_length,
294 rset,
295 *stats,
296 weight.get(),
297 (sort_functor.get() != NULL),
298 (mdecider != NULL),
299 collapse_key,
300 collapse_max,
301 percent_threshold,
302 weight_threshold,
303 order,
304 sort_key,
305 sort_by,
306 sort_val_reverse,
307 time_limit,
308 matchspies);
310 MSet mset = match.get_mset(first,
311 maxitems,
312 checkatleast,
313 *stats,
314 mdecider,
315 sort_functor.get(),
316 collapse_key,
317 collapse_max,
318 percent_threshold,
319 weight_threshold,
320 order,
321 sort_key,
322 sort_by,
323 sort_val_reverse,
324 time_limit,
325 matchspies);
327 if (first_orig != first && mset.internal.get()) {
328 mset.internal->set_first(first_orig);
331 mset.internal->set_enquire(this);
333 if (!mset.internal->get_stats()) {
334 mset.internal->set_stats(stats.release());
337 return mset;
340 TermIterator
341 Enquire::Internal::get_matching_terms_begin(docid did) const
343 if (query.empty())
344 return TermIterator();
346 struct term_and_pos {
347 string term;
348 Xapian::termpos pos;
350 term_and_pos(const string& term_, Xapian::termpos pos_)
351 : term(term_), pos(pos_) {}
354 vector<term_and_pos> query_terms;
355 Xapian::termpos pos = 1;
356 for (auto t = query.get_terms_begin(); t != query.get_terms_end(); ++t) {
357 query_terms.emplace_back(*t, pos++);
360 if (query_terms.empty())
361 return TermIterator();
363 // Reorder by term, secondary sort by position.
364 sort(query_terms.begin(), query_terms.end(),
365 [](const term_and_pos& a, const term_and_pos& b) {
366 int cmp = a.term.compare(b.term);
367 return cmp ? cmp < 0 : a.pos < b.pos;
370 // Loop through the query terms, skipping the document terms for each to
371 // see which match, and shuffling down the matching ones. Also discard
372 // repeats, keeping the smallest position.
373 size_t i = 0, j = 0;
374 auto t = db.termlist_begin(did);
375 do {
376 const string& term = query_terms[i].term;
377 if (j == 0 || term != query_terms[j - 1].term) {
378 t.skip_to(term);
379 if (t == db.termlist_end(did)) {
380 break;
383 if (*t == term) {
384 // Matched, so move down if necessary.
385 if (i != j)
386 query_terms[j] = std::move(query_terms[i]);
387 ++j;
390 } while (++i != query_terms.size());
392 // Truncate to leave just the matching terms.
393 query_terms.erase(query_terms.begin() + j, query_terms.end());
395 // Reorder by ascending query position.
396 sort(query_terms.begin(), query_terms.end(),
397 [](const term_and_pos& a, const term_and_pos& b) {
398 return a.pos < b.pos;
401 // Iterator adaptor to present query_terms as a container of just strings.
402 struct Itor {
403 vector<term_and_pos>::const_iterator it;
405 explicit
406 Itor(vector<term_and_pos>::const_iterator it_) : it(it_) {}
408 const std::string& operator*() const {
409 return it->term;
412 Itor& operator++() {
413 ++it;
414 return *this;
417 Itor operator++(int) {
418 Itor retval = *this;
419 ++it;
420 return retval;
423 bool operator!=(const Itor& o) { return it != o.it; }
426 return TermIterator(new VectorTermList(Itor(query_terms.cbegin()),
427 Itor(query_terms.cend())));
430 ESet
431 Enquire::Internal::get_eset(termcount maxitems,
432 const RSet& rset,
433 int flags,
434 const ExpandDecider* edecider_,
435 double min_weight) const
437 using Xapian::Internal::opt_intrusive_ptr;
438 opt_intrusive_ptr<const ExpandDecider> edecider(edecider_);
440 Xapian::ESet eset;
442 if (maxitems == 0 || rset.empty()) {
443 // Either we were asked for no results, or wouldn't produce any
444 // because no documents were marked as relevant.
445 return eset;
448 // Excluding query terms is a no-op without a query.
449 if ((flags & Enquire::INCLUDE_QUERY_TERMS) == 0 && !query.empty()) {
450 auto edft = new ExpandDeciderFilterTerms(query.get_terms_begin(),
451 query.get_terms_end());
452 if (edecider.get() == NULL) {
453 edecider = edft->release();
454 } else {
455 // Make sure ExpandDeciderFilterTerms doesn't leak if new throws.
456 opt_intrusive_ptr<const ExpandDecider> ptr(edft->release());
457 edecider = (new ExpandDeciderAnd(ptr.get(),
458 edecider.get()))->release();
462 bool use_exact_termfreq = flags & Enquire::USE_EXACT_TERMFREQ;
463 if (eweight == Enquire::Internal::EXPAND_BO1) {
464 using Xapian::Internal::Bo1EWeight;
465 Bo1EWeight bo1eweight(db, rset.size(), use_exact_termfreq);
466 eset.internal->expand(maxitems, db, rset, edecider.get(), bo1eweight,
467 min_weight);
468 } else {
469 AssertEq(eweight, Enquire::Internal::EXPAND_TRAD);
470 using Xapian::Internal::TradEWeight;
471 TradEWeight tradeweight(db, rset.size(), use_exact_termfreq, expand_k);
472 eset.internal->expand(maxitems, db, rset, edecider.get(), tradeweight,
473 min_weight);
476 return eset;