Reimplement the matcher
[xapian.git] / xapian-core / matcher / localsubmatch.cc
blob5dcf3f22bc261954a68e214cf7245241376d13f0
1 /** @file localsubmatch.cc
2 * @brief SubMatch class for a local database.
3 */
4 /* Copyright (C) 2006,2007,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "localsubmatch.h"
26 #include "backends/databaseinternal.h"
27 #include "debuglog.h"
28 #include "api/emptypostlist.h"
29 #include "extraweightpostlist.h"
30 #include "api/leafpostlist.h"
31 #include "omassert.h"
32 #include "queryoptimiser.h"
33 #include "synonympostlist.h"
34 #include "api/termlist.h"
35 #include "weight/weightinternal.h"
37 #include "xapian/error.h"
39 #include <memory>
40 #include <map>
41 #include <string>
43 using namespace std;
45 /** Xapian::Weight subclass which adds laziness.
47 * For terms from a wildcard when remote databases are involved, we need to
48 * delay calling init_() on the weight object until the stats for the terms
49 * from the wildcard have been collated.
51 class LazyWeight : public Xapian::Weight {
52 LeafPostList * pl;
54 Xapian::Weight * real_wt;
56 Xapian::Weight::Internal * stats;
58 Xapian::termcount qlen;
60 Xapian::termcount wqf;
62 double factor;
64 LazyWeight * clone() const;
66 void init(double factor_);
68 public:
69 LazyWeight(LeafPostList * pl_,
70 Xapian::Weight * real_wt_,
71 Xapian::Weight::Internal * stats_,
72 Xapian::termcount qlen_,
73 Xapian::termcount wqf__,
74 double factor_)
75 : pl(pl_),
76 real_wt(real_wt_),
77 stats(stats_),
78 qlen(qlen_),
79 wqf(wqf__),
80 factor(factor_)
81 { }
83 std::string name() const;
85 std::string serialise() const;
86 LazyWeight * unserialise(const std::string & serialised) const;
88 double get_sumpart(Xapian::termcount wdf,
89 Xapian::termcount doclen,
90 Xapian::termcount uniqterms) const;
91 double get_maxpart() const;
93 double get_sumextra(Xapian::termcount doclen,
94 Xapian::termcount uniqterms) const;
95 double get_maxextra() const;
98 LazyWeight *
99 LazyWeight::clone() const
101 throw Xapian::InvalidOperationError("LazyWeight::clone()");
104 void
105 LazyWeight::init(double factor_)
107 (void)factor_;
108 throw Xapian::InvalidOperationError("LazyWeight::init()");
111 string
112 LazyWeight::name() const
114 string desc = "LazyWeight(";
115 desc += real_wt->name();
116 desc += ")";
117 return desc;
120 string
121 LazyWeight::serialise() const
123 throw Xapian::InvalidOperationError("LazyWeight::serialise()");
126 LazyWeight *
127 LazyWeight::unserialise(const string &) const
129 throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
132 double
133 LazyWeight::get_sumpart(Xapian::termcount wdf,
134 Xapian::termcount doclen,
135 Xapian::termcount uniqterms) const
137 (void)wdf;
138 (void)doclen;
139 (void)uniqterms;
140 throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
143 double
144 LazyWeight::get_sumextra(Xapian::termcount doclen,
145 Xapian::termcount uniqterms) const
147 (void)doclen;
148 (void)uniqterms;
149 throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
152 double
153 LazyWeight::get_maxpart() const
155 // This gets called first for the case we care about.
156 return pl->resolve_lazy_termweight(real_wt, stats, qlen, wqf, factor);
159 double
160 LazyWeight::get_maxextra() const
162 throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
165 PostList *
166 LocalSubMatch::get_postlist(PostListTree * matcher,
167 Xapian::termcount * total_subqs_ptr)
169 LOGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist", matcher | total_subqs_ptr);
171 if (query.empty())
172 RETURN(new EmptyPostList); // MatchNothing
174 // Build the postlist tree for the query. This calls
175 // LocalSubMatch::open_post_list() for each term in the query.
176 PostList * pl;
178 QueryOptimiser opt(*db, *this, matcher, full_db_has_positions);
179 pl = query.internal->postlist(&opt, 1.0);
180 *total_subqs_ptr = opt.get_total_subqs();
183 unique_ptr<Xapian::Weight> extra_wt(wt_factory->clone());
184 // Only uses term-independent stats.
185 extra_wt->init_(*total_stats, qlen);
186 if (extra_wt->get_maxextra() != 0.0) {
187 // There's a term-independent weight contribution, so we combine the
188 // postlist tree with an ExtraWeightPostList which adds in this
189 // contribution.
190 pl = new ExtraWeightPostList(pl, db, extra_wt.release(), matcher);
193 RETURN(pl);
196 PostList *
197 LocalSubMatch::make_synonym_postlist(PostList * or_pl, double factor)
199 LOGCALL(MATCH, PostList *, "LocalSubMatch::make_synonym_postlist", or_pl | factor);
200 if (rare(or_pl->get_termfreq_max() == 0)) {
201 // or_pl is an EmptyPostList or equivalent.
202 return or_pl;
204 LOGVALUE(MATCH, or_pl->get_termfreq_est());
205 unique_ptr<SynonymPostList> res(new SynonymPostList(or_pl, db));
206 unique_ptr<Xapian::Weight> wt(wt_factory->clone());
208 TermFreqs freqs;
209 // Avoid calling get_termfreq_est_using_stats() if the database is empty
210 // so we don't need to special case that repeatedly when implementing it.
211 // FIXME: it would be nicer to handle an empty database higher up, though
212 // we need to catch the case where all the non-empty subdatabases have
213 // failed, so we can't just push this right up to the start of get_mset().
214 if (usual(total_stats->collection_size != 0)) {
215 freqs = or_pl->get_termfreq_est_using_stats(*total_stats);
217 wt->init_(*total_stats, qlen, factor,
218 freqs.termfreq, freqs.reltermfreq, freqs.collfreq);
220 res->set_weight(wt.release());
221 RETURN(res.release());
224 PostList *
225 LocalSubMatch::open_post_list(const string& term,
226 Xapian::termcount wqf,
227 double factor,
228 bool need_positions,
229 bool in_synonym,
230 QueryOptimiser * qopt,
231 bool lazy_weight)
233 LOGCALL(MATCH, PostList *, "LocalSubMatch::open_post_list", term | wqf | factor | need_positions | qopt | lazy_weight);
235 bool weighted = (factor != 0.0 && !term.empty());
237 LeafPostList * pl = NULL;
238 if (!term.empty() && !need_positions) {
239 if ((!weighted && !in_synonym) ||
240 !wt_factory->get_sumpart_needs_wdf_()) {
241 Xapian::doccount sub_tf;
242 db->get_freqs(term, &sub_tf, NULL);
243 if (sub_tf == db->get_doccount()) {
244 // If we're not going to use the wdf or term positions, and the
245 // term indexes all documents, we can replace it with the
246 // MatchAll postlist, which is especially efficient if there
247 // are no gaps in the docids.
248 pl = db->open_leaf_post_list(string());
250 // Set the term name so the postlist looks up the correct term
251 // frequencies - this is necessary if the weighting scheme
252 // needs collection frequency or reltermfreq (termfreq would be
253 // correct anyway since it's just the collection size in this
254 // case).
255 pl->set_term(term);
260 if (!pl) {
261 const LeafPostList * hint = qopt->get_hint_postlist();
262 if (hint)
263 pl = hint->open_nearby_postlist(term);
264 if (!pl) {
265 pl = db->open_leaf_post_list(term);
267 qopt->set_hint_postlist(pl);
270 if (lazy_weight) {
271 // Term came from a wildcard, but we may already have that term in the
272 // query anyway, so check before accumulating its TermFreqs.
273 map<string, TermFreqs>::iterator i = total_stats->termfreqs.find(term);
274 if (i == total_stats->termfreqs.end()) {
275 Xapian::doccount sub_tf;
276 Xapian::termcount sub_cf;
277 db->get_freqs(term, &sub_tf, &sub_cf);
278 total_stats->termfreqs.insert({term, TermFreqs(sub_tf, 0, sub_cf)});
282 if (weighted) {
283 Xapian::Weight * wt = wt_factory->clone();
284 if (!lazy_weight) {
285 wt->init_(*total_stats, qlen, term, wqf, factor);
286 total_stats->set_max_part(term, wt->get_maxpart());
287 } else {
288 // Delay initialising the actual weight object, so that we can
289 // gather stats for the terms lazily expanded from a wildcard
290 // (needed for the remote database case).
291 wt = new LazyWeight(pl, wt, total_stats, qlen, wqf, factor);
293 pl->set_termweight(wt);
295 RETURN(pl);