1 /** @file localsubmatch.cc
2 * @brief SubMatch class for a local database.
4 /* Copyright (C) 2006,2007,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "localsubmatch.h"
26 #include "backends/databaseinternal.h"
28 #include "api/emptypostlist.h"
29 #include "extraweightpostlist.h"
30 #include "api/leafpostlist.h"
32 #include "queryoptimiser.h"
33 #include "synonympostlist.h"
34 #include "api/termlist.h"
35 #include "weight/weightinternal.h"
37 #include "xapian/error.h"
45 /** Xapian::Weight subclass which adds laziness.
47 * For terms from a wildcard when remote databases are involved, we need to
48 * delay calling init_() on the weight object until the stats for the terms
49 * from the wildcard have been collated.
51 class LazyWeight
: public Xapian::Weight
{
54 Xapian::Weight
* real_wt
;
56 Xapian::Weight::Internal
* stats
;
58 Xapian::termcount qlen
;
60 Xapian::termcount wqf
;
64 LazyWeight
* clone() const;
66 void init(double factor_
);
69 LazyWeight(LeafPostList
* pl_
,
70 Xapian::Weight
* real_wt_
,
71 Xapian::Weight::Internal
* stats_
,
72 Xapian::termcount qlen_
,
73 Xapian::termcount wqf__
,
83 std::string
name() const;
85 std::string
serialise() const;
86 LazyWeight
* unserialise(const std::string
& serialised
) const;
88 double get_sumpart(Xapian::termcount wdf
,
89 Xapian::termcount doclen
,
90 Xapian::termcount uniqterms
) const;
91 double get_maxpart() const;
93 double get_sumextra(Xapian::termcount doclen
,
94 Xapian::termcount uniqterms
) const;
95 double get_maxextra() const;
99 LazyWeight::clone() const
101 throw Xapian::InvalidOperationError("LazyWeight::clone()");
105 LazyWeight::init(double factor_
)
108 throw Xapian::InvalidOperationError("LazyWeight::init()");
112 LazyWeight::name() const
114 string desc
= "LazyWeight(";
115 desc
+= real_wt
->name();
121 LazyWeight::serialise() const
123 throw Xapian::InvalidOperationError("LazyWeight::serialise()");
127 LazyWeight::unserialise(const string
&) const
129 throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
133 LazyWeight::get_sumpart(Xapian::termcount wdf
,
134 Xapian::termcount doclen
,
135 Xapian::termcount uniqterms
) const
140 throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
144 LazyWeight::get_sumextra(Xapian::termcount doclen
,
145 Xapian::termcount uniqterms
) const
149 throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
153 LazyWeight::get_maxpart() const
155 // This gets called first for the case we care about.
156 return pl
->resolve_lazy_termweight(real_wt
, stats
, qlen
, wqf
, factor
);
160 LazyWeight::get_maxextra() const
162 throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
166 LocalSubMatch::get_postlist(PostListTree
* matcher
,
167 Xapian::termcount
* total_subqs_ptr
)
169 LOGCALL(MATCH
, PostList
*, "LocalSubMatch::get_postlist", matcher
| total_subqs_ptr
);
172 RETURN(new EmptyPostList
); // MatchNothing
174 // Build the postlist tree for the query. This calls
175 // LocalSubMatch::open_post_list() for each term in the query.
178 QueryOptimiser
opt(*db
, *this, matcher
, full_db_has_positions
);
179 pl
= query
.internal
->postlist(&opt
, 1.0);
180 *total_subqs_ptr
= opt
.get_total_subqs();
183 unique_ptr
<Xapian::Weight
> extra_wt(wt_factory
->clone());
184 // Only uses term-independent stats.
185 extra_wt
->init_(*total_stats
, qlen
);
186 if (extra_wt
->get_maxextra() != 0.0) {
187 // There's a term-independent weight contribution, so we combine the
188 // postlist tree with an ExtraWeightPostList which adds in this
190 pl
= new ExtraWeightPostList(pl
, db
, extra_wt
.release(), matcher
);
197 LocalSubMatch::make_synonym_postlist(PostList
* or_pl
, double factor
)
199 LOGCALL(MATCH
, PostList
*, "LocalSubMatch::make_synonym_postlist", or_pl
| factor
);
200 if (rare(or_pl
->get_termfreq_max() == 0)) {
201 // or_pl is an EmptyPostList or equivalent.
204 LOGVALUE(MATCH
, or_pl
->get_termfreq_est());
205 unique_ptr
<SynonymPostList
> res(new SynonymPostList(or_pl
, db
));
206 unique_ptr
<Xapian::Weight
> wt(wt_factory
->clone());
209 // Avoid calling get_termfreq_est_using_stats() if the database is empty
210 // so we don't need to special case that repeatedly when implementing it.
211 // FIXME: it would be nicer to handle an empty database higher up, though
212 // we need to catch the case where all the non-empty subdatabases have
213 // failed, so we can't just push this right up to the start of get_mset().
214 if (usual(total_stats
->collection_size
!= 0)) {
215 freqs
= or_pl
->get_termfreq_est_using_stats(*total_stats
);
217 wt
->init_(*total_stats
, qlen
, factor
,
218 freqs
.termfreq
, freqs
.reltermfreq
, freqs
.collfreq
);
220 res
->set_weight(wt
.release());
221 RETURN(res
.release());
225 LocalSubMatch::open_post_list(const string
& term
,
226 Xapian::termcount wqf
,
230 QueryOptimiser
* qopt
,
233 LOGCALL(MATCH
, PostList
*, "LocalSubMatch::open_post_list", term
| wqf
| factor
| need_positions
| qopt
| lazy_weight
);
235 bool weighted
= (factor
!= 0.0 && !term
.empty());
237 LeafPostList
* pl
= NULL
;
238 if (!term
.empty() && !need_positions
) {
239 if ((!weighted
&& !in_synonym
) ||
240 !wt_factory
->get_sumpart_needs_wdf_()) {
241 Xapian::doccount sub_tf
;
242 db
->get_freqs(term
, &sub_tf
, NULL
);
243 if (sub_tf
== db
->get_doccount()) {
244 // If we're not going to use the wdf or term positions, and the
245 // term indexes all documents, we can replace it with the
246 // MatchAll postlist, which is especially efficient if there
247 // are no gaps in the docids.
248 pl
= db
->open_leaf_post_list(string());
250 // Set the term name so the postlist looks up the correct term
251 // frequencies - this is necessary if the weighting scheme
252 // needs collection frequency or reltermfreq (termfreq would be
253 // correct anyway since it's just the collection size in this
261 const LeafPostList
* hint
= qopt
->get_hint_postlist();
263 pl
= hint
->open_nearby_postlist(term
);
265 pl
= db
->open_leaf_post_list(term
);
267 qopt
->set_hint_postlist(pl
);
271 // Term came from a wildcard, but we may already have that term in the
272 // query anyway, so check before accumulating its TermFreqs.
273 map
<string
, TermFreqs
>::iterator i
= total_stats
->termfreqs
.find(term
);
274 if (i
== total_stats
->termfreqs
.end()) {
275 Xapian::doccount sub_tf
;
276 Xapian::termcount sub_cf
;
277 db
->get_freqs(term
, &sub_tf
, &sub_cf
);
278 total_stats
->termfreqs
.insert({term
, TermFreqs(sub_tf
, 0, sub_cf
)});
283 Xapian::Weight
* wt
= wt_factory
->clone();
285 wt
->init_(*total_stats
, qlen
, term
, wqf
, factor
);
286 total_stats
->set_max_part(term
, wt
->get_maxpart());
288 // Delay initialising the actual weight object, so that we can
289 // gather stats for the terms lazily expanded from a wildcard
290 // (needed for the remote database case).
291 wt
= new LazyWeight(pl
, wt
, total_stats
, qlen
, wqf
, factor
);
293 pl
->set_termweight(wt
);