Fix misordered commands in source
[xapian.git] / xapian-core / matcher / nearpostlist.cc
blob14fd490e712cf414fe9c05af522d652fbb9c8076
1 /** @file nearpostlist.cc
2 * @brief Return docs containing terms within a specified window.
3 */
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017,2018 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "nearpostlist.h"
26 #include "debuglog.h"
27 #include "backends/positionlist.h"
28 #include "heap.h"
29 #include "omassert.h"
30 #include "str.h"
32 #include <algorithm>
33 #include <vector>
35 using namespace std;
37 NearPostList::NearPostList(PostList *source_,
38 Xapian::termpos window_,
39 const vector<PostList*>::const_iterator &terms_begin,
40 const vector<PostList*>::const_iterator &terms_end,
41 PostListTree* pltree_)
42 : SelectPostList(source_, pltree_),
43 window(window_),
44 terms(terms_begin, terms_end)
46 size_t n = terms.size();
47 Assert(n > 1);
48 poslists = new PositionList*[n];
51 NearPostList::~NearPostList()
53 delete [] poslists;
56 struct TermCmp {
57 bool operator()(const PostList * a, const PostList * b) const {
58 return a->get_wdf() < b->get_wdf();
62 struct Cmp {
63 bool operator()(const PositionList * a, const PositionList * b) const {
64 return a->get_position() > b->get_position();
68 bool
69 NearPostList::test_doc()
71 LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
73 // Sort to put least frequent terms first, to try to minimise the number of
74 // position lists we need to read if there are no matches.
76 // We use wdf as a proxy for the length of the position lists, since we'd
77 // need to read each position list to find its length and we're trying to
78 // avoid having to read them all if we can.
79 sort(terms.begin(), terms.end(), TermCmp());
81 poslists[0] = terms[0]->read_position_list();
82 if (!poslists[0]->next())
83 RETURN(false);
85 Xapian::termpos last = poslists[0]->get_position();
86 PositionList ** end = poslists + 1;
88 while (true) {
89 if (last - poslists[0]->get_position() < window) {
90 if (size_t(end - poslists) != terms.size()) {
91 // We haven't started all the position lists yet, so start the
92 // next one.
93 PositionList * posl =
94 terms[end - poslists]->read_position_list();
95 if (last < window) {
96 if (!posl->next())
97 RETURN(false);
98 } else {
99 if (!posl->skip_to(last - window + 1))
100 RETURN(false);
102 Xapian::termpos pos = posl->get_position();
103 if (pos > last) last = pos;
104 *end++ = posl;
105 Heap::push(poslists, end, Cmp());
106 continue;
109 // We have all the terms within the specified window, but some may
110 // be at the same position (in particular if the same term is
111 // listed twice). So we work through the terms in ascending
112 // position order, and each time we find one with a duplicate
113 // position, we advance it - if that pushes us out of the window,
114 // we return to the outer loop, otherwise we reinsert it into the
115 // heap at its new position and continue to look for duplicates
116 // we need to adjust.
117 Xapian::termpos pos = poslists[0]->get_position();
118 Heap::pop(poslists, end, Cmp());
119 PositionList ** i = end - 1;
120 while (true) {
121 if (poslists[0]->get_position() == pos) {
122 if (!poslists[0]->next())
123 RETURN(false);
124 Xapian::termpos newpos = poslists[0]->get_position();
125 if (newpos - end[-1]->get_position() >= window) {
126 // No longer fits in the window.
127 last = newpos;
128 break;
130 Heap::replace(poslists, i, Cmp());
131 continue;
133 pos = poslists[0]->get_position();
134 Heap::pop(poslists, i, Cmp());
135 if (--i == poslists) {
136 Assert(pos - end[-1]->get_position() < window);
137 RETURN(true);
141 Heap::make(poslists, end, Cmp());
142 continue;
144 if (!poslists[0]->skip_to(last - window + 1))
145 break;
146 last = max(last, poslists[0]->get_position());
147 Heap::replace(poslists, end, Cmp());
150 RETURN(false);
153 Xapian::termcount
154 NearPostList::get_wdf() const
156 // Calculate an estimate for the wdf of a near postlist.
158 // The natural definition of the wdf for a near postlist is the number of
159 // times the terms occur in a group within the windowsize in the document -
160 // in other words, the number of times a routine like do_test() could find
161 // a match, if it kept going after finding the first one. However,
162 // calculating this would be expensive (in CPU time at least - the position
163 // lists are probably already read from disk), and the benefit is dubious
164 // (given that the estimate here is probably only going to be used for
165 // calculating the weight for synonym postlists, and it's not clear that
166 // the above definition is exactly appropriate in this situation, anyway).
168 // Instead, we could do an estimate of this value, based on the lengths
169 // of the position lists. Rather than having to open the position lists,
170 // we could use the wdfs, which will be the same value unless the wdfs have
171 // been artificially inflated - in which case we probably want to use the
172 // inflated value anyway (it will have been inflated to represent the fact
173 // that this term is more important than others, so we should make use of
174 // this hint).
176 // A reasonable way to calculate an estimate would be to consider the
177 // document to be split into W (overlapping) windows, each 1 term apart,
178 // and of the same length as the windowsize used for the phrase. Then,
179 // calculate the probability that each term is found in this window (as
180 // Prob = wdf / doclen), multiply these probabilities to get the
181 // probability that they're all found in the window, and then multiply by
182 // the windowsize again to get an estimate.
184 // However, this requires the doclen, which won't always be available (ie,
185 // if the weighting scheme doesn't use it, we won't have read it from
186 // disk).
188 // Rather than force an access to disk to get the doclen, we use an even
189 // rougher estimate: the minimum wdf in the sub-lists. This is actually
190 // the upper bound for the wdf (since the phrase can only occur the same
191 // number of times as the least frequent of its constituent terms).
193 // If this estimate proves to give bad results, we can revisit this and try
194 // a better approximation.
195 vector<PostList *>::const_iterator i = terms.begin();
196 Xapian::termcount wdf = (*i)->get_wdf();
197 while (++i != terms.end()) {
198 wdf = min(wdf, (*i)->get_wdf());
200 return wdf;
203 Xapian::doccount
204 NearPostList::get_termfreq_est() const
206 // It's hard to estimate how many times the postlist will match as it
207 // depends a lot on the terms and window, but usually it will occur
208 // significantly less often than the individual terms.
209 return pl->get_termfreq_est() / 2;
212 TermFreqs
213 NearPostList::get_termfreq_est_using_stats(
214 const Xapian::Weight::Internal & stats) const
216 LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
217 // No idea how to estimate this - do the same as get_termfreq_est() for
218 // now.
219 TermFreqs result(pl->get_termfreq_est_using_stats(stats));
220 result.termfreq /= 2;
221 result.reltermfreq /= 2;
222 RETURN(result);
225 string
226 NearPostList::get_description() const
228 string m = "(Near ";
229 m += str(window);
230 m += ' ';
231 m += pl->get_description();
232 m += ")";
233 return m;