1 /** @file nearpostlist.cc
2 * @brief Return docs containing terms within a specified window.
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017,2018 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "nearpostlist.h"
27 #include "backends/positionlist.h"
37 NearPostList::NearPostList(PostList
*source_
,
38 Xapian::termpos window_
,
39 const vector
<PostList
*>::const_iterator
&terms_begin
,
40 const vector
<PostList
*>::const_iterator
&terms_end
,
41 PostListTree
* pltree_
)
42 : SelectPostList(source_
, pltree_
),
44 terms(terms_begin
, terms_end
)
46 size_t n
= terms
.size();
48 poslists
= new PositionList
*[n
];
51 NearPostList::~NearPostList()
57 bool operator()(const PostList
* a
, const PostList
* b
) const {
58 return a
->get_wdf() < b
->get_wdf();
63 bool operator()(const PositionList
* a
, const PositionList
* b
) const {
64 return a
->get_position() > b
->get_position();
69 NearPostList::test_doc()
71 LOGCALL(MATCH
, bool, "NearPostList::test_doc", NO_ARGS
);
73 // Sort to put least frequent terms first, to try to minimise the number of
74 // position lists we need to read if there are no matches.
76 // We use wdf as a proxy for the length of the position lists, since we'd
77 // need to read each position list to find its length and we're trying to
78 // avoid having to read them all if we can.
79 sort(terms
.begin(), terms
.end(), TermCmp());
81 poslists
[0] = terms
[0]->read_position_list();
82 if (!poslists
[0]->next())
85 Xapian::termpos last
= poslists
[0]->get_position();
86 PositionList
** end
= poslists
+ 1;
89 if (last
- poslists
[0]->get_position() < window
) {
90 if (size_t(end
- poslists
) != terms
.size()) {
91 // We haven't started all the position lists yet, so start the
94 terms
[end
- poslists
]->read_position_list();
99 if (!posl
->skip_to(last
- window
+ 1))
102 Xapian::termpos pos
= posl
->get_position();
103 if (pos
> last
) last
= pos
;
105 Heap::push(poslists
, end
, Cmp());
109 // We have all the terms within the specified window, but some may
110 // be at the same position (in particular if the same term is
111 // listed twice). So we work through the terms in ascending
112 // position order, and each time we find one with a duplicate
113 // position, we advance it - if that pushes us out of the window,
114 // we return to the outer loop, otherwise we reinsert it into the
115 // heap at its new position and continue to look for duplicates
116 // we need to adjust.
117 Xapian::termpos pos
= poslists
[0]->get_position();
118 Heap::pop(poslists
, end
, Cmp());
119 PositionList
** i
= end
- 1;
121 if (poslists
[0]->get_position() == pos
) {
122 if (!poslists
[0]->next())
124 Xapian::termpos newpos
= poslists
[0]->get_position();
125 if (newpos
- end
[-1]->get_position() >= window
) {
126 // No longer fits in the window.
130 Heap::replace(poslists
, i
, Cmp());
133 pos
= poslists
[0]->get_position();
134 Heap::pop(poslists
, i
, Cmp());
135 if (--i
== poslists
) {
136 Assert(pos
- end
[-1]->get_position() < window
);
141 Heap::make(poslists
, end
, Cmp());
144 if (!poslists
[0]->skip_to(last
- window
+ 1))
146 last
= max(last
, poslists
[0]->get_position());
147 Heap::replace(poslists
, end
, Cmp());
154 NearPostList::get_wdf() const
156 // Calculate an estimate for the wdf of a near postlist.
158 // The natural definition of the wdf for a near postlist is the number of
159 // times the terms occur in a group within the windowsize in the document -
160 // in other words, the number of times a routine like do_test() could find
161 // a match, if it kept going after finding the first one. However,
162 // calculating this would be expensive (in CPU time at least - the position
163 // lists are probably already read from disk), and the benefit is dubious
164 // (given that the estimate here is probably only going to be used for
165 // calculating the weight for synonym postlists, and it's not clear that
166 // the above definition is exactly appropriate in this situation, anyway).
168 // Instead, we could do an estimate of this value, based on the lengths
169 // of the position lists. Rather than having to open the position lists,
170 // we could use the wdfs, which will be the same value unless the wdfs have
171 // been artificially inflated - in which case we probably want to use the
172 // inflated value anyway (it will have been inflated to represent the fact
173 // that this term is more important than others, so we should make use of
176 // A reasonable way to calculate an estimate would be to consider the
177 // document to be split into W (overlapping) windows, each 1 term apart,
178 // and of the same length as the windowsize used for the phrase. Then,
179 // calculate the probability that each term is found in this window (as
180 // Prob = wdf / doclen), multiply these probabilities to get the
181 // probability that they're all found in the window, and then multiply by
182 // the windowsize again to get an estimate.
184 // However, this requires the doclen, which won't always be available (ie,
185 // if the weighting scheme doesn't use it, we won't have read it from
188 // Rather than force an access to disk to get the doclen, we use an even
189 // rougher estimate: the minimum wdf in the sub-lists. This is actually
190 // the upper bound for the wdf (since the phrase can only occur the same
191 // number of times as the least frequent of its constituent terms).
193 // If this estimate proves to give bad results, we can revisit this and try
194 // a better approximation.
195 vector
<PostList
*>::const_iterator i
= terms
.begin();
196 Xapian::termcount wdf
= (*i
)->get_wdf();
197 while (++i
!= terms
.end()) {
198 wdf
= min(wdf
, (*i
)->get_wdf());
204 NearPostList::get_termfreq_est() const
206 // It's hard to estimate how many times the postlist will match as it
207 // depends a lot on the terms and window, but usually it will occur
208 // significantly less often than the individual terms.
209 return pl
->get_termfreq_est() / 2;
213 NearPostList::get_termfreq_est_using_stats(
214 const Xapian::Weight::Internal
& stats
) const
216 LOGCALL(MATCH
, TermFreqs
, "NearPostList::get_termfreq_est_using_stats", stats
);
217 // No idea how to estimate this - do the same as get_termfreq_est() for
219 TermFreqs
result(pl
->get_termfreq_est_using_stats(stats
));
220 result
.termfreq
/= 2;
221 result
.reltermfreq
/= 2;
226 NearPostList::get_description() const
231 m
+= pl
->get_description();