xapian-core/matcher/nearpostlist.cc

   1 /** @file nearpostlist.cc
   2  * @brief Return docs containing terms within a specified window.
   3  */
   4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017,2018 Olly Betts
   5  * Copyright (C) 2007 Lemur Consulting Ltd
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "nearpostlist.h"
  25
  26 #include "debuglog.h"
  27 #include "backends/positionlist.h"
  28 #include "heap.h"
  29 #include "omassert.h"
  30 #include "str.h"
  31
  32 #include <algorithm>
  33 #include <vector>
  34
  35 using namespace std;
  36
  37 NearPostList::NearPostList(PostList *source_,
  38                            Xapian::termpos window_,
  39                            const vector<PostList*>::const_iterator &terms_begin,
  40                            const vector<PostList*>::const_iterator &terms_end,
  41                            PostListTree* pltree_)
  42     : SelectPostList(source_, pltree_),
  43       window(window_),
  44       terms(terms_begin, terms_end)
  45 {
  46     size_t n = terms.size();
  47     Assert(n > 1);
  48     poslists = new PositionList*[n];
  49 }
  50
  51 NearPostList::~NearPostList()
  52 {
  53     delete [] poslists;
  54 }
  55
  56 struct TermCmp {
  57     bool operator()(const PostList * a, const PostList * b) const {
  58         return a->get_wdf() < b->get_wdf();
  59     }
  60 };
  61
  62 struct Cmp {
  63     bool operator()(const PositionList * a, const PositionList * b) const {
  64         return a->get_position() > b->get_position();
  65     }
  66 };
  67
  68 bool
  69 NearPostList::test_doc()
  70 {
  71     LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
  72
  73     // Sort to put least frequent terms first, to try to minimise the number of
  74     // position lists we need to read if there are no matches.
  75     //
  76     // We use wdf as a proxy for the length of the position lists, since we'd
  77     // need to read each position list to find its length and we're trying to
  78     // avoid having to read them all if we can.
  79     sort(terms.begin(), terms.end(), TermCmp());
  80
  81     poslists[0] = terms[0]->read_position_list();
  82     if (!poslists[0]->next())
  83         RETURN(false);
  84
  85     Xapian::termpos last = poslists[0]->get_position();
  86     PositionList ** end = poslists + 1;
  87
  88     while (true) {
  89         if (last - poslists[0]->get_position() < window) {
  90             if (size_t(end - poslists) != terms.size()) {
  91                 // We haven't started all the position lists yet, so start the
  92                 // next one.
  93                 PositionList * posl =
  94                     terms[end - poslists]->read_position_list();
  95                 if (last < window) {
  96                     if (!posl->next())
  97                         RETURN(false);
  98                 } else {
  99                     if (!posl->skip_to(last - window + 1))
 100                         RETURN(false);
 101                 }
 102                 Xapian::termpos pos = posl->get_position();
 103                 if (pos > last) last = pos;
 104                 *end++ = posl;
 105                 Heap::push(poslists, end, Cmp());
 106                 continue;
 107             }
 108
 109             // We have all the terms within the specified window, but some may
 110             // be at the same position (in particular if the same term is
 111             // listed twice).  So we work through the terms in ascending
 112             // position order, and each time we find one with a duplicate
 113             // position, we advance it - if that pushes us out of the window,
 114             // we return to the outer loop, otherwise we reinsert it into the
 115             // heap at its new position and continue to look for duplicates
 116             // we need to adjust.
 117             Xapian::termpos pos = poslists[0]->get_position();
 118             Heap::pop(poslists, end, Cmp());
 119             PositionList ** i = end - 1;
 120             while (true) {
 121                 if (poslists[0]->get_position() == pos) {
 122                     if (!poslists[0]->next())
 123                         RETURN(false);
 124                     Xapian::termpos newpos = poslists[0]->get_position();
 125                     if (newpos - end[-1]->get_position() >= window) {
 126                         // No longer fits in the window.
 127                         last = newpos;
 128                         break;
 129                     }
 130                     Heap::replace(poslists, i, Cmp());
 131                     continue;
 132                 }
 133                 pos = poslists[0]->get_position();
 134                 Heap::pop(poslists, i, Cmp());
 135                 if (--i == poslists) {
 136                     Assert(pos - end[-1]->get_position() < window);
 137                     RETURN(true);
 138                 }
 139             }
 140
 141             Heap::make(poslists, end, Cmp());
 142             continue;
 143         }
 144         if (!poslists[0]->skip_to(last - window + 1))
 145             break;
 146         last = max(last, poslists[0]->get_position());
 147         Heap::replace(poslists, end, Cmp());
 148     }
 149
 150     RETURN(false);
 151 }
 152
 153 Xapian::termcount
 154 NearPostList::get_wdf() const
 155 {
 156     // Calculate an estimate for the wdf of a near postlist.
 157     //
 158     // The natural definition of the wdf for a near postlist is the number of
 159     // times the terms occur in a group within the windowsize in the document -
 160     // in other words, the number of times a routine like do_test() could find
 161     // a match, if it kept going after finding the first one.  However,
 162     // calculating this would be expensive (in CPU time at least - the position
 163     // lists are probably already read from disk), and the benefit is dubious
 164     // (given that the estimate here is probably only going to be used for
 165     // calculating the weight for synonym postlists, and it's not clear that
 166     // the above definition is exactly appropriate in this situation, anyway).
 167     //
 168     // Instead, we could do an estimate of this value, based on the lengths
 169     // of the position lists.  Rather than having to open the position lists,
 170     // we could use the wdfs, which will be the same value unless the wdfs have
 171     // been artificially inflated - in which case we probably want to use the
 172     // inflated value anyway (it will have been inflated to represent the fact
 173     // that this term is more important than others, so we should make use of
 174     // this hint).
 175     //
 176     // A reasonable way to calculate an estimate would be to consider the
 177     // document to be split into W (overlapping) windows, each 1 term apart,
 178     // and of the same length as the windowsize used for the phrase.  Then,
 179     // calculate the probability that each term is found in this window (as
 180     // Prob = wdf / doclen), multiply these probabilities to get the
 181     // probability that they're all found in the window, and then multiply by
 182     // the windowsize again to get an estimate.
 183     //
 184     // However, this requires the doclen, which won't always be available (ie,
 185     // if the weighting scheme doesn't use it, we won't have read it from
 186     // disk).
 187     //
 188     // Rather than force an access to disk to get the doclen, we use an even
 189     // rougher estimate: the minimum wdf in the sub-lists.  This is actually
 190     // the upper bound for the wdf (since the phrase can only occur the same
 191     // number of times as the least frequent of its constituent terms).
 192     //
 193     // If this estimate proves to give bad results, we can revisit this and try
 194     // a better approximation.
 195     vector<PostList *>::const_iterator i = terms.begin();
 196     Xapian::termcount wdf = (*i)->get_wdf();
 197     while (++i != terms.end()) {
 198         wdf = min(wdf, (*i)->get_wdf());
 199     }
 200     return wdf;
 201 }
 202
 203 Xapian::doccount
 204 NearPostList::get_termfreq_est() const
 205 {
 206     // It's hard to estimate how many times the postlist will match as it
 207     // depends a lot on the terms and window, but usually it will occur
 208     // significantly less often than the individual terms.
 209     return pl->get_termfreq_est() / 2;
 210 }
 211
 212 TermFreqs
 213 NearPostList::get_termfreq_est_using_stats(
 214         const Xapian::Weight::Internal & stats) const
 215 {
 216     LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
 217     // No idea how to estimate this - do the same as get_termfreq_est() for
 218     // now.
 219     TermFreqs result(pl->get_termfreq_est_using_stats(stats));
 220     result.termfreq /= 2;
 221     result.reltermfreq /= 2;
 222     RETURN(result);
 223 }
 224
 225 string
 226 NearPostList::get_description() const
 227 {
 228     string m = "(Near ";
 229     m += str(window);
 230     m += ' ';
 231     m += pl->get_description();
 232     m += ")";
 233     return m;
 234 }