xapian-core/api/mset.cc

   1 /** @file mset.cc
   2  * @brief Xapian::MSet class
   3  */
   4 /* Copyright (C) 2017 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "msetinternal.h"
  24 #include "xapian/mset.h"
  25
  26 #include "net/length.h"
  27 #include "net/serialise.h"
  28 #include "matcher/msetcmp.h"
  29 #include "roundestimate.h"
  30 #include "serialise-double.h"
  31 #include "str.h"
  32 #include "unicode/description_append.h"
  33
  34 #include <algorithm>
  35 #include <cfloat>
  36 #include <string>
  37
  38 using namespace std;
  39
  40 namespace Xapian {
  41
  42 MSet::MSet(const MSet& o) : internal(o.internal) {}
  43
  44 MSet&
  45 MSet::operator=(const MSet& o)
  46 {
  47     internal = o.internal;
  48     return *this;
  49 }
  50
  51 MSet::MSet() : internal(new MSet::Internal) {}
  52
  53 MSet::~MSet() {}
  54
  55 void
  56 MSet::fetch_(Xapian::doccount first, Xapian::doccount last) const
  57 {
  58     internal->fetch(first, last);
  59 }
  60
  61 void
  62 MSet::set_item_weight(Xapian::doccount i, double weight)
  63 {
  64     internal->set_item_weight(i, weight);
  65 }
  66
  67 void
  68 MSet::sort_by_relevance()
  69 {
  70     std::sort(internal->items.begin(), internal->items.end(),
  71               get_msetcmp_function(Enquire::Internal::REL, true, false));
  72 }
  73
  74 int
  75 MSet::convert_to_percent(double weight) const
  76 {
  77     return internal->convert_to_percent(weight);
  78 }
  79
  80 Xapian::doccount
  81 MSet::get_termfreq(const std::string& term) const
  82 {
  83     // Check the cached data for query terms first.
  84     Xapian::doccount termfreq;
  85     if (usual(internal->stats && internal->stats->get_stats(term, termfreq))) {
  86         return termfreq;
  87     }
  88
  89     if (rare(internal->enquire.get() == NULL)) {
  90         // Consistent with get_termfreq() on an empty database which always
  91         // returns 0.
  92         return 0;
  93     }
  94
  95     // Fall back to asking the database via enquire.
  96     return internal->enquire->get_termfreq(term);
  97 }
  98
  99 double
 100 MSet::get_termweight(const std::string& term) const
 101 {
 102     // A term not in the query has no termweight, so 0.0 makes sense as the
 103     // answer in such cases.
 104     double weight = 0.0;
 105     if (usual(internal->stats)) {
 106         (void)internal->stats->get_termweight(term, weight);
 107     }
 108     return weight;
 109 }
 110
 111 Xapian::doccount
 112 MSet::get_firstitem() const
 113 {
 114     return internal->first;
 115 }
 116
 117 Xapian::doccount
 118 MSet::get_matches_lower_bound() const
 119 {
 120     return internal->matches_lower_bound;
 121 }
 122
 123 Xapian::doccount
 124 MSet::get_matches_estimated() const
 125 {
 126     // Doing this here avoids calculating if the estimate is never looked at,
 127     // though does mean we recalculate if this method is called more than once.
 128     return round_estimate(internal->matches_lower_bound,
 129                           internal->matches_upper_bound,
 130                           internal->matches_estimated);
 131 }
 132
 133 Xapian::doccount
 134 MSet::get_matches_upper_bound() const
 135 {
 136     return internal->matches_upper_bound;
 137 }
 138
 139 Xapian::doccount
 140 MSet::get_uncollapsed_matches_lower_bound() const
 141 {
 142     return internal->uncollapsed_lower_bound;
 143 }
 144
 145 Xapian::doccount
 146 MSet::get_uncollapsed_matches_estimated() const
 147 {
 148     // Doing this here avoids calculating if the estimate is never looked at,
 149     // though does mean we recalculate if this method is called more than once.
 150     return round_estimate(internal->uncollapsed_lower_bound,
 151                           internal->uncollapsed_upper_bound,
 152                           internal->uncollapsed_estimated);
 153 }
 154
 155 Xapian::doccount
 156 MSet::get_uncollapsed_matches_upper_bound() const
 157 {
 158     return internal->uncollapsed_upper_bound;
 159 }
 160
 161 double
 162 MSet::get_max_attained() const
 163 {
 164     return internal->max_attained;
 165 }
 166
 167 double
 168 MSet::get_max_possible() const
 169 {
 170     return internal->max_possible;
 171 }
 172
 173 Xapian::doccount
 174 MSet::size() const
 175 {
 176     Assert(internal.get());
 177     return internal->items.size();
 178 }
 179
 180 std::string
 181 MSet::snippet(const std::string& text,
 182               size_t length,
 183               const Xapian::Stem& stemmer,
 184               unsigned flags,
 185               const std::string& hi_start,
 186               const std::string& hi_end,
 187               const std::string& omit) const
 188 {
 189     // The actual implementation is in queryparser/termgenerator_internal.cc.
 190     return internal->snippet(text, length, stemmer, flags,
 191                              hi_start, hi_end, omit);
 192 }
 193
 194 std::string
 195 MSet::get_description() const
 196 {
 197     return internal->get_description();
 198 }
 199
 200 Document
 201 MSet::Internal::get_document(Xapian::doccount index) const
 202 {
 203     if (index >= items.size()) {
 204         string msg = "Requested index ";
 205         msg += str(index);
 206         msg += " in MSet of size ";
 207         msg += str(items.size());
 208         throw Xapian::RangeError(msg);
 209     }
 210     Assert(enquire.get());
 211     return enquire->get_document(items[index].get_docid());
 212 }
 213
 214 void
 215 MSet::Internal::fetch(Xapian::doccount first_, Xapian::doccount last) const
 216 {
 217     if (items.empty() || enquire.get() == NULL) {
 218         return;
 219     }
 220     if (last > items.size() - 1) {
 221         last = items.size() - 1;
 222     }
 223     if (first_ <= last) {
 224         Xapian::doccount n = last - first_;
 225         for (Xapian::doccount i = 0; i <= n; ++i) {
 226             enquire->request_document(items[i].get_docid());
 227         }
 228     }
 229 }
 230
 231 void
 232 MSet::Internal::set_item_weight(Xapian::doccount i, double weight)
 233 {
 234     // max_attained is updated assuming that set_item_weight is called on every
 235     // MSet item from 0 up. While assigning new weights max_attained is updated
 236     // as the maximum of the new weights set till Xapian::doccount i.
 237     if (i == 0)
 238         max_attained = weight;
 239     else
 240         max_attained = max(max_attained, weight);
 241     // Ideally the max_possible should be the maximum possible weight that
 242     // can be assigned by the reranking algorithm, but since it is not always
 243     // possible to calculate the max possible weight for a reranking algorithm
 244     // we use this approach.
 245     max_possible = max(max_possible, max_attained);
 246     items[i].set_weight(weight);
 247 }
 248
 249 int
 250 MSet::Internal::convert_to_percent(double weight) const
 251 {
 252     int percent;
 253     if (percent_scale_factor == 0.0) {
 254         // For an unweighted search, give all matches 100%.
 255         percent = 100;
 256     } else if (weight <= 0.0) {
 257         // Some weighting schemes can return zero relevance while matching,
 258         // so give such matches 0%.
 259         percent = 0;
 260     } else {
 261         // Adding on 100 * DBL_EPSILON was a hack to work around excess
 262         // precision (e.g. on x86 when not using SSE), but this code seems like
 263         // it's generally asking for problems with floating point rounding
 264         // issues - maybe we ought to carry through the matching and total
 265         // number of subqueries and calculate using those instead.
 266         //
 267         // There are corresponding hacks in matcher/multimatch.cc.
 268         percent = int(weight * percent_scale_factor + 100.0 * DBL_EPSILON);
 269         if (percent <= 0) {
 270             // Make any non-zero weight give a non-zero percentage.
 271             percent = 1;
 272         } else if (percent > 100) {
 273             // Make sure we don't ever exceed 100%.
 274             percent = 100;
 275         }
 276         // FIXME: Ideally we should also make sure any non-exact match gives
 277         // < 100%.
 278     }
 279     return percent;
 280 }
 281
 282 string
 283 MSet::Internal::serialise() const
 284 {
 285     string result;
 286
 287     result += encode_length(first);
 288     // Send back the raw matches_* values.  MSet::get_matches_estimated()
 289     // rounds the estimate lazily, but MSetPostList::get_termfreq_est()
 290     // returns the estimate, and the raw estimate is better for that.
 291     //
 292     // It is also cleaner that a round-trip through serialisation gives you an
 293     // object which is as close to the original as possible.
 294     result += encode_length(matches_lower_bound);
 295     result += encode_length(matches_estimated);
 296     result += encode_length(matches_upper_bound);
 297     result += encode_length(uncollapsed_lower_bound);
 298     result += encode_length(uncollapsed_estimated);
 299     result += encode_length(uncollapsed_upper_bound);
 300     result += serialise_double(max_possible);
 301     result += serialise_double(max_attained);
 302
 303     result += serialise_double(percent_scale_factor);
 304
 305     result += encode_length(items.size());
 306     for (auto&& item : items) {
 307         result += serialise_double(item.get_weight());
 308         result += encode_length(item.get_docid());
 309         result += encode_length(item.get_sort_key().size());
 310         result += item.get_sort_key();
 311         result += encode_length(item.get_collapse_key().size());
 312         result += item.get_collapse_key();
 313         result += encode_length(item.get_collapse_count());
 314     }
 315
 316     if (stats)
 317         result += serialise_stats(*stats);
 318
 319     return result;
 320 }
 321
 322 void
 323 MSet::Internal::unserialise(const char * p, const char * p_end)
 324 {
 325     items.clear();
 326
 327     decode_length(&p, p_end, first);
 328     decode_length(&p, p_end, matches_lower_bound);
 329     decode_length(&p, p_end, matches_estimated);
 330     decode_length(&p, p_end, matches_upper_bound);
 331     decode_length(&p, p_end, uncollapsed_lower_bound);
 332     decode_length(&p, p_end, uncollapsed_estimated);
 333     decode_length(&p, p_end, uncollapsed_upper_bound);
 334     max_possible = unserialise_double(&p, p_end);
 335     max_attained = unserialise_double(&p, p_end);
 336
 337     percent_scale_factor = unserialise_double(&p, p_end);
 338
 339     size_t msize;
 340     decode_length(&p, p_end, msize);
 341     while (msize-- > 0) {
 342         double wt = unserialise_double(&p, p_end);
 343         Xapian::docid did;
 344         decode_length(&p, p_end, did);
 345         size_t len;
 346         decode_length_and_check(&p, p_end, len);
 347         string sort_key(p, len);
 348         p += len;
 349         decode_length_and_check(&p, p_end, len);
 350         string key(p, len);
 351         p += len;
 352         Xapian::doccount collapse_cnt;
 353         decode_length(&p, p_end, collapse_cnt);
 354         items.emplace_back(wt, did, std::move(key), collapse_cnt,
 355                            std::move(sort_key));
 356     }
 357
 358     if (p != p_end) {
 359         stats.reset(new Xapian::Weight::Internal());
 360         unserialise_stats(string(p, p_end - p), *stats);
 361     }
 362 }
 363
 364 string
 365 MSet::Internal::get_description() const
 366 {
 367     string desc = "MSet(matches_lower_bound=";
 368     desc += str(matches_lower_bound);
 369     desc += ", matches_estimated=";
 370     desc += str(matches_estimated);
 371     desc += ", matches_upper_bound=";
 372     desc += str(matches_upper_bound);
 373     if (uncollapsed_lower_bound != matches_lower_bound) {
 374         desc += ", uncollapsed_lower_bound=";
 375         desc += str(uncollapsed_lower_bound);
 376     }
 377     if (uncollapsed_estimated != matches_estimated) {
 378         desc += ", uncollapsed_estimated=";
 379         desc += str(uncollapsed_estimated);
 380     }
 381     if (uncollapsed_upper_bound != matches_upper_bound) {
 382         desc += ", uncollapsed_upper_bound=";
 383         desc += str(uncollapsed_upper_bound);
 384     }
 385     if (first != 0) {
 386         desc += ", first=";
 387         desc += str(first);
 388     }
 389     if (max_possible > 0) {
 390         desc += ", max_possible=";
 391         desc += str(max_possible);
 392     }
 393     if (max_attained > 0) {
 394         desc += ", max_attained=";
 395         desc += str(max_attained);
 396     }
 397     desc += ", [";
 398     bool comma = false;
 399     for (auto&& item : items) {
 400         if (comma) {
 401             desc += ", ";
 402         } else {
 403             comma = true;
 404         }
 405         desc += item.get_description();
 406     }
 407     desc += "])";
 408     return desc;
 409 }
 410
 411 }