xapian-core/api/mset.cc

   1 /** @file mset.cc
   2  * @brief Xapian::MSet class
   3  */
   4 /* Copyright (C) 2017 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "msetinternal.h"
  24 #include "xapian/mset.h"
  25
  26 #include "net/length.h"
  27 #include "net/serialise.h"
  28 #include "matcher/msetcmp.h"
  29 #include "roundestimate.h"
  30 #include "serialise-double.h"
  31 #include "str.h"
  32 #include "unicode/description_append.h"
  33
  34 #include <algorithm>
  35 #include <cfloat>
  36 #include <string>
  37
  38 using namespace std;
  39
  40 namespace Xapian {
  41
  42 MSet::MSet(const MSet& o) : internal(o.internal) {}
  43
  44 MSet&
  45 MSet::operator=(const MSet& o)
  46 {
  47     internal = o.internal;
  48     return *this;
  49 }
  50
  51 MSet::MSet() : internal(new MSet::Internal) {}
  52
  53 MSet::MSet(Internal* internal_) : internal(internal_) {}
  54
  55 MSet::~MSet() {}
  56
  57 void
  58 MSet::fetch_(Xapian::doccount first, Xapian::doccount last) const
  59 {
  60     internal->fetch(first, last);
  61 }
  62
  63 void
  64 MSet::set_item_weight(Xapian::doccount i, double weight)
  65 {
  66     internal->set_item_weight(i, weight);
  67 }
  68
  69 void
  70 MSet::sort_by_relevance()
  71 {
  72     std::sort(internal->items.begin(), internal->items.end(),
  73               get_msetcmp_function(Enquire::Internal::REL, true, false));
  74 }
  75
  76 int
  77 MSet::convert_to_percent(double weight) const
  78 {
  79     return internal->convert_to_percent(weight);
  80 }
  81
  82 Xapian::doccount
  83 MSet::get_termfreq(const std::string& term) const
  84 {
  85     // Check the cached data for query terms first.
  86     Xapian::doccount termfreq;
  87     if (usual(internal->stats && internal->stats->get_stats(term, termfreq))) {
  88         return termfreq;
  89     }
  90
  91     if (rare(internal->enquire.get() == NULL)) {
  92         // Consistent with get_termfreq() on an empty database which always
  93         // returns 0.
  94         return 0;
  95     }
  96
  97     // Fall back to asking the database via enquire.
  98     return internal->enquire->get_termfreq(term);
  99 }
 100
 101 double
 102 MSet::get_termweight(const std::string& term) const
 103 {
 104     // A term not in the query has no termweight, so 0.0 makes sense as the
 105     // answer in such cases.
 106     double weight = 0.0;
 107     if (usual(internal->stats)) {
 108         (void)internal->stats->get_termweight(term, weight);
 109     }
 110     return weight;
 111 }
 112
 113 Xapian::doccount
 114 MSet::get_firstitem() const
 115 {
 116     return internal->first;
 117 }
 118
 119 Xapian::doccount
 120 MSet::get_matches_lower_bound() const
 121 {
 122     return internal->matches_lower_bound;
 123 }
 124
 125 Xapian::doccount
 126 MSet::get_matches_estimated() const
 127 {
 128     // Doing this here avoids calculating if the estimate is never looked at,
 129     // though does mean we recalculate if this method is called more than once.
 130     return round_estimate(internal->matches_lower_bound,
 131                           internal->matches_upper_bound,
 132                           internal->matches_estimated);
 133 }
 134
 135 Xapian::doccount
 136 MSet::get_matches_upper_bound() const
 137 {
 138     return internal->matches_upper_bound;
 139 }
 140
 141 Xapian::doccount
 142 MSet::get_uncollapsed_matches_lower_bound() const
 143 {
 144     return internal->uncollapsed_lower_bound;
 145 }
 146
 147 Xapian::doccount
 148 MSet::get_uncollapsed_matches_estimated() const
 149 {
 150     // Doing this here avoids calculating if the estimate is never looked at,
 151     // though does mean we recalculate if this method is called more than once.
 152     return round_estimate(internal->uncollapsed_lower_bound,
 153                           internal->uncollapsed_upper_bound,
 154                           internal->uncollapsed_estimated);
 155 }
 156
 157 Xapian::doccount
 158 MSet::get_uncollapsed_matches_upper_bound() const
 159 {
 160     return internal->uncollapsed_upper_bound;
 161 }
 162
 163 double
 164 MSet::get_max_attained() const
 165 {
 166     return internal->max_attained;
 167 }
 168
 169 double
 170 MSet::get_max_possible() const
 171 {
 172     return internal->max_possible;
 173 }
 174
 175 Xapian::doccount
 176 MSet::size() const
 177 {
 178     Assert(internal.get());
 179     return internal->items.size();
 180 }
 181
 182 std::string
 183 MSet::snippet(const std::string& text,
 184               size_t length,
 185               const Xapian::Stem& stemmer,
 186               unsigned flags,
 187               const std::string& hi_start,
 188               const std::string& hi_end,
 189               const std::string& omit) const
 190 {
 191     // The actual implementation is in queryparser/termgenerator_internal.cc.
 192     return internal->snippet(text, length, stemmer, flags,
 193                              hi_start, hi_end, omit);
 194 }
 195
 196 std::string
 197 MSet::get_description() const
 198 {
 199     return internal->get_description();
 200 }
 201
 202 Document
 203 MSet::Internal::get_document(Xapian::doccount index) const
 204 {
 205     if (index >= items.size()) {
 206         string msg = "Requested index ";
 207         msg += str(index);
 208         msg += " in MSet of size ";
 209         msg += str(items.size());
 210         throw Xapian::RangeError(msg);
 211     }
 212     Assert(enquire.get());
 213     return enquire->get_document(items[index].get_docid());
 214 }
 215
 216 void
 217 MSet::Internal::fetch(Xapian::doccount first_, Xapian::doccount last) const
 218 {
 219     if (items.empty() || enquire.get() == NULL) {
 220         return;
 221     }
 222     if (last > items.size() - 1) {
 223         last = items.size() - 1;
 224     }
 225     if (first_ <= last) {
 226         Xapian::doccount n = last - first_;
 227         for (Xapian::doccount i = 0; i <= n; ++i) {
 228             enquire->request_document(items[i].get_docid());
 229         }
 230     }
 231 }
 232
 233 void
 234 MSet::Internal::set_item_weight(Xapian::doccount i, double weight)
 235 {
 236     // max_attained is updated assuming that set_item_weight is called on every
 237     // MSet item from 0 up. While assigning new weights max_attained is updated
 238     // as the maximum of the new weights set till Xapian::doccount i.
 239     if (i == 0)
 240         max_attained = weight;
 241     else
 242         max_attained = max(max_attained, weight);
 243     // Ideally the max_possible should be the maximum possible weight that
 244     // can be assigned by the reranking algorithm, but since it is not always
 245     // possible to calculate the max possible weight for a reranking algorithm
 246     // we use this approach.
 247     max_possible = max(max_possible, max_attained);
 248     items[i].set_weight(weight);
 249 }
 250
 251 int
 252 MSet::Internal::convert_to_percent(double weight) const
 253 {
 254     int percent;
 255     if (percent_scale_factor == 0.0) {
 256         // For an unweighted search, give all matches 100%.
 257         percent = 100;
 258     } else if (weight <= 0.0) {
 259         // Some weighting schemes can return zero relevance while matching,
 260         // so give such matches 0%.
 261         percent = 0;
 262     } else {
 263         // Adding on 100 * DBL_EPSILON was a hack to work around excess
 264         // precision (e.g. on x86 when not using SSE), but this code seems like
 265         // it's generally asking for problems with floating point rounding
 266         // issues - maybe we ought to carry through the matching and total
 267         // number of subqueries and calculate using those instead.
 268         //
 269         // There are corresponding hacks in matcher/matcher.cc.
 270         percent = int(weight * percent_scale_factor + 100.0 * DBL_EPSILON);
 271         if (percent <= 0) {
 272             // Make any non-zero weight give a non-zero percentage.
 273             percent = 1;
 274         } else if (percent > 100) {
 275             // Make sure we don't ever exceed 100%.
 276             percent = 100;
 277         }
 278         // FIXME: Ideally we should also make sure any non-exact match gives
 279         // < 100%.
 280     }
 281     return percent;
 282 }
 283
 284 void
 285 MSet::Internal::unshard_docids(Xapian::doccount shard,
 286                                Xapian::doccount n_shards)
 287 {
 288     for (auto& result : items) {
 289         result.unshard_docid(shard, n_shards);
 290     }
 291 }
 292
 293 void
 294 MSet::Internal::merge_stats(const Internal* o)
 295 {
 296     if (snippet_bg_relevance.empty()) {
 297         snippet_bg_relevance = o->snippet_bg_relevance;
 298     } else {
 299         Assert(snippet_bg_relevance == o->snippet_bg_relevance);
 300     }
 301     matches_lower_bound += o->matches_lower_bound;
 302     matches_estimated += o->matches_estimated;
 303     matches_upper_bound += o->matches_upper_bound;
 304     uncollapsed_lower_bound += o->uncollapsed_lower_bound;
 305     uncollapsed_estimated += o->uncollapsed_estimated;
 306     uncollapsed_upper_bound += o->uncollapsed_upper_bound;
 307     max_possible = max(max_possible, o->max_possible);
 308     if (o->max_attained > max_attained) {
 309         max_attained = o->max_attained;
 310         percent_scale_factor = o->percent_scale_factor;
 311     }
 312 }
 313
 314 string
 315 MSet::Internal::serialise() const
 316 {
 317     string result;
 318
 319     result += encode_length(first);
 320     // Send back the raw matches_* values.  MSet::get_matches_estimated()
 321     // rounds the estimate lazily, but when we merge MSet objects we really
 322     // want to merge based on the raw estimates.
 323     //
 324     // It is also cleaner that a round-trip through serialisation gives you an
 325     // object which is as close to the original as possible.
 326     result += encode_length(matches_lower_bound);
 327     result += encode_length(matches_estimated);
 328     result += encode_length(matches_upper_bound);
 329     result += encode_length(uncollapsed_lower_bound);
 330     result += encode_length(uncollapsed_estimated);
 331     result += encode_length(uncollapsed_upper_bound);
 332     result += serialise_double(max_possible);
 333     result += serialise_double(max_attained);
 334
 335     result += serialise_double(percent_scale_factor);
 336
 337     result += encode_length(items.size());
 338     for (auto&& item : items) {
 339         result += serialise_double(item.get_weight());
 340         result += encode_length(item.get_docid());
 341         result += encode_length(item.get_sort_key().size());
 342         result += item.get_sort_key();
 343         result += encode_length(item.get_collapse_key().size());
 344         result += item.get_collapse_key();
 345         result += encode_length(item.get_collapse_count());
 346     }
 347
 348     if (stats)
 349         result += serialise_stats(*stats);
 350
 351     return result;
 352 }
 353
 354 void
 355 MSet::Internal::unserialise(const char * p, const char * p_end)
 356 {
 357     items.clear();
 358
 359     decode_length(&p, p_end, first);
 360     decode_length(&p, p_end, matches_lower_bound);
 361     decode_length(&p, p_end, matches_estimated);
 362     decode_length(&p, p_end, matches_upper_bound);
 363     decode_length(&p, p_end, uncollapsed_lower_bound);
 364     decode_length(&p, p_end, uncollapsed_estimated);
 365     decode_length(&p, p_end, uncollapsed_upper_bound);
 366     max_possible = unserialise_double(&p, p_end);
 367     max_attained = unserialise_double(&p, p_end);
 368
 369     percent_scale_factor = unserialise_double(&p, p_end);
 370
 371     size_t msize;
 372     decode_length(&p, p_end, msize);
 373     while (msize-- > 0) {
 374         double wt = unserialise_double(&p, p_end);
 375         Xapian::docid did;
 376         decode_length(&p, p_end, did);
 377         size_t len;
 378         decode_length_and_check(&p, p_end, len);
 379         string sort_key(p, len);
 380         p += len;
 381         decode_length_and_check(&p, p_end, len);
 382         string key(p, len);
 383         p += len;
 384         Xapian::doccount collapse_cnt;
 385         decode_length(&p, p_end, collapse_cnt);
 386         items.emplace_back(wt, did, std::move(key), collapse_cnt,
 387                            std::move(sort_key));
 388     }
 389
 390     if (p != p_end) {
 391         stats.reset(new Xapian::Weight::Internal());
 392         unserialise_stats(string(p, p_end - p), *stats);
 393     }
 394 }
 395
 396 string
 397 MSet::Internal::get_description() const
 398 {
 399     string desc = "MSet(matches_lower_bound=";
 400     desc += str(matches_lower_bound);
 401     desc += ", matches_estimated=";
 402     desc += str(matches_estimated);
 403     desc += ", matches_upper_bound=";
 404     desc += str(matches_upper_bound);
 405     if (uncollapsed_lower_bound != matches_lower_bound) {
 406         desc += ", uncollapsed_lower_bound=";
 407         desc += str(uncollapsed_lower_bound);
 408     }
 409     if (uncollapsed_estimated != matches_estimated) {
 410         desc += ", uncollapsed_estimated=";
 411         desc += str(uncollapsed_estimated);
 412     }
 413     if (uncollapsed_upper_bound != matches_upper_bound) {
 414         desc += ", uncollapsed_upper_bound=";
 415         desc += str(uncollapsed_upper_bound);
 416     }
 417     if (first != 0) {
 418         desc += ", first=";
 419         desc += str(first);
 420     }
 421     if (max_possible > 0) {
 422         desc += ", max_possible=";
 423         desc += str(max_possible);
 424     }
 425     if (max_attained > 0) {
 426         desc += ", max_attained=";
 427         desc += str(max_attained);
 428     }
 429     desc += ", [";
 430     bool comma = false;
 431     for (auto&& item : items) {
 432         if (comma) {
 433             desc += ", ";
 434         } else {
 435             comma = true;
 436         }
 437         desc += item.get_description();
 438     }
 439     desc += "])";
 440     return desc;
 441 }
 442
 443 }