Skip lockfilealreadyopen1 under __CYGWIN__ &__WIN32__
[xapian.git] / xapian-core / api / mset.cc
blob898d624d9102d7d342b93461ada3b8cf6276efbe
1 /** @file mset.cc
2 * @brief Xapian::MSet class
3 */
4 /* Copyright (C) 2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "msetinternal.h"
24 #include "xapian/mset.h"
26 #include "net/length.h"
27 #include "net/serialise.h"
28 #include "matcher/msetcmp.h"
29 #include "roundestimate.h"
30 #include "serialise-double.h"
31 #include "str.h"
32 #include "unicode/description_append.h"
34 #include <algorithm>
35 #include <cfloat>
36 #include <string>
38 using namespace std;
40 namespace Xapian {
42 MSet::MSet(const MSet& o) : internal(o.internal) {}
44 MSet&
45 MSet::operator=(const MSet& o)
47 internal = o.internal;
48 return *this;
51 MSet::MSet() : internal(new MSet::Internal) {}
53 MSet::MSet(Internal* internal_) : internal(internal_) {}
55 MSet::~MSet() {}
57 void
58 MSet::fetch_(Xapian::doccount first, Xapian::doccount last) const
60 internal->fetch(first, last);
63 void
64 MSet::set_item_weight(Xapian::doccount i, double weight)
66 internal->set_item_weight(i, weight);
69 void
70 MSet::sort_by_relevance()
72 std::sort(internal->items.begin(), internal->items.end(),
73 get_msetcmp_function(Enquire::Internal::REL, true, false));
76 int
77 MSet::convert_to_percent(double weight) const
79 return internal->convert_to_percent(weight);
82 Xapian::doccount
83 MSet::get_termfreq(const std::string& term) const
85 // Check the cached data for query terms first.
86 Xapian::doccount termfreq;
87 if (usual(internal->stats && internal->stats->get_stats(term, termfreq))) {
88 return termfreq;
91 if (rare(internal->enquire.get() == NULL)) {
92 // Consistent with get_termfreq() on an empty database which always
93 // returns 0.
94 return 0;
97 // Fall back to asking the database via enquire.
98 return internal->enquire->get_termfreq(term);
101 double
102 MSet::get_termweight(const std::string& term) const
104 // A term not in the query has no termweight, so 0.0 makes sense as the
105 // answer in such cases.
106 double weight = 0.0;
107 if (usual(internal->stats)) {
108 (void)internal->stats->get_termweight(term, weight);
110 return weight;
113 Xapian::doccount
114 MSet::get_firstitem() const
116 return internal->first;
119 Xapian::doccount
120 MSet::get_matches_lower_bound() const
122 return internal->matches_lower_bound;
125 Xapian::doccount
126 MSet::get_matches_estimated() const
128 // Doing this here avoids calculating if the estimate is never looked at,
129 // though does mean we recalculate if this method is called more than once.
130 return round_estimate(internal->matches_lower_bound,
131 internal->matches_upper_bound,
132 internal->matches_estimated);
135 Xapian::doccount
136 MSet::get_matches_upper_bound() const
138 return internal->matches_upper_bound;
141 Xapian::doccount
142 MSet::get_uncollapsed_matches_lower_bound() const
144 return internal->uncollapsed_lower_bound;
147 Xapian::doccount
148 MSet::get_uncollapsed_matches_estimated() const
150 // Doing this here avoids calculating if the estimate is never looked at,
151 // though does mean we recalculate if this method is called more than once.
152 return round_estimate(internal->uncollapsed_lower_bound,
153 internal->uncollapsed_upper_bound,
154 internal->uncollapsed_estimated);
157 Xapian::doccount
158 MSet::get_uncollapsed_matches_upper_bound() const
160 return internal->uncollapsed_upper_bound;
163 double
164 MSet::get_max_attained() const
166 return internal->max_attained;
169 double
170 MSet::get_max_possible() const
172 return internal->max_possible;
175 Xapian::doccount
176 MSet::size() const
178 Assert(internal.get());
179 return internal->items.size();
182 std::string
183 MSet::snippet(const std::string& text,
184 size_t length,
185 const Xapian::Stem& stemmer,
186 unsigned flags,
187 const std::string& hi_start,
188 const std::string& hi_end,
189 const std::string& omit) const
191 // The actual implementation is in queryparser/termgenerator_internal.cc.
192 return internal->snippet(text, length, stemmer, flags,
193 hi_start, hi_end, omit);
196 std::string
197 MSet::get_description() const
199 return internal->get_description();
202 Document
203 MSet::Internal::get_document(Xapian::doccount index) const
205 if (index >= items.size()) {
206 string msg = "Requested index ";
207 msg += str(index);
208 msg += " in MSet of size ";
209 msg += str(items.size());
210 throw Xapian::RangeError(msg);
212 Assert(enquire.get());
213 return enquire->get_document(items[index].get_docid());
216 void
217 MSet::Internal::fetch(Xapian::doccount first_, Xapian::doccount last) const
219 if (items.empty() || enquire.get() == NULL) {
220 return;
222 if (last > items.size() - 1) {
223 last = items.size() - 1;
225 if (first_ <= last) {
226 Xapian::doccount n = last - first_;
227 for (Xapian::doccount i = 0; i <= n; ++i) {
228 enquire->request_document(items[i].get_docid());
233 void
234 MSet::Internal::set_item_weight(Xapian::doccount i, double weight)
236 // max_attained is updated assuming that set_item_weight is called on every
237 // MSet item from 0 up. While assigning new weights max_attained is updated
238 // as the maximum of the new weights set till Xapian::doccount i.
239 if (i == 0)
240 max_attained = weight;
241 else
242 max_attained = max(max_attained, weight);
243 // Ideally the max_possible should be the maximum possible weight that
244 // can be assigned by the reranking algorithm, but since it is not always
245 // possible to calculate the max possible weight for a reranking algorithm
246 // we use this approach.
247 max_possible = max(max_possible, max_attained);
248 items[i].set_weight(weight);
252 MSet::Internal::convert_to_percent(double weight) const
254 int percent;
255 if (percent_scale_factor == 0.0) {
256 // For an unweighted search, give all matches 100%.
257 percent = 100;
258 } else if (weight <= 0.0) {
259 // Some weighting schemes can return zero relevance while matching,
260 // so give such matches 0%.
261 percent = 0;
262 } else {
263 // Adding on 100 * DBL_EPSILON was a hack to work around excess
264 // precision (e.g. on x86 when not using SSE), but this code seems like
265 // it's generally asking for problems with floating point rounding
266 // issues - maybe we ought to carry through the matching and total
267 // number of subqueries and calculate using those instead.
269 // There are corresponding hacks in matcher/matcher.cc.
270 percent = int(weight * percent_scale_factor + 100.0 * DBL_EPSILON);
271 if (percent <= 0) {
272 // Make any non-zero weight give a non-zero percentage.
273 percent = 1;
274 } else if (percent > 100) {
275 // Make sure we don't ever exceed 100%.
276 percent = 100;
278 // FIXME: Ideally we should also make sure any non-exact match gives
279 // < 100%.
281 return percent;
284 void
285 MSet::Internal::unshard_docids(Xapian::doccount shard,
286 Xapian::doccount n_shards)
288 for (auto& result : items) {
289 result.unshard_docid(shard, n_shards);
293 void
294 MSet::Internal::merge_stats(const Internal* o)
296 if (snippet_bg_relevance.empty()) {
297 snippet_bg_relevance = o->snippet_bg_relevance;
298 } else {
299 Assert(snippet_bg_relevance == o->snippet_bg_relevance);
301 matches_lower_bound += o->matches_lower_bound;
302 matches_estimated += o->matches_estimated;
303 matches_upper_bound += o->matches_upper_bound;
304 uncollapsed_lower_bound += o->uncollapsed_lower_bound;
305 uncollapsed_estimated += o->uncollapsed_estimated;
306 uncollapsed_upper_bound += o->uncollapsed_upper_bound;
307 max_possible = max(max_possible, o->max_possible);
308 if (o->max_attained > max_attained) {
309 max_attained = o->max_attained;
310 percent_scale_factor = o->percent_scale_factor;
314 string
315 MSet::Internal::serialise() const
317 string result;
319 result += encode_length(first);
320 // Send back the raw matches_* values. MSet::get_matches_estimated()
321 // rounds the estimate lazily, but when we merge MSet objects we really
322 // want to merge based on the raw estimates.
324 // It is also cleaner that a round-trip through serialisation gives you an
325 // object which is as close to the original as possible.
326 result += encode_length(matches_lower_bound);
327 result += encode_length(matches_estimated);
328 result += encode_length(matches_upper_bound);
329 result += encode_length(uncollapsed_lower_bound);
330 result += encode_length(uncollapsed_estimated);
331 result += encode_length(uncollapsed_upper_bound);
332 result += serialise_double(max_possible);
333 result += serialise_double(max_attained);
335 result += serialise_double(percent_scale_factor);
337 result += encode_length(items.size());
338 for (auto&& item : items) {
339 result += serialise_double(item.get_weight());
340 result += encode_length(item.get_docid());
341 result += encode_length(item.get_sort_key().size());
342 result += item.get_sort_key();
343 result += encode_length(item.get_collapse_key().size());
344 result += item.get_collapse_key();
345 result += encode_length(item.get_collapse_count());
348 if (stats)
349 result += serialise_stats(*stats);
351 return result;
354 void
355 MSet::Internal::unserialise(const char * p, const char * p_end)
357 items.clear();
359 decode_length(&p, p_end, first);
360 decode_length(&p, p_end, matches_lower_bound);
361 decode_length(&p, p_end, matches_estimated);
362 decode_length(&p, p_end, matches_upper_bound);
363 decode_length(&p, p_end, uncollapsed_lower_bound);
364 decode_length(&p, p_end, uncollapsed_estimated);
365 decode_length(&p, p_end, uncollapsed_upper_bound);
366 max_possible = unserialise_double(&p, p_end);
367 max_attained = unserialise_double(&p, p_end);
369 percent_scale_factor = unserialise_double(&p, p_end);
371 size_t msize;
372 decode_length(&p, p_end, msize);
373 while (msize-- > 0) {
374 double wt = unserialise_double(&p, p_end);
375 Xapian::docid did;
376 decode_length(&p, p_end, did);
377 size_t len;
378 decode_length_and_check(&p, p_end, len);
379 string sort_key(p, len);
380 p += len;
381 decode_length_and_check(&p, p_end, len);
382 string key(p, len);
383 p += len;
384 Xapian::doccount collapse_cnt;
385 decode_length(&p, p_end, collapse_cnt);
386 items.emplace_back(wt, did, std::move(key), collapse_cnt,
387 std::move(sort_key));
390 if (p != p_end) {
391 stats.reset(new Xapian::Weight::Internal());
392 unserialise_stats(string(p, p_end - p), *stats);
396 string
397 MSet::Internal::get_description() const
399 string desc = "MSet(matches_lower_bound=";
400 desc += str(matches_lower_bound);
401 desc += ", matches_estimated=";
402 desc += str(matches_estimated);
403 desc += ", matches_upper_bound=";
404 desc += str(matches_upper_bound);
405 if (uncollapsed_lower_bound != matches_lower_bound) {
406 desc += ", uncollapsed_lower_bound=";
407 desc += str(uncollapsed_lower_bound);
409 if (uncollapsed_estimated != matches_estimated) {
410 desc += ", uncollapsed_estimated=";
411 desc += str(uncollapsed_estimated);
413 if (uncollapsed_upper_bound != matches_upper_bound) {
414 desc += ", uncollapsed_upper_bound=";
415 desc += str(uncollapsed_upper_bound);
417 if (first != 0) {
418 desc += ", first=";
419 desc += str(first);
421 if (max_possible > 0) {
422 desc += ", max_possible=";
423 desc += str(max_possible);
425 if (max_attained > 0) {
426 desc += ", max_attained=";
427 desc += str(max_attained);
429 desc += ", [";
430 bool comma = false;
431 for (auto&& item : items) {
432 if (comma) {
433 desc += ", ";
434 } else {
435 comma = true;
437 desc += item.get_description();
439 desc += "])";
440 return desc;