*(foo.get()) -> *foo
[xapian.git] / xapian-core / api / mset.cc
blobb33a2f407b9395458d27817334943b27d0ae6b99
1 /** @file mset.cc
2 * @brief Xapian::MSet class
3 */
4 /* Copyright (C) 2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "msetinternal.h"
24 #include "xapian/mset.h"
26 #include "net/length.h"
27 #include "net/serialise.h"
28 #include "matcher/msetcmp.h"
29 #include "roundestimate.h"
30 #include "serialise-double.h"
31 #include "str.h"
32 #include "unicode/description_append.h"
34 #include <algorithm>
35 #include <cfloat>
36 #include <string>
38 using namespace std;
40 namespace Xapian {
42 MSet::MSet(const MSet& o) : internal(o.internal) {}
44 MSet&
45 MSet::operator=(const MSet& o)
47 internal = o.internal;
48 return *this;
51 MSet::MSet() : internal(new MSet::Internal) {}
53 MSet::~MSet() {}
55 void
56 MSet::fetch_(Xapian::doccount first, Xapian::doccount last) const
58 internal->fetch(first, last);
61 void
62 MSet::set_item_weight(Xapian::doccount i, double weight)
64 internal->set_item_weight(i, weight);
67 void
68 MSet::sort_by_relevance()
70 std::sort(internal->items.begin(), internal->items.end(),
71 get_msetcmp_function(Enquire::Internal::REL, true, false));
74 int
75 MSet::convert_to_percent(double weight) const
77 return internal->convert_to_percent(weight);
80 Xapian::doccount
81 MSet::get_termfreq(const std::string& term) const
83 // Check the cached data for query terms first.
84 Xapian::doccount termfreq;
85 if (usual(internal->stats && internal->stats->get_stats(term, termfreq))) {
86 return termfreq;
89 if (rare(internal->enquire.get() == NULL)) {
90 // Consistent with get_termfreq() on an empty database which always
91 // returns 0.
92 return 0;
95 // Fall back to asking the database via enquire.
96 return internal->enquire->get_termfreq(term);
99 double
100 MSet::get_termweight(const std::string& term) const
102 // A term not in the query has no termweight, so 0.0 makes sense as the
103 // answer in such cases.
104 double weight = 0.0;
105 if (usual(internal->stats)) {
106 (void)internal->stats->get_termweight(term, weight);
108 return weight;
111 Xapian::doccount
112 MSet::get_firstitem() const
114 return internal->first;
117 Xapian::doccount
118 MSet::get_matches_lower_bound() const
120 return internal->matches_lower_bound;
123 Xapian::doccount
124 MSet::get_matches_estimated() const
126 // Doing this here avoids calculating if the estimate is never looked at,
127 // though does mean we recalculate if this method is called more than once.
128 return round_estimate(internal->matches_lower_bound,
129 internal->matches_upper_bound,
130 internal->matches_estimated);
133 Xapian::doccount
134 MSet::get_matches_upper_bound() const
136 return internal->matches_upper_bound;
139 Xapian::doccount
140 MSet::get_uncollapsed_matches_lower_bound() const
142 return internal->uncollapsed_lower_bound;
145 Xapian::doccount
146 MSet::get_uncollapsed_matches_estimated() const
148 // Doing this here avoids calculating if the estimate is never looked at,
149 // though does mean we recalculate if this method is called more than once.
150 return round_estimate(internal->uncollapsed_lower_bound,
151 internal->uncollapsed_upper_bound,
152 internal->uncollapsed_estimated);
155 Xapian::doccount
156 MSet::get_uncollapsed_matches_upper_bound() const
158 return internal->uncollapsed_upper_bound;
161 double
162 MSet::get_max_attained() const
164 return internal->max_attained;
167 double
168 MSet::get_max_possible() const
170 return internal->max_possible;
173 Xapian::doccount
174 MSet::size() const
176 Assert(internal.get());
177 return internal->items.size();
180 std::string
181 MSet::snippet(const std::string& text,
182 size_t length,
183 const Xapian::Stem& stemmer,
184 unsigned flags,
185 const std::string& hi_start,
186 const std::string& hi_end,
187 const std::string& omit) const
189 // The actual implementation is in queryparser/termgenerator_internal.cc.
190 return internal->snippet(text, length, stemmer, flags,
191 hi_start, hi_end, omit);
194 std::string
195 MSet::get_description() const
197 return internal->get_description();
200 Document
201 MSet::Internal::get_document(Xapian::doccount index) const
203 if (index >= items.size()) {
204 string msg = "Requested index ";
205 msg += str(index);
206 msg += " in MSet of size ";
207 msg += str(items.size());
208 throw Xapian::RangeError(msg);
210 Assert(enquire.get());
211 return enquire->get_document(items[index].get_docid());
214 void
215 MSet::Internal::fetch(Xapian::doccount first_, Xapian::doccount last) const
217 if (items.empty() || enquire.get() == NULL) {
218 return;
220 if (last > items.size() - 1) {
221 last = items.size() - 1;
223 if (first_ <= last) {
224 Xapian::doccount n = last - first_;
225 for (Xapian::doccount i = 0; i <= n; ++i) {
226 enquire->request_document(items[i].get_docid());
231 void
232 MSet::Internal::set_item_weight(Xapian::doccount i, double weight)
234 // max_attained is updated assuming that set_item_weight is called on every
235 // MSet item from 0 up. While assigning new weights max_attained is updated
236 // as the maximum of the new weights set till Xapian::doccount i.
237 if (i == 0)
238 max_attained = weight;
239 else
240 max_attained = max(max_attained, weight);
241 // Ideally the max_possible should be the maximum possible weight that
242 // can be assigned by the reranking algorithm, but since it is not always
243 // possible to calculate the max possible weight for a reranking algorithm
244 // we use this approach.
245 max_possible = max(max_possible, max_attained);
246 items[i].set_weight(weight);
250 MSet::Internal::convert_to_percent(double weight) const
252 int percent;
253 if (percent_scale_factor == 0.0) {
254 // For an unweighted search, give all matches 100%.
255 percent = 100;
256 } else if (weight <= 0.0) {
257 // Some weighting schemes can return zero relevance while matching,
258 // so give such matches 0%.
259 percent = 0;
260 } else {
261 // Adding on 100 * DBL_EPSILON was a hack to work around excess
262 // precision (e.g. on x86 when not using SSE), but this code seems like
263 // it's generally asking for problems with floating point rounding
264 // issues - maybe we ought to carry through the matching and total
265 // number of subqueries and calculate using those instead.
267 // There are corresponding hacks in matcher/multimatch.cc.
268 percent = int(weight * percent_scale_factor + 100.0 * DBL_EPSILON);
269 if (percent <= 0) {
270 // Make any non-zero weight give a non-zero percentage.
271 percent = 1;
272 } else if (percent > 100) {
273 // Make sure we don't ever exceed 100%.
274 percent = 100;
276 // FIXME: Ideally we should also make sure any non-exact match gives
277 // < 100%.
279 return percent;
282 string
283 MSet::Internal::serialise() const
285 string result;
287 result += encode_length(first);
288 // Send back the raw matches_* values. MSet::get_matches_estimated()
289 // rounds the estimate lazily, but MSetPostList::get_termfreq_est()
290 // returns the estimate, and the raw estimate is better for that.
292 // It is also cleaner that a round-trip through serialisation gives you an
293 // object which is as close to the original as possible.
294 result += encode_length(matches_lower_bound);
295 result += encode_length(matches_estimated);
296 result += encode_length(matches_upper_bound);
297 result += encode_length(uncollapsed_lower_bound);
298 result += encode_length(uncollapsed_estimated);
299 result += encode_length(uncollapsed_upper_bound);
300 result += serialise_double(max_possible);
301 result += serialise_double(max_attained);
303 result += serialise_double(percent_scale_factor);
305 result += encode_length(items.size());
306 for (auto&& item : items) {
307 result += serialise_double(item.get_weight());
308 result += encode_length(item.get_docid());
309 result += encode_length(item.get_sort_key().size());
310 result += item.get_sort_key();
311 result += encode_length(item.get_collapse_key().size());
312 result += item.get_collapse_key();
313 result += encode_length(item.get_collapse_count());
316 if (stats)
317 result += serialise_stats(*stats);
319 return result;
322 void
323 MSet::Internal::unserialise(const char * p, const char * p_end)
325 items.clear();
327 decode_length(&p, p_end, first);
328 decode_length(&p, p_end, matches_lower_bound);
329 decode_length(&p, p_end, matches_estimated);
330 decode_length(&p, p_end, matches_upper_bound);
331 decode_length(&p, p_end, uncollapsed_lower_bound);
332 decode_length(&p, p_end, uncollapsed_estimated);
333 decode_length(&p, p_end, uncollapsed_upper_bound);
334 max_possible = unserialise_double(&p, p_end);
335 max_attained = unserialise_double(&p, p_end);
337 percent_scale_factor = unserialise_double(&p, p_end);
339 size_t msize;
340 decode_length(&p, p_end, msize);
341 while (msize-- > 0) {
342 double wt = unserialise_double(&p, p_end);
343 Xapian::docid did;
344 decode_length(&p, p_end, did);
345 size_t len;
346 decode_length_and_check(&p, p_end, len);
347 string sort_key(p, len);
348 p += len;
349 decode_length_and_check(&p, p_end, len);
350 string key(p, len);
351 p += len;
352 Xapian::doccount collapse_cnt;
353 decode_length(&p, p_end, collapse_cnt);
354 items.emplace_back(wt, did, std::move(key), collapse_cnt,
355 std::move(sort_key));
358 if (p != p_end) {
359 stats.reset(new Xapian::Weight::Internal());
360 unserialise_stats(string(p, p_end - p), *stats);
364 string
365 MSet::Internal::get_description() const
367 string desc = "MSet(matches_lower_bound=";
368 desc += str(matches_lower_bound);
369 desc += ", matches_estimated=";
370 desc += str(matches_estimated);
371 desc += ", matches_upper_bound=";
372 desc += str(matches_upper_bound);
373 if (uncollapsed_lower_bound != matches_lower_bound) {
374 desc += ", uncollapsed_lower_bound=";
375 desc += str(uncollapsed_lower_bound);
377 if (uncollapsed_estimated != matches_estimated) {
378 desc += ", uncollapsed_estimated=";
379 desc += str(uncollapsed_estimated);
381 if (uncollapsed_upper_bound != matches_upper_bound) {
382 desc += ", uncollapsed_upper_bound=";
383 desc += str(uncollapsed_upper_bound);
385 if (first != 0) {
386 desc += ", first=";
387 desc += str(first);
389 if (max_possible > 0) {
390 desc += ", max_possible=";
391 desc += str(max_possible);
393 if (max_attained > 0) {
394 desc += ", max_attained=";
395 desc += str(max_attained);
397 desc += ", [";
398 bool comma = false;
399 for (auto&& item : items) {
400 if (comma) {
401 desc += ", ";
402 } else {
403 comma = true;
405 desc += item.get_description();
407 desc += "])";
408 return desc;