Reimplement the matcher
[xapian.git] / xapian-core / include / xapian / mset.h
blob9375dc99884775bb8debfc531886afcf4b68a2d7
1 /** @file mset.h
2 * @brief Class representing a list of search results
3 */
4 /* Copyright (C) 2015,2016,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #ifndef XAPIAN_INCLUDED_MSET_H
23 #define XAPIAN_INCLUDED_MSET_H
25 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
26 # error "Never use <xapian/mset.h> directly; include <xapian.h> instead."
27 #endif
29 #include <iterator>
30 #include <string>
32 #include <xapian/attributes.h>
33 #include <xapian/document.h>
34 #include <xapian/error.h>
35 #include <xapian/intrusive_ptr.h>
36 #include <xapian/stem.h>
37 #include <xapian/types.h>
38 #include <xapian/visibility.h>
40 namespace Xapian {
42 class MSetIterator;
44 /// Class representing a list of search results.
45 class XAPIAN_VISIBILITY_DEFAULT MSet {
46 friend class MSetIterator;
48 // Helper function for fetch() methods.
49 void fetch_(Xapian::doccount first, Xapian::doccount last) const;
51 /** Update the weight corresponding to the document indexed at
52 * position i with wt.
54 * The MSet's max_possible and max_attained are also updated.
56 * This method must be called to update the weight of every document in
57 * the MSet for i = 0 to mset.size() - 1 in ascending order to avoid
58 * miscalculation of max_attained and max_possible.
60 * @param i MSet index to update
61 * @param wt new weight to assign to the document at index @a i
63 void set_item_weight(Xapian::doccount i, double wt);
65 public:
66 /// Class representing the MSet internals.
67 class Internal;
68 /// @private @internal Reference counted internals.
69 Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
71 /** Copying is allowed.
73 * The internals are reference counted, so copying is cheap.
75 MSet(const MSet & o);
77 /** Copying is allowed.
79 * The internals are reference counted, so assignment is cheap.
81 MSet & operator=(const MSet & o);
83 /** Default constructor.
85 * Creates an empty MSet, mostly useful as a placeholder.
87 MSet();
89 /** @private @internal Wrap an existing Internal. */
90 XAPIAN_VISIBILITY_INTERNAL
91 explicit MSet(Internal* internal_);
93 /// Destructor.
94 ~MSet();
96 /** Assigns new weights and updates MSet.
98 * Dereferencing the Iterator should return a double.
100 * The weights returned by the iterator are assigned to elements of
101 * the MSet in rank order.
103 * @param begin Begin iterator.
104 * @param end End iterator.
106 * @exception Xapian::InvalidArgument is thrown if the total number of
107 * elements in the input doesn't match the total number of
108 * documents in MSet.
110 template <typename Iterator>
111 void replace_weights(Iterator first, Iterator last)
113 if (last - first != size()) {
114 throw Xapian::InvalidArgumentError("Number of weights assigned "
115 "doesn't match the number of "
116 "items");
118 Xapian::doccount i = 0;
119 while (first != last) {
120 set_item_weight(i, *first);
121 ++i;
122 ++first;
126 /** Sorts the list of documents in MSet according to their weights.
128 * Use after calling MSet::replace_weights.
130 void sort_by_relevance();
132 /** Convert a weight to a percentage.
134 * The matching document with the highest weight will get 100% if it
135 * matches all the weighted query terms, and proportionally less if it
136 * only matches some, and other weights are scaled by the same factor.
138 * Documents with a non-zero score will always score at least 1%.
140 * Note that these generally aren't percentages of anything meaningful
141 * (unless you use a custom weighting formula where they are!)
143 int convert_to_percent(double weight) const;
145 /** Convert the weight of the current iterator position to a percentage.
147 * The matching document with the highest weight will get 100% if it
148 * matches all the weighted query terms, and proportionally less if it
149 * only matches some, and other weights are scaled by the same factor.
151 * Documents with a non-zero score will always score at least 1%.
153 * Note that these generally aren't percentages of anything meaningful
154 * (unless you use a custom weighting formula where they are!)
156 int convert_to_percent(const MSetIterator & it) const;
158 /** Get the termfreq of a term.
160 * @return The number of documents @a term occurs in.
162 * Since 1.5.0, this method returns 0 if called on an MSet which is
163 * not associated with a database (which is consistent with
164 * Database::get_termfreq() returning 0 when called on a Database
165 * with no sub-databases); in earlier versions,
166 * Xapian::InvalidOperationError was thrown in this case.
168 Xapian::doccount get_termfreq(const std::string & term) const;
170 /** Get the term weight of a term.
172 * @return The maximum weight that @a term could have contributed to a
173 * document.
175 * Since 1.5.0, this method returns 0.0 if called on an MSet which is
176 * not associated with a database, or with a term which wasn't present
177 * in the query (since in both cases the term contributes no weight to any
178 * matching documents); in earlier versions, Xapian::InvalidOperationError
179 * was thrown for the first case, and Xapian::InvalidArgumentError for the
180 * second.
182 double get_termweight(const std::string & term) const;
184 /** Rank of first item in this MSet.
186 * This is the parameter `first` passed to Xapian::Enquire::get_mset().
188 Xapian::doccount get_firstitem() const;
190 /** Lower bound on the total number of matching documents. */
191 Xapian::doccount get_matches_lower_bound() const;
192 /** Estimate of the total number of matching documents. */
193 Xapian::doccount get_matches_estimated() const;
194 /** Upper bound on the total number of matching documents. */
195 Xapian::doccount get_matches_upper_bound() const;
197 /** Lower bound on the total number of matching documents before collapsing.
199 * Conceptually the same as get_matches_lower_bound() for the same query
200 * without any collapse part (though the actual value may differ).
202 Xapian::doccount get_uncollapsed_matches_lower_bound() const;
203 /** Estimate of the total number of matching documents before collapsing.
205 * Conceptually the same as get_matches_estimated() for the same query
206 * without any collapse part (though the actual value may differ).
208 Xapian::doccount get_uncollapsed_matches_estimated() const;
209 /** Upper bound on the total number of matching documents before collapsing.
211 * Conceptually the same as get_matches_upper_bound() for the same query
212 * without any collapse part (though the actual value may differ).
214 Xapian::doccount get_uncollapsed_matches_upper_bound() const;
216 /** The maximum weight attained by any document. */
217 double get_max_attained() const;
218 /** The maximum possible weight any document could achieve. */
219 double get_max_possible() const;
221 enum {
222 /** Model the relevancy of non-query terms in MSet::snippet().
224 * Non-query terms will be assigned a small weight, and the snippet
225 * will tend to prefer snippets which contain a more interesting
226 * background (where the query term content is equivalent).
228 SNIPPET_BACKGROUND_MODEL = 1,
229 /** Exhaustively evaluate candidate snippets in MSet::snippet().
231 * Without this flag, snippet generation will stop once it thinks
232 * it has found a "good enough" snippet, which will generally reduce
233 * the time taken to generate a snippet.
235 SNIPPET_EXHAUSTIVE = 2,
236 /** Return the empty string if no term got matched.
238 * If enabled, snippet() returns an empty string if not a single match
239 * was found in text. If not enabled, snippet() returns a (sub)string
240 * of text without any highlighted terms.
242 SNIPPET_EMPTY_WITHOUT_MATCH = 4
245 /** Generate a snippet.
247 * This method selects a continuous run of words of less than about @a
248 * length bytes from @a text, based mainly on where the query matches
249 * (currently terms, exact phrases and wildcards are taken into account),
250 * but also on the non-query terms in the text.
252 * The returned text can be escaped (by default, it is escaped to make it
253 * suitable for use in HTML), and matches with the query will be
254 * highlighted using @a hi_start and @a hi_end.
256 * If the snippet seems to start or end mid-sentence, then @a omit is
257 * prepended or append (respectively) to indicate this.
259 * The stemmer used to build the query should be specified in @a stemmer.
261 * And @a flags contains flags controlling behaviour.
263 * Added in 1.3.5.
265 std::string snippet(const std::string & text,
266 size_t length = 500,
267 const Xapian::Stem & stemmer = Xapian::Stem(),
268 unsigned flags = SNIPPET_BACKGROUND_MODEL|SNIPPET_EXHAUSTIVE,
269 const std::string & hi_start = "<b>",
270 const std::string & hi_end = "</b>",
271 const std::string & omit = "...") const;
273 /** Prefetch hint a range of items.
275 * For a remote database, this may start a pipelined fetch of the
276 * requested documents from the remote server.
278 * For a disk-based database, this may send prefetch hints to the
279 * operating system such that the disk blocks the requested documents
280 * are stored in are more likely to be in the cache when we come to
281 * actually read them.
283 void fetch(const MSetIterator &begin, const MSetIterator &end) const;
285 /** Prefetch hint a single MSet item.
287 * For a remote database, this may start a pipelined fetch of the
288 * requested documents from the remote server.
290 * For a disk-based database, this may send prefetch hints to the
291 * operating system such that the disk blocks the requested documents
292 * are stored in are more likely to be in the cache when we come to
293 * actually read them.
295 void fetch(const MSetIterator &item) const;
297 /** Prefetch hint the whole MSet.
299 * For a remote database, this may start a pipelined fetch of the
300 * requested documents from the remote server.
302 * For a disk-based database, this may send prefetch hints to the
303 * operating system such that the disk blocks the requested documents
304 * are stored in are more likely to be in the cache when we come to
305 * actually read them.
307 void fetch() const { fetch_(0, Xapian::doccount(-1)); }
309 /** Return number of items in this MSet object. */
310 Xapian::doccount size() const;
312 /** Return true if this MSet object is empty. */
313 bool empty() const { return size() == 0; }
315 /** Efficiently swap this MSet object with another. */
316 void swap(MSet & o) { internal.swap(o.internal); }
318 /** Return iterator pointing to the first item in this MSet. */
319 MSetIterator begin() const;
321 /** Return iterator pointing to just after the last item in this MSet. */
322 MSetIterator end() const;
324 /** Return iterator pointing to the i-th object in this MSet. */
325 MSetIterator operator[](Xapian::doccount i) const;
327 /** Return iterator pointing to the last object in this MSet. */
328 MSetIterator back() const;
330 /// Return a string describing this object.
331 std::string get_description() const;
333 /** @private @internal MSet is what the C++ STL calls a container.
335 * The following typedefs allow the class to be used in templates in the
336 * same way the standard containers can be.
338 * These are deliberately hidden from the Doxygen-generated docs, as the
339 * machinery here isn't interesting to API users. They just need to know
340 * that Xapian container classes are compatible with the STL.
342 * See "The C++ Programming Language", 3rd ed. section 16.3.1:
344 // @{
345 /// @private
346 typedef Xapian::MSetIterator value_type;
347 /// @private
348 typedef Xapian::doccount size_type;
349 /// @private
350 typedef Xapian::doccount_diff difference_type;
351 /// @private
352 typedef Xapian::MSetIterator iterator;
353 /// @private
354 typedef Xapian::MSetIterator const_iterator;
355 /// @private
356 typedef value_type * pointer;
357 /// @private
358 typedef const value_type * const_pointer;
359 /// @private
360 typedef value_type & reference;
361 /// @private
362 typedef const value_type & const_reference;
363 // @}
365 /** @private @internal MSet is what the C++ STL calls a container.
367 * The following methods allow the class to be used in templates in the
368 * same way the standard containers can be.
370 * These are deliberately hidden from the Doxygen-generated docs, as the
371 * machinery here isn't interesting to API users. They just need to know
372 * that Xapian container classes are compatible with the STL.
374 // @{
375 // The size is fixed once created.
376 Xapian::doccount max_size() const { return size(); }
377 // @}
380 /// Iterator over a Xapian::MSet.
381 class XAPIAN_VISIBILITY_DEFAULT MSetIterator {
382 friend class MSet;
384 MSetIterator(const Xapian::MSet & mset_, Xapian::doccount off_from_end_)
385 : mset(mset_), off_from_end(off_from_end_) { }
387 public:
388 /** @private @internal The MSet we are iterating over. */
389 Xapian::MSet mset;
391 /** @private @internal The current position of the iterator.
393 * We store the offset from the end of @a mset, since that means
394 * MSet::end() just needs to set this member to 0.
396 Xapian::MSet::size_type off_from_end;
398 /** Create an unpositioned MSetIterator. */
399 MSetIterator() : off_from_end(0) { }
401 /** Get the numeric document id for the current position. */
402 Xapian::docid operator*() const;
404 /// Advance the iterator to the next position.
405 MSetIterator & operator++() {
406 --off_from_end;
407 return *this;
410 /// Advance the iterator to the next position (postfix version).
411 MSetIterator operator++(int) {
412 MSetIterator retval = *this;
413 --off_from_end;
414 return retval;
417 /// Move the iterator to the previous position.
418 MSetIterator & operator--() {
419 ++off_from_end;
420 return *this;
423 /// Move the iterator to the previous position (postfix version).
424 MSetIterator operator--(int) {
425 MSetIterator retval = *this;
426 ++off_from_end;
427 return retval;
430 /** @private @internal MSetIterator is what the C++ STL calls an
431 * random_access_iterator.
433 * The following typedefs allow std::iterator_traits<> to work so that
434 * this iterator can be used with the STL.
436 * These are deliberately hidden from the Doxygen-generated docs, as the
437 * machinery here isn't interesting to API users. They just need to know
438 * that Xapian iterator classes are compatible with the STL.
440 // @{
441 /// @private
442 typedef std::random_access_iterator_tag iterator_category;
443 /// @private
444 typedef std::string value_type;
445 /// @private
446 typedef Xapian::termcount_diff difference_type;
447 /// @private
448 typedef std::string * pointer;
449 /// @private
450 typedef std::string & reference;
451 // @}
453 /// Move the iterator forwards by n positions.
454 MSetIterator & operator+=(difference_type n) {
455 off_from_end -= n;
456 return *this;
459 /// Move the iterator back by n positions.
460 MSetIterator & operator-=(difference_type n) {
461 off_from_end += n;
462 return *this;
465 /** Return the iterator incremented by @a n positions.
467 * If @a n is negative, decrements by (-n) positions.
469 MSetIterator operator+(difference_type n) const {
470 return MSetIterator(mset, off_from_end - n);
473 /** Return the iterator decremented by @a n positions.
475 * If @a n is negative, increments by (-n) positions.
477 MSetIterator operator-(difference_type n) const {
478 return MSetIterator(mset, off_from_end + n);
481 /** Return the number of positions between @a o and this iterator. */
482 difference_type operator-(const MSetIterator& o) const {
483 return difference_type(o.off_from_end) - difference_type(off_from_end);
486 /** Return the MSet rank for the current position.
488 * The rank of mset[0] is mset.get_firstitem().
490 Xapian::doccount get_rank() const {
491 return mset.get_firstitem() + (mset.size() - off_from_end);
494 /** Get the Document object for the current position. */
495 Xapian::Document get_document() const;
497 /** Get the weight for the current position. */
498 double get_weight() const;
500 /** Return the collapse key for the current position.
502 * If collapsing isn't in use, an empty string will be returned.
504 std::string get_collapse_key() const;
506 /** Return a count of the number of collapses done onto the current key.
508 * This starts at 0, and is incremented each time an item is eliminated
509 * because its key is the same as that of the current item (as returned
510 * by get_collapse_key()).
512 * Note that this is NOT necessarily one less than the total number of
513 * matching documents with this collapse key due to various optimisations
514 * implemented in the matcher - for example, it can skip documents
515 * completely if it can prove their weight wouldn't be enough to make the
516 * result set.
518 * You can say is that if get_collapse_count() > 0 then there are
519 * >= get_collapse_count() other documents with the current collapse
520 * key. But if get_collapse_count() == 0 then there may or may not be
521 * other such documents.
523 Xapian::doccount get_collapse_count() const;
525 /** Convert the weight of the current iterator position to a percentage.
527 * The matching document with the highest weight will get 100% if it
528 * matches all the weighted query terms, and proportionally less if it
529 * only matches some, and other weights are scaled by the same factor.
531 * Documents with a non-zero score will always score at least 1%.
533 * Note that these generally aren't percentages of anything meaningful
534 * (unless you use a custom weighting formula where they are!)
536 int get_percent() const {
537 return mset.convert_to_percent(get_weight());
540 /// Return a string describing this object.
541 std::string get_description() const;
544 bool
545 XAPIAN_NOTHROW(operator==(const MSetIterator &a, const MSetIterator &b));
547 /// Equality test for MSetIterator objects.
548 inline bool
549 operator==(const MSetIterator &a, const MSetIterator &b) XAPIAN_NOEXCEPT
551 return a.off_from_end == b.off_from_end;
554 inline bool
555 XAPIAN_NOTHROW(operator!=(const MSetIterator &a, const MSetIterator &b));
557 /// Inequality test for MSetIterator objects.
558 inline bool
559 operator!=(const MSetIterator &a, const MSetIterator &b) XAPIAN_NOEXCEPT
561 return !(a == b);
564 bool
565 XAPIAN_NOTHROW(operator<(const MSetIterator &a, const MSetIterator &b));
567 /// Inequality test for MSetIterator objects.
568 inline bool
569 operator<(const MSetIterator &a, const MSetIterator &b) XAPIAN_NOEXCEPT
571 return a.off_from_end > b.off_from_end;
574 inline bool
575 XAPIAN_NOTHROW(operator>(const MSetIterator &a, const MSetIterator &b));
577 /// Inequality test for MSetIterator objects.
578 inline bool
579 operator>(const MSetIterator &a, const MSetIterator &b) XAPIAN_NOEXCEPT
581 return b < a;
584 inline bool
585 XAPIAN_NOTHROW(operator>=(const MSetIterator &a, const MSetIterator &b));
587 /// Inequality test for MSetIterator objects.
588 inline bool
589 operator>=(const MSetIterator &a, const MSetIterator &b) XAPIAN_NOEXCEPT
591 return !(a < b);
594 inline bool
595 XAPIAN_NOTHROW(operator<=(const MSetIterator &a, const MSetIterator &b));
597 /// Inequality test for MSetIterator objects.
598 inline bool
599 operator<=(const MSetIterator &a, const MSetIterator &b) XAPIAN_NOEXCEPT
601 return !(b < a);
604 /** Return MSetIterator @a it incremented by @a n positions.
606 * If @a n is negative, decrements by (-n) positions.
608 inline MSetIterator
609 operator+(MSetIterator::difference_type n, const MSetIterator& it)
611 return it + n;
614 // Inlined methods of MSet which need MSetIterator to have been defined:
616 inline void
617 MSet::fetch(const MSetIterator &begin_it, const MSetIterator &end_it) const
619 fetch_(begin_it.off_from_end, end_it.off_from_end);
622 inline void
623 MSet::fetch(const MSetIterator &item) const
625 fetch_(item.off_from_end, item.off_from_end);
628 inline MSetIterator
629 MSet::begin() const {
630 return MSetIterator(*this, size());
633 inline MSetIterator
634 MSet::end() const {
635 // Decrementing the result of end() needs to work, so we must pass in
636 // *this here.
637 return MSetIterator(*this, 0);
640 inline MSetIterator
641 MSet::operator[](Xapian::doccount i) const {
642 return MSetIterator(*this, size() - i);
645 inline MSetIterator
646 MSet::back() const {
647 return MSetIterator(*this, 1);
650 inline int
651 MSet::convert_to_percent(const MSetIterator & it) const {
652 return convert_to_percent(it.get_weight());
657 #endif // XAPIAN_INCLUDED_MSET_H