Remove some superfluous blank lines
[xapian.git] / xapian-core / include / xapian / postingsource.h
blob93c4baac7d569cf196fee7a915cf56319a91698c
1 /** @file postingsource.h
2 * @brief External sources of posting information
3 */
4 /* Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright (C) 2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_POSTINGSOURCE_H
23 #define XAPIAN_INCLUDED_POSTINGSOURCE_H
25 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
26 # error "Never use <xapian/postingsource.h> directly; include <xapian.h> instead."
27 #endif
29 #include <xapian/attributes.h>
30 #include <xapian/database.h>
31 #include <xapian/intrusive_ptr.h>
32 #include <xapian/postingiterator.h>
33 #include <xapian/types.h>
34 #include <xapian/valueiterator.h>
35 #include <xapian/visibility.h>
37 #include <string>
38 #include <map>
40 namespace Xapian {
42 class Registry;
44 /** Base class which provides an "external" source of postings.
46 class XAPIAN_VISIBILITY_DEFAULT PostingSource
47 : public Xapian::Internal::opt_intrusive_base {
48 /// Don't allow assignment.
49 void operator=(const PostingSource &) = delete;
51 /// Don't allow copying.
52 PostingSource(const PostingSource &) = delete;
54 /// The current upper bound on what get_weight() can return.
55 double max_weight_;
57 /** The object to inform of maxweight changes.
59 * We store this as a (void*) to avoid needing to declare an internal
60 * type in an external header. It's actually (PostListTree *).
62 void * matcher_;
64 public:
65 /// Allow subclasses to be instantiated.
66 XAPIAN_NOTHROW(PostingSource())
67 : max_weight_(0), matcher_(NULL) { }
69 /** @private @internal Set the object to inform of maxweight changes.
71 * This method is for internal use only - it would be private except that
72 * would force us to forward declare an internal class in an external API
73 * header just to make it a friend.
75 XAPIAN_VISIBILITY_INTERNAL
76 void register_matcher_(void * matcher) { matcher_ = matcher; }
78 // Destructor.
79 virtual ~PostingSource();
81 /** A lower bound on the number of documents this object can return.
83 * Xapian will always call init() on a PostingSource before calling this
84 * for the first time.
86 virtual Xapian::doccount get_termfreq_min() const = 0;
88 /** An estimate of the number of documents this object can return.
90 * It must always be true that:
92 * get_termfreq_min() <= get_termfreq_est() <= get_termfreq_max()
94 * Xapian will always call init() on a PostingSource before calling this
95 * for the first time.
97 virtual Xapian::doccount get_termfreq_est() const = 0;
99 /** An upper bound on the number of documents this object can return.
101 * Xapian will always call init() on a PostingSource before calling this
102 * for the first time.
104 virtual Xapian::doccount get_termfreq_max() const = 0;
106 /** Specify an upper bound on what get_weight() will return from now on.
108 * This upper bound is used by the matcher to perform various
109 * optimisations, so if you can return a good bound, then matches
110 * will generally run faster.
112 * This method should be called after calling init(), and may be called
113 * during iteration if the upper bound drops. It is probably only useful
114 * to call from subclasses (it was actually a "protected" method prior to
115 * Xapian 1.3.4, but that makes it tricky to wrap for other languages).
117 * It is valid for the posting source to have returned a higher value from
118 * get_weight() earlier in the iteration, but the posting source must not
119 * return a higher value from get_weight() than the currently set upper
120 * bound, and the upper bound must not be increased (until init() has been
121 * called).
123 * If you don't call this method, the upper bound will default to 0, for
124 * convenience when implementing "weight-less" PostingSource subclasses.
126 * @param max_weight The upper bound to set.
128 void set_maxweight(double max_weight);
130 /// Return the currently set upper bound on what get_weight() can return.
131 double XAPIAN_NOTHROW(get_maxweight() const) { return max_weight_; }
133 /** Return the weight contribution for the current document.
135 * This default implementation always returns 0, for convenience when
136 * implementing "weight-less" PostingSource subclasses.
138 * This method may assume that it will only be called when there is a
139 * "current document". In detail: Xapian will always call init() on a
140 * PostingSource before calling this for the first time. It will also
141 * only call this if the PostingSource reports that it is pointing to a
142 * valid document (ie, it will not call it before calling at least one of
143 * next(), skip_to() or check(), and will ensure that the PostingSource is
144 * not at the end by calling at_end()).
146 virtual double get_weight() const;
148 /** Return the current docid.
150 * This method may assume that it will only be called when there is a
151 * "current document". See @a get_weight() for details.
153 * Note: in the case of a multi-database search, the returned docid should
154 * be in the single subdatabase relevant to this posting source. See the
155 * @a init() method for details.
157 virtual Xapian::docid get_docid() const = 0;
159 /** Advance the current position to the next matching document.
161 * The PostingSource starts before the first entry in the list, so next(),
162 * skip_to() or check() must be called before any methods which need the
163 * context of the current position.
165 * Xapian will always call init() on a PostingSource before calling this
166 * for the first time.
168 * @param min_wt The minimum weight contribution that is needed (this is
169 * just a hint which subclasses may ignore).
171 virtual void next(double min_wt) = 0;
173 /** Advance to the specified docid.
175 * If the specified docid isn't in the list, position ourselves on the
176 * first document after it (or at_end() if no greater docids are present).
178 * If the current position is already the specified docid, this method will
179 * leave the position unmodified.
181 * If the specified docid is earlier than the current position, the
182 * behaviour is unspecified. A sensible behaviour would be to leave the
183 * current position unmodified, but it is also reasonable to move to the
184 * specified docid.
186 * The default implementation calls next() repeatedly, which works but
187 * skip_to() can often be implemented much more efficiently.
189 * Xapian will always call init() on a PostingSource before calling this
190 * for the first time.
192 * Note: in the case of a multi-database search, the docid specified is
193 * the docid in the single subdatabase relevant to this posting source.
194 * See the @a init() method for details.
196 * @param did The document id to advance to.
197 * @param min_wt The minimum weight contribution that is needed (this is
198 * just a hint which subclasses may ignore).
200 virtual void skip_to(Xapian::docid did, double min_wt);
202 /** Check if the specified docid occurs.
204 * The caller is required to ensure that the specified document id @a did
205 * actually exists in the database. If it does, it must move to that
206 * document id, and return true. If it does not, it may either:
208 * - return true, having moved to a definite position (including
209 * "at_end"), which must be the same position as skip_to() would have
210 * moved to.
212 * or
214 * - return false, having moved to an "indeterminate" position, such that
215 * a subsequent call to next() or skip_to() will move to the next
216 * matching position after @a did.
218 * Generally, this method should act like skip_to() and return true if
219 * that can be done at little extra cost.
221 * Otherwise it should simply check if a particular docid is present,
222 * returning true if it is, and false if it isn't.
224 * The default implementation calls skip_to() and always returns true.
226 * Xapian will always call init() on a PostingSource before calling this
227 * for the first time.
229 * Note: in the case of a multi-database search, the docid specified is
230 * the docid in the single subdatabase relevant to this posting source.
231 * See the @a init() method for details.
233 * @param did The document id to check.
234 * @param min_wt The minimum weight contribution that is needed (this is
235 * just a hint which subclasses may ignore).
237 virtual bool check(Xapian::docid did, double min_wt);
239 /** Return true if the current position is past the last entry in this list.
241 * At least one of @a next(), @a skip_to() or @a check() will be called
242 * before this method is first called.
244 virtual bool at_end() const = 0;
246 /** Clone the posting source.
248 * The clone should inherit the configuration of the parent, but need not
249 * inherit the state. ie, the clone does not need to be in the same
250 * iteration position as the original: the matcher will always call
251 * init() on the clone before attempting to move the iterator, or read
252 * the information about the current position of the iterator.
254 * This may return NULL to indicate that cloning is not supported. In
255 * this case, the PostingSource may only be used with a single-database
256 * search.
258 * The default implementation returns NULL.
260 * Note that the returned object will be deallocated by Xapian after use
261 * with "delete". If you want to handle the deletion in a special way
262 * (for example when wrapping the Xapian API for use from another
263 * language) then you can define a static <code>operator delete</code>
264 * method in your subclass as shown here:
265 * https://trac.xapian.org/ticket/554#comment:1
267 virtual PostingSource * clone() const;
269 /** Name of the posting source class.
271 * This is used when serialising and unserialising posting sources; for
272 * example, for performing remote searches.
274 * If the subclass is in a C++ namespace, the namespace should be included
275 * in the name, using "::" as a separator. For example, for a
276 * PostingSource subclass called "FooPostingSource" in the "Xapian"
277 * namespace the result of this call should be "Xapian::FooPostingSource".
279 * This should only be implemented if serialise() and unserialise() are
280 * also implemented. The default implementation returns an empty string.
282 * If this returns an empty string, Xapian will assume that serialise()
283 * and unserialise() are not implemented.
285 virtual std::string name() const;
287 /** Serialise object parameters into a string.
289 * The serialised parameters should represent the configuration of the
290 * posting source, but need not (indeed, should not) represent the current
291 * iteration state.
293 * If you don't want to support the remote backend, you can use the
294 * default implementation which simply throws Xapian::UnimplementedError.
296 virtual std::string serialise() const;
298 /** Create object given string serialisation returned by serialise().
300 * Note that the returned object will be deallocated by Xapian after use
301 * with "delete". If you want to handle the deletion in a special way
302 * (for example when wrapping the Xapian API for use from another
303 * language) then you can define a static <code>operator delete</code>
304 * method in your subclass as shown here:
305 * https://trac.xapian.org/ticket/554#comment:1
307 * If you don't want to support the remote backend, you can use the
308 * default implementation which simply throws Xapian::UnimplementedError.
310 * @param serialised A serialised instance of this PostingSource subclass.
312 virtual PostingSource * unserialise(const std::string &serialised) const;
314 /** Create object given string serialisation returned by serialise().
316 * Note that the returned object will be deallocated by Xapian after use
317 * with "delete". If you want to handle the deletion in a special way
318 * (for example when wrapping the Xapian API for use from another
319 * language) then you can define a static <code>operator delete</code>
320 * method in your subclass as shown here:
321 * https://trac.xapian.org/ticket/554#comment:1
323 * This method is supplied with a Registry object, which can be used when
324 * unserialising objects contained within the posting source. The default
325 * implementation simply calls unserialise() which doesn't take the
326 * Registry object, so you do not need to implement this method unless you
327 * want to take advantage of the Registry object when unserialising.
329 * @param serialised A serialised instance of this PostingSource subclass.
330 * @param registry The Xapian::Registry object to use.
332 virtual PostingSource * unserialise_with_registry(const std::string &serialised,
333 const Registry & registry) const;
335 /** Set this PostingSource to the start of the list of postings.
337 * This is called automatically by the matcher prior to each query being
338 * processed.
340 * If a PostingSource is used for multiple searches, @a init() will
341 * therefore be called multiple times, and must handle this by using the
342 * database passed in the most recent call.
344 * @param db The database which the PostingSource should iterate through.
346 * Note: in the case of a multi-database search, a separate PostingSource
347 * will be used for each database (the separate PostingSources will be
348 * obtained using @a clone()), and each PostingSource will be passed one of
349 * the sub-databases as the @a db parameter here. The @a db parameter
350 * will therefore always refer to a single database. All docids passed
351 * to, or returned from, the PostingSource refer to docids in that single
352 * database, rather than in the multi-database.
354 virtual void init(const Database & db) = 0;
356 /** Return a string describing this object.
358 * This default implementation returns a generic answer. This default
359 * it provided to avoid forcing those deriving their own PostingSource
360 * subclass from having to implement this (they may not care what
361 * get_description() gives for their subclass).
363 virtual std::string get_description() const;
365 /** Start reference counting this object.
367 * You can hand ownership of a dynamically allocated PostingSource
368 * object to Xapian by calling release() and then passing the object to a
369 * Xapian method. Xapian will arrange to delete the object once it is no
370 * longer required.
372 PostingSource * release() {
373 opt_intrusive_base::release();
374 return this;
377 /** Start reference counting this object.
379 * You can hand ownership of a dynamically allocated PostingSource
380 * object to Xapian by calling release() and then passing the object to a
381 * Xapian method. Xapian will arrange to delete the object once it is no
382 * longer required.
384 const PostingSource * release() const {
385 opt_intrusive_base::release();
386 return this;
391 /** A posting source which generates weights from a value slot.
393 * This is a base class for classes which generate weights using values stored
394 * in the specified slot. For example, ValueWeightPostingSource uses
395 * sortable_unserialise to convert values directly to weights.
397 * The upper bound on the weight returned is set to DBL_MAX. Subclasses
398 * should call set_maxweight() in their init() methods after calling
399 * ValuePostingSource::init() if they know a tighter bound on the weight.
401 class XAPIAN_VISIBILITY_DEFAULT ValuePostingSource : public PostingSource {
402 Xapian::Database db;
404 Xapian::valueno slot;
406 Xapian::ValueIterator value_it;
408 bool started;
410 Xapian::doccount termfreq_min;
412 Xapian::doccount termfreq_est;
414 Xapian::doccount termfreq_max;
416 public:
417 /** Construct a ValuePostingSource.
419 * @param slot_ The value slot to read values from.
421 explicit XAPIAN_NOTHROW(ValuePostingSource(Xapian::valueno slot_))
422 : slot(slot_) {}
424 Xapian::doccount get_termfreq_min() const;
425 Xapian::doccount get_termfreq_est() const;
426 Xapian::doccount get_termfreq_max() const;
428 void next(double min_wt);
429 void skip_to(Xapian::docid min_docid, double min_wt);
430 bool check(Xapian::docid min_docid, double min_wt);
432 bool at_end() const;
434 Xapian::docid get_docid() const;
436 void init(const Database & db_);
438 /** The database we're reading values from.
440 * Added in 1.2.23 and 1.3.5.
442 Xapian::Database get_database() const { return db; }
444 /** The slot we're reading values from.
446 * Added in 1.2.23 and 1.3.5.
448 Xapian::valueno get_slot() const { return slot; }
450 /** Read current value.
452 * Added in 1.2.23 and 1.3.5.
454 std::string get_value() const { return *value_it; }
456 /** End the iteration.
458 * Calls to at_end() will return true after calling this method.
460 * Added in 1.2.23 and 1.3.5.
462 void done() {
463 value_it = db.valuestream_end(slot);
464 started = true;
467 /** Flag indicating if we've started (true if we have).
469 * Added in 1.2.23 and 1.3.5.
471 bool get_started() const { return started; }
473 /** Set a lower bound on the term frequency.
475 * Subclasses should set this if they are overriding the next(), skip_to()
476 * or check() methods to return fewer documents.
478 * Added in 1.2.23 and 1.3.5.
480 void set_termfreq_min(Xapian::doccount termfreq_min_) {
481 termfreq_min = termfreq_min_;
484 /** An estimate of the term frequency.
486 * Subclasses should set this if they are overriding the next(), skip_to()
487 * or check() methods.
489 * Added in 1.2.23 and 1.3.5.
491 void set_termfreq_est(Xapian::doccount termfreq_est_) {
492 termfreq_est = termfreq_est_;
495 /** An upper bound on the term frequency.
497 * Subclasses should set this if they are overriding the next(), skip_to()
498 * or check() methods.
500 * Added in 1.2.23 and 1.3.5.
502 void set_termfreq_max(Xapian::doccount termfreq_max_) {
503 termfreq_max = termfreq_max_;
506 std::string get_description() const;
510 /** A posting source which reads weights from a value slot.
512 * This returns entries for all documents in the given database which have a
513 * non empty values in the specified slot. It returns a weight calculated by
514 * applying sortable_unserialise to the value stored in the slot (so the
515 * values stored should probably have been calculated by applying
516 * sortable_serialise to a floating point number at index time).
518 * The upper bound on the weight returned is set using the upper bound on the
519 * values in the specified slot, or DBL_MAX if value bounds aren't supported
520 * by the current backend.
522 * For efficiency, this posting source doesn't check that the stored values
523 * are valid in any way, so it will never raise an exception due to invalid
524 * stored values. In particular, it doesn't ensure that the unserialised
525 * values are positive, which is a requirement for weights. The behaviour if
526 * the slot contains values which unserialise to negative values is undefined.
528 class XAPIAN_VISIBILITY_DEFAULT ValueWeightPostingSource
529 : public ValuePostingSource {
530 public:
531 /** Construct a ValueWeightPostingSource.
533 * @param slot_ The value slot to read values from.
535 explicit ValueWeightPostingSource(Xapian::valueno slot_);
537 double get_weight() const;
538 ValueWeightPostingSource * clone() const;
539 std::string name() const;
540 std::string serialise() const;
541 ValueWeightPostingSource * unserialise(const std::string &serialised) const;
542 void init(const Database & db_);
544 std::string get_description() const;
548 /** Read weights from a value which is known to decrease as docid increases.
550 * This posting source can be used, like ValueWeightPostingSource, to add a
551 * weight contribution to a query based on the values stored in a slot. The
552 * values in the slot must be serialised as by @a sortable_serialise().
554 * However, this posting source is additionally given a range of document IDs,
555 * within which the weight is known to be decreasing. ie, for all documents
556 * with ids A and B within this range (including the endpoints), where A is
557 * less than B, the weight of A is less than or equal to the weight of B.
558 * This can allow the posting source to skip to the end of the range quickly
559 * if insufficient weight is left in the posting source for a particular
560 * source.
562 * By default, the range is assumed to cover all document IDs.
564 * The ordering property can be arranged at index time, or by sorting an
565 * indexed database to produce a new, sorted, database.
567 class XAPIAN_VISIBILITY_DEFAULT DecreasingValueWeightPostingSource
568 : public Xapian::ValueWeightPostingSource {
569 protected:
570 /** Start of range of docids for which weights are known to be decreasing.
572 * 0 => first docid.
574 Xapian::docid range_start;
576 /** End of range of docids for which weights are known to be decreasing.
578 * 0 => last docid.
580 Xapian::docid range_end;
582 /// Weight at current position.
583 double curr_weight;
585 /// Flag, set to true if there are docs after the end of the range.
586 bool items_at_end;
588 /// Skip the iterator forward if in the decreasing range, and weight is low.
589 void skip_if_in_range(double min_wt);
591 public:
592 /** Construct a DecreasingValueWeightPostingSource.
594 * @param slot_ The value slot to read values from.
595 * @param range_start_ Start of range of docids for which weights are
596 * known to be decreasing (default: first docid)
597 * @param range_end_ End of range of docids for which weights are
598 * known to be decreasing (default: last docid)
600 DecreasingValueWeightPostingSource(Xapian::valueno slot_,
601 Xapian::docid range_start_ = 0,
602 Xapian::docid range_end_ = 0);
604 double get_weight() const;
605 DecreasingValueWeightPostingSource * clone() const;
606 std::string name() const;
607 std::string serialise() const;
608 DecreasingValueWeightPostingSource * unserialise(const std::string &serialised) const;
609 void init(const Xapian::Database & db_);
611 void next(double min_wt);
612 void skip_to(Xapian::docid min_docid, double min_wt);
613 bool check(Xapian::docid min_docid, double min_wt);
615 std::string get_description() const;
619 /** A posting source which looks up weights in a map using values as the key.
621 * This allows will return entries for all documents in the given database
622 * which have a value in the slot specified. The values will be mapped to the
623 * corresponding weight in the weight map. If there is no mapping for a
624 * particular value, the default weight will be returned (which itself
625 * defaults to 0.0).
627 class XAPIAN_VISIBILITY_DEFAULT ValueMapPostingSource
628 : public ValuePostingSource {
629 /// The default weight
630 double default_weight;
632 /// The maximum weight in weight_map.
633 double max_weight_in_map;
635 /// The value -> weight map
636 std::map<std::string, double> weight_map;
638 public:
639 /** Construct a ValueMapPostingSource.
641 * @param slot_ The value slot to read values from.
643 explicit ValueMapPostingSource(Xapian::valueno slot_);
645 /** Add a mapping.
647 * @param key The key looked up from the value slot.
648 * @param wt The weight to give this key.
650 void add_mapping(const std::string &key, double wt);
652 /** Clear all mappings. */
653 void clear_mappings();
655 /** Set a default weight for document values not in the map.
657 * @param wt The weight to set as the default.
659 void set_default_weight(double wt);
661 double get_weight() const;
662 ValueMapPostingSource * clone() const;
663 std::string name() const;
664 std::string serialise() const;
665 ValueMapPostingSource * unserialise(const std::string &serialised) const;
666 void init(const Database & db_);
668 std::string get_description() const;
672 /** A posting source which returns a fixed weight for all documents.
674 * This returns entries for all documents in the given database, with a fixed
675 * weight (specified by a parameter to the constructor).
677 class XAPIAN_VISIBILITY_DEFAULT FixedWeightPostingSource : public PostingSource {
678 /// The database we're reading documents from.
679 Xapian::Database db;
681 /// Number of documents in the posting source.
682 Xapian::doccount termfreq;
684 /// Iterator over all documents.
685 Xapian::PostingIterator it;
687 /// Flag indicating if we've started (true if we have).
688 bool started;
690 /// The docid last passed to check() (0 if check() wasn't the last move).
691 Xapian::docid check_docid;
693 public:
694 /** Construct a FixedWeightPostingSource.
696 * @param wt The fixed weight to return.
698 explicit FixedWeightPostingSource(double wt);
700 Xapian::doccount get_termfreq_min() const;
701 Xapian::doccount get_termfreq_est() const;
702 Xapian::doccount get_termfreq_max() const;
704 double get_weight() const;
706 void next(double min_wt);
707 void skip_to(Xapian::docid min_docid, double min_wt);
708 bool check(Xapian::docid min_docid, double min_wt);
710 bool at_end() const;
712 Xapian::docid get_docid() const;
714 FixedWeightPostingSource * clone() const;
715 std::string name() const;
716 std::string serialise() const;
717 FixedWeightPostingSource * unserialise(const std::string &serialised) const;
718 void init(const Database & db_);
720 std::string get_description() const;
725 #endif // XAPIAN_INCLUDED_POSTINGSOURCE_H