2 * @brief Weighting scheme API.
4 /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016 Olly Betts
5 * Copyright (C) 2009 Lemur Consulting Ltd
6 * Copyright (C) 2013,2014 Aarsh Shah
7 * Copyright (C) 2016 Vivek Pal
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
25 #define XAPIAN_INCLUDED_WEIGHT_H
29 #include <xapian/types.h>
30 #include <xapian/visibility.h>
34 /** Abstract base class for weighting schemes. */
35 class XAPIAN_VISIBILITY_DEFAULT Weight
{
37 /// Stats which the weighting scheme can use (see @a need_stat()).
39 /// Number of documents in the collection.
41 /// Number of documents in the RSet.
43 /// Average length of documents in the collection.
45 /// How many documents the current term is in.
47 /// How many documents in the RSet the current term is in.
49 /// Sum of wqf for terms in the query.
51 /// Within-query-frequency of the current term.
53 /// Within-document-frequency of the current term in the current document.
55 /// Length of the current document (sum wdf).
57 /// Lower bound on (non-zero) document lengths.
59 /// Upper bound on document lengths.
60 DOC_LENGTH_MAX
= 1024,
61 /// Upper bound on wdf.
63 /// Sum of wdf over the whole collection for the current term.
64 COLLECTION_FREQ
= 4096,
65 /// Number of unique terms in the current document.
69 /** Tell Xapian that your subclass will want a particular statistic.
71 * Some of the statistics can be costly to fetch or calculate, so
72 * Xapian needs to know which are actually going to be used. You
73 * should call need_stat() from your constructor for each such
76 * @param flag The stat_flags value for a required statistic.
78 void need_stat(stat_flags flag
) {
79 stats_needed
= stat_flags(stats_needed
| flag
);
82 /** Allow the subclass to perform any initialisation it needs to.
84 * @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
85 * If the Weight object is for the term-independent
86 * weight supplied by get_sumextra()/get_maxextra(),
87 * then init(0.0) is called (starting from Xapian
88 * 1.2.11 and 1.3.1 - earlier versions failed to
89 * call init() for such Weight objects).
91 virtual void init(double factor
) = 0;
94 /// Don't allow assignment.
95 void operator=(const Weight
&);
97 /// A bitmask of the statistics this weighting scheme needs.
98 stat_flags stats_needed
;
100 /// The number of documents in the collection.
101 Xapian::doccount collection_size_
;
103 /// The number of documents marked as relevant.
104 Xapian::doccount rset_size_
;
106 /// The average length of a document in the collection.
107 Xapian::doclength average_length_
;
109 /// The number of documents which this term indexes.
110 Xapian::doccount termfreq_
;
112 // The collection frequency of the term.
113 Xapian::termcount collectionfreq_
;
115 /// The number of relevant documents which this term indexes.
116 Xapian::doccount reltermfreq_
;
118 /// The length of the query.
119 Xapian::termcount query_length_
;
121 /// The within-query-frequency of this term.
122 Xapian::termcount wqf_
;
124 /// A lower bound on the minimum length of any document in the database.
125 Xapian::termcount doclength_lower_bound_
;
127 /// An upper bound on the maximum length of any document in the database.
128 Xapian::termcount doclength_upper_bound_
;
130 /// An upper bound on the wdf of this term.
131 Xapian::termcount wdf_upper_bound_
;
135 /// Default constructor, needed by subclass constructors.
136 Weight() : stats_needed() { }
138 /** Type of smoothing to use with the Language Model Weighting scheme.
140 * Default is TWO_STAGE_SMOOTHING.
143 TWO_STAGE_SMOOTHING
= 1,
144 DIRICHLET_SMOOTHING
= 2,
145 ABSOLUTE_DISCOUNT_SMOOTHING
= 3,
146 JELINEK_MERCER_SMOOTHING
= 4,
147 DIRICHLET_PLUS_SMOOTHING
= 5
152 /** Virtual destructor, because we have virtual methods. */
155 /** Clone this object.
157 * This method allocates and returns a copy of the object it is called on.
159 * If your subclass is called FooWeight and has parameters a and b, then
160 * you would implement FooWeight::clone() like so:
162 * FooWeight * FooWeight::clone() const { return new FooWeight(a, b); }
164 * Note that the returned object will be deallocated by Xapian after use
165 * with "delete". If you want to handle the deletion in a special way
166 * (for example when wrapping the Xapian API for use from another
167 * language) then you can define a static <code>operator delete</code>
168 * method in your subclass as shown here:
169 * https://trac.xapian.org/ticket/554#comment:1
171 virtual Weight
* clone() const = 0;
173 /** Return the name of this weighting scheme.
175 * This name is used by the remote backend. It is passed along with the
176 * serialised parameters to the remote server so that it knows which class
179 * Return the full namespace-qualified name of your class here - if
180 * your class is called FooWeight, return "FooWeight" from this method
181 * (Xapian::BM25Weight returns "Xapian::BM25Weight" here).
183 * If you don't want to support the remote backend, you can use the
184 * default implementation which simply returns an empty string.
186 virtual std::string
name() const;
188 /** Return this object's parameters serialised as a single string.
190 * If you don't want to support the remote backend, you can use the
191 * default implementation which simply throws Xapian::UnimplementedError.
193 virtual std::string
serialise() const;
195 /** Unserialise parameters.
197 * This method unserialises parameters serialised by the @a serialise()
198 * method and allocates and returns a new object initialised with them.
200 * If you don't want to support the remote backend, you can use the
201 * default implementation which simply throws Xapian::UnimplementedError.
203 * Note that the returned object will be deallocated by Xapian after use
204 * with "delete". If you want to handle the deletion in a special way
205 * (for example when wrapping the Xapian API for use from another
206 * language) then you can define a static <code>operator delete</code>
207 * method in your subclass as shown here:
208 * https://trac.xapian.org/ticket/554#comment:1
210 * @param serialised A string containing the serialised parameters.
212 virtual Weight
* unserialise(const std::string
& serialised
) const;
214 /** Calculate the weight contribution for this object's term to a document.
216 * The parameters give information about the document which may be used
217 * in the calculations:
219 * @param wdf The within document frequency of the term in the document.
220 * @param doclen The document's length (unnormalised).
221 * @param uniqterms Number of unique terms in the document (used
222 * for absolute smoothing).
224 virtual double get_sumpart(Xapian::termcount wdf
,
225 Xapian::termcount doclen
,
226 Xapian::termcount uniqterms
) const = 0;
228 /** Return an upper bound on what get_sumpart() can return for any document.
230 * This information is used by the matcher to perform various
231 * optimisations, so strive to make the bound as tight as possible.
233 virtual double get_maxpart() const = 0;
235 /** Calculate the term-independent weight component for a document.
237 * The parameter gives information about the document which may be used
238 * in the calculations:
240 * @param doclen The document's length (unnormalised).
241 * @param uniqterms The number of unique terms in the document.
243 virtual double get_sumextra(Xapian::termcount doclen
,
244 Xapian::termcount uniqterms
) const = 0;
246 /** Return an upper bound on what get_sumextra() can return for any
249 * This information is used by the matcher to perform various
250 * optimisations, so strive to make the bound as tight as possible.
252 virtual double get_maxextra() const = 0;
254 /** @private @internal Initialise this object to calculate weights for term
257 * @param stats Source of statistics.
258 * @param query_len_ Query length.
259 * @param term The term for the new object.
260 * @param wqf_ The within-query-frequency of @a term.
261 * @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
263 void init_(const Internal
& stats
, Xapian::termcount query_len_
,
264 const std::string
& term
, Xapian::termcount wqf_
,
267 /** @private @internal Initialise this object to calculate weights for a
270 * @param stats Source of statistics.
271 * @param query_len_ Query length.
272 * @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
273 * @param termfreq The termfreq to use.
274 * @param reltermfreq The reltermfreq to use.
275 * @param collection_freq The collection frequency to use.
277 void init_(const Internal
& stats
, Xapian::termcount query_len_
,
278 double factor
, Xapian::doccount termfreq
,
279 Xapian::doccount reltermfreq
, Xapian::termcount collection_freq
);
281 /** @private @internal Initialise this object to calculate the extra weight
284 * @param stats Source of statistics.
285 * @param query_len_ Query length.
287 void init_(const Internal
& stats
, Xapian::termcount query_len_
);
289 /** @private @internal Return true if the document length is needed.
291 * If this method returns true, then the document length will be fetched
292 * and passed to @a get_sumpart(). Otherwise 0 may be passed for the
295 bool get_sumpart_needs_doclength_() const {
296 return stats_needed
& DOC_LENGTH
;
299 /** @private @internal Return true if the WDF is needed.
301 * If this method returns true, then the WDF will be fetched and passed to
302 * @a get_sumpart(). Otherwise 0 may be passed for the wdf.
304 bool get_sumpart_needs_wdf_() const {
305 return stats_needed
& WDF
;
308 /** @private @internal Return true if the number of unique terms is needed.
310 * If this method returns true, then the number of unique terms will be
311 * fetched and passed to @a get_sumpart(). Otherwise 0 may be passed for
312 * the number of unique terms.
314 bool get_sumpart_needs_uniqueterms_() const {
315 return stats_needed
& UNIQUE_TERMS
;
319 /** Don't allow copying.
321 * This would ideally be private, but that causes a compilation error
322 * with GCC 4.1 (which appears to be a bug).
324 Weight(const Weight
&);
326 /// The number of documents in the collection.
327 Xapian::doccount
get_collection_size() const { return collection_size_
; }
329 /// The number of documents marked as relevant.
330 Xapian::doccount
get_rset_size() const { return rset_size_
; }
332 /// The average length of a document in the collection.
333 Xapian::doclength
get_average_length() const { return average_length_
; }
335 /// The number of documents which this term indexes.
336 Xapian::doccount
get_termfreq() const { return termfreq_
; }
338 /// The number of relevant documents which this term indexes.
339 Xapian::doccount
get_reltermfreq() const { return reltermfreq_
; }
341 /// The collection frequency of the term.
342 Xapian::termcount
get_collection_freq() const { return collectionfreq_
; }
344 /// The length of the query.
345 Xapian::termcount
get_query_length() const { return query_length_
; }
347 /// The within-query-frequency of this term.
348 Xapian::termcount
get_wqf() const { return wqf_
; }
350 /** An upper bound on the maximum length of any document in the database.
352 * This should only be used by get_maxpart() and get_maxextra().
354 Xapian::termcount
get_doclength_upper_bound() const {
355 return doclength_upper_bound_
;
358 /** A lower bound on the minimum length of any document in the database.
360 * This bound does not include any zero-length documents.
362 * This should only be used by get_maxpart() and get_maxextra().
364 Xapian::termcount
get_doclength_lower_bound() const {
365 return doclength_lower_bound_
;
368 /** An upper bound on the wdf of this term.
370 * This should only be used by get_maxpart() and get_maxextra().
372 Xapian::termcount
get_wdf_upper_bound() const {
373 return wdf_upper_bound_
;
377 /** Class implementing a "boolean" weighting scheme.
379 * This weighting scheme gives all documents zero weight.
381 class XAPIAN_VISIBILITY_DEFAULT BoolWeight
: public Weight
{
382 BoolWeight
* clone() const;
384 void init(double factor
);
387 /** Construct a BoolWeight. */
390 std::string
name() const;
392 std::string
serialise() const;
393 BoolWeight
* unserialise(const std::string
& serialised
) const;
395 double get_sumpart(Xapian::termcount wdf
,
396 Xapian::termcount doclen
,
397 Xapian::termcount uniqterms
) const;
398 double get_maxpart() const;
400 double get_sumextra(Xapian::termcount doclen
,
401 Xapian::termcount uniqterms
) const;
402 double get_maxextra() const;
405 /// Xapian::Weight subclass implementing the tf-idf weighting scheme.
406 class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight
: public Weight
{
407 /* Three character string indicating the normalizations for tf(wdf), idf and
409 std::string normalizations
;
411 /// The factor to multiply with the weight.
414 TfIdfWeight
* clone() const;
416 void init(double factor
);
418 /* When additional normalizations are implemented in the future, the additional statistics for them
419 should be accessed by these functions. */
420 double get_wdfn(Xapian::termcount wdf
, char c
) const;
421 double get_idfn(Xapian::doccount termfreq
, char c
) const;
422 double get_wtn(double wt
, char c
) const;
425 /** Construct a TfIdfWeight
427 * @param normalizations A three character string indicating the
428 * normalizations to be used for the tf(wdf), idf
429 * and document weight. (default: "ntn")
431 * The @a normalizations string works like so:
433 * @li The first character specifies the normalization for the wdf. The
434 * following normalizations are currently supported:
436 * @li 'n': None. wdfn=wdf
437 * @li 'b': Boolean wdfn=1 if term in document else wdfn=0
438 * @li 's': Square wdfn=wdf*wdf
439 * @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
441 * The Max-wdf and Augmented Max wdf normalizations haven't yet been
444 * @li The second character indicates the normalization for the idf. The
445 * following normalizations are currently supported:
447 * @li 'n': None idfn=1
448 * @li 't': TfIdf idfn=log(N/Termfreq) where N is the number of
449 * documents in collection and Termfreq is the number of documents
450 * which are indexed by the term t.
451 * @li 'p': Prob idfn=log((N-Termfreq)/Termfreq)
452 * @li 'f': Freq idfn=1/Termfreq
453 * @li 's': Squared idfn=log(N/Termfreq)^2
455 * @li The third and the final character indicates the normalization for
456 * the document weight. The following normalizations are currently
459 * @li 'n': None wtn=tfn*idfn
461 * Implementing support for more normalizations of each type would require
462 * extending the backend to track more statistics.
464 explicit TfIdfWeight(const std::string
&normalizations
);
466 /** Construct a TfIdfWeight using the default normalizations ("ntn"). */
468 : normalizations("ntn")
473 need_stat(COLLECTION_SIZE
);
476 std::string
name() const;
478 std::string
serialise() const;
479 TfIdfWeight
* unserialise(const std::string
& serialised
) const;
481 double get_sumpart(Xapian::termcount wdf
,
482 Xapian::termcount doclen
,
483 Xapian::termcount uniqterm
) const;
484 double get_maxpart() const;
486 double get_sumextra(Xapian::termcount doclen
,
487 Xapian::termcount uniqterms
) const;
488 double get_maxextra() const;
492 /// Xapian::Weight subclass implementing the BM25 probabilistic formula.
493 class XAPIAN_VISIBILITY_DEFAULT BM25Weight
: public Weight
{
494 /// Factor to multiply the document length by.
495 mutable Xapian::doclength len_factor
;
497 /// Factor combining all the document independent factors.
498 mutable double termweight
;
500 /// The BM25 parameters.
501 double param_k1
, param_k2
, param_k3
, param_b
;
503 /// The minimum normalised document length value.
504 Xapian::doclength param_min_normlen
;
506 BM25Weight
* clone() const;
508 void init(double factor
);
511 /** Construct a BM25Weight.
513 * @param k1 A non-negative parameter controlling how influential
514 * within-document-frequency (wdf) is. k1=0 means that
515 * wdf doesn't affect the weights. The larger k1 is, the more
516 * wdf influences the weights. (default 1)
518 * @param k2 A non-negative parameter which controls the strength of a
519 * correction factor which depends upon query length and
520 * normalised document length. k2=0 disable this factor; larger
521 * k2 makes it stronger. (default 0)
523 * @param k3 A non-negative parameter controlling how influential
524 * within-query-frequency (wqf) is. k3=0 means that wqf
525 * doesn't affect the weights. The larger k3 is, the more
526 * wqf influences the weights. (default 1)
528 * @param b A parameter between 0 and 1, controlling how strong the
529 * document length normalisation of wdf is. 0 means no
530 * normalisation; 1 means full normalisation. (default 0.5)
532 * @param min_normlen A parameter specifying a minimum value for
533 * normalised document length. Normalised document length
534 * values less than this will be clamped to this value, helping
535 * to prevent very short documents getting large weights.
538 BM25Weight(double k1
, double k2
, double k3
, double b
, double min_normlen
)
539 : param_k1(k1
), param_k2(k2
), param_k3(k3
), param_b(b
),
540 param_min_normlen(min_normlen
)
542 if (param_k1
< 0) param_k1
= 0;
543 if (param_k2
< 0) param_k2
= 0;
544 if (param_k3
< 0) param_k3
= 0;
547 } else if (param_b
> 1) {
550 need_stat(COLLECTION_SIZE
);
551 need_stat(RSET_SIZE
);
553 need_stat(RELTERMFREQ
);
556 if (param_k2
!= 0 || (param_k1
!= 0 && param_b
!= 0)) {
557 need_stat(DOC_LENGTH_MIN
);
558 need_stat(AVERAGE_LENGTH
);
560 if (param_k1
!= 0 && param_b
!= 0) need_stat(DOC_LENGTH
);
561 if (param_k2
!= 0) need_stat(QUERY_LENGTH
);
562 if (param_k3
!= 0) need_stat(WQF
);
566 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
567 param_min_normlen(0.5)
569 need_stat(COLLECTION_SIZE
);
570 need_stat(RSET_SIZE
);
572 need_stat(RELTERMFREQ
);
575 need_stat(DOC_LENGTH_MIN
);
576 need_stat(AVERAGE_LENGTH
);
577 need_stat(DOC_LENGTH
);
581 std::string
name() const;
583 std::string
serialise() const;
584 BM25Weight
* unserialise(const std::string
& serialised
) const;
586 double get_sumpart(Xapian::termcount wdf
,
587 Xapian::termcount doclen
,
588 Xapian::termcount uniqterm
) const;
589 double get_maxpart() const;
591 double get_sumextra(Xapian::termcount doclen
,
592 Xapian::termcount uniqterms
) const;
593 double get_maxextra() const;
596 /// Xapian::Weight subclass implementing the BM25+ probabilistic formula.
597 class XAPIAN_VISIBILITY_DEFAULT BM25PlusWeight
: public Weight
{
598 /// Factor to multiply the document length by.
599 mutable Xapian::doclength len_factor
;
601 /// Factor combining all the document independent factors.
602 mutable double termweight
;
604 /// The BM25+ parameters.
605 double param_k1
, param_k2
, param_k3
, param_b
;
607 /// The minimum normalised document length value.
608 Xapian::doclength param_min_normlen
;
610 /// Additional parameter delta in the BM25+ formula.
613 BM25PlusWeight
* clone() const;
615 void init(double factor
);
618 /** Construct a BM25PlusWeight.
620 * @param k1 A non-negative parameter controlling how influential
621 * within-document-frequency (wdf) is. k1=0 means that
622 * wdf doesn't affect the weights. The larger k1 is, the more
623 * wdf influences the weights. (default 1)
625 * @param k2 A non-negative parameter which controls the strength of a
626 * correction factor which depends upon query length and
627 * normalised document length. k2=0 disable this factor; larger
628 * k2 makes it stronger. The paper which describes BM25+
629 * ignores BM25's document-independent component (so implicitly
630 * k2=0), but we support non-zero k2 too. (default 0)
632 * @param k3 A non-negative parameter controlling how influential
633 * within-query-frequency (wqf) is. k3=0 means that wqf
634 * doesn't affect the weights. The larger k3 is, the more
635 * wqf influences the weights. (default 1)
637 * @param b A parameter between 0 and 1, controlling how strong the
638 * document length normalisation of wdf is. 0 means no
639 * normalisation; 1 means full normalisation. (default 0.5)
641 * @param min_normlen A parameter specifying a minimum value for
642 * normalised document length. Normalised document length
643 * values less than this will be clamped to this value, helping
644 * to prevent very short documents getting large weights.
647 * @param delta A parameter for pseudo tf value to control the scale
648 * of the tf lower bound. Delta(δ) can be tuned for example
649 * from 0.0 to 1.5 but BM25+ can still work effectively
650 * across collections with a fixed δ = 1.0. (default 1.0)
652 BM25PlusWeight(double k1
, double k2
, double k3
, double b
,
653 double min_normlen
, double delta
)
654 : param_k1(k1
), param_k2(k2
), param_k3(k3
), param_b(b
),
655 param_min_normlen(min_normlen
), param_delta(delta
)
657 if (param_k1
< 0) param_k1
= 0;
658 if (param_k2
< 0) param_k2
= 0;
659 if (param_k3
< 0) param_k3
= 0;
660 if (param_delta
< 0) param_delta
= 0;
663 } else if (param_b
> 1) {
666 need_stat(COLLECTION_SIZE
);
667 need_stat(RSET_SIZE
);
669 need_stat(RELTERMFREQ
);
672 if (param_k2
!= 0 || (param_k1
!= 0 && param_b
!= 0)) {
673 need_stat(DOC_LENGTH_MIN
);
674 need_stat(AVERAGE_LENGTH
);
676 if (param_k1
!= 0 && param_b
!= 0) need_stat(DOC_LENGTH
);
677 if (param_k2
!= 0) need_stat(QUERY_LENGTH
);
678 if (param_k3
!= 0) need_stat(WQF
);
679 if (param_delta
!= 0) {
680 need_stat(AVERAGE_LENGTH
);
681 need_stat(DOC_LENGTH
);
687 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
688 param_min_normlen(0.5), param_delta(1)
690 need_stat(COLLECTION_SIZE
);
691 need_stat(RSET_SIZE
);
693 need_stat(RELTERMFREQ
);
696 need_stat(DOC_LENGTH_MIN
);
697 need_stat(AVERAGE_LENGTH
);
698 need_stat(DOC_LENGTH
);
702 std::string
name() const;
704 std::string
serialise() const;
705 BM25PlusWeight
* unserialise(const std::string
& serialised
) const;
707 double get_sumpart(Xapian::termcount wdf
,
708 Xapian::termcount doclen
,
709 Xapian::termcount uniqterm
) const;
710 double get_maxpart() const;
712 double get_sumextra(Xapian::termcount doclen
,
713 Xapian::termcount uniqterms
) const;
714 double get_maxextra() const;
717 /** Xapian::Weight subclass implementing the traditional probabilistic formula.
719 * This class implements the "traditional" Probabilistic Weighting scheme, as
720 * described by the early papers on Probabilistic Retrieval. BM25 generally
721 * gives better results.
723 * TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
724 * the latter returns weights (k+1) times larger.
726 class XAPIAN_VISIBILITY_DEFAULT TradWeight
: public Weight
{
727 /// Factor to multiply the document length by.
728 mutable Xapian::doclength len_factor
;
730 /// Factor combining all the document independent factors.
731 mutable double termweight
;
733 /// The parameter in the formula.
736 TradWeight
* clone() const;
738 void init(double factor
);
741 /** Construct a TradWeight.
743 * @param k A non-negative parameter controlling how influential
744 * within-document-frequency (wdf) and document length are.
745 * k=0 means that wdf and document length don't affect the
746 * weights. The larger k is, the more they do. (default 1)
748 explicit TradWeight(double k
= 1.0) : param_k(k
) {
749 if (param_k
< 0) param_k
= 0;
750 if (param_k
!= 0.0) {
751 need_stat(AVERAGE_LENGTH
);
752 need_stat(DOC_LENGTH
);
754 need_stat(COLLECTION_SIZE
);
755 need_stat(RSET_SIZE
);
757 need_stat(RELTERMFREQ
);
758 need_stat(DOC_LENGTH_MIN
);
763 std::string
name() const;
765 std::string
serialise() const;
766 TradWeight
* unserialise(const std::string
& serialised
) const;
768 double get_sumpart(Xapian::termcount wdf
,
769 Xapian::termcount doclen
,
770 Xapian::termcount uniqueterms
) const;
771 double get_maxpart() const;
773 double get_sumextra(Xapian::termcount doclen
,
774 Xapian::termcount uniqterms
) const;
775 double get_maxextra() const;
778 /** This class implements the InL2 weighting scheme.
780 * InL2 is a representative scheme of the Divergence from Randomness Framework
783 * This weighting scheme is useful for tasks that require early precision.
785 * It uses the Inverse document frequency model (In), the Laplace method to
786 * find the aftereffect of sampling (L) and the second wdf normalization
787 * proposed by Amati to normalize the wdf in the document to the length of the
790 * For more information about the DFR Framework and the InL2 scheme, please
791 * refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
792 * models of information retrieval based on measuring the divergence from
793 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
796 class XAPIAN_VISIBILITY_DEFAULT InL2Weight
: public Weight
{
797 /// The wdf normalization parameter in the formula.
800 /// The upper bound on the weight a term can give to a document.
803 /// The constant values which are used on every call to get_sumpart().
804 double wqf_product_idf
;
805 double c_product_avlen
;
807 InL2Weight
* clone() const;
809 void init(double factor
);
812 /** Construct an InL2Weight.
814 * @param c A non-negative and non zero parameter controlling the extent
815 * of the normalization of the wdf to the document length. The
816 * default value of 1 is suitable for longer queries but it may
817 * need to be changed for shorter queries. For more information,
818 * please refer to Gianni Amati's PHD thesis.
820 explicit InL2Weight(double c
);
825 need_stat(AVERAGE_LENGTH
);
826 need_stat(DOC_LENGTH
);
827 need_stat(DOC_LENGTH_MIN
);
828 need_stat(DOC_LENGTH_MAX
);
829 need_stat(COLLECTION_SIZE
);
836 std::string
name() const;
838 std::string
serialise() const;
839 InL2Weight
* unserialise(const std::string
& serialised
) const;
841 double get_sumpart(Xapian::termcount wdf
,
842 Xapian::termcount doclen
,
843 Xapian::termcount uniqterms
) const;
844 double get_maxpart() const;
846 double get_sumextra(Xapian::termcount doclen
,
847 Xapian::termcount uniqterms
) const;
848 double get_maxextra() const;
851 /** This class implements the IfB2 weighting scheme.
853 * IfB2 is a representative scheme of the Divergence from Randomness Framework
856 * It uses the Inverse term frequency model (If), the Bernoulli method to find
857 * the aftereffect of sampling (B) and the second wdf normalization proposed
858 * by Amati to normalize the wdf in the document to the length of the document
861 * For more information about the DFR Framework and the IfB2 scheme, please
862 * refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
863 * models of information retrieval based on measuring the divergence from
864 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
867 class XAPIAN_VISIBILITY_DEFAULT IfB2Weight
: public Weight
{
868 /// The wdf normalization parameter in the formula.
871 /// The upper bound on the weight.
874 /// The constant values which are used for calculations in get_sumpart().
875 double wqf_product_idf
;
876 double c_product_avlen
;
879 IfB2Weight
* clone() const;
881 void init(double factor
);
884 /** Construct an IfB2Weight.
886 * @param c A non-negative and non zero parameter controlling the extent
887 * of the normalization of the wdf to the document length. The
888 * default value of 1 is suitable for longer queries but it may
889 * need to be changed for shorter queries. For more information,
890 * please refer to Gianni Amati's PHD thesis titled
891 * Probabilistic Models for Information Retrieval based on
892 * Divergence from Randomness.
894 explicit IfB2Weight(double c
);
896 IfB2Weight() : param_c(1.0) {
897 need_stat(AVERAGE_LENGTH
);
898 need_stat(DOC_LENGTH
);
899 need_stat(DOC_LENGTH_MIN
);
900 need_stat(DOC_LENGTH_MAX
);
901 need_stat(COLLECTION_SIZE
);
902 need_stat(COLLECTION_FREQ
);
909 std::string
name() const;
911 std::string
serialise() const;
912 IfB2Weight
* unserialise(const std::string
& serialised
) const;
914 double get_sumpart(Xapian::termcount wdf
,
915 Xapian::termcount doclen
,
916 Xapian::termcount uniqterm
) const;
917 double get_maxpart() const;
919 double get_sumextra(Xapian::termcount doclen
,
920 Xapian::termcount uniqterms
) const;
921 double get_maxextra() const;
924 /** This class implements the IneB2 weighting scheme.
926 * IneB2 is a representative scheme of the Divergence from Randomness
927 * Framework by Gianni Amati.
929 * It uses the Inverse expected document frequency model (Ine), the Bernoulli
930 * method to find the aftereffect of sampling (B) and the second wdf
931 * normalization proposed by Amati to normalize the wdf in the document to the
932 * length of the document (H2).
934 * For more information about the DFR Framework and the IneB2 scheme, please
935 * refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
936 * models of information retrieval based on measuring the divergence from
937 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
940 class XAPIAN_VISIBILITY_DEFAULT IneB2Weight
: public Weight
{
941 /// The wdf normalization parameter in the formula.
944 /// The upper bound of the weight.
947 /// Constant values used in get_sumpart().
948 double wqf_product_idf
;
949 double c_product_avlen
;
952 IneB2Weight
* clone() const;
954 void init(double factor
);
957 /** Construct an IneB2Weight.
959 * @param c A non-negative and non zero parameter controlling the extent
960 * of the normalization of the wdf to the document length. The
961 * default value of 1 is suitable for longer queries but it may
962 * need to be changed for shorter queries. For more information,
963 * please refer to Gianni Amati's PHD thesis.
965 explicit IneB2Weight(double c
);
967 IneB2Weight() : param_c(1.0) {
968 need_stat(AVERAGE_LENGTH
);
969 need_stat(DOC_LENGTH
);
970 need_stat(DOC_LENGTH_MIN
);
971 need_stat(DOC_LENGTH_MAX
);
972 need_stat(COLLECTION_SIZE
);
976 need_stat(COLLECTION_FREQ
);
980 std::string
name() const;
982 std::string
serialise() const;
983 IneB2Weight
* unserialise(const std::string
& serialised
) const;
985 double get_sumpart(Xapian::termcount wdf
,
986 Xapian::termcount doclen
,
987 Xapian::termcount uniqterms
) const;
988 double get_maxpart() const;
990 double get_sumextra(Xapian::termcount doclen
,
991 Xapian::termcount uniqterms
) const;
992 double get_maxextra() const;
995 /** This class implements the BB2 weighting scheme.
997 * BB2 is a representative scheme of the Divergence from Randomness Framework
1000 * It uses the Bose-Einstein probabilistic distribution (B) along with
1001 * Stirling's power approximation, the Bernoulli method to find the
1002 * aftereffect of sampling (B) and the second wdf normalization proposed by
1003 * Amati to normalize the wdf in the document to the length of the document
1006 * For more information about the DFR Framework and the BB2 scheme, please
1007 * refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
1008 * models of information retrieval based on measuring the divergence from
1009 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
1012 class XAPIAN_VISIBILITY_DEFAULT BB2Weight
: public Weight
{
1013 /// The wdf normalization parameter in the formula.
1016 /// The upper bound on the weight.
1019 /// The constant values to be used in get_sumpart().
1020 double c_product_avlen
;
1023 double stirling_constant_1
;
1024 double stirling_constant_2
;
1026 BB2Weight
* clone() const;
1028 void init(double factor
);
1031 /** Construct a BB2Weight.
1033 * @param c A non-negative and non zero parameter controlling the extent
1034 * of the normalization of the wdf to the document length. A
1035 * default value of 1 is suitable for longer queries but it may
1036 * need to be changed for shorter queries. For more information,
1037 * please refer to Gianni Amati's PHD thesis titled
1038 * Probabilistic Models for Information Retrieval based on
1039 * Divergence from Randomness.
1041 explicit BB2Weight(double c
);
1043 BB2Weight() : param_c(1.0) {
1044 need_stat(AVERAGE_LENGTH
);
1045 need_stat(DOC_LENGTH
);
1046 need_stat(DOC_LENGTH_MIN
);
1047 need_stat(DOC_LENGTH_MAX
);
1048 need_stat(COLLECTION_SIZE
);
1049 need_stat(COLLECTION_FREQ
);
1053 need_stat(TERMFREQ
);
1056 std::string
name() const;
1058 std::string
serialise() const;
1059 BB2Weight
* unserialise(const std::string
& serialised
) const;
1061 double get_sumpart(Xapian::termcount wdf
,
1062 Xapian::termcount doclen
,
1063 Xapian::termcount uniqterms
) const;
1064 double get_maxpart() const;
1066 double get_sumextra(Xapian::termcount doclen
,
1067 Xapian::termcount uniqterms
) const;
1068 double get_maxextra() const;
1071 /** This class implements the DLH weighting scheme, which is a representative
1072 * scheme of the Divergence from Randomness Framework by Gianni Amati.
1074 * This is a parameter free weighting scheme and it should be used with query
1075 * expansion to obtain better results. It uses the HyperGeometric Probabilistic
1076 * model and Laplace's normalization to calculate the risk gain.
1078 * For more information about the DFR Framework and the DLH scheme, please
1080 * a.) Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
1081 * models of information retrieval based on measuring the divergence from
1082 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp.
1084 * b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
1085 * G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
1086 * Proceedings of the 16th Text REtrieval Conference (TREC-2007), 2008.
1088 class XAPIAN_VISIBILITY_DEFAULT DLHWeight
: public Weight
{
1089 /// Now unused but left in place in 1.4.x for ABI compatibility.
1092 /// The upper bound on the weight.
1095 /// The constant value to be used in get_sumpart().
1096 double log_constant
;
1097 double wqf_product_factor
;
1099 DLHWeight
* clone() const;
1101 void init(double factor
);
1105 need_stat(AVERAGE_LENGTH
);
1106 need_stat(DOC_LENGTH
);
1107 need_stat(COLLECTION_SIZE
);
1108 need_stat(COLLECTION_FREQ
);
1112 need_stat(DOC_LENGTH_MIN
);
1113 need_stat(DOC_LENGTH_MAX
);
1116 std::string
name() const;
1118 std::string
serialise() const;
1119 DLHWeight
* unserialise(const std::string
& serialised
) const;
1121 double get_sumpart(Xapian::termcount wdf
,
1122 Xapian::termcount doclen
,
1123 Xapian::termcount uniqterms
) const;
1124 double get_maxpart() const;
1126 double get_sumextra(Xapian::termcount doclen
,
1127 Xapian::termcount uniqterms
) const;
1128 double get_maxextra() const;
1131 /** This class implements the PL2 weighting scheme.
1133 * PL2 is a representative scheme of the Divergence from Randomness Framework
1136 * This weighting scheme is useful for tasks that require early precision.
1138 * It uses the Poisson approximation of the Binomial Probabilistic distribution
1139 * (P) along with Stirling's approximation for the factorial value, the Laplace
1140 * method to find the aftereffect of sampling (L) and the second wdf
1141 * normalization proposed by Amati to normalize the wdf in the document to the
1142 * length of the document (H2).
1144 * For more information about the DFR Framework and the PL2 scheme, please
1145 * refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic models
1146 * of information retrieval based on measuring the divergence from randomness
1147 * ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp. 357-389.
1149 class XAPIAN_VISIBILITY_DEFAULT PL2Weight
: public Weight
{
1150 /// The wdf normalization parameter in the formula.
1153 /** The factor to multiply weights by.
1155 * The misleading name is due to this having been used to store a lower
1156 * bound in 1.4.0. We no longer need to store that, and so this member
1157 * has been repurposed in 1.4.1 and later (but the name left the same to
1158 * ensure ABI compatibility with 1.4.0).
1162 /// The upper bound on the weight.
1165 /// Constants for a given term in a given query.
1168 /// Set by init() to (param_c * get_average_length())
1171 PL2Weight
* clone() const;
1173 void init(double factor
);
1176 /** Construct a PL2Weight.
1178 * @param c A non-negative and non zero parameter controlling the extent
1179 * of the normalization of the wdf to the document length. The
1180 * default value of 1 is suitable for longer queries but it may
1181 * need to be changed for shorter queries. For more information,
1182 * please refer to Gianni Amati's PHD thesis titled
1183 * Probabilistic Models for Information Retrieval based on
1184 * Divergence from Randomness.
1186 explicit PL2Weight(double c
);
1188 PL2Weight() : param_c(1.0) {
1189 need_stat(AVERAGE_LENGTH
);
1190 need_stat(DOC_LENGTH
);
1191 need_stat(DOC_LENGTH_MIN
);
1192 need_stat(DOC_LENGTH_MAX
);
1193 need_stat(COLLECTION_SIZE
);
1194 need_stat(COLLECTION_FREQ
);
1200 std::string
name() const;
1202 std::string
serialise() const;
1203 PL2Weight
* unserialise(const std::string
& serialised
) const;
1205 double get_sumpart(Xapian::termcount wdf
,
1206 Xapian::termcount doclen
,
1207 Xapian::termcount uniqterms
) const;
1208 double get_maxpart() const;
1210 double get_sumextra(Xapian::termcount doclen
,
1211 Xapian::termcount uniqterms
) const;
1212 double get_maxextra() const;
1215 /// Xapian::Weight subclass implementing the PL2+ probabilistic formula.
1216 class XAPIAN_VISIBILITY_DEFAULT PL2PlusWeight
: public Weight
{
1217 /// The factor to multiply weights by.
1220 /// The wdf normalization parameter in the formula.
1223 /// Additional parameter delta in the PL2+ weighting formula.
1226 /// The upper bound on the weight.
1229 /// Constants for a given term in a given query.
1232 /// Set by init() to (param_c * get_average_length())
1235 /// Set by init() to get_collection_freq()) / get_collection_size()
1238 /// Weight contribution of delta term in the PL2+ function
1241 PL2PlusWeight
* clone() const;
1243 void init(double factor_
);
1246 /** Construct a PL2PlusWeight.
1248 * @param c A non-negative and non zero parameter controlling the extent
1249 * of the normalization of the wdf to the document length. The
1250 * default value of 1 is suitable for longer queries but it may
1251 * need to be changed for shorter queries. For more information,
1252 * please refer to Gianni Amati's PHD thesis titled
1253 * Probabilistic Models for Information Retrieval based on
1254 * Divergence from Randomness.
1256 * @param delta A parameter for pseudo tf value to control the scale
1257 * of the tf lower bound. Delta(δ) should be a positive
1258 * real number. It can be tuned for example from 0.1 to 1.5
1259 * in increments of 0.1 or so. Experiments have shown that
1260 * PL2+ works effectively across collections with a fixed δ = 0.8
1263 PL2PlusWeight(double c
, double delta
);
1266 : param_c(1.0), param_delta(0.8) {
1267 need_stat(AVERAGE_LENGTH
);
1268 need_stat(DOC_LENGTH
);
1269 need_stat(DOC_LENGTH_MIN
);
1270 need_stat(DOC_LENGTH_MAX
);
1271 need_stat(COLLECTION_SIZE
);
1272 need_stat(COLLECTION_FREQ
);
1278 std::string
name() const;
1280 std::string
serialise() const;
1281 PL2PlusWeight
* unserialise(const std::string
& serialised
) const;
1283 double get_sumpart(Xapian::termcount wdf
,
1284 Xapian::termcount doclen
,
1285 Xapian::termcount uniqterms
) const;
1286 double get_maxpart() const;
1288 double get_sumextra(Xapian::termcount doclen
,
1289 Xapian::termcount uniqterms
) const;
1290 double get_maxextra() const;
1293 /** This class implements the DPH weighting scheme.
1295 * DPH is a representative scheme of the Divergence from Randomness Framework
1298 * This is a parameter free weighting scheme and it should be used with query
1299 * expansion to obtain better results. It uses the HyperGeometric Probabilistic
1300 * model and Popper's normalization to calculate the risk gain.
1302 * For more information about the DFR Framework and the DPH scheme, please
1304 * a.) Gianni Amati and Cornelis Joost Van Rijsbergen
1305 * Probabilistic models of information retrieval based on measuring the
1306 * divergence from randomness ACM Transactions on Information Systems (TOIS) 20,
1307 * (4), 2002, pp. 357-389.
1308 * b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
1309 * G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
1310 * Proceedings of the 16th Text Retrieval Conference (TREC-2007), 2008.
1312 class XAPIAN_VISIBILITY_DEFAULT DPHWeight
: public Weight
{
1313 /// The upper bound on the weight.
1316 /// Now unused but left in place in 1.4.x for ABI compatibility.
1319 /// The constant value used in get_sumpart() .
1320 double log_constant
;
1321 double wqf_product_factor
;
1323 DPHWeight
* clone() const;
1325 void init(double factor
);
1328 /** Construct a DPHWeight. */
1330 need_stat(AVERAGE_LENGTH
);
1331 need_stat(DOC_LENGTH
);
1332 need_stat(COLLECTION_SIZE
);
1333 need_stat(COLLECTION_FREQ
);
1337 need_stat(DOC_LENGTH_MIN
);
1338 need_stat(DOC_LENGTH_MAX
);
1341 std::string
name() const;
1343 std::string
serialise() const;
1344 DPHWeight
* unserialise(const std::string
& serialised
) const;
1346 double get_sumpart(Xapian::termcount wdf
,
1347 Xapian::termcount doclen
,
1348 Xapian::termcount uniqterms
) const;
1349 double get_maxpart() const;
1351 double get_sumextra(Xapian::termcount doclen
,
1352 Xapian::termcount uniqterms
) const;
1353 double get_maxextra() const;
1357 /** Xapian::Weight subclass implementing the Language Model formula.
1359 * This class implements the "Language Model" Weighting scheme, as
1360 * described by the early papers on LM by Bruce Croft.
1362 * LM works by comparing the query to a Language Model of the document.
1363 * The language model itself is parameter-free, though LMWeight takes
1364 * parameters which specify the smoothing used.
1366 class XAPIAN_VISIBILITY_DEFAULT LMWeight
: public Weight
{
1367 /** The type of smoothing to use. */
1368 type_smoothing select_smoothing
;
1370 // Parameters for handling negative value of log, and for smoothing.
1371 double param_log
, param_smoothing1
, param_smoothing2
;
1373 /** The factor to multiply weights by.
1375 * The misleading name is due to this having been used to store some
1376 * other value in 1.4.0. However, that value only takes one
1377 * multiplication and one division to calculate, so for 1.4.x we can just
1378 * recalculate it each time we need it, and so this member has been
1379 * repurposed in 1.4.1 and later (but the name left the same to ensure ABI
1380 * compatibility with 1.4.0).
1382 double weight_collection
;
1384 LMWeight
* clone() const;
1386 void init(double factor
);
1389 /** Construct a LMWeight.
1391 * @param param_log_ A non-negative parameter controlling how much
1392 * to clamp negative values returned by the log.
1393 * The log is calculated by multiplying the
1394 * actual weight by param_log. If param_log is
1395 * 0.0, then the document length upper bound will
1396 * be used (default: document length upper bound)
1398 * @param select_smoothing_ A parameter of type enum
1399 * type_smoothing. This parameter
1400 * controls which smoothing type to use.
1401 * (default: TWO_STAGE_SMOOTHING)
1403 * @param param_smoothing1_ A non-negative parameter for smoothing
1404 * whose meaning depends on
1405 * select_smoothing_. In
1406 * JELINEK_MERCER_SMOOTHING, it plays the
1407 * role of estimation and in
1408 * DIRICHLET_SMOOTHING the role of query
1409 * modelling. (default JELINEK_MERCER,
1410 * ABSOLUTE, TWOSTAGE(0.7),
1413 * @param param_smoothing2_ A non-negative parameter which is used
1414 * with TWO_STAGE_SMOOTHING as parameter for Dirichlet's
1415 * smoothing (default: 2000) and as parameter delta to
1416 * control the scale of the tf lower bound in the
1417 * DIRICHLET_PLUS_SMOOTHING (default 0.05).
1420 // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
1421 explicit LMWeight(double param_log_
= 0.0,
1422 type_smoothing select_smoothing_
= TWO_STAGE_SMOOTHING
,
1423 double param_smoothing1_
= -1.0,
1424 double param_smoothing2_
= -1.0)
1425 : select_smoothing(select_smoothing_
), param_log(param_log_
), param_smoothing1(param_smoothing1_
),
1426 param_smoothing2(param_smoothing2_
)
1428 if (param_smoothing1
< 0) param_smoothing1
= 0.7;
1429 if (param_smoothing2
< 0) {
1430 if (select_smoothing
== TWO_STAGE_SMOOTHING
)
1431 param_smoothing2
= 2000.0;
1433 param_smoothing2
= 0.05;
1435 need_stat(AVERAGE_LENGTH
);
1436 need_stat(DOC_LENGTH
);
1437 need_stat(COLLECTION_SIZE
);
1438 need_stat(RSET_SIZE
);
1439 need_stat(TERMFREQ
);
1440 need_stat(RELTERMFREQ
);
1441 need_stat(DOC_LENGTH_MAX
);
1444 need_stat(COLLECTION_FREQ
);
1445 if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
)
1446 need_stat(UNIQUE_TERMS
);
1447 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
)
1448 need_stat(DOC_LENGTH_MIN
);
1451 std::string
name() const;
1453 std::string
serialise() const;
1454 LMWeight
* unserialise(const std::string
& s
) const;
1456 double get_sumpart(Xapian::termcount wdf
,
1457 Xapian::termcount doclen
,
1458 Xapian::termcount uniqterm
) const;
1459 double get_maxpart() const;
1461 double get_sumextra(Xapian::termcount doclen
, Xapian::termcount
) const;
1462 double get_maxextra() const;
1465 /** Xapian::Weight subclass implementing Coordinate Matching.
1467 * Each matching term score one point. See Managing Gigabytes, Second Edition
1470 class XAPIAN_VISIBILITY_DEFAULT CoordWeight
: public Weight
{
1471 /// The factor to multiply weights by.
1475 CoordWeight
* clone() const ;
1477 void init(double factor_
);
1479 /** Construct a CoordWeight. */
1482 std::string
name() const;
1484 std::string
serialise() const;
1485 CoordWeight
* unserialise(const std::string
&) const;
1487 double get_sumpart(Xapian::termcount wdf
,
1488 Xapian::termcount doclen
,
1489 Xapian::termcount uniqterm
) const;
1490 double get_maxpart() const;
1492 double get_sumextra(Xapian::termcount
, Xapian::termcount
) const;
1493 double get_maxextra() const;
1498 #endif // XAPIAN_INCLUDED_WEIGHT_H