Fix whitespace irregularities in code
[xapian.git] / xapian-core / include / xapian / weight.h
blobac8ccf2f9839ff508b4f07671ac5f088dbe6498d
1 /** @file weight.h
2 * @brief Weighting scheme API.
3 */
4 /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016 Olly Betts
5 * Copyright (C) 2009 Lemur Consulting Ltd
6 * Copyright (C) 2013,2014 Aarsh Shah
7 * Copyright (C) 2016 Vivek Pal
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
25 #define XAPIAN_INCLUDED_WEIGHT_H
27 #include <string>
29 #include <xapian/types.h>
30 #include <xapian/visibility.h>
32 namespace Xapian {
34 /** Abstract base class for weighting schemes. */
35 class XAPIAN_VISIBILITY_DEFAULT Weight {
36 protected:
37 /// Stats which the weighting scheme can use (see @a need_stat()).
38 typedef enum {
39 /// Number of documents in the collection.
40 COLLECTION_SIZE = 1,
41 /// Number of documents in the RSet.
42 RSET_SIZE = 2,
43 /// Average length of documents in the collection.
44 AVERAGE_LENGTH = 4,
45 /// How many documents the current term is in.
46 TERMFREQ = 8,
47 /// How many documents in the RSet the current term is in.
48 RELTERMFREQ = 16,
49 /// Sum of wqf for terms in the query.
50 QUERY_LENGTH = 32,
51 /// Within-query-frequency of the current term.
52 WQF = 64,
53 /// Within-document-frequency of the current term in the current document.
54 WDF = 128,
55 /// Length of the current document (sum wdf).
56 DOC_LENGTH = 256,
57 /// Lower bound on (non-zero) document lengths.
58 DOC_LENGTH_MIN = 512,
59 /// Upper bound on document lengths.
60 DOC_LENGTH_MAX = 1024,
61 /// Upper bound on wdf.
62 WDF_MAX = 2048,
63 /// Sum of wdf over the whole collection for the current term.
64 COLLECTION_FREQ = 4096,
65 /// Number of unique terms in the current document.
66 UNIQUE_TERMS = 8192
67 } stat_flags;
69 /** Tell Xapian that your subclass will want a particular statistic.
71 * Some of the statistics can be costly to fetch or calculate, so
72 * Xapian needs to know which are actually going to be used. You
73 * should call need_stat() from your constructor for each such
74 * statistic.
76 * @param flag The stat_flags value for a required statistic.
78 void need_stat(stat_flags flag) {
79 stats_needed = stat_flags(stats_needed | flag);
82 /** Allow the subclass to perform any initialisation it needs to.
84 * @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
85 * If the Weight object is for the term-independent
86 * weight supplied by get_sumextra()/get_maxextra(),
87 * then init(0.0) is called (starting from Xapian
88 * 1.2.11 and 1.3.1 - earlier versions failed to
89 * call init() for such Weight objects).
91 virtual void init(double factor) = 0;
93 private:
94 /// Don't allow assignment.
95 void operator=(const Weight &);
97 /// A bitmask of the statistics this weighting scheme needs.
98 stat_flags stats_needed;
100 /// The number of documents in the collection.
101 Xapian::doccount collection_size_;
103 /// The number of documents marked as relevant.
104 Xapian::doccount rset_size_;
106 /// The average length of a document in the collection.
107 Xapian::doclength average_length_;
109 /// The number of documents which this term indexes.
110 Xapian::doccount termfreq_;
112 // The collection frequency of the term.
113 Xapian::termcount collectionfreq_;
115 /// The number of relevant documents which this term indexes.
116 Xapian::doccount reltermfreq_;
118 /// The length of the query.
119 Xapian::termcount query_length_;
121 /// The within-query-frequency of this term.
122 Xapian::termcount wqf_;
124 /// A lower bound on the minimum length of any document in the database.
125 Xapian::termcount doclength_lower_bound_;
127 /// An upper bound on the maximum length of any document in the database.
128 Xapian::termcount doclength_upper_bound_;
130 /// An upper bound on the wdf of this term.
131 Xapian::termcount wdf_upper_bound_;
133 public:
135 /// Default constructor, needed by subclass constructors.
136 Weight() : stats_needed() { }
138 /** Type of smoothing to use with the Language Model Weighting scheme.
140 * Default is TWO_STAGE_SMOOTHING.
142 typedef enum {
143 TWO_STAGE_SMOOTHING = 1,
144 DIRICHLET_SMOOTHING = 2,
145 ABSOLUTE_DISCOUNT_SMOOTHING = 3,
146 JELINEK_MERCER_SMOOTHING = 4,
147 DIRICHLET_PLUS_SMOOTHING = 5
148 } type_smoothing;
150 class Internal;
152 /** Virtual destructor, because we have virtual methods. */
153 virtual ~Weight();
155 /** Clone this object.
157 * This method allocates and returns a copy of the object it is called on.
159 * If your subclass is called FooWeight and has parameters a and b, then
160 * you would implement FooWeight::clone() like so:
162 * FooWeight * FooWeight::clone() const { return new FooWeight(a, b); }
164 * Note that the returned object will be deallocated by Xapian after use
165 * with "delete". If you want to handle the deletion in a special way
166 * (for example when wrapping the Xapian API for use from another
167 * language) then you can define a static <code>operator delete</code>
168 * method in your subclass as shown here:
169 * https://trac.xapian.org/ticket/554#comment:1
171 virtual Weight * clone() const = 0;
173 /** Return the name of this weighting scheme.
175 * This name is used by the remote backend. It is passed along with the
176 * serialised parameters to the remote server so that it knows which class
177 * to create.
179 * Return the full namespace-qualified name of your class here - if
180 * your class is called FooWeight, return "FooWeight" from this method
181 * (Xapian::BM25Weight returns "Xapian::BM25Weight" here).
183 * If you don't want to support the remote backend, you can use the
184 * default implementation which simply returns an empty string.
186 virtual std::string name() const;
188 /** Return this object's parameters serialised as a single string.
190 * If you don't want to support the remote backend, you can use the
191 * default implementation which simply throws Xapian::UnimplementedError.
193 virtual std::string serialise() const;
195 /** Unserialise parameters.
197 * This method unserialises parameters serialised by the @a serialise()
198 * method and allocates and returns a new object initialised with them.
200 * If you don't want to support the remote backend, you can use the
201 * default implementation which simply throws Xapian::UnimplementedError.
203 * Note that the returned object will be deallocated by Xapian after use
204 * with "delete". If you want to handle the deletion in a special way
205 * (for example when wrapping the Xapian API for use from another
206 * language) then you can define a static <code>operator delete</code>
207 * method in your subclass as shown here:
208 * https://trac.xapian.org/ticket/554#comment:1
210 * @param serialised A string containing the serialised parameters.
212 virtual Weight * unserialise(const std::string & serialised) const;
214 /** Calculate the weight contribution for this object's term to a document.
216 * The parameters give information about the document which may be used
217 * in the calculations:
219 * @param wdf The within document frequency of the term in the document.
220 * @param doclen The document's length (unnormalised).
221 * @param uniqterms Number of unique terms in the document (used
222 * for absolute smoothing).
224 virtual double get_sumpart(Xapian::termcount wdf,
225 Xapian::termcount doclen,
226 Xapian::termcount uniqterms) const = 0;
228 /** Return an upper bound on what get_sumpart() can return for any document.
230 * This information is used by the matcher to perform various
231 * optimisations, so strive to make the bound as tight as possible.
233 virtual double get_maxpart() const = 0;
235 /** Calculate the term-independent weight component for a document.
237 * The parameter gives information about the document which may be used
238 * in the calculations:
240 * @param doclen The document's length (unnormalised).
241 * @param uniqterms The number of unique terms in the document.
243 virtual double get_sumextra(Xapian::termcount doclen,
244 Xapian::termcount uniqterms) const = 0;
246 /** Return an upper bound on what get_sumextra() can return for any
247 * document.
249 * This information is used by the matcher to perform various
250 * optimisations, so strive to make the bound as tight as possible.
252 virtual double get_maxextra() const = 0;
254 /** @private @internal Initialise this object to calculate weights for term
255 * @a term.
257 * @param stats Source of statistics.
258 * @param query_len_ Query length.
259 * @param term The term for the new object.
260 * @param wqf_ The within-query-frequency of @a term.
261 * @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
263 void init_(const Internal & stats, Xapian::termcount query_len_,
264 const std::string & term, Xapian::termcount wqf_,
265 double factor);
267 /** @private @internal Initialise this object to calculate weights for a
268 * synonym.
270 * @param stats Source of statistics.
271 * @param query_len_ Query length.
272 * @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
273 * @param termfreq The termfreq to use.
274 * @param reltermfreq The reltermfreq to use.
275 * @param collection_freq The collection frequency to use.
277 void init_(const Internal & stats, Xapian::termcount query_len_,
278 double factor, Xapian::doccount termfreq,
279 Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
281 /** @private @internal Initialise this object to calculate the extra weight
282 * component.
284 * @param stats Source of statistics.
285 * @param query_len_ Query length.
287 void init_(const Internal & stats, Xapian::termcount query_len_);
289 /** @private @internal Return true if the document length is needed.
291 * If this method returns true, then the document length will be fetched
292 * and passed to @a get_sumpart(). Otherwise 0 may be passed for the
293 * document length.
295 bool get_sumpart_needs_doclength_() const {
296 return stats_needed & DOC_LENGTH;
299 /** @private @internal Return true if the WDF is needed.
301 * If this method returns true, then the WDF will be fetched and passed to
302 * @a get_sumpart(). Otherwise 0 may be passed for the wdf.
304 bool get_sumpart_needs_wdf_() const {
305 return stats_needed & WDF;
308 /** @private @internal Return true if the number of unique terms is needed.
310 * If this method returns true, then the number of unique terms will be
311 * fetched and passed to @a get_sumpart(). Otherwise 0 may be passed for
312 * the number of unique terms.
314 bool get_sumpart_needs_uniqueterms_() const {
315 return stats_needed & UNIQUE_TERMS;
318 protected:
319 /** Don't allow copying.
321 * This would ideally be private, but that causes a compilation error
322 * with GCC 4.1 (which appears to be a bug).
324 Weight(const Weight &);
326 /// The number of documents in the collection.
327 Xapian::doccount get_collection_size() const { return collection_size_; }
329 /// The number of documents marked as relevant.
330 Xapian::doccount get_rset_size() const { return rset_size_; }
332 /// The average length of a document in the collection.
333 Xapian::doclength get_average_length() const { return average_length_; }
335 /// The number of documents which this term indexes.
336 Xapian::doccount get_termfreq() const { return termfreq_; }
338 /// The number of relevant documents which this term indexes.
339 Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
341 /// The collection frequency of the term.
342 Xapian::termcount get_collection_freq() const { return collectionfreq_; }
344 /// The length of the query.
345 Xapian::termcount get_query_length() const { return query_length_; }
347 /// The within-query-frequency of this term.
348 Xapian::termcount get_wqf() const { return wqf_; }
350 /** An upper bound on the maximum length of any document in the database.
352 * This should only be used by get_maxpart() and get_maxextra().
354 Xapian::termcount get_doclength_upper_bound() const {
355 return doclength_upper_bound_;
358 /** A lower bound on the minimum length of any document in the database.
360 * This bound does not include any zero-length documents.
362 * This should only be used by get_maxpart() and get_maxextra().
364 Xapian::termcount get_doclength_lower_bound() const {
365 return doclength_lower_bound_;
368 /** An upper bound on the wdf of this term.
370 * This should only be used by get_maxpart() and get_maxextra().
372 Xapian::termcount get_wdf_upper_bound() const {
373 return wdf_upper_bound_;
377 /** Class implementing a "boolean" weighting scheme.
379 * This weighting scheme gives all documents zero weight.
381 class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
382 BoolWeight * clone() const;
384 void init(double factor);
386 public:
387 /** Construct a BoolWeight. */
388 BoolWeight() { }
390 std::string name() const;
392 std::string serialise() const;
393 BoolWeight * unserialise(const std::string & serialised) const;
395 double get_sumpart(Xapian::termcount wdf,
396 Xapian::termcount doclen,
397 Xapian::termcount uniqterms) const;
398 double get_maxpart() const;
400 double get_sumextra(Xapian::termcount doclen,
401 Xapian::termcount uniqterms) const;
402 double get_maxextra() const;
405 /// Xapian::Weight subclass implementing the tf-idf weighting scheme.
406 class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
407 /* Three character string indicating the normalizations for tf(wdf), idf and
408 tfidf weight. */
409 std::string normalizations;
411 /// The factor to multiply with the weight.
412 double factor;
414 TfIdfWeight * clone() const;
416 void init(double factor);
418 /* When additional normalizations are implemented in the future, the additional statistics for them
419 should be accessed by these functions. */
420 double get_wdfn(Xapian::termcount wdf, char c) const;
421 double get_idfn(Xapian::doccount termfreq, char c) const;
422 double get_wtn(double wt, char c) const;
424 public:
425 /** Construct a TfIdfWeight
427 * @param normalizations A three character string indicating the
428 * normalizations to be used for the tf(wdf), idf
429 * and document weight. (default: "ntn")
431 * The @a normalizations string works like so:
433 * @li The first character specifies the normalization for the wdf. The
434 * following normalizations are currently supported:
436 * @li 'n': None. wdfn=wdf
437 * @li 'b': Boolean wdfn=1 if term in document else wdfn=0
438 * @li 's': Square wdfn=wdf*wdf
439 * @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
441 * The Max-wdf and Augmented Max wdf normalizations haven't yet been
442 * implemented.
444 * @li The second character indicates the normalization for the idf. The
445 * following normalizations are currently supported:
447 * @li 'n': None idfn=1
448 * @li 't': TfIdf idfn=log(N/Termfreq) where N is the number of
449 * documents in collection and Termfreq is the number of documents
450 * which are indexed by the term t.
451 * @li 'p': Prob idfn=log((N-Termfreq)/Termfreq)
452 * @li 'f': Freq idfn=1/Termfreq
453 * @li 's': Squared idfn=log(N/Termfreq)^2
455 * @li The third and the final character indicates the normalization for
456 * the document weight. The following normalizations are currently
457 * supported:
459 * @li 'n': None wtn=tfn*idfn
461 * Implementing support for more normalizations of each type would require
462 * extending the backend to track more statistics.
464 explicit TfIdfWeight(const std::string &normalizations);
466 /** Construct a TfIdfWeight using the default normalizations ("ntn"). */
467 TfIdfWeight()
468 : normalizations("ntn")
470 need_stat(TERMFREQ);
471 need_stat(WDF);
472 need_stat(WDF_MAX);
473 need_stat(COLLECTION_SIZE);
476 std::string name() const;
478 std::string serialise() const;
479 TfIdfWeight * unserialise(const std::string & serialised) const;
481 double get_sumpart(Xapian::termcount wdf,
482 Xapian::termcount doclen,
483 Xapian::termcount uniqterm) const;
484 double get_maxpart() const;
486 double get_sumextra(Xapian::termcount doclen,
487 Xapian::termcount uniqterms) const;
488 double get_maxextra() const;
492 /// Xapian::Weight subclass implementing the BM25 probabilistic formula.
493 class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
494 /// Factor to multiply the document length by.
495 mutable Xapian::doclength len_factor;
497 /// Factor combining all the document independent factors.
498 mutable double termweight;
500 /// The BM25 parameters.
501 double param_k1, param_k2, param_k3, param_b;
503 /// The minimum normalised document length value.
504 Xapian::doclength param_min_normlen;
506 BM25Weight * clone() const;
508 void init(double factor);
510 public:
511 /** Construct a BM25Weight.
513 * @param k1 A non-negative parameter controlling how influential
514 * within-document-frequency (wdf) is. k1=0 means that
515 * wdf doesn't affect the weights. The larger k1 is, the more
516 * wdf influences the weights. (default 1)
518 * @param k2 A non-negative parameter which controls the strength of a
519 * correction factor which depends upon query length and
520 * normalised document length. k2=0 disable this factor; larger
521 * k2 makes it stronger. (default 0)
523 * @param k3 A non-negative parameter controlling how influential
524 * within-query-frequency (wqf) is. k3=0 means that wqf
525 * doesn't affect the weights. The larger k3 is, the more
526 * wqf influences the weights. (default 1)
528 * @param b A parameter between 0 and 1, controlling how strong the
529 * document length normalisation of wdf is. 0 means no
530 * normalisation; 1 means full normalisation. (default 0.5)
532 * @param min_normlen A parameter specifying a minimum value for
533 * normalised document length. Normalised document length
534 * values less than this will be clamped to this value, helping
535 * to prevent very short documents getting large weights.
536 * (default 0.5)
538 BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
539 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
540 param_min_normlen(min_normlen)
542 if (param_k1 < 0) param_k1 = 0;
543 if (param_k2 < 0) param_k2 = 0;
544 if (param_k3 < 0) param_k3 = 0;
545 if (param_b < 0) {
546 param_b = 0;
547 } else if (param_b > 1) {
548 param_b = 1;
550 need_stat(COLLECTION_SIZE);
551 need_stat(RSET_SIZE);
552 need_stat(TERMFREQ);
553 need_stat(RELTERMFREQ);
554 need_stat(WDF);
555 need_stat(WDF_MAX);
556 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
557 need_stat(DOC_LENGTH_MIN);
558 need_stat(AVERAGE_LENGTH);
560 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
561 if (param_k2 != 0) need_stat(QUERY_LENGTH);
562 if (param_k3 != 0) need_stat(WQF);
565 BM25Weight()
566 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
567 param_min_normlen(0.5)
569 need_stat(COLLECTION_SIZE);
570 need_stat(RSET_SIZE);
571 need_stat(TERMFREQ);
572 need_stat(RELTERMFREQ);
573 need_stat(WDF);
574 need_stat(WDF_MAX);
575 need_stat(DOC_LENGTH_MIN);
576 need_stat(AVERAGE_LENGTH);
577 need_stat(DOC_LENGTH);
578 need_stat(WQF);
581 std::string name() const;
583 std::string serialise() const;
584 BM25Weight * unserialise(const std::string & serialised) const;
586 double get_sumpart(Xapian::termcount wdf,
587 Xapian::termcount doclen,
588 Xapian::termcount uniqterm) const;
589 double get_maxpart() const;
591 double get_sumextra(Xapian::termcount doclen,
592 Xapian::termcount uniqterms) const;
593 double get_maxextra() const;
596 /// Xapian::Weight subclass implementing the BM25+ probabilistic formula.
597 class XAPIAN_VISIBILITY_DEFAULT BM25PlusWeight : public Weight {
598 /// Factor to multiply the document length by.
599 mutable Xapian::doclength len_factor;
601 /// Factor combining all the document independent factors.
602 mutable double termweight;
604 /// The BM25+ parameters.
605 double param_k1, param_k2, param_k3, param_b;
607 /// The minimum normalised document length value.
608 Xapian::doclength param_min_normlen;
610 /// Additional parameter delta in the BM25+ formula.
611 double param_delta;
613 BM25PlusWeight * clone() const;
615 void init(double factor);
617 public:
618 /** Construct a BM25PlusWeight.
620 * @param k1 A non-negative parameter controlling how influential
621 * within-document-frequency (wdf) is. k1=0 means that
622 * wdf doesn't affect the weights. The larger k1 is, the more
623 * wdf influences the weights. (default 1)
625 * @param k2 A non-negative parameter which controls the strength of a
626 * correction factor which depends upon query length and
627 * normalised document length. k2=0 disable this factor; larger
628 * k2 makes it stronger. The paper which describes BM25+
629 * ignores BM25's document-independent component (so implicitly
630 * k2=0), but we support non-zero k2 too. (default 0)
632 * @param k3 A non-negative parameter controlling how influential
633 * within-query-frequency (wqf) is. k3=0 means that wqf
634 * doesn't affect the weights. The larger k3 is, the more
635 * wqf influences the weights. (default 1)
637 * @param b A parameter between 0 and 1, controlling how strong the
638 * document length normalisation of wdf is. 0 means no
639 * normalisation; 1 means full normalisation. (default 0.5)
641 * @param min_normlen A parameter specifying a minimum value for
642 * normalised document length. Normalised document length
643 * values less than this will be clamped to this value, helping
644 * to prevent very short documents getting large weights.
645 * (default 0.5)
647 * @param delta A parameter for pseudo tf value to control the scale
648 * of the tf lower bound. Delta(δ) can be tuned for example
649 * from 0.0 to 1.5 but BM25+ can still work effectively
650 * across collections with a fixed δ = 1.0. (default 1.0)
652 BM25PlusWeight(double k1, double k2, double k3, double b,
653 double min_normlen, double delta)
654 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
655 param_min_normlen(min_normlen), param_delta(delta)
657 if (param_k1 < 0) param_k1 = 0;
658 if (param_k2 < 0) param_k2 = 0;
659 if (param_k3 < 0) param_k3 = 0;
660 if (param_delta < 0) param_delta = 0;
661 if (param_b < 0) {
662 param_b = 0;
663 } else if (param_b > 1) {
664 param_b = 1;
666 need_stat(COLLECTION_SIZE);
667 need_stat(RSET_SIZE);
668 need_stat(TERMFREQ);
669 need_stat(RELTERMFREQ);
670 need_stat(WDF);
671 need_stat(WDF_MAX);
672 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
673 need_stat(DOC_LENGTH_MIN);
674 need_stat(AVERAGE_LENGTH);
676 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
677 if (param_k2 != 0) need_stat(QUERY_LENGTH);
678 if (param_k3 != 0) need_stat(WQF);
679 if (param_delta != 0) {
680 need_stat(AVERAGE_LENGTH);
681 need_stat(DOC_LENGTH);
682 need_stat(WQF);
686 BM25PlusWeight()
687 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
688 param_min_normlen(0.5), param_delta(1)
690 need_stat(COLLECTION_SIZE);
691 need_stat(RSET_SIZE);
692 need_stat(TERMFREQ);
693 need_stat(RELTERMFREQ);
694 need_stat(WDF);
695 need_stat(WDF_MAX);
696 need_stat(DOC_LENGTH_MIN);
697 need_stat(AVERAGE_LENGTH);
698 need_stat(DOC_LENGTH);
699 need_stat(WQF);
702 std::string name() const;
704 std::string serialise() const;
705 BM25PlusWeight * unserialise(const std::string & serialised) const;
707 double get_sumpart(Xapian::termcount wdf,
708 Xapian::termcount doclen,
709 Xapian::termcount uniqterm) const;
710 double get_maxpart() const;
712 double get_sumextra(Xapian::termcount doclen,
713 Xapian::termcount uniqterms) const;
714 double get_maxextra() const;
717 /** Xapian::Weight subclass implementing the traditional probabilistic formula.
719 * This class implements the "traditional" Probabilistic Weighting scheme, as
720 * described by the early papers on Probabilistic Retrieval. BM25 generally
721 * gives better results.
723 * TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
724 * the latter returns weights (k+1) times larger.
726 class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
727 /// Factor to multiply the document length by.
728 mutable Xapian::doclength len_factor;
730 /// Factor combining all the document independent factors.
731 mutable double termweight;
733 /// The parameter in the formula.
734 double param_k;
736 TradWeight * clone() const;
738 void init(double factor);
740 public:
741 /** Construct a TradWeight.
743 * @param k A non-negative parameter controlling how influential
744 * within-document-frequency (wdf) and document length are.
745 * k=0 means that wdf and document length don't affect the
746 * weights. The larger k is, the more they do. (default 1)
748 explicit TradWeight(double k = 1.0) : param_k(k) {
749 if (param_k < 0) param_k = 0;
750 if (param_k != 0.0) {
751 need_stat(AVERAGE_LENGTH);
752 need_stat(DOC_LENGTH);
754 need_stat(COLLECTION_SIZE);
755 need_stat(RSET_SIZE);
756 need_stat(TERMFREQ);
757 need_stat(RELTERMFREQ);
758 need_stat(DOC_LENGTH_MIN);
759 need_stat(WDF);
760 need_stat(WDF_MAX);
763 std::string name() const;
765 std::string serialise() const;
766 TradWeight * unserialise(const std::string & serialised) const;
768 double get_sumpart(Xapian::termcount wdf,
769 Xapian::termcount doclen,
770 Xapian::termcount uniqueterms) const;
771 double get_maxpart() const;
773 double get_sumextra(Xapian::termcount doclen,
774 Xapian::termcount uniqterms) const;
775 double get_maxextra() const;
778 /** This class implements the InL2 weighting scheme.
780 * InL2 is a representative scheme of the Divergence from Randomness Framework
781 * by Gianni Amati.
783 * This weighting scheme is useful for tasks that require early precision.
785 * It uses the Inverse document frequency model (In), the Laplace method to
786 * find the aftereffect of sampling (L) and the second wdf normalization
787 * proposed by Amati to normalize the wdf in the document to the length of the
788 * document (H2).
790 * For more information about the DFR Framework and the InL2 scheme, please
791 * refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
792 * models of information retrieval based on measuring the divergence from
793 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
794 * pp. 357-389.
796 class XAPIAN_VISIBILITY_DEFAULT InL2Weight : public Weight {
797 /// The wdf normalization parameter in the formula.
798 double param_c;
800 /// The upper bound on the weight a term can give to a document.
801 double upper_bound;
803 /// The constant values which are used on every call to get_sumpart().
804 double wqf_product_idf;
805 double c_product_avlen;
807 InL2Weight * clone() const;
809 void init(double factor);
811 public:
812 /** Construct an InL2Weight.
814 * @param c A non-negative and non zero parameter controlling the extent
815 * of the normalization of the wdf to the document length. The
816 * default value of 1 is suitable for longer queries but it may
817 * need to be changed for shorter queries. For more information,
818 * please refer to Gianni Amati's PHD thesis.
820 explicit InL2Weight(double c);
822 InL2Weight()
823 : param_c(1.0)
825 need_stat(AVERAGE_LENGTH);
826 need_stat(DOC_LENGTH);
827 need_stat(DOC_LENGTH_MIN);
828 need_stat(DOC_LENGTH_MAX);
829 need_stat(COLLECTION_SIZE);
830 need_stat(WDF);
831 need_stat(WDF_MAX);
832 need_stat(WQF);
833 need_stat(TERMFREQ);
836 std::string name() const;
838 std::string serialise() const;
839 InL2Weight * unserialise(const std::string & serialised) const;
841 double get_sumpart(Xapian::termcount wdf,
842 Xapian::termcount doclen,
843 Xapian::termcount uniqterms) const;
844 double get_maxpart() const;
846 double get_sumextra(Xapian::termcount doclen,
847 Xapian::termcount uniqterms) const;
848 double get_maxextra() const;
851 /** This class implements the IfB2 weighting scheme.
853 * IfB2 is a representative scheme of the Divergence from Randomness Framework
854 * by Gianni Amati.
856 * It uses the Inverse term frequency model (If), the Bernoulli method to find
857 * the aftereffect of sampling (B) and the second wdf normalization proposed
858 * by Amati to normalize the wdf in the document to the length of the document
859 * (H2).
861 * For more information about the DFR Framework and the IfB2 scheme, please
862 * refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
863 * models of information retrieval based on measuring the divergence from
864 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
865 * pp. 357-389.
867 class XAPIAN_VISIBILITY_DEFAULT IfB2Weight : public Weight {
868 /// The wdf normalization parameter in the formula.
869 double param_c;
871 /// The upper bound on the weight.
872 double upper_bound;
874 /// The constant values which are used for calculations in get_sumpart().
875 double wqf_product_idf;
876 double c_product_avlen;
877 double B_constant;
879 IfB2Weight * clone() const;
881 void init(double factor);
883 public:
884 /** Construct an IfB2Weight.
886 * @param c A non-negative and non zero parameter controlling the extent
887 * of the normalization of the wdf to the document length. The
888 * default value of 1 is suitable for longer queries but it may
889 * need to be changed for shorter queries. For more information,
890 * please refer to Gianni Amati's PHD thesis titled
891 * Probabilistic Models for Information Retrieval based on
892 * Divergence from Randomness.
894 explicit IfB2Weight(double c);
896 IfB2Weight() : param_c(1.0) {
897 need_stat(AVERAGE_LENGTH);
898 need_stat(DOC_LENGTH);
899 need_stat(DOC_LENGTH_MIN);
900 need_stat(DOC_LENGTH_MAX);
901 need_stat(COLLECTION_SIZE);
902 need_stat(COLLECTION_FREQ);
903 need_stat(WDF);
904 need_stat(WDF_MAX);
905 need_stat(WQF);
906 need_stat(TERMFREQ);
909 std::string name() const;
911 std::string serialise() const;
912 IfB2Weight * unserialise(const std::string & serialised) const;
914 double get_sumpart(Xapian::termcount wdf,
915 Xapian::termcount doclen,
916 Xapian::termcount uniqterm) const;
917 double get_maxpart() const;
919 double get_sumextra(Xapian::termcount doclen,
920 Xapian::termcount uniqterms) const;
921 double get_maxextra() const;
924 /** This class implements the IneB2 weighting scheme.
926 * IneB2 is a representative scheme of the Divergence from Randomness
927 * Framework by Gianni Amati.
929 * It uses the Inverse expected document frequency model (Ine), the Bernoulli
930 * method to find the aftereffect of sampling (B) and the second wdf
931 * normalization proposed by Amati to normalize the wdf in the document to the
932 * length of the document (H2).
934 * For more information about the DFR Framework and the IneB2 scheme, please
935 * refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
936 * models of information retrieval based on measuring the divergence from
937 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
938 * pp. 357-389.
940 class XAPIAN_VISIBILITY_DEFAULT IneB2Weight : public Weight {
941 /// The wdf normalization parameter in the formula.
942 double param_c;
944 /// The upper bound of the weight.
945 double upper_bound;
947 /// Constant values used in get_sumpart().
948 double wqf_product_idf;
949 double c_product_avlen;
950 double B_constant;
952 IneB2Weight * clone() const;
954 void init(double factor);
956 public:
957 /** Construct an IneB2Weight.
959 * @param c A non-negative and non zero parameter controlling the extent
960 * of the normalization of the wdf to the document length. The
961 * default value of 1 is suitable for longer queries but it may
962 * need to be changed for shorter queries. For more information,
963 * please refer to Gianni Amati's PHD thesis.
965 explicit IneB2Weight(double c);
967 IneB2Weight() : param_c(1.0) {
968 need_stat(AVERAGE_LENGTH);
969 need_stat(DOC_LENGTH);
970 need_stat(DOC_LENGTH_MIN);
971 need_stat(DOC_LENGTH_MAX);
972 need_stat(COLLECTION_SIZE);
973 need_stat(WDF);
974 need_stat(WDF_MAX);
975 need_stat(WQF);
976 need_stat(COLLECTION_FREQ);
977 need_stat(TERMFREQ);
980 std::string name() const;
982 std::string serialise() const;
983 IneB2Weight * unserialise(const std::string & serialised) const;
985 double get_sumpart(Xapian::termcount wdf,
986 Xapian::termcount doclen,
987 Xapian::termcount uniqterms) const;
988 double get_maxpart() const;
990 double get_sumextra(Xapian::termcount doclen,
991 Xapian::termcount uniqterms) const;
992 double get_maxextra() const;
995 /** This class implements the BB2 weighting scheme.
997 * BB2 is a representative scheme of the Divergence from Randomness Framework
998 * by Gianni Amati.
1000 * It uses the Bose-Einstein probabilistic distribution (B) along with
1001 * Stirling's power approximation, the Bernoulli method to find the
1002 * aftereffect of sampling (B) and the second wdf normalization proposed by
1003 * Amati to normalize the wdf in the document to the length of the document
1004 * (H2).
1006 * For more information about the DFR Framework and the BB2 scheme, please
1007 * refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
1008 * models of information retrieval based on measuring the divergence from
1009 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
1010 * pp. 357-389.
1012 class XAPIAN_VISIBILITY_DEFAULT BB2Weight : public Weight {
1013 /// The wdf normalization parameter in the formula.
1014 double param_c;
1016 /// The upper bound on the weight.
1017 double upper_bound;
1019 /// The constant values to be used in get_sumpart().
1020 double c_product_avlen;
1021 double B_constant;
1022 double wt;
1023 double stirling_constant_1;
1024 double stirling_constant_2;
1026 BB2Weight * clone() const;
1028 void init(double factor);
1030 public:
1031 /** Construct a BB2Weight.
1033 * @param c A non-negative and non zero parameter controlling the extent
1034 * of the normalization of the wdf to the document length. A
1035 * default value of 1 is suitable for longer queries but it may
1036 * need to be changed for shorter queries. For more information,
1037 * please refer to Gianni Amati's PHD thesis titled
1038 * Probabilistic Models for Information Retrieval based on
1039 * Divergence from Randomness.
1041 explicit BB2Weight(double c);
1043 BB2Weight() : param_c(1.0) {
1044 need_stat(AVERAGE_LENGTH);
1045 need_stat(DOC_LENGTH);
1046 need_stat(DOC_LENGTH_MIN);
1047 need_stat(DOC_LENGTH_MAX);
1048 need_stat(COLLECTION_SIZE);
1049 need_stat(COLLECTION_FREQ);
1050 need_stat(WDF);
1051 need_stat(WDF_MAX);
1052 need_stat(WQF);
1053 need_stat(TERMFREQ);
1056 std::string name() const;
1058 std::string serialise() const;
1059 BB2Weight * unserialise(const std::string & serialised) const;
1061 double get_sumpart(Xapian::termcount wdf,
1062 Xapian::termcount doclen,
1063 Xapian::termcount uniqterms) const;
1064 double get_maxpart() const;
1066 double get_sumextra(Xapian::termcount doclen,
1067 Xapian::termcount uniqterms) const;
1068 double get_maxextra() const;
1071 /** This class implements the DLH weighting scheme, which is a representative
1072 * scheme of the Divergence from Randomness Framework by Gianni Amati.
1074 * This is a parameter free weighting scheme and it should be used with query
1075 * expansion to obtain better results. It uses the HyperGeometric Probabilistic
1076 * model and Laplace's normalization to calculate the risk gain.
1078 * For more information about the DFR Framework and the DLH scheme, please
1079 * refer to :
1080 * a.) Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
1081 * models of information retrieval based on measuring the divergence from
1082 * randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp.
1083 * 357-389.
1084 * b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
1085 * G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
1086 * Proceedings of the 16th Text REtrieval Conference (TREC-2007), 2008.
1088 class XAPIAN_VISIBILITY_DEFAULT DLHWeight : public Weight {
1089 /// Now unused but left in place in 1.4.x for ABI compatibility.
1090 double lower_bound;
1092 /// The upper bound on the weight.
1093 double upper_bound;
1095 /// The constant value to be used in get_sumpart().
1096 double log_constant;
1097 double wqf_product_factor;
1099 DLHWeight * clone() const;
1101 void init(double factor);
1103 public:
1104 DLHWeight() {
1105 need_stat(AVERAGE_LENGTH);
1106 need_stat(DOC_LENGTH);
1107 need_stat(COLLECTION_SIZE);
1108 need_stat(COLLECTION_FREQ);
1109 need_stat(WDF);
1110 need_stat(WQF);
1111 need_stat(WDF_MAX);
1112 need_stat(DOC_LENGTH_MIN);
1113 need_stat(DOC_LENGTH_MAX);
1116 std::string name() const;
1118 std::string serialise() const;
1119 DLHWeight * unserialise(const std::string & serialised) const;
1121 double get_sumpart(Xapian::termcount wdf,
1122 Xapian::termcount doclen,
1123 Xapian::termcount uniqterms) const;
1124 double get_maxpart() const;
1126 double get_sumextra(Xapian::termcount doclen,
1127 Xapian::termcount uniqterms) const;
1128 double get_maxextra() const;
1131 /** This class implements the PL2 weighting scheme.
1133 * PL2 is a representative scheme of the Divergence from Randomness Framework
1134 * by Gianni Amati.
1136 * This weighting scheme is useful for tasks that require early precision.
1138 * It uses the Poisson approximation of the Binomial Probabilistic distribution
1139 * (P) along with Stirling's approximation for the factorial value, the Laplace
1140 * method to find the aftereffect of sampling (L) and the second wdf
1141 * normalization proposed by Amati to normalize the wdf in the document to the
1142 * length of the document (H2).
1144 * For more information about the DFR Framework and the PL2 scheme, please
1145 * refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic models
1146 * of information retrieval based on measuring the divergence from randomness
1147 * ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp. 357-389.
1149 class XAPIAN_VISIBILITY_DEFAULT PL2Weight : public Weight {
1150 /// The wdf normalization parameter in the formula.
1151 double param_c;
1153 /** The factor to multiply weights by.
1155 * The misleading name is due to this having been used to store a lower
1156 * bound in 1.4.0. We no longer need to store that, and so this member
1157 * has been repurposed in 1.4.1 and later (but the name left the same to
1158 * ensure ABI compatibility with 1.4.0).
1160 double lower_bound;
1162 /// The upper bound on the weight.
1163 double upper_bound;
1165 /// Constants for a given term in a given query.
1166 double P1, P2;
1168 /// Set by init() to (param_c * get_average_length())
1169 double cl;
1171 PL2Weight * clone() const;
1173 void init(double factor);
1175 public:
1176 /** Construct a PL2Weight.
1178 * @param c A non-negative and non zero parameter controlling the extent
1179 * of the normalization of the wdf to the document length. The
1180 * default value of 1 is suitable for longer queries but it may
1181 * need to be changed for shorter queries. For more information,
1182 * please refer to Gianni Amati's PHD thesis titled
1183 * Probabilistic Models for Information Retrieval based on
1184 * Divergence from Randomness.
1186 explicit PL2Weight(double c);
1188 PL2Weight() : param_c(1.0) {
1189 need_stat(AVERAGE_LENGTH);
1190 need_stat(DOC_LENGTH);
1191 need_stat(DOC_LENGTH_MIN);
1192 need_stat(DOC_LENGTH_MAX);
1193 need_stat(COLLECTION_SIZE);
1194 need_stat(COLLECTION_FREQ);
1195 need_stat(WDF);
1196 need_stat(WDF_MAX);
1197 need_stat(WQF);
1200 std::string name() const;
1202 std::string serialise() const;
1203 PL2Weight * unserialise(const std::string & serialised) const;
1205 double get_sumpart(Xapian::termcount wdf,
1206 Xapian::termcount doclen,
1207 Xapian::termcount uniqterms) const;
1208 double get_maxpart() const;
1210 double get_sumextra(Xapian::termcount doclen,
1211 Xapian::termcount uniqterms) const;
1212 double get_maxextra() const;
1215 /// Xapian::Weight subclass implementing the PL2+ probabilistic formula.
1216 class XAPIAN_VISIBILITY_DEFAULT PL2PlusWeight : public Weight {
1217 /// The factor to multiply weights by.
1218 double factor;
1220 /// The wdf normalization parameter in the formula.
1221 double param_c;
1223 /// Additional parameter delta in the PL2+ weighting formula.
1224 double param_delta;
1226 /// The upper bound on the weight.
1227 double upper_bound;
1229 /// Constants for a given term in a given query.
1230 double P1, P2;
1232 /// Set by init() to (param_c * get_average_length())
1233 double cl;
1235 /// Set by init() to get_collection_freq()) / get_collection_size()
1236 double mean;
1238 /// Weight contribution of delta term in the PL2+ function
1239 double dw;
1241 PL2PlusWeight * clone() const;
1243 void init(double factor_);
1245 public:
1246 /** Construct a PL2PlusWeight.
1248 * @param c A non-negative and non zero parameter controlling the extent
1249 * of the normalization of the wdf to the document length. The
1250 * default value of 1 is suitable for longer queries but it may
1251 * need to be changed for shorter queries. For more information,
1252 * please refer to Gianni Amati's PHD thesis titled
1253 * Probabilistic Models for Information Retrieval based on
1254 * Divergence from Randomness.
1256 * @param delta A parameter for pseudo tf value to control the scale
1257 * of the tf lower bound. Delta(δ) should be a positive
1258 * real number. It can be tuned for example from 0.1 to 1.5
1259 * in increments of 0.1 or so. Experiments have shown that
1260 * PL2+ works effectively across collections with a fixed δ = 0.8
1261 * (default 0.8)
1263 PL2PlusWeight(double c, double delta);
1265 PL2PlusWeight()
1266 : param_c(1.0), param_delta(0.8) {
1267 need_stat(AVERAGE_LENGTH);
1268 need_stat(DOC_LENGTH);
1269 need_stat(DOC_LENGTH_MIN);
1270 need_stat(DOC_LENGTH_MAX);
1271 need_stat(COLLECTION_SIZE);
1272 need_stat(COLLECTION_FREQ);
1273 need_stat(WDF);
1274 need_stat(WDF_MAX);
1275 need_stat(WQF);
1278 std::string name() const;
1280 std::string serialise() const;
1281 PL2PlusWeight * unserialise(const std::string & serialised) const;
1283 double get_sumpart(Xapian::termcount wdf,
1284 Xapian::termcount doclen,
1285 Xapian::termcount uniqterms) const;
1286 double get_maxpart() const;
1288 double get_sumextra(Xapian::termcount doclen,
1289 Xapian::termcount uniqterms) const;
1290 double get_maxextra() const;
1293 /** This class implements the DPH weighting scheme.
1295 * DPH is a representative scheme of the Divergence from Randomness Framework
1296 * by Gianni Amati.
1298 * This is a parameter free weighting scheme and it should be used with query
1299 * expansion to obtain better results. It uses the HyperGeometric Probabilistic
1300 * model and Popper's normalization to calculate the risk gain.
1302 * For more information about the DFR Framework and the DPH scheme, please
1303 * refer to :
1304 * a.) Gianni Amati and Cornelis Joost Van Rijsbergen
1305 * Probabilistic models of information retrieval based on measuring the
1306 * divergence from randomness ACM Transactions on Information Systems (TOIS) 20,
1307 * (4), 2002, pp. 357-389.
1308 * b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
1309 * G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
1310 * Proceedings of the 16th Text Retrieval Conference (TREC-2007), 2008.
1312 class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight {
1313 /// The upper bound on the weight.
1314 double upper_bound;
1316 /// Now unused but left in place in 1.4.x for ABI compatibility.
1317 double lower_bound;
1319 /// The constant value used in get_sumpart() .
1320 double log_constant;
1321 double wqf_product_factor;
1323 DPHWeight * clone() const;
1325 void init(double factor);
1327 public:
1328 /** Construct a DPHWeight. */
1329 DPHWeight() {
1330 need_stat(AVERAGE_LENGTH);
1331 need_stat(DOC_LENGTH);
1332 need_stat(COLLECTION_SIZE);
1333 need_stat(COLLECTION_FREQ);
1334 need_stat(WDF);
1335 need_stat(WQF);
1336 need_stat(WDF_MAX);
1337 need_stat(DOC_LENGTH_MIN);
1338 need_stat(DOC_LENGTH_MAX);
1341 std::string name() const;
1343 std::string serialise() const;
1344 DPHWeight * unserialise(const std::string & serialised) const;
1346 double get_sumpart(Xapian::termcount wdf,
1347 Xapian::termcount doclen,
1348 Xapian::termcount uniqterms) const;
1349 double get_maxpart() const;
1351 double get_sumextra(Xapian::termcount doclen,
1352 Xapian::termcount uniqterms) const;
1353 double get_maxextra() const;
1357 /** Xapian::Weight subclass implementing the Language Model formula.
1359 * This class implements the "Language Model" Weighting scheme, as
1360 * described by the early papers on LM by Bruce Croft.
1362 * LM works by comparing the query to a Language Model of the document.
1363 * The language model itself is parameter-free, though LMWeight takes
1364 * parameters which specify the smoothing used.
1366 class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {
1367 /** The type of smoothing to use. */
1368 type_smoothing select_smoothing;
1370 // Parameters for handling negative value of log, and for smoothing.
1371 double param_log, param_smoothing1, param_smoothing2;
1373 /** The factor to multiply weights by.
1375 * The misleading name is due to this having been used to store some
1376 * other value in 1.4.0. However, that value only takes one
1377 * multiplication and one division to calculate, so for 1.4.x we can just
1378 * recalculate it each time we need it, and so this member has been
1379 * repurposed in 1.4.1 and later (but the name left the same to ensure ABI
1380 * compatibility with 1.4.0).
1382 double weight_collection;
1384 LMWeight * clone() const;
1386 void init(double factor);
1388 public:
1389 /** Construct a LMWeight.
1391 * @param param_log_ A non-negative parameter controlling how much
1392 * to clamp negative values returned by the log.
1393 * The log is calculated by multiplying the
1394 * actual weight by param_log. If param_log is
1395 * 0.0, then the document length upper bound will
1396 * be used (default: document length upper bound)
1398 * @param select_smoothing_ A parameter of type enum
1399 * type_smoothing. This parameter
1400 * controls which smoothing type to use.
1401 * (default: TWO_STAGE_SMOOTHING)
1403 * @param param_smoothing1_ A non-negative parameter for smoothing
1404 * whose meaning depends on
1405 * select_smoothing_. In
1406 * JELINEK_MERCER_SMOOTHING, it plays the
1407 * role of estimation and in
1408 * DIRICHLET_SMOOTHING the role of query
1409 * modelling. (default JELINEK_MERCER,
1410 * ABSOLUTE, TWOSTAGE(0.7),
1411 * DIRCHLET(2000))
1413 * @param param_smoothing2_ A non-negative parameter which is used
1414 * with TWO_STAGE_SMOOTHING as parameter for Dirichlet's
1415 * smoothing (default: 2000) and as parameter delta to
1416 * control the scale of the tf lower bound in the
1417 * DIRICHLET_PLUS_SMOOTHING (default 0.05).
1420 // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
1421 explicit LMWeight(double param_log_ = 0.0,
1422 type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1423 double param_smoothing1_ = -1.0,
1424 double param_smoothing2_ = -1.0)
1425 : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1426 param_smoothing2(param_smoothing2_)
1428 if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1429 if (param_smoothing2 < 0) {
1430 if (select_smoothing == TWO_STAGE_SMOOTHING)
1431 param_smoothing2 = 2000.0;
1432 else
1433 param_smoothing2 = 0.05;
1435 need_stat(AVERAGE_LENGTH);
1436 need_stat(DOC_LENGTH);
1437 need_stat(COLLECTION_SIZE);
1438 need_stat(RSET_SIZE);
1439 need_stat(TERMFREQ);
1440 need_stat(RELTERMFREQ);
1441 need_stat(DOC_LENGTH_MAX);
1442 need_stat(WDF);
1443 need_stat(WDF_MAX);
1444 need_stat(COLLECTION_FREQ);
1445 if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1446 need_stat(UNIQUE_TERMS);
1447 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1448 need_stat(DOC_LENGTH_MIN);
1451 std::string name() const;
1453 std::string serialise() const;
1454 LMWeight * unserialise(const std::string & s) const;
1456 double get_sumpart(Xapian::termcount wdf,
1457 Xapian::termcount doclen,
1458 Xapian::termcount uniqterm) const;
1459 double get_maxpart() const;
1461 double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
1462 double get_maxextra() const;
1465 /** Xapian::Weight subclass implementing Coordinate Matching.
1467 * Each matching term score one point. See Managing Gigabytes, Second Edition
1468 * p181.
1470 class XAPIAN_VISIBILITY_DEFAULT CoordWeight : public Weight {
1471 /// The factor to multiply weights by.
1472 double factor;
1474 public:
1475 CoordWeight * clone() const ;
1477 void init(double factor_);
1479 /** Construct a CoordWeight. */
1480 CoordWeight() { }
1482 std::string name() const;
1484 std::string serialise() const;
1485 CoordWeight * unserialise(const std::string &) const;
1487 double get_sumpart(Xapian::termcount wdf,
1488 Xapian::termcount doclen,
1489 Xapian::termcount uniqterm) const;
1490 double get_maxpart() const;
1492 double get_sumextra(Xapian::termcount, Xapian::termcount) const;
1493 double get_maxextra() const;
1498 #endif // XAPIAN_INCLUDED_WEIGHT_H