xapian-core/include/xapian/weight.h

   1 /** @file weight.h
   2  * @brief Weighting scheme API.
   3  */
   4 /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016 Olly Betts
   5  * Copyright (C) 2009 Lemur Consulting Ltd
   6  * Copyright (C) 2013,2014 Aarsh Shah
   7  * Copyright (C) 2016 Vivek Pal
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  22  */
  23
  24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
  25 #define XAPIAN_INCLUDED_WEIGHT_H
  26
  27 #include <string>
  28
  29 #include <xapian/types.h>
  30 #include <xapian/visibility.h>
  31
  32 namespace Xapian {
  33
  34 /** Abstract base class for weighting schemes. */
  35 class XAPIAN_VISIBILITY_DEFAULT Weight {
  36   protected:
  37     /// Stats which the weighting scheme can use (see @a need_stat()).
  38     typedef enum {
  39         /// Number of documents in the collection.
  40         COLLECTION_SIZE = 1,
  41         /// Number of documents in the RSet.
  42         RSET_SIZE = 2,
  43         /// Average length of documents in the collection.
  44         AVERAGE_LENGTH = 4,
  45         /// How many documents the current term is in.
  46         TERMFREQ = 8,
  47         /// How many documents in the RSet the current term is in.
  48         RELTERMFREQ = 16,
  49         /// Sum of wqf for terms in the query.
  50         QUERY_LENGTH = 32,
  51         /// Within-query-frequency of the current term.
  52         WQF = 64,
  53         /// Within-document-frequency of the current term in the current document.
  54         WDF = 128,
  55         /// Length of the current document (sum wdf).
  56         DOC_LENGTH = 256,
  57         /// Lower bound on (non-zero) document lengths.
  58         DOC_LENGTH_MIN = 512,
  59         /// Upper bound on document lengths.
  60         DOC_LENGTH_MAX = 1024,
  61         /// Upper bound on wdf.
  62         WDF_MAX = 2048,
  63         /// Sum of wdf over the whole collection for the current term.
  64         COLLECTION_FREQ = 4096,
  65         /// Number of unique terms in the current document.
  66         UNIQUE_TERMS = 8192
  67     } stat_flags;
  68
  69     /** Tell Xapian that your subclass will want a particular statistic.
  70      *
  71      *  Some of the statistics can be costly to fetch or calculate, so
  72      *  Xapian needs to know which are actually going to be used.  You
  73      *  should call need_stat() from your constructor for each such
  74      *  statistic.
  75      *
  76      * @param flag  The stat_flags value for a required statistic.
  77      */
  78     void need_stat(stat_flags flag) {
  79         stats_needed = stat_flags(stats_needed | flag);
  80     }
  81
  82     /** Allow the subclass to perform any initialisation it needs to.
  83      *
  84      *  @param factor     Any scaling factor (e.g. from OP_SCALE_WEIGHT).
  85      *                    If the Weight object is for the term-independent
  86      *                    weight supplied by get_sumextra()/get_maxextra(),
  87      *                    then init(0.0) is called (starting from Xapian
  88      *                    1.2.11 and 1.3.1 - earlier versions failed to
  89      *                    call init() for such Weight objects).
  90      */
  91     virtual void init(double factor) = 0;
  92
  93   private:
  94     /// Don't allow assignment.
  95     void operator=(const Weight &);
  96
  97     /// A bitmask of the statistics this weighting scheme needs.
  98     stat_flags stats_needed;
  99
 100     /// The number of documents in the collection.
 101     Xapian::doccount collection_size_;
 102
 103     /// The number of documents marked as relevant.
 104     Xapian::doccount rset_size_;
 105
 106     /// The average length of a document in the collection.
 107     Xapian::doclength average_length_;
 108
 109     /// The number of documents which this term indexes.
 110     Xapian::doccount termfreq_;
 111
 112     // The collection frequency of the term.
 113     Xapian::termcount collectionfreq_;
 114
 115     /// The number of relevant documents which this term indexes.
 116     Xapian::doccount reltermfreq_;
 117
 118     /// The length of the query.
 119     Xapian::termcount query_length_;
 120
 121     /// The within-query-frequency of this term.
 122     Xapian::termcount wqf_;
 123
 124     /// A lower bound on the minimum length of any document in the database.
 125     Xapian::termcount doclength_lower_bound_;
 126
 127     /// An upper bound on the maximum length of any document in the database.
 128     Xapian::termcount doclength_upper_bound_;
 129
 130     /// An upper bound on the wdf of this term.
 131     Xapian::termcount wdf_upper_bound_;
 132
 133   public:
 134
 135     /// Default constructor, needed by subclass constructors.
 136     Weight() : stats_needed() { }
 137
 138     /** Type of smoothing to use with the Language Model Weighting scheme.
 139      *
 140      *  Default is TWO_STAGE_SMOOTHING.
 141      */
 142     typedef enum {
 143         TWO_STAGE_SMOOTHING = 1,
 144         DIRICHLET_SMOOTHING = 2,
 145         ABSOLUTE_DISCOUNT_SMOOTHING = 3,
 146         JELINEK_MERCER_SMOOTHING = 4,
 147         DIRICHLET_PLUS_SMOOTHING = 5
 148     } type_smoothing;
 149
 150     class Internal;
 151
 152     /** Virtual destructor, because we have virtual methods. */
 153     virtual ~Weight();
 154
 155     /** Clone this object.
 156      *
 157      *  This method allocates and returns a copy of the object it is called on.
 158      *
 159      *  If your subclass is called FooWeight and has parameters a and b, then
 160      *  you would implement FooWeight::clone() like so:
 161      *
 162      *  FooWeight * FooWeight::clone() const { return new FooWeight(a, b); }
 163      *
 164      *  Note that the returned object will be deallocated by Xapian after use
 165      *  with "delete".  If you want to handle the deletion in a special way
 166      *  (for example when wrapping the Xapian API for use from another
 167      *  language) then you can define a static <code>operator delete</code>
 168      *  method in your subclass as shown here:
 169      *  https://trac.xapian.org/ticket/554#comment:1
 170      */
 171     virtual Weight * clone() const = 0;
 172
 173     /** Return the name of this weighting scheme.
 174      *
 175      *  This name is used by the remote backend.  It is passed along with the
 176      *  serialised parameters to the remote server so that it knows which class
 177      *  to create.
 178      *
 179      *  Return the full namespace-qualified name of your class here - if
 180      *  your class is called FooWeight, return "FooWeight" from this method
 181      *  (Xapian::BM25Weight returns "Xapian::BM25Weight" here).
 182      *
 183      *  If you don't want to support the remote backend, you can use the
 184      *  default implementation which simply returns an empty string.
 185      */
 186     virtual std::string name() const;
 187
 188     /** Return this object's parameters serialised as a single string.
 189      *
 190      *  If you don't want to support the remote backend, you can use the
 191      *  default implementation which simply throws Xapian::UnimplementedError.
 192      */
 193     virtual std::string serialise() const;
 194
 195     /** Unserialise parameters.
 196      *
 197      *  This method unserialises parameters serialised by the @a serialise()
 198      *  method and allocates and returns a new object initialised with them.
 199      *
 200      *  If you don't want to support the remote backend, you can use the
 201      *  default implementation which simply throws Xapian::UnimplementedError.
 202      *
 203      *  Note that the returned object will be deallocated by Xapian after use
 204      *  with "delete".  If you want to handle the deletion in a special way
 205      *  (for example when wrapping the Xapian API for use from another
 206      *  language) then you can define a static <code>operator delete</code>
 207      *  method in your subclass as shown here:
 208      *  https://trac.xapian.org/ticket/554#comment:1
 209      *
 210      *  @param serialised       A string containing the serialised parameters.
 211      */
 212     virtual Weight * unserialise(const std::string & serialised) const;
 213
 214     /** Calculate the weight contribution for this object's term to a document.
 215      *
 216      *  The parameters give information about the document which may be used
 217      *  in the calculations:
 218      *
 219      *  @param wdf    The within document frequency of the term in the document.
 220      *  @param doclen The document's length (unnormalised).
 221      *  @param uniqterms        Number of unique terms in the document (used
 222      *                          for absolute smoothing).
 223      */
 224     virtual double get_sumpart(Xapian::termcount wdf,
 225                                Xapian::termcount doclen,
 226                                Xapian::termcount uniqterms) const = 0;
 227
 228     /** Return an upper bound on what get_sumpart() can return for any document.
 229      *
 230      *  This information is used by the matcher to perform various
 231      *  optimisations, so strive to make the bound as tight as possible.
 232      */
 233     virtual double get_maxpart() const = 0;
 234
 235     /** Calculate the term-independent weight component for a document.
 236      *
 237      *  The parameter gives information about the document which may be used
 238      *  in the calculations:
 239      *
 240      *  @param doclen The document's length (unnormalised).
 241      *  @param uniqterms The number of unique terms in the document.
 242      */
 243     virtual double get_sumextra(Xapian::termcount doclen,
 244                                 Xapian::termcount uniqterms) const = 0;
 245
 246     /** Return an upper bound on what get_sumextra() can return for any
 247      *  document.
 248      *
 249      *  This information is used by the matcher to perform various
 250      *  optimisations, so strive to make the bound as tight as possible.
 251      */
 252     virtual double get_maxextra() const = 0;
 253
 254     /** @private @internal Initialise this object to calculate weights for term
 255      *  @a term.
 256      *
 257      *  @param stats      Source of statistics.
 258      *  @param query_len_ Query length.
 259      *  @param term       The term for the new object.
 260      *  @param wqf_       The within-query-frequency of @a term.
 261      *  @param factor     Any scaling factor (e.g. from OP_SCALE_WEIGHT).
 262      */
 263     void init_(const Internal & stats, Xapian::termcount query_len_,
 264                const std::string & term, Xapian::termcount wqf_,
 265                double factor);
 266
 267     /** @private @internal Initialise this object to calculate weights for a
 268      *  synonym.
 269      *
 270      *  @param stats       Source of statistics.
 271      *  @param query_len_  Query length.
 272      *  @param factor      Any scaling factor (e.g. from OP_SCALE_WEIGHT).
 273      *  @param termfreq    The termfreq to use.
 274      *  @param reltermfreq The reltermfreq to use.
 275      *  @param collection_freq The collection frequency to use.
 276      */
 277     void init_(const Internal & stats, Xapian::termcount query_len_,
 278                double factor, Xapian::doccount termfreq,
 279                Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
 280
 281     /** @private @internal Initialise this object to calculate the extra weight
 282      *  component.
 283      *
 284      *  @param stats      Source of statistics.
 285      *  @param query_len_ Query length.
 286      */
 287     void init_(const Internal & stats, Xapian::termcount query_len_);
 288
 289     /** @private @internal Return true if the document length is needed.
 290      *
 291      *  If this method returns true, then the document length will be fetched
 292      *  and passed to @a get_sumpart().  Otherwise 0 may be passed for the
 293      *  document length.
 294      */
 295     bool get_sumpart_needs_doclength_() const {
 296         return stats_needed & DOC_LENGTH;
 297     }
 298
 299     /** @private @internal Return true if the WDF is needed.
 300      *
 301      *  If this method returns true, then the WDF will be fetched and passed to
 302      *  @a get_sumpart().  Otherwise 0 may be passed for the wdf.
 303      */
 304     bool get_sumpart_needs_wdf_() const {
 305         return stats_needed & WDF;
 306     }
 307
 308     /** @private @internal Return true if the number of unique terms is needed.
 309      *
 310      *  If this method returns true, then the number of unique terms will be
 311      *  fetched and passed to @a get_sumpart().  Otherwise 0 may be passed for
 312      *  the number of unique terms.
 313      */
 314     bool get_sumpart_needs_uniqueterms_() const {
 315         return stats_needed & UNIQUE_TERMS;
 316     }
 317
 318   protected:
 319     /** Don't allow copying.
 320      *
 321      *  This would ideally be private, but that causes a compilation error
 322      *  with GCC 4.1 (which appears to be a bug).
 323      */
 324     Weight(const Weight &);
 325
 326     /// The number of documents in the collection.
 327     Xapian::doccount get_collection_size() const { return collection_size_; }
 328
 329     /// The number of documents marked as relevant.
 330     Xapian::doccount get_rset_size() const { return rset_size_; }
 331
 332     /// The average length of a document in the collection.
 333     Xapian::doclength get_average_length() const { return average_length_; }
 334
 335     /// The number of documents which this term indexes.
 336     Xapian::doccount get_termfreq() const { return termfreq_; }
 337
 338     /// The number of relevant documents which this term indexes.
 339     Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
 340
 341     /// The collection frequency of the term.
 342     Xapian::termcount get_collection_freq() const { return collectionfreq_; }
 343
 344     /// The length of the query.
 345     Xapian::termcount get_query_length() const { return query_length_; }
 346
 347     /// The within-query-frequency of this term.
 348     Xapian::termcount get_wqf() const { return wqf_; }
 349
 350     /** An upper bound on the maximum length of any document in the database.
 351      *
 352      *  This should only be used by get_maxpart() and get_maxextra().
 353      */
 354     Xapian::termcount get_doclength_upper_bound() const {
 355         return doclength_upper_bound_;
 356     }
 357
 358     /** A lower bound on the minimum length of any document in the database.
 359      *
 360      *  This bound does not include any zero-length documents.
 361      *
 362      *  This should only be used by get_maxpart() and get_maxextra().
 363      */
 364     Xapian::termcount get_doclength_lower_bound() const {
 365         return doclength_lower_bound_;
 366     }
 367
 368     /** An upper bound on the wdf of this term.
 369      *
 370      *  This should only be used by get_maxpart() and get_maxextra().
 371      */
 372     Xapian::termcount get_wdf_upper_bound() const {
 373         return wdf_upper_bound_;
 374     }
 375 };
 376
 377 /** Class implementing a "boolean" weighting scheme.
 378  *
 379  *  This weighting scheme gives all documents zero weight.
 380  */
 381 class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
 382     BoolWeight * clone() const;
 383
 384     void init(double factor);
 385
 386   public:
 387     /** Construct a BoolWeight. */
 388     BoolWeight() { }
 389
 390     std::string name() const;
 391
 392     std::string serialise() const;
 393     BoolWeight * unserialise(const std::string & serialised) const;
 394
 395     double get_sumpart(Xapian::termcount wdf,
 396                        Xapian::termcount doclen,
 397                        Xapian::termcount uniqterms) const;
 398     double get_maxpart() const;
 399
 400     double get_sumextra(Xapian::termcount doclen,
 401                         Xapian::termcount uniqterms) const;
 402     double get_maxextra() const;
 403 };
 404
 405 /// Xapian::Weight subclass implementing the tf-idf weighting scheme.
 406 class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
 407     /* Three character string indicating the normalizations for tf(wdf), idf and
 408        tfidf weight. */
 409     std::string normalizations;
 410
 411     /// The factor to multiply with the weight.
 412     double factor;
 413
 414     TfIdfWeight * clone() const;
 415
 416     void init(double factor);
 417
 418     /* When additional normalizations are implemented in the future, the additional statistics for them
 419        should be accessed by these functions. */
 420     double get_wdfn(Xapian::termcount wdf, char c) const;
 421     double get_idfn(Xapian::doccount termfreq, char c) const;
 422     double get_wtn(double wt, char c) const;
 423
 424   public:
 425     /** Construct a TfIdfWeight
 426      *
 427      *  @param normalizations   A three character string indicating the
 428      *                          normalizations to be used for the tf(wdf), idf
 429      *                          and document weight.  (default: "ntn")
 430      *
 431      * The @a normalizations string works like so:
 432      *
 433      * @li The first character specifies the normalization for the wdf.  The
 434      *     following normalizations are currently supported:
 435      *
 436      *     @li 'n': None.      wdfn=wdf
 437      *     @li 'b': Boolean    wdfn=1 if term in document else wdfn=0
 438      *     @li 's': Square     wdfn=wdf*wdf
 439      *     @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
 440      *
 441      *     The Max-wdf and Augmented Max wdf normalizations haven't yet been
 442      *     implemented.
 443      *
 444      * @li The second character indicates the normalization for the idf.  The
 445      *     following normalizations are currently supported:
 446      *
 447      *     @li 'n': None    idfn=1
 448      *     @li 't': TfIdf   idfn=log(N/Termfreq) where N is the number of
 449      *         documents in collection and Termfreq is the number of documents
 450      *         which are indexed by the term t.
 451      *     @li 'p': Prob    idfn=log((N-Termfreq)/Termfreq)
 452      *     @li 'f': Freq    idfn=1/Termfreq
 453      *     @li 's': Squared idfn=log(N/Termfreq)^2
 454      *
 455      * @li The third and the final character indicates the normalization for
 456      *     the document weight.  The following normalizations are currently
 457      *     supported:
 458      *
 459      *     @li 'n': None wtn=tfn*idfn
 460      *
 461      * Implementing support for more normalizations of each type would require
 462      * extending the backend to track more statistics.
 463      */
 464     explicit TfIdfWeight(const std::string &normalizations);
 465
 466     /** Construct a TfIdfWeight using the default normalizations ("ntn"). */
 467     TfIdfWeight()
 468     : normalizations("ntn")
 469     {
 470         need_stat(TERMFREQ);
 471         need_stat(WDF);
 472         need_stat(WDF_MAX);
 473         need_stat(COLLECTION_SIZE);
 474     }
 475
 476     std::string name() const;
 477
 478     std::string serialise() const;
 479     TfIdfWeight * unserialise(const std::string & serialised) const;
 480
 481     double get_sumpart(Xapian::termcount wdf,
 482                        Xapian::termcount doclen,
 483                        Xapian::termcount uniqterm) const;
 484     double get_maxpart() const;
 485
 486     double get_sumextra(Xapian::termcount doclen,
 487                         Xapian::termcount uniqterms) const;
 488     double get_maxextra() const;
 489 };
 490
 491
 492 /// Xapian::Weight subclass implementing the BM25 probabilistic formula.
 493 class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
 494     /// Factor to multiply the document length by.
 495     mutable Xapian::doclength len_factor;
 496
 497     /// Factor combining all the document independent factors.
 498     mutable double termweight;
 499
 500     /// The BM25 parameters.
 501     double param_k1, param_k2, param_k3, param_b;
 502
 503     /// The minimum normalised document length value.
 504     Xapian::doclength param_min_normlen;
 505
 506     BM25Weight * clone() const;
 507
 508     void init(double factor);
 509
 510   public:
 511     /** Construct a BM25Weight.
 512      *
 513      *  @param k1  A non-negative parameter controlling how influential
 514      *             within-document-frequency (wdf) is.  k1=0 means that
 515      *             wdf doesn't affect the weights.  The larger k1 is, the more
 516      *             wdf influences the weights.  (default 1)
 517      *
 518      *  @param k2  A non-negative parameter which controls the strength of a
 519      *             correction factor which depends upon query length and
 520      *             normalised document length.  k2=0 disable this factor; larger
 521      *             k2 makes it stronger.  (default 0)
 522      *
 523      *  @param k3  A non-negative parameter controlling how influential
 524      *             within-query-frequency (wqf) is.  k3=0 means that wqf
 525      *             doesn't affect the weights.  The larger k3 is, the more
 526      *             wqf influences the weights.  (default 1)
 527      *
 528      *  @param b   A parameter between 0 and 1, controlling how strong the
 529      *             document length normalisation of wdf is.  0 means no
 530      *             normalisation; 1 means full normalisation.  (default 0.5)
 531      *
 532      *  @param min_normlen  A parameter specifying a minimum value for
 533      *             normalised document length.  Normalised document length
 534      *             values less than this will be clamped to this value, helping
 535      *             to prevent very short documents getting large weights.
 536      *             (default 0.5)
 537      */
 538     BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
 539         : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
 540           param_min_normlen(min_normlen)
 541     {
 542         if (param_k1 < 0) param_k1 = 0;
 543         if (param_k2 < 0) param_k2 = 0;
 544         if (param_k3 < 0) param_k3 = 0;
 545         if (param_b < 0) {
 546             param_b = 0;
 547         } else if (param_b > 1) {
 548             param_b = 1;
 549         }
 550         need_stat(COLLECTION_SIZE);
 551         need_stat(RSET_SIZE);
 552         need_stat(TERMFREQ);
 553         need_stat(RELTERMFREQ);
 554         need_stat(WDF);
 555         need_stat(WDF_MAX);
 556         if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
 557             need_stat(DOC_LENGTH_MIN);
 558             need_stat(AVERAGE_LENGTH);
 559         }
 560         if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
 561         if (param_k2 != 0) need_stat(QUERY_LENGTH);
 562         if (param_k3 != 0) need_stat(WQF);
 563     }
 564
 565     BM25Weight()
 566         : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
 567           param_min_normlen(0.5)
 568     {
 569         need_stat(COLLECTION_SIZE);
 570         need_stat(RSET_SIZE);
 571         need_stat(TERMFREQ);
 572         need_stat(RELTERMFREQ);
 573         need_stat(WDF);
 574         need_stat(WDF_MAX);
 575         need_stat(DOC_LENGTH_MIN);
 576         need_stat(AVERAGE_LENGTH);
 577         need_stat(DOC_LENGTH);
 578         need_stat(WQF);
 579     }
 580
 581     std::string name() const;
 582
 583     std::string serialise() const;
 584     BM25Weight * unserialise(const std::string & serialised) const;
 585
 586     double get_sumpart(Xapian::termcount wdf,
 587                        Xapian::termcount doclen,
 588                        Xapian::termcount uniqterm) const;
 589     double get_maxpart() const;
 590
 591     double get_sumextra(Xapian::termcount doclen,
 592                         Xapian::termcount uniqterms) const;
 593     double get_maxextra() const;
 594 };
 595
 596 /// Xapian::Weight subclass implementing the BM25+ probabilistic formula.
 597 class XAPIAN_VISIBILITY_DEFAULT BM25PlusWeight : public Weight {
 598     /// Factor to multiply the document length by.
 599     mutable Xapian::doclength len_factor;
 600
 601     /// Factor combining all the document independent factors.
 602     mutable double termweight;
 603
 604     /// The BM25+ parameters.
 605     double param_k1, param_k2, param_k3, param_b;
 606
 607     /// The minimum normalised document length value.
 608     Xapian::doclength param_min_normlen;
 609
 610     /// Additional parameter delta in the BM25+ formula.
 611     double param_delta;
 612
 613     BM25PlusWeight * clone() const;
 614
 615     void init(double factor);
 616
 617   public:
 618     /** Construct a BM25PlusWeight.
 619      *
 620      *  @param k1  A non-negative parameter controlling how influential
 621      *             within-document-frequency (wdf) is.  k1=0 means that
 622      *             wdf doesn't affect the weights.  The larger k1 is, the more
 623      *             wdf influences the weights.  (default 1)
 624      *
 625      *  @param k2  A non-negative parameter which controls the strength of a
 626      *             correction factor which depends upon query length and
 627      *             normalised document length.  k2=0 disable this factor; larger
 628      *             k2 makes it stronger.  The paper which describes BM25+
 629      *             ignores BM25's document-independent component (so implicitly
 630      *             k2=0), but we support non-zero k2 too.  (default 0)
 631      *
 632      *  @param k3  A non-negative parameter controlling how influential
 633      *             within-query-frequency (wqf) is.  k3=0 means that wqf
 634      *             doesn't affect the weights.  The larger k3 is, the more
 635      *             wqf influences the weights.  (default 1)
 636      *
 637      *  @param b   A parameter between 0 and 1, controlling how strong the
 638      *             document length normalisation of wdf is.  0 means no
 639      *             normalisation; 1 means full normalisation.  (default 0.5)
 640      *
 641      *  @param min_normlen  A parameter specifying a minimum value for
 642      *             normalised document length.  Normalised document length
 643      *             values less than this will be clamped to this value, helping
 644      *             to prevent very short documents getting large weights.
 645      *             (default 0.5)
 646      *
 647      *  @param delta  A parameter for pseudo tf value to control the scale
 648      *                of the tf lower bound. Delta(δ) can be tuned for example
 649      *                from 0.0 to 1.5 but BM25+ can still work effectively
 650      *                across collections with a fixed δ = 1.0. (default 1.0)
 651      */
 652     BM25PlusWeight(double k1, double k2, double k3, double b,
 653                    double min_normlen, double delta)
 654         : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
 655           param_min_normlen(min_normlen), param_delta(delta)
 656     {
 657         if (param_k1 < 0) param_k1 = 0;
 658         if (param_k2 < 0) param_k2 = 0;
 659         if (param_k3 < 0) param_k3 = 0;
 660         if (param_delta < 0) param_delta = 0;
 661         if (param_b < 0) {
 662             param_b = 0;
 663         } else if (param_b > 1) {
 664             param_b = 1;
 665         }
 666         need_stat(COLLECTION_SIZE);
 667         need_stat(RSET_SIZE);
 668         need_stat(TERMFREQ);
 669         need_stat(RELTERMFREQ);
 670         need_stat(WDF);
 671         need_stat(WDF_MAX);
 672         if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
 673             need_stat(DOC_LENGTH_MIN);
 674             need_stat(AVERAGE_LENGTH);
 675         }
 676         if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
 677         if (param_k2 != 0) need_stat(QUERY_LENGTH);
 678         if (param_k3 != 0) need_stat(WQF);
 679         if (param_delta != 0) {
 680             need_stat(AVERAGE_LENGTH);
 681             need_stat(DOC_LENGTH);
 682             need_stat(WQF);
 683         }
 684     }
 685
 686     BM25PlusWeight()
 687         : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
 688           param_min_normlen(0.5), param_delta(1)
 689     {
 690         need_stat(COLLECTION_SIZE);
 691         need_stat(RSET_SIZE);
 692         need_stat(TERMFREQ);
 693         need_stat(RELTERMFREQ);
 694         need_stat(WDF);
 695         need_stat(WDF_MAX);
 696         need_stat(DOC_LENGTH_MIN);
 697         need_stat(AVERAGE_LENGTH);
 698         need_stat(DOC_LENGTH);
 699         need_stat(WQF);
 700     }
 701
 702     std::string name() const;
 703
 704     std::string serialise() const;
 705     BM25PlusWeight * unserialise(const std::string & serialised) const;
 706
 707     double get_sumpart(Xapian::termcount wdf,
 708                        Xapian::termcount doclen,
 709                        Xapian::termcount uniqterm) const;
 710     double get_maxpart() const;
 711
 712     double get_sumextra(Xapian::termcount doclen,
 713                         Xapian::termcount uniqterms) const;
 714     double get_maxextra() const;
 715 };
 716
 717 /** Xapian::Weight subclass implementing the traditional probabilistic formula.
 718  *
 719  * This class implements the "traditional" Probabilistic Weighting scheme, as
 720  * described by the early papers on Probabilistic Retrieval.  BM25 generally
 721  * gives better results.
 722  *
 723  * TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
 724  * the latter returns weights (k+1) times larger.
 725  */
 726 class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
 727     /// Factor to multiply the document length by.
 728     mutable Xapian::doclength len_factor;
 729
 730     /// Factor combining all the document independent factors.
 731     mutable double termweight;
 732
 733     /// The parameter in the formula.
 734     double param_k;
 735
 736     TradWeight * clone() const;
 737
 738     void init(double factor);
 739
 740   public:
 741     /** Construct a TradWeight.
 742      *
 743      *  @param k  A non-negative parameter controlling how influential
 744      *            within-document-frequency (wdf) and document length are.
 745      *            k=0 means that wdf and document length don't affect the
 746      *            weights.  The larger k is, the more they do.  (default 1)
 747      */
 748     explicit TradWeight(double k = 1.0) : param_k(k) {
 749         if (param_k < 0) param_k = 0;
 750         if (param_k != 0.0) {
 751             need_stat(AVERAGE_LENGTH);
 752             need_stat(DOC_LENGTH);
 753         }
 754         need_stat(COLLECTION_SIZE);
 755         need_stat(RSET_SIZE);
 756         need_stat(TERMFREQ);
 757         need_stat(RELTERMFREQ);
 758         need_stat(DOC_LENGTH_MIN);
 759         need_stat(WDF);
 760         need_stat(WDF_MAX);
 761     }
 762
 763     std::string name() const;
 764
 765     std::string serialise() const;
 766     TradWeight * unserialise(const std::string & serialised) const;
 767
 768     double get_sumpart(Xapian::termcount wdf,
 769                        Xapian::termcount doclen,
 770                        Xapian::termcount uniqueterms) const;
 771     double get_maxpart() const;
 772
 773     double get_sumextra(Xapian::termcount doclen,
 774                         Xapian::termcount uniqterms) const;
 775     double get_maxextra() const;
 776 };
 777
 778 /** This class implements the InL2 weighting scheme.
 779  *
 780  *  InL2 is a representative scheme of the Divergence from Randomness Framework
 781  *  by Gianni Amati.
 782  *
 783  *  This weighting scheme is useful for tasks that require early precision.
 784  *
 785  *  It uses the Inverse document frequency model (In), the Laplace method to
 786  *  find the aftereffect of sampling (L) and the second wdf normalization
 787  *  proposed by Amati to normalize the wdf in the document to the length of the
 788  *  document (H2).
 789  *
 790  *  For more information about the DFR Framework and the InL2 scheme, please
 791  *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 792  *  models of information retrieval based on measuring the divergence from
 793  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 794  *  pp. 357-389.
 795  */
 796 class XAPIAN_VISIBILITY_DEFAULT InL2Weight : public Weight {
 797     /// The wdf normalization parameter in the formula.
 798     double param_c;
 799
 800     /// The upper bound on the weight a term can give to a document.
 801     double upper_bound;
 802
 803     /// The constant values which are used on every call to get_sumpart().
 804     double wqf_product_idf;
 805     double c_product_avlen;
 806
 807     InL2Weight * clone() const;
 808
 809     void init(double factor);
 810
 811   public:
 812     /** Construct an InL2Weight.
 813      *
 814      *  @param c  A non-negative and non zero parameter controlling the extent
 815      *            of the normalization of the wdf to the document length. The
 816      *            default value of 1 is suitable for longer queries but it may
 817      *            need to be changed for shorter queries. For more information,
 818      *            please refer to Gianni Amati's PHD thesis.
 819      */
 820     explicit InL2Weight(double c);
 821
 822     InL2Weight()
 823     : param_c(1.0)
 824     {
 825         need_stat(AVERAGE_LENGTH);
 826         need_stat(DOC_LENGTH);
 827         need_stat(DOC_LENGTH_MIN);
 828         need_stat(DOC_LENGTH_MAX);
 829         need_stat(COLLECTION_SIZE);
 830         need_stat(WDF);
 831         need_stat(WDF_MAX);
 832         need_stat(WQF);
 833         need_stat(TERMFREQ);
 834     }
 835
 836     std::string name() const;
 837
 838     std::string serialise() const;
 839     InL2Weight * unserialise(const std::string & serialised) const;
 840
 841     double get_sumpart(Xapian::termcount wdf,
 842                        Xapian::termcount doclen,
 843                        Xapian::termcount uniqterms) const;
 844     double get_maxpart() const;
 845
 846     double get_sumextra(Xapian::termcount doclen,
 847                         Xapian::termcount uniqterms) const;
 848     double get_maxextra() const;
 849 };
 850
 851 /** This class implements the IfB2 weighting scheme.
 852  *
 853  *  IfB2 is a representative scheme of the Divergence from Randomness Framework
 854  *  by Gianni Amati.
 855  *
 856  *  It uses the Inverse term frequency model (If), the Bernoulli method to find
 857  *  the aftereffect of sampling (B) and the second wdf normalization proposed
 858  *  by Amati to normalize the wdf in the document to the length of the document
 859  *  (H2).
 860  *
 861  *  For more information about the DFR Framework and the IfB2 scheme, please
 862  *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 863  *  models of information retrieval based on measuring the divergence from
 864  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 865  *  pp. 357-389.
 866  */
 867 class XAPIAN_VISIBILITY_DEFAULT IfB2Weight : public Weight {
 868     /// The wdf normalization parameter in the formula.
 869     double param_c;
 870
 871     /// The upper bound on the weight.
 872     double upper_bound;
 873
 874     /// The constant values which are used for calculations in get_sumpart().
 875     double wqf_product_idf;
 876     double c_product_avlen;
 877     double B_constant;
 878
 879     IfB2Weight * clone() const;
 880
 881     void init(double factor);
 882
 883   public:
 884     /** Construct an IfB2Weight.
 885      *
 886      *  @param c  A non-negative and non zero parameter controlling the extent
 887      *            of the normalization of the wdf to the document length. The
 888      *            default value of 1 is suitable for longer queries but it may
 889      *            need to be changed for shorter queries. For more information,
 890      *            please refer to Gianni Amati's PHD thesis titled
 891      *            Probabilistic Models for Information Retrieval based on
 892      *            Divergence from Randomness.
 893      */
 894     explicit IfB2Weight(double c);
 895
 896     IfB2Weight() : param_c(1.0) {
 897         need_stat(AVERAGE_LENGTH);
 898         need_stat(DOC_LENGTH);
 899         need_stat(DOC_LENGTH_MIN);
 900         need_stat(DOC_LENGTH_MAX);
 901         need_stat(COLLECTION_SIZE);
 902         need_stat(COLLECTION_FREQ);
 903         need_stat(WDF);
 904         need_stat(WDF_MAX);
 905         need_stat(WQF);
 906         need_stat(TERMFREQ);
 907     }
 908
 909     std::string name() const;
 910
 911     std::string serialise() const;
 912     IfB2Weight * unserialise(const std::string & serialised) const;
 913
 914     double get_sumpart(Xapian::termcount wdf,
 915                        Xapian::termcount doclen,
 916                        Xapian::termcount uniqterm) const;
 917     double get_maxpart() const;
 918
 919     double get_sumextra(Xapian::termcount doclen,
 920                         Xapian::termcount uniqterms) const;
 921     double get_maxextra() const;
 922 };
 923
 924 /** This class implements the IneB2 weighting scheme.
 925  *
 926  *  IneB2 is a representative scheme of the Divergence from Randomness
 927  *  Framework by Gianni Amati.
 928  *
 929  *  It uses the Inverse expected document frequency model (Ine), the Bernoulli
 930  *  method to find the aftereffect of sampling (B) and the second wdf
 931  *  normalization proposed by Amati to normalize the wdf in the document to the
 932  *  length of the document (H2).
 933  *
 934  *  For more information about the DFR Framework and the IneB2 scheme, please
 935  *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 936  *  models of information retrieval based on measuring the divergence from
 937  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 938  *  pp. 357-389.
 939  */
 940 class XAPIAN_VISIBILITY_DEFAULT IneB2Weight : public Weight {
 941     /// The wdf normalization parameter in the formula.
 942     double param_c;
 943
 944     /// The upper bound of the weight.
 945     double upper_bound;
 946
 947     /// Constant values used in get_sumpart().
 948     double wqf_product_idf;
 949     double c_product_avlen;
 950     double B_constant;
 951
 952     IneB2Weight * clone() const;
 953
 954     void init(double factor);
 955
 956   public:
 957     /** Construct an IneB2Weight.
 958      *
 959      *  @param c  A non-negative and non zero parameter controlling the extent
 960      *            of the normalization of the wdf to the document length. The
 961      *            default value of 1 is suitable for longer queries but it may
 962      *            need to be changed for shorter queries. For more information,
 963      *            please refer to Gianni Amati's PHD thesis.
 964      */
 965     explicit IneB2Weight(double c);
 966
 967     IneB2Weight() : param_c(1.0) {
 968         need_stat(AVERAGE_LENGTH);
 969         need_stat(DOC_LENGTH);
 970         need_stat(DOC_LENGTH_MIN);
 971         need_stat(DOC_LENGTH_MAX);
 972         need_stat(COLLECTION_SIZE);
 973         need_stat(WDF);
 974         need_stat(WDF_MAX);
 975         need_stat(WQF);
 976         need_stat(COLLECTION_FREQ);
 977         need_stat(TERMFREQ);
 978     }
 979
 980     std::string name() const;
 981
 982     std::string serialise() const;
 983     IneB2Weight * unserialise(const std::string & serialised) const;
 984
 985     double get_sumpart(Xapian::termcount wdf,
 986                        Xapian::termcount doclen,
 987                        Xapian::termcount uniqterms) const;
 988     double get_maxpart() const;
 989
 990     double get_sumextra(Xapian::termcount doclen,
 991                         Xapian::termcount uniqterms) const;
 992     double get_maxextra() const;
 993 };
 994
 995 /** This class implements the BB2 weighting scheme.
 996  *
 997  *  BB2 is a representative scheme of the Divergence from Randomness Framework
 998  *  by Gianni Amati.
 999  *
1000  *  It uses the Bose-Einstein probabilistic distribution (B) along with
1001  *  Stirling's power approximation, the Bernoulli method to find the
1002  *  aftereffect of sampling (B) and the second wdf normalization proposed by
1003  *  Amati to normalize the wdf in the document to the length of the document
1004  *  (H2).
1005  *
1006  *  For more information about the DFR Framework and the BB2 scheme, please
1007  *  refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
1008  *  models of information retrieval based on measuring the divergence from
1009  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
1010  *  pp. 357-389.
1011  */
1012 class XAPIAN_VISIBILITY_DEFAULT BB2Weight : public Weight {
1013     /// The wdf normalization parameter in the formula.
1014     double param_c;
1015
1016     /// The upper bound on the weight.
1017     double upper_bound;
1018
1019     /// The constant values to be used in get_sumpart().
1020     double c_product_avlen;
1021     double B_constant;
1022     double wt;
1023     double stirling_constant_1;
1024     double stirling_constant_2;
1025
1026     BB2Weight * clone() const;
1027
1028     void init(double factor);
1029
1030   public:
1031     /** Construct a BB2Weight.
1032      *
1033      *  @param c  A non-negative and non zero parameter controlling the extent
1034      *            of the normalization of the wdf to the document length. A
1035      *            default value of 1 is suitable for longer queries but it may
1036      *            need to be changed for shorter queries. For more information,
1037      *            please refer to Gianni Amati's PHD thesis titled
1038      *            Probabilistic Models for Information Retrieval based on
1039      *            Divergence from Randomness.
1040      */
1041     explicit BB2Weight(double c);
1042
1043     BB2Weight() : param_c(1.0) {
1044         need_stat(AVERAGE_LENGTH);
1045         need_stat(DOC_LENGTH);
1046         need_stat(DOC_LENGTH_MIN);
1047         need_stat(DOC_LENGTH_MAX);
1048         need_stat(COLLECTION_SIZE);
1049         need_stat(COLLECTION_FREQ);
1050         need_stat(WDF);
1051         need_stat(WDF_MAX);
1052         need_stat(WQF);
1053         need_stat(TERMFREQ);
1054     }
1055
1056     std::string name() const;
1057
1058     std::string serialise() const;
1059     BB2Weight * unserialise(const std::string & serialised) const;
1060
1061     double get_sumpart(Xapian::termcount wdf,
1062                        Xapian::termcount doclen,
1063                        Xapian::termcount uniqterms) const;
1064     double get_maxpart() const;
1065
1066     double get_sumextra(Xapian::termcount doclen,
1067                         Xapian::termcount uniqterms) const;
1068     double get_maxextra() const;
1069 };
1070
1071 /** This class implements the DLH weighting scheme, which is a representative
1072  *  scheme of the Divergence from Randomness Framework by Gianni Amati.
1073  *
1074  *  This is a parameter free weighting scheme and it should be used with query
1075  *  expansion to obtain better results. It uses the HyperGeometric Probabilistic
1076  *  model and Laplace's normalization to calculate the risk gain.
1077  *
1078  *  For more information about the DFR Framework and the DLH scheme, please
1079  *  refer to :
1080  *  a.) Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
1081  *  models of information retrieval based on measuring the divergence from
1082  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp.
1083  *  357-389.
1084  *  b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
1085  *  G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
1086  *  Proceedings of the 16th Text REtrieval Conference (TREC-2007), 2008.
1087  */
1088 class XAPIAN_VISIBILITY_DEFAULT DLHWeight : public Weight {
1089     /// Now unused but left in place in 1.4.x for ABI compatibility.
1090     double lower_bound;
1091
1092     /// The upper bound on the weight.
1093     double upper_bound;
1094
1095     /// The constant value to be used in get_sumpart().
1096     double log_constant;
1097     double wqf_product_factor;
1098
1099     DLHWeight * clone() const;
1100
1101     void init(double factor);
1102
1103   public:
1104     DLHWeight() {
1105         need_stat(AVERAGE_LENGTH);
1106         need_stat(DOC_LENGTH);
1107         need_stat(COLLECTION_SIZE);
1108         need_stat(COLLECTION_FREQ);
1109         need_stat(WDF);
1110         need_stat(WQF);
1111         need_stat(WDF_MAX);
1112         need_stat(DOC_LENGTH_MIN);
1113         need_stat(DOC_LENGTH_MAX);
1114     }
1115
1116     std::string name() const;
1117
1118     std::string serialise() const;
1119     DLHWeight * unserialise(const std::string & serialised) const;
1120
1121     double get_sumpart(Xapian::termcount wdf,
1122                        Xapian::termcount doclen,
1123                        Xapian::termcount uniqterms) const;
1124     double get_maxpart() const;
1125
1126     double get_sumextra(Xapian::termcount doclen,
1127                         Xapian::termcount uniqterms) const;
1128     double get_maxextra() const;
1129 };
1130
1131 /** This class implements the PL2 weighting scheme.
1132  *
1133  *  PL2 is a representative scheme of the Divergence from Randomness Framework
1134  *  by Gianni Amati.
1135  *
1136  *  This weighting scheme is useful for tasks that require early precision.
1137  *
1138  *  It uses the Poisson approximation of the Binomial Probabilistic distribution
1139  *  (P) along with Stirling's approximation for the factorial value, the Laplace
1140  *  method to find the aftereffect of sampling (L) and the second wdf
1141  *  normalization proposed by Amati to normalize the wdf in the document to the
1142  *  length of the document (H2).
1143  *
1144  *  For more information about the DFR Framework and the PL2 scheme, please
1145  *  refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic models
1146  *  of information retrieval based on measuring the divergence from randomness
1147  *  ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp. 357-389.
1148  */
1149 class XAPIAN_VISIBILITY_DEFAULT PL2Weight : public Weight {
1150     /// The wdf normalization parameter in the formula.
1151     double param_c;
1152
1153     /** The factor to multiply weights by.
1154      *
1155      *  The misleading name is due to this having been used to store a lower
1156      *  bound in 1.4.0.  We no longer need to store that, and so this member
1157      *  has been repurposed in 1.4.1 and later (but the name left the same to
1158      *  ensure ABI compatibility with 1.4.0).
1159      */
1160     double lower_bound;
1161
1162     /// The upper bound on the weight.
1163     double upper_bound;
1164
1165     /// Constants for a given term in a given query.
1166     double P1, P2;
1167
1168     /// Set by init() to (param_c * get_average_length())
1169     double cl;
1170
1171     PL2Weight * clone() const;
1172
1173     void init(double factor);
1174
1175   public:
1176     /** Construct a PL2Weight.
1177      *
1178      *  @param c  A non-negative and non zero parameter controlling the extent
1179      *            of the normalization of the wdf to the document length. The
1180      *            default value of 1 is suitable for longer queries but it may
1181      *            need to be changed for shorter queries. For more information,
1182      *            please refer to Gianni Amati's PHD thesis titled
1183      *            Probabilistic Models for Information Retrieval based on
1184      *            Divergence from Randomness.
1185      */
1186     explicit PL2Weight(double c);
1187
1188     PL2Weight() : param_c(1.0) {
1189         need_stat(AVERAGE_LENGTH);
1190         need_stat(DOC_LENGTH);
1191         need_stat(DOC_LENGTH_MIN);
1192         need_stat(DOC_LENGTH_MAX);
1193         need_stat(COLLECTION_SIZE);
1194         need_stat(COLLECTION_FREQ);
1195         need_stat(WDF);
1196         need_stat(WDF_MAX);
1197         need_stat(WQF);
1198     }
1199
1200     std::string name() const;
1201
1202     std::string serialise() const;
1203     PL2Weight * unserialise(const std::string & serialised) const;
1204
1205     double get_sumpart(Xapian::termcount wdf,
1206                        Xapian::termcount doclen,
1207                        Xapian::termcount uniqterms) const;
1208     double get_maxpart() const;
1209
1210     double get_sumextra(Xapian::termcount doclen,
1211                         Xapian::termcount uniqterms) const;
1212     double get_maxextra() const;
1213 };
1214
1215 /// Xapian::Weight subclass implementing the PL2+ probabilistic formula.
1216 class XAPIAN_VISIBILITY_DEFAULT PL2PlusWeight : public Weight {
1217     /// The factor to multiply weights by.
1218     double factor;
1219
1220     /// The wdf normalization parameter in the formula.
1221     double param_c;
1222
1223     /// Additional parameter delta in the PL2+ weighting formula.
1224     double param_delta;
1225
1226     /// The upper bound on the weight.
1227     double upper_bound;
1228
1229     /// Constants for a given term in a given query.
1230     double P1, P2;
1231
1232     /// Set by init() to (param_c * get_average_length())
1233     double cl;
1234
1235     /// Set by init() to get_collection_freq()) / get_collection_size()
1236     double mean;
1237
1238     /// Weight contribution of delta term in the PL2+ function
1239     double dw;
1240
1241     PL2PlusWeight * clone() const;
1242
1243     void init(double factor_);
1244
1245   public:
1246     /** Construct a PL2PlusWeight.
1247      *
1248      *  @param c  A non-negative and non zero parameter controlling the extent
1249      *            of the normalization of the wdf to the document length. The
1250      *            default value of 1 is suitable for longer queries but it may
1251      *            need to be changed for shorter queries. For more information,
1252      *            please refer to Gianni Amati's PHD thesis titled
1253      *            Probabilistic Models for Information Retrieval based on
1254      *            Divergence from Randomness.
1255      *
1256      *  @param delta  A parameter for pseudo tf value to control the scale
1257      *                of the tf lower bound. Delta(δ) should be a positive
1258      *                real number. It can be tuned for example from 0.1 to 1.5
1259      *                in increments of 0.1 or so. Experiments have shown that
1260      *                PL2+ works effectively across collections with a fixed δ = 0.8
1261      *                (default 0.8)
1262      */
1263     PL2PlusWeight(double c, double delta);
1264
1265     PL2PlusWeight()
1266         : param_c(1.0), param_delta(0.8) {
1267         need_stat(AVERAGE_LENGTH);
1268         need_stat(DOC_LENGTH);
1269         need_stat(DOC_LENGTH_MIN);
1270         need_stat(DOC_LENGTH_MAX);
1271         need_stat(COLLECTION_SIZE);
1272         need_stat(COLLECTION_FREQ);
1273         need_stat(WDF);
1274         need_stat(WDF_MAX);
1275         need_stat(WQF);
1276     }
1277
1278     std::string name() const;
1279
1280     std::string serialise() const;
1281     PL2PlusWeight * unserialise(const std::string & serialised) const;
1282
1283     double get_sumpart(Xapian::termcount wdf,
1284                        Xapian::termcount doclen,
1285                        Xapian::termcount uniqterms) const;
1286     double get_maxpart() const;
1287
1288     double get_sumextra(Xapian::termcount doclen,
1289                         Xapian::termcount uniqterms) const;
1290     double get_maxextra() const;
1291 };
1292
1293 /** This class implements the DPH weighting scheme.
1294  *
1295  *  DPH is a representative scheme of the Divergence from Randomness Framework
1296  *  by Gianni Amati.
1297  *
1298  *  This is a parameter free weighting scheme and it should be used with query
1299  *  expansion to obtain better results. It uses the HyperGeometric Probabilistic
1300  *  model and Popper's normalization to calculate the risk gain.
1301  *
1302  *  For more information about the DFR Framework and the DPH scheme, please
1303  *  refer to :
1304  *  a.) Gianni Amati and Cornelis Joost Van Rijsbergen
1305  *  Probabilistic models of information retrieval based on measuring the
1306  *  divergence from randomness ACM Transactions on Information Systems (TOIS) 20,
1307  *  (4), 2002, pp. 357-389.
1308  *  b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
1309  *  G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
1310  *  Proceedings of the 16th Text Retrieval Conference (TREC-2007), 2008.
1311  */
1312 class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight {
1313     /// The upper bound on the weight.
1314     double upper_bound;
1315
1316     /// Now unused but left in place in 1.4.x for ABI compatibility.
1317     double lower_bound;
1318
1319     /// The constant value used in get_sumpart() .
1320     double log_constant;
1321     double wqf_product_factor;
1322
1323     DPHWeight * clone() const;
1324
1325     void init(double factor);
1326
1327   public:
1328     /** Construct a DPHWeight. */
1329     DPHWeight() {
1330         need_stat(AVERAGE_LENGTH);
1331         need_stat(DOC_LENGTH);
1332         need_stat(COLLECTION_SIZE);
1333         need_stat(COLLECTION_FREQ);
1334         need_stat(WDF);
1335         need_stat(WQF);
1336         need_stat(WDF_MAX);
1337         need_stat(DOC_LENGTH_MIN);
1338         need_stat(DOC_LENGTH_MAX);
1339     }
1340
1341     std::string name() const;
1342
1343     std::string serialise() const;
1344     DPHWeight * unserialise(const std::string & serialised) const;
1345
1346     double get_sumpart(Xapian::termcount wdf,
1347                        Xapian::termcount doclen,
1348                        Xapian::termcount uniqterms) const;
1349     double get_maxpart() const;
1350
1351     double get_sumextra(Xapian::termcount doclen,
1352                         Xapian::termcount uniqterms) const;
1353     double get_maxextra() const;
1354 };
1355
1356
1357 /** Xapian::Weight subclass implementing the Language Model formula.
1358  *
1359  * This class implements the "Language Model" Weighting scheme, as
1360  * described by the early papers on LM by Bruce Croft.
1361  *
1362  * LM works by comparing the query to a Language Model of the document.
1363  * The language model itself is parameter-free, though LMWeight takes
1364  * parameters which specify the smoothing used.
1365  */
1366 class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {
1367     /** The type of smoothing to use. */
1368     type_smoothing select_smoothing;
1369
1370     // Parameters for handling negative value of log, and for smoothing.
1371     double param_log, param_smoothing1, param_smoothing2;
1372
1373     /** The factor to multiply weights by.
1374      *
1375      *  The misleading name is due to this having been used to store some
1376      *  other value in 1.4.0.  However, that value only takes one
1377      *  multiplication and one division to calculate, so for 1.4.x we can just
1378      *  recalculate it each time we need it, and so this member has been
1379      *  repurposed in 1.4.1 and later (but the name left the same to ensure ABI
1380      *  compatibility with 1.4.0).
1381      */
1382     double weight_collection;
1383
1384     LMWeight * clone() const;
1385
1386     void init(double factor);
1387
1388   public:
1389     /** Construct a LMWeight.
1390      *
1391      *  @param param_log_       A non-negative parameter controlling how much
1392      *                          to clamp negative values returned by the log.
1393      *                          The log is calculated by multiplying the
1394      *                          actual weight by param_log.  If param_log is
1395      *                          0.0, then the document length upper bound will
1396      *                          be used (default: document length upper bound)
1397      *
1398      *  @param select_smoothing_        A parameter of type enum
1399      *                                  type_smoothing.  This parameter
1400      *                                  controls which smoothing type to use.
1401      *                                  (default: TWO_STAGE_SMOOTHING)
1402      *
1403      *  @param param_smoothing1_        A non-negative parameter for smoothing
1404      *                                  whose meaning depends on
1405      *                                  select_smoothing_.  In
1406      *                                  JELINEK_MERCER_SMOOTHING, it plays the
1407      *                                  role of estimation and in
1408      *                                  DIRICHLET_SMOOTHING the role of query
1409      *                                  modelling. (default JELINEK_MERCER,
1410      *                                  ABSOLUTE, TWOSTAGE(0.7),
1411      *                                  DIRCHLET(2000))
1412      *
1413      *  @param param_smoothing2_        A non-negative parameter which is used
1414      *                                  with TWO_STAGE_SMOOTHING as parameter for Dirichlet's
1415      *                                  smoothing (default: 2000) and as parameter delta to
1416      *                                  control the scale of the tf lower bound in the
1417      *                                  DIRICHLET_PLUS_SMOOTHING (default 0.05).
1418      *
1419      */
1420     // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
1421     explicit LMWeight(double param_log_ = 0.0,
1422                       type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1423                       double param_smoothing1_ = -1.0,
1424                       double param_smoothing2_ = -1.0)
1425         : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1426           param_smoothing2(param_smoothing2_)
1427     {
1428         if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1429         if (param_smoothing2 < 0) {
1430             if (select_smoothing == TWO_STAGE_SMOOTHING)
1431                 param_smoothing2 = 2000.0;
1432             else
1433                 param_smoothing2 = 0.05;
1434         }
1435         need_stat(AVERAGE_LENGTH);
1436         need_stat(DOC_LENGTH);
1437         need_stat(COLLECTION_SIZE);
1438         need_stat(RSET_SIZE);
1439         need_stat(TERMFREQ);
1440         need_stat(RELTERMFREQ);
1441         need_stat(DOC_LENGTH_MAX);
1442         need_stat(WDF);
1443         need_stat(WDF_MAX);
1444         need_stat(COLLECTION_FREQ);
1445         if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1446             need_stat(UNIQUE_TERMS);
1447         if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1448             need_stat(DOC_LENGTH_MIN);
1449     }
1450
1451     std::string name() const;
1452
1453     std::string serialise() const;
1454     LMWeight * unserialise(const std::string & s) const;
1455
1456     double get_sumpart(Xapian::termcount wdf,
1457                        Xapian::termcount doclen,
1458                        Xapian::termcount uniqterm) const;
1459     double get_maxpart() const;
1460
1461     double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
1462     double get_maxextra() const;
1463 };
1464
1465 /** Xapian::Weight subclass implementing Coordinate Matching.
1466  *
1467  *  Each matching term score one point.  See Managing Gigabytes, Second Edition
1468  *  p181.
1469  */
1470 class XAPIAN_VISIBILITY_DEFAULT CoordWeight : public Weight {
1471     /// The factor to multiply weights by.
1472     double factor;
1473
1474   public:
1475     CoordWeight * clone() const ;
1476
1477     void init(double factor_);
1478
1479     /** Construct a CoordWeight. */
1480     CoordWeight() { }
1481
1482     std::string name() const;
1483
1484     std::string serialise() const;
1485     CoordWeight * unserialise(const std::string &) const;
1486
1487     double get_sumpart(Xapian::termcount wdf,
1488                        Xapian::termcount doclen,
1489                        Xapian::termcount uniqterm) const;
1490     double get_maxpart() const;
1491
1492     double get_sumextra(Xapian::termcount, Xapian::termcount) const;
1493     double get_maxextra() const;
1494 };
1495
1496 }
1497
1498 #endif // XAPIAN_INCLUDED_WEIGHT_H