xapian-core/weight/dlhweight.cc

   1 /** @file dlhweight.cc
   2  * @brief Xapian::DLHWeight class - The DLH weighting scheme of the DFR framework.
   3  */
   4 /* Copyright (C) 2013, 2014 Aarsh Shah
   5  * Copyright (C) 2016 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "xapian/weight.h"
  25
  26 #include "xapian/error.h"
  27 #include "common/log2.h"
  28 #include <algorithm>
  29
  30 using namespace std;
  31
  32 namespace Xapian {
  33
  34 DLHWeight *
  35 DLHWeight::clone() const
  36 {
  37     return new DLHWeight();
  38 }
  39
  40 void
  41 DLHWeight::init(double factor)
  42 {
  43     double wdf_upper = get_wdf_upper_bound();
  44     if (wdf_upper == 0) {
  45         upper_bound = 0.0;
  46         return;
  47     }
  48
  49     const double wdf_lower = 1.0;
  50     double len_upper = get_doclength_upper_bound();
  51     double len_lower = get_doclength_lower_bound();
  52
  53     double N = get_collection_size();
  54     double F = get_collection_freq();
  55
  56     // Calculate constant values to be used in get_sumpart().
  57     log_constant = get_average_length() * N / F;
  58     wqf_product_factor = get_wqf() * factor;
  59
  60     // Calculate values for the upper bound.
  61
  62     // w <= l, so if the allowed ranges overlap, max w/l is 1.0.
  63     double max_wdf_over_l = wdf_upper < len_lower ? wdf_upper / len_lower : 1.0;
  64
  65     // First term A: w/(w+.5)*log2(w/l*L) where L=l_avg.N/coll_freq
  66     // Assume log >= 0:
  67     //   w/(w+.5) = 1-1/(2w+1) and w >= 1 so max A at w=w_max
  68     //   log2(w/l*L) maximised when w/l maximised
  69     //   so max A at w=w_max, l=max(l_min, w_max)
  70     // If log < 0 => then A <= 0, so max A <= 0 and we want to minimise the
  71     // factor outside the log.
  72     double logged_expr = max_wdf_over_l * log_constant;
  73     double w_for_A = logged_expr > 1.0 ? wdf_upper : wdf_lower;
  74     double A = w_for_A / (w_for_A + 0.5) * log2(logged_expr);
  75
  76     // Second term B:
  77     //
  78     // (l-w)*log2(1-w/l)
  79     //
  80     // The log is negative, and w <= l so B <= 0 and its maximum is the value
  81     // as close to zero as possible.  So smaller (l-w) is better and smaller
  82     // w/l is better.
  83     //
  84     // This function is ill defined at w=l, but -> 0 as w -> l.
  85     //
  86     // If w=l is valid (i.e. len_lower > wdf_upper) then B = 0.
  87     double B = 0;
  88     if (len_lower > wdf_upper) {
  89         // If not, then minimising l-w gives us a candidate (i.e. w=wdf_upper
  90         // and l=len_lower).
  91         //
  92         // The function is also 0 at w = 0 (there must be a local mimina at
  93         // some value of w between 0 and l), so the other candidate is at
  94         // w=wdf_lower.
  95         //
  96         // We need to find the optimum value of l in this case, so
  97         // differentiate the formula by l:
  98         //
  99         // d/dl: log2(1-w/l) + (l-w)*(1-w/l)/(l*log(2))
 100         //     = (log(1-w/l) + (1-w/l)²)/log(2)
 101         //     = (log(x) + x²)/log(2)     [x=1-w/l]
 102         //
 103         // which is 0 at approx x=0.65291864
 104         //
 105         // x=1-w/l <=> l*(1-x)=w <=> l=w/(1-x) <=> l ~= 0.34708136*w
 106         //
 107         // but l >= w so we can't attain that (and the log isn't valid there).
 108         //
 109         // Gradient at (without loss of generality) l=2*w is:
 110         //       (log(0.5) + 0.25)/log(2)
 111         // which is < 0 so want to minimise l, i.e. l=len_lower, so the other
 112         // candidate is w=wdf_lower and l=len_lower.
 113         //
 114         // So evaluate both candidates and pick the larger:
 115         double B1 = (len_lower - wdf_lower) * log2(1.0 - wdf_lower / len_lower);
 116         double B2 = (len_lower - wdf_upper) * log2(1.0 - wdf_upper / len_lower);
 117         B = max(B1, B2);
 118     }
 119
 120     /* An upper bound of the term used in the third log can be obtained by:
 121      *
 122      * 0.5 * log2(2.0 * M_PI * wdf * (1 - wdf / len))
 123      *
 124      * An upper bound on wdf * (1 - wdf / len) (and hence on the log, since
 125      * log is a monotonically increasing function on positive numbers) can
 126      * be obtained by plugging in the upper bound of the length and
 127      * differentiating the term w.r.t wdf which gives the value of wdf at which
 128      * the function attains maximum value - at wdf = len_upper / 2 (or if the
 129      * wdf can't be that large, at wdf = wdf_upper): */
 130     double wdf_var = min(wdf_upper, len_upper / 2.0);
 131     double max_product = wdf_var * (1.0 - wdf_var / len_upper);
 132 #if 0
 133     /* An upper bound can also be obtained by taking the minimum and maximum
 134      * wdf value in the formula as shown (which isn't useful now as it's always
 135      * >= the bound above, but could be useful if we tracked bounds on wdf/len):
 136      */
 137     double min_wdf_to_len = wdf_lower / len_upper;
 138     double max_product_2 = wdf_upper * (1.0 - min_wdf_to_len);
 139     /* Take the minimum of the two upper bounds. */
 140     max_product = min(max_product, max_product_2);
 141 #endif
 142     double C = 0.5 * log2(2.0 * M_PI * max_product) / (wdf_lower + 0.5);
 143     upper_bound = A + B + C;
 144
 145     if (rare(upper_bound < 0.0))
 146         upper_bound = 0.0;
 147     else
 148         upper_bound *= wqf_product_factor;
 149 }
 150
 151 string
 152 DLHWeight::name() const
 153 {
 154     return "Xapian::DLHWeight";
 155 }
 156
 157 string
 158 DLHWeight::short_name() const
 159 {
 160     return "dlh";
 161 }
 162
 163 string
 164 DLHWeight::serialise() const
 165 {
 166     return string();
 167 }
 168
 169 DLHWeight *
 170 DLHWeight::unserialise(const string& s) const
 171 {
 172     if (rare(!s.empty()))
 173         throw Xapian::SerialisationError("Extra data in DLHWeight::unserialise()");
 174     return new DLHWeight();
 175 }
 176
 177 double
 178 DLHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
 179                        Xapian::termcount) const
 180 {
 181     if (wdf == 0 || wdf == len) return 0.0;
 182
 183     double wdf_to_len = double(wdf) / len;
 184     double one_minus_wdf_to_len = 1.0 - wdf_to_len;
 185
 186     double wt = wdf * log2(wdf_to_len * log_constant) +
 187                 (len - wdf) * log2(one_minus_wdf_to_len) +
 188                 0.5 * log2(2.0 * M_PI * wdf * one_minus_wdf_to_len);
 189     if (rare(wt <= 0.0)) return 0.0;
 190
 191     return wqf_product_factor * wt / (wdf + 0.5);
 192 }
 193
 194 double
 195 DLHWeight::get_maxpart() const
 196 {
 197     return upper_bound;
 198 }
 199
 200 double
 201 DLHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
 202 {
 203     return 0;
 204 }
 205
 206 double
 207 DLHWeight::get_maxextra() const
 208 {
 209     return 0;
 210 }
 211
 212 DLHWeight *
 213 DLHWeight::create_from_parameters(const char * p) const
 214 {
 215     if (*p != '\0')
 216         throw InvalidArgumentError("No parameters are required for DLHWeight");
 217     return new Xapian::DLHWeight();
 218 }
 219
 220 }