Support: quest -f cjk_ngram
[xapian.git] / xapian-core / weight / tfidfweight.cc
blob87c0ecf7e86acb0a312d15096354597cdc0d31ba
1 /** @file tfidfweight.cc
2 * @brief Xapian::TfIdfWeight class - The TfIdf weighting scheme
3 */
4 /* Copyright (C) 2013 Aarsh Shah
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/weight.h"
24 #include <cmath>
25 #include <cstring>
27 #include "debuglog.h"
28 #include "omassert.h"
30 #include "xapian/error.h"
32 using namespace std;
34 namespace Xapian {
36 TfIdfWeight::TfIdfWeight(const std::string &normals)
37 : normalizations(normals)
39 if (normalizations.length() != 3 ||
40 !strchr("nbsl", normalizations[0]) ||
41 !strchr("ntp", normalizations[1]) ||
42 !strchr("n", normalizations[2]))
43 throw Xapian::InvalidArgumentError("Normalization string is invalid");
44 if (normalizations[1] != 'n') {
45 need_stat(TERMFREQ);
46 need_stat(COLLECTION_SIZE);
48 need_stat(WDF);
49 need_stat(WDF_MAX);
52 TfIdfWeight *
53 TfIdfWeight::clone() const
55 return new TfIdfWeight(normalizations);
58 void
59 TfIdfWeight::init(double factor_)
61 factor = factor_;
64 string
65 TfIdfWeight::name() const
67 return "Xapian::TfIdfWeight";
70 string
71 TfIdfWeight::serialise() const
73 return normalizations;
76 TfIdfWeight *
77 TfIdfWeight::unserialise(const string & s) const
79 if (s.length() != 3)
80 throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
81 return new TfIdfWeight(s);
84 double
85 TfIdfWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount,
86 Xapian::termcount) const
88 Xapian::doccount termfreq = 1;
89 if (normalizations[1] != 'n') termfreq = get_termfreq();
90 double wt = get_wdfn(wdf, normalizations[0]) *
91 get_idfn(termfreq, normalizations[1]);
92 return get_wtn(wt, normalizations[2]) * factor;
95 // An upper bound can be calculated simply on the basis of wdf_max as termfreq
96 // and N are constants.
97 double
98 TfIdfWeight::get_maxpart() const
100 Xapian::doccount termfreq = 1;
101 if (normalizations[1] != 'n') termfreq = get_termfreq();
102 Xapian::termcount wdf_max = get_wdf_upper_bound();
103 double wt = get_wdfn(wdf_max, normalizations[0]) *
104 get_idfn(termfreq, normalizations[1]);
105 return get_wtn(wt, normalizations[2]) * factor;
108 // There is no extra per document component in the TfIdfWeighting scheme.
109 double
110 TfIdfWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
112 return 0;
115 double
116 TfIdfWeight::get_maxextra() const
118 return 0;
121 // Return normalized wdf, idf and weight depending on the normalization string.
122 double
123 TfIdfWeight::get_wdfn(Xapian::termcount wdf, char c) const
125 switch (c) {
126 case 'b':
127 if (wdf == 0) return 0;
128 return 1.0;
129 case 's':
130 return (wdf * wdf);
131 case 'l':
132 if (wdf == 0) return 0;
133 return (1 + log(double(wdf)));
134 default:
135 AssertEq(c, 'n');
136 return wdf;
140 double
141 TfIdfWeight::get_idfn(Xapian::doccount termfreq, char c) const
143 double N = 1.0;
144 if (c != 'n') N = get_collection_size();
145 switch (c) {
146 case 'n':
147 return 1.0;
148 case 'p':
149 // All documents are indexed by the term
150 if (N == termfreq) return 0;
151 return log((N - termfreq) / termfreq);
152 default:
153 AssertEq(c, 't');
154 return (log(N / termfreq));
158 double
159 TfIdfWeight::get_wtn(double wt, char c) const
161 (void)c;
162 AssertEq(c, 'n');
163 return wt;