HoneyTable::read_item(): Read tag in one go
[xapian.git] / xapian-core / expand / expandweight.h
blob25556eb3d7de4aefd03567b37b1d3eaf01ea6e8e
1 /** @file expandweight.h
2 * @brief Collate statistics and calculate the term weights for the ESet.
3 */
4 /* Copyright (C) 2007,2008,2009,2011,2016 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
25 #include <xapian/database.h>
27 #include "api/termlist.h"
28 #include "internaltypes.h"
30 #include <string>
31 #include <vector>
33 namespace Xapian {
34 namespace Internal {
36 /// Collates statistics while calculating term weight in an ESet.
37 class ExpandStats {
38 /// Which databases in a multidb are included in termfreq.
39 std::vector<bool> dbs_seen;
41 /// Average document length in the whole database.
42 Xapian::doclength avlen;
44 /// The parameter k to be used for TradWeight query expansion.
45 double expand_k;
47 public:
48 /// Size of the subset of a multidb to which the value in termfreq applies.
49 Xapian::doccount dbsize;
51 /// Term frequency (for a multidb, may be for a subset of the databases).
52 Xapian::doccount termfreq;
54 /// The number of times the term occurs in the rset.
55 Xapian::termcount rcollection_freq;
57 /// The number of documents from the RSet indexed by the current term (r).
58 Xapian::doccount rtermfreq;
60 /// The multiplier to be used in TradWeight query expansion.
61 double multiplier;
63 /// Keeps track of the index of the sub-database we're accumulating for.
64 size_t db_index;
66 /// Constructor for expansion schemes which do not require the "expand_k"
67 /// parameter.
68 explicit ExpandStats(Xapian::doclength avlen_)
69 : avlen(avlen_), expand_k(0), dbsize(0), termfreq(0),
70 rcollection_freq(0), rtermfreq(0), multiplier(0), db_index(0) {
73 /// Constructor for expansion schemes which require the "expand_k" parameter.
74 ExpandStats(Xapian::doclength avlen_, double expand_k_)
75 : avlen(avlen_), expand_k(expand_k_), dbsize(0), termfreq(0),
76 rcollection_freq(0), rtermfreq(0), multiplier(0), db_index(0) {
80 void accumulate(Xapian::termcount wdf, Xapian::termcount doclen,
81 Xapian::doccount subtf, Xapian::doccount subdbsize)
83 // Boolean terms may have wdf == 0, but treat that as 1 so such terms
84 // get a non-zero weight.
85 if (wdf == 0) wdf = 1;
86 ++rtermfreq;
87 rcollection_freq += wdf;
89 multiplier += (expand_k + 1) * wdf / (expand_k * doclen / avlen + wdf);
91 // If we've not seen this sub-database before, then update dbsize and
92 // termfreq and note that we have seen it.
93 if (db_index >= dbs_seen.size() || !dbs_seen[db_index]) {
94 if (db_index >= dbs_seen.size()) dbs_seen.resize(db_index + 1);
95 dbs_seen[db_index] = true;
96 dbsize += subdbsize;
97 termfreq += subtf;
101 /* Clear the statistics collected in the ExpandStats object before using it
102 * for a new term. */
103 void clear_stats()
105 dbs_seen.clear();
106 dbsize = 0;
107 termfreq = 0;
108 rcollection_freq = 0;
109 rtermfreq = 0;
110 multiplier = 0;
111 db_index = 0;
115 /// Class for calculating probabilistic ESet term weights.
116 class ExpandWeight {
117 /// The combined database.
118 const Xapian::Database db;
120 /// The number of documents in the whole database.
121 Xapian::doccount dbsize;
123 /// Average document length in the whole database.
124 Xapian::doclength avlen;
126 /// The number of documents in the RSet.
127 Xapian::doccount rsize;
129 /// The collection frequency of the term.
130 Xapian::termcount collection_freq;
132 /// The total length of the databse.
133 Xapian::totallength collection_len;
135 /** Should we calculate the exact term frequency when generating an ESet?
137 * This only has any effect if we're using a combined database.
139 * If this member is true, the exact term frequency will be obtained from
140 * the Database object. If this member is false, then an approximation is
141 * used to estimate the term frequency based on the term frequencies in
142 * the sub-databases which we see while collating term statistics, and the
143 * relative sizes of the sub-databases.
145 bool use_exact_termfreq;
147 public:
148 /** Constructor.
150 * @param db_ The database.
151 * @param rsize_ The number of documents in the RSet.
152 * @param use_exact_termfreq_ When expanding over a combined database,
153 * should we use the exact termfreq (if false
154 * a cheaper approximation is used).
156 ExpandWeight(const Xapian::Database &db_,
157 Xapian::doccount rsize_,
158 bool use_exact_termfreq_)
159 : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
160 rsize(rsize_), collection_freq(0),
161 collection_len(avlen * dbsize + .5),
162 use_exact_termfreq(use_exact_termfreq_), stats(avlen) {}
164 /** Constructor.
166 * @param db_ The database.
167 * @param rsize_ The number of documents in the RSet.
168 * @param use_exact_termfreq_ When expanding over a combined database,
169 * should we use the exact termfreq (if false
170 * a cheaper approximation is used).
171 * @param expand_k_ The parameter for TradWeight query expansion.
173 ExpandWeight(const Xapian::Database &db_,
174 Xapian::doccount rsize_,
175 bool use_exact_termfreq_,
176 double expand_k_)
177 : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
178 rsize(rsize_), collection_freq(0),
179 collection_len(avlen * dbsize + .5),
180 use_exact_termfreq(use_exact_termfreq_), stats(avlen, expand_k_) {}
182 /** Get the term statistics.
183 * @param merger The tree of TermList objects.
184 * @param term The current term name.
186 void collect_stats(TermList * merger, const std::string & term);
188 /// Calculate the weight.
189 virtual double get_weight() const = 0;
191 protected:
192 /// An ExpandStats object to accumulate statistics.
193 ExpandStats stats;
195 /// Return the average length of the databse.
196 double get_avlen() const { return avlen; }
198 /// Return the number of documents in the RSet.
199 Xapian::doccount get_rsize() const { return rsize; }
201 /// Return the collection frequency of the term.
202 Xapian::termcount get_collection_freq() const { return collection_freq; }
204 /// Return the length of the collection.
205 Xapian::totallength get_collection_len() const { return collection_len; }
207 /// Return the size of the database.
208 Xapian::doccount get_dbsize() const { return dbsize; }
211 /** This class implements the TradWeight scheme for query expansion.
213 * It is the default scheme for query expansion.
215 class TradEWeight : public ExpandWeight {
216 public:
217 /** Constructor.
219 * @param db_ The database.
220 * @param rsize_ The number of documents in the RSet.
221 * @param use_exact_termfreq_ When expanding over a combined database,
222 * should we use the exact termfreq (if false
223 * a cheaper approximation is used).
224 * @param expand_k_ The parameter for TradWeight query expansion.
226 * All the parameters are passed to the parent ExpandWeight object.
228 TradEWeight(const Xapian::Database &db_,
229 Xapian::doccount rsize_,
230 bool use_exact_termfreq_,
231 double expand_k_)
232 : ExpandWeight(db_, rsize_, use_exact_termfreq_, expand_k_) { }
234 double get_weight() const;
237 /** This class implements the Bo1 scheme for query expansion.
239 * Bo1 is a representative scheme of the Divergence from Randomness Framework
240 * by Gianni Amati.
242 * This is a parameter free weighting scheme for query expansion and it uses
243 * the Bose-Einstein probabilistic distribution.
245 * For more information about the DFR Framework and the Bo1 scheme, please
246 * refer to Gianni Amati's PHD thesis.
248 class Bo1EWeight : public ExpandWeight {
249 public:
250 /** Constructor.
252 * @param db_ The database.
253 * @param rsize_ The number of documents in the RSet.
254 * @param use_exact_termfreq_ When expanding over a combined database,
255 * should we use the exact termfreq (if false
256 * a cheaper approximation is used).
258 * All the parameters are passed to the parent ExpandWeight object.
260 Bo1EWeight(const Xapian::Database &db_,
261 Xapian::doccount rsize_,
262 bool use_exact_termfreq_)
263 : ExpandWeight(db_, rsize_, use_exact_termfreq_) {}
265 double get_weight() const;
271 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H