1 /** @file expandweight.h
2 * @brief Collate statistics and calculate the term weights for the ESet.
4 /* Copyright (C) 2007,2008,2009,2011,2016 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
25 #include <xapian/database.h>
27 #include "api/termlist.h"
28 #include "internaltypes.h"
36 /// Collates statistics while calculating term weight in an ESet.
38 /// Which databases in a multidb are included in termfreq.
39 std::vector
<bool> dbs_seen
;
41 /// Average document length in the whole database.
42 Xapian::doclength avlen
;
44 /// The parameter k to be used for TradWeight query expansion.
48 /// Size of the subset of a multidb to which the value in termfreq applies.
49 Xapian::doccount dbsize
;
51 /// Term frequency (for a multidb, may be for a subset of the databases).
52 Xapian::doccount termfreq
;
54 /// The number of times the term occurs in the rset.
55 Xapian::termcount rcollection_freq
;
57 /// The number of documents from the RSet indexed by the current term (r).
58 Xapian::doccount rtermfreq
;
60 /// The multiplier to be used in TradWeight query expansion.
63 /// Keeps track of the index of the sub-database we're accumulating for.
66 /// Constructor for expansion schemes which do not require the "expand_k"
68 explicit ExpandStats(Xapian::doclength avlen_
)
69 : avlen(avlen_
), expand_k(0), dbsize(0), termfreq(0),
70 rcollection_freq(0), rtermfreq(0), multiplier(0), db_index(0) {
73 /// Constructor for expansion schemes which require the "expand_k" parameter.
74 ExpandStats(Xapian::doclength avlen_
, double expand_k_
)
75 : avlen(avlen_
), expand_k(expand_k_
), dbsize(0), termfreq(0),
76 rcollection_freq(0), rtermfreq(0), multiplier(0), db_index(0) {
80 void accumulate(Xapian::termcount wdf
, Xapian::termcount doclen
,
81 Xapian::doccount subtf
, Xapian::doccount subdbsize
)
83 // Boolean terms may have wdf == 0, but treat that as 1 so such terms
84 // get a non-zero weight.
85 if (wdf
== 0) wdf
= 1;
87 rcollection_freq
+= wdf
;
89 multiplier
+= (expand_k
+ 1) * wdf
/ (expand_k
* doclen
/ avlen
+ wdf
);
91 // If we've not seen this sub-database before, then update dbsize and
92 // termfreq and note that we have seen it.
93 if (db_index
>= dbs_seen
.size() || !dbs_seen
[db_index
]) {
94 if (db_index
>= dbs_seen
.size()) dbs_seen
.resize(db_index
+ 1);
95 dbs_seen
[db_index
] = true;
101 /* Clear the statistics collected in the ExpandStats object before using it
108 rcollection_freq
= 0;
115 /// Class for calculating probabilistic ESet term weights.
117 /// The combined database.
118 const Xapian::Database db
;
120 /// The number of documents in the whole database.
121 Xapian::doccount dbsize
;
123 /// Average document length in the whole database.
124 Xapian::doclength avlen
;
126 /// The number of documents in the RSet.
127 Xapian::doccount rsize
;
129 /// The collection frequency of the term.
130 Xapian::termcount collection_freq
;
132 /// The total length of the databse.
133 Xapian::totallength collection_len
;
135 /** Should we calculate the exact term frequency when generating an ESet?
137 * This only has any effect if we're using a combined database.
139 * If this member is true, the exact term frequency will be obtained from
140 * the Database object. If this member is false, then an approximation is
141 * used to estimate the term frequency based on the term frequencies in
142 * the sub-databases which we see while collating term statistics, and the
143 * relative sizes of the sub-databases.
145 bool use_exact_termfreq
;
150 * @param db_ The database.
151 * @param rsize_ The number of documents in the RSet.
152 * @param use_exact_termfreq_ When expanding over a combined database,
153 * should we use the exact termfreq (if false
154 * a cheaper approximation is used).
156 ExpandWeight(const Xapian::Database
&db_
,
157 Xapian::doccount rsize_
,
158 bool use_exact_termfreq_
)
159 : db(db_
), dbsize(db
.get_doccount()), avlen(db
.get_avlength()),
160 rsize(rsize_
), collection_freq(0),
161 collection_len(avlen
* dbsize
+ .5),
162 use_exact_termfreq(use_exact_termfreq_
), stats(avlen
) {}
166 * @param db_ The database.
167 * @param rsize_ The number of documents in the RSet.
168 * @param use_exact_termfreq_ When expanding over a combined database,
169 * should we use the exact termfreq (if false
170 * a cheaper approximation is used).
171 * @param expand_k_ The parameter for TradWeight query expansion.
173 ExpandWeight(const Xapian::Database
&db_
,
174 Xapian::doccount rsize_
,
175 bool use_exact_termfreq_
,
177 : db(db_
), dbsize(db
.get_doccount()), avlen(db
.get_avlength()),
178 rsize(rsize_
), collection_freq(0),
179 collection_len(avlen
* dbsize
+ .5),
180 use_exact_termfreq(use_exact_termfreq_
), stats(avlen
, expand_k_
) {}
182 /** Get the term statistics.
183 * @param merger The tree of TermList objects.
184 * @param term The current term name.
186 void collect_stats(TermList
* merger
, const std::string
& term
);
188 /// Calculate the weight.
189 virtual double get_weight() const = 0;
192 /// An ExpandStats object to accumulate statistics.
195 /// Return the average length of the databse.
196 double get_avlen() const { return avlen
; }
198 /// Return the number of documents in the RSet.
199 Xapian::doccount
get_rsize() const { return rsize
; }
201 /// Return the collection frequency of the term.
202 Xapian::termcount
get_collection_freq() const { return collection_freq
; }
204 /// Return the length of the collection.
205 Xapian::totallength
get_collection_len() const { return collection_len
; }
207 /// Return the size of the database.
208 Xapian::doccount
get_dbsize() const { return dbsize
; }
211 /** This class implements the TradWeight scheme for query expansion.
213 * It is the default scheme for query expansion.
215 class TradEWeight
: public ExpandWeight
{
219 * @param db_ The database.
220 * @param rsize_ The number of documents in the RSet.
221 * @param use_exact_termfreq_ When expanding over a combined database,
222 * should we use the exact termfreq (if false
223 * a cheaper approximation is used).
224 * @param expand_k_ The parameter for TradWeight query expansion.
226 * All the parameters are passed to the parent ExpandWeight object.
228 TradEWeight(const Xapian::Database
&db_
,
229 Xapian::doccount rsize_
,
230 bool use_exact_termfreq_
,
232 : ExpandWeight(db_
, rsize_
, use_exact_termfreq_
, expand_k_
) { }
234 double get_weight() const;
237 /** This class implements the Bo1 scheme for query expansion.
239 * Bo1 is a representative scheme of the Divergence from Randomness Framework
242 * This is a parameter free weighting scheme for query expansion and it uses
243 * the Bose-Einstein probabilistic distribution.
245 * For more information about the DFR Framework and the Bo1 scheme, please
246 * refer to Gianni Amati's PHD thesis.
248 class Bo1EWeight
: public ExpandWeight
{
252 * @param db_ The database.
253 * @param rsize_ The number of documents in the RSet.
254 * @param use_exact_termfreq_ When expanding over a combined database,
255 * should we use the exact termfreq (if false
256 * a cheaper approximation is used).
258 * All the parameters are passed to the parent ExpandWeight object.
260 Bo1EWeight(const Xapian::Database
&db_
,
261 Xapian::doccount rsize_
,
262 bool use_exact_termfreq_
)
263 : ExpandWeight(db_
, rsize_
, use_exact_termfreq_
) {}
265 double get_weight() const;
271 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H