Add Weight::create() and Weight::create_from_parameters()
[xapian.git] / xapian-core / weight / weightinternal.h
blob4195998abaec48c5b43365eb1c0d1b10ed546f54
1 /** @file weightinternal.h
2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
3 */
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/error.h"
29 #include "xapian/query.h"
31 #include "backends/database.h"
32 #include "internaltypes.h"
33 #include "omassert.h"
35 #include <cerrno>
36 #include <cstdlib>
37 #include <map>
38 #include <string>
40 /// The frequencies for a term.
41 struct TermFreqs {
42 Xapian::doccount termfreq;
43 Xapian::doccount reltermfreq;
44 Xapian::termcount collfreq;
45 double max_part;
47 TermFreqs() : termfreq(0), reltermfreq(0), collfreq(0), max_part(0.0) {}
48 TermFreqs(Xapian::doccount termfreq_,
49 Xapian::doccount reltermfreq_,
50 Xapian::termcount collfreq_,
51 double max_part_ = 0.0)
52 : termfreq(termfreq_),
53 reltermfreq(reltermfreq_),
54 collfreq(collfreq_),
55 max_part(max_part_) {}
57 void operator+=(const TermFreqs & other) {
58 termfreq += other.termfreq;
59 reltermfreq += other.reltermfreq;
60 collfreq += other.collfreq;
61 max_part += other.max_part;
64 /// Return a std::string describing this object.
65 std::string get_description() const;
68 namespace Xapian {
70 class RSet;
72 /** Class to hold statistics for a given collection. */
73 class Weight::Internal {
74 #ifdef XAPIAN_ASSERTIONS
75 /** Number of sub-databases. */
76 size_t subdbs;
78 /** True if we've finalised the stats.
80 * Used for assertions.
82 mutable bool finalised;
83 #endif
85 public:
86 /** Total length of all documents in the collection. */
87 totlen_t total_length;
89 /** Number of documents in the collection. */
90 Xapian::doccount collection_size;
92 /** Number of relevant documents in the collection. */
93 Xapian::doccount rset_size;
95 /** Number of terms in the collection. */
96 Xapian::termcount total_term_count;
98 /** Has max_part been set for any term?
100 * If not, we can avoid having to serialise max_part.
102 bool have_max_part;
104 /** Database to get the bounds on doclength and wdf from. */
105 Xapian::Database db;
107 /** The query. */
108 Xapian::Query query;
110 /** Map of term frequencies and relevant term frequencies for the
111 * collection. */
112 std::map<std::string, TermFreqs> termfreqs;
114 Internal()
116 #ifdef XAPIAN_ASSERTIONS
117 subdbs(0), finalised(false),
118 #endif
119 total_length(0), collection_size(0), rset_size(0),
120 total_term_count(0), have_max_part(false) { }
122 /** Add in the supplied statistics from a sub-database.
124 * Used for remote databases, where we pass across a serialised stats
125 * object, unserialise it, and add it to our total.
127 Internal & operator+=(const Internal & inc);
129 void set_query(const Xapian::Query &query_) {
130 AssertEq(subdbs, 0);
131 query = query_;
134 /// Accumulate the rtermfreqs for terms in the query.
135 void accumulate_stats(const Xapian::Database::Internal &sub_db,
136 const Xapian::RSet &rset);
138 /** Get the frequencies for the given term.
140 * termfreq is "n_t", the number of documents in the collection indexed by
141 * the given term.
143 * reltermfreq is "r_t", the number of relevant documents in the
144 * collection indexed by the given term.
146 * collfreq is the total number of occurrences of the term in all
147 * documents.
149 bool get_stats(const std::string & term,
150 Xapian::doccount & termfreq,
151 Xapian::doccount & reltermfreq,
152 Xapian::termcount & collfreq) const {
153 #ifdef XAPIAN_ASSERTIONS
154 finalised = true;
155 #endif
156 // We pass an empty std::string for term when calculating the extra
157 // weight.
158 if (term.empty()) {
159 termfreq = collection_size;
160 collfreq = collection_size;
161 reltermfreq = rset_size;
162 return true;
165 auto i = termfreqs.find(term);
166 if (i == termfreqs.end()) {
167 termfreq = reltermfreq = collfreq = 0;
168 return false;
171 termfreq = i->second.termfreq;
172 reltermfreq = i->second.reltermfreq;
173 collfreq = i->second.collfreq;
174 return true;
177 /// Get just the termfreq.
178 bool get_stats(const std::string & term,
179 Xapian::doccount & termfreq) const {
180 Xapian::doccount dummy1;
181 Xapian::termcount dummy2;
182 return get_stats(term, termfreq, dummy1, dummy2);
185 /// Get the termweight.
186 bool get_termweight(const std::string & term, double & termweight) const {
187 #ifdef XAPIAN_ASSERTIONS
188 finalised = true;
189 #endif
190 termweight = 0.0;
191 if (term.empty()) {
192 return false;
195 auto i = termfreqs.find(term);
196 if (i == termfreqs.end()) {
197 return false;
200 termweight = i->second.max_part;
201 return true;
204 /** Get the minimum and maximum termweights.
206 * Used by the snippet code.
208 void get_max_termweight(double & min_tw, double & max_tw) {
209 auto i = termfreqs.begin();
210 while (i != termfreqs.end() && i->second.max_part == 0.0) ++i;
211 if (rare(i == termfreqs.end())) {
212 min_tw = max_tw = 0.0;
213 return;
215 min_tw = max_tw = i->second.max_part;
216 while (++i != termfreqs.end()) {
217 double max_part = i->second.max_part;
218 if (max_part > max_tw) {
219 max_tw = max_part;
220 } else if (max_part < min_tw && max_part != 0.0) {
221 min_tw = max_part;
226 /// Set max_part for a term.
227 void set_max_part(const std::string & term, double max_part) {
228 have_max_part = true;
229 Assert(!term.empty());
230 auto i = termfreqs.find(term);
231 if (i != termfreqs.end())
232 i->second.max_part += max_part;
235 Xapian::doclength get_average_length() const {
236 #ifdef XAPIAN_ASSERTIONS
237 finalised = true;
238 #endif
239 if (rare(collection_size == 0)) return 0;
240 return Xapian::doclength(total_length) / collection_size;
243 /** Set the "bounds" stats from Database @a db. */
244 void set_bounds_from_db(const Xapian::Database &db_) {
245 Assert(!finalised);
246 db = db_;
249 /// Return a std::string describing this object.
250 std::string get_description() const;
252 static bool double_param(const char ** p, double * ptr_val) {
253 char *end;
254 errno = 0;
255 double v = strtod(*p, &end);
256 if (*p == end || errno) return false;
257 *p = end;
258 *ptr_val = v;
259 return true;
262 static void parameter_error(const char * msg,
263 const std::string & scheme) {
264 string m(msg);
265 m += ": '";
266 m += scheme;
267 m += "'";
268 throw InvalidArgumentError(m);
274 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H