Fix whitespace irregularities in code
[xapian.git] / xapian-core / weight / weightinternal.h
blob2c62a396a3891e25491ae5d2c8ac8ab898a4bd9f
1 /** @file weightinternal.h
2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
3 */
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/query.h"
30 #include "backends/database.h"
31 #include "internaltypes.h"
32 #include "omassert.h"
34 #include <map>
35 #include <string>
37 /// The frequencies for a term.
38 struct TermFreqs {
39 Xapian::doccount termfreq;
40 Xapian::doccount reltermfreq;
41 Xapian::termcount collfreq;
42 double max_part;
44 TermFreqs() : termfreq(0), reltermfreq(0), collfreq(0), max_part(0.0) {}
45 TermFreqs(Xapian::doccount termfreq_,
46 Xapian::doccount reltermfreq_,
47 Xapian::termcount collfreq_,
48 double max_part_ = 0.0)
49 : termfreq(termfreq_),
50 reltermfreq(reltermfreq_),
51 collfreq(collfreq_),
52 max_part(max_part_) {}
54 void operator+=(const TermFreqs & other) {
55 termfreq += other.termfreq;
56 reltermfreq += other.reltermfreq;
57 collfreq += other.collfreq;
58 max_part += other.max_part;
61 /// Return a std::string describing this object.
62 std::string get_description() const;
65 namespace Xapian {
67 class RSet;
69 /** Class to hold statistics for a given collection. */
70 class Weight::Internal {
71 #ifdef XAPIAN_ASSERTIONS
72 /** Number of sub-databases. */
73 size_t subdbs;
75 /** True if we've finalised the stats.
77 * Used for assertions.
79 mutable bool finalised;
80 #endif
82 public:
83 /** Total length of all documents in the collection. */
84 totlen_t total_length;
86 /** Number of documents in the collection. */
87 Xapian::doccount collection_size;
89 /** Number of relevant documents in the collection. */
90 Xapian::doccount rset_size;
92 /** Number of terms in the collection. */
93 Xapian::termcount total_term_count;
95 /** Has max_part been set for any term?
97 * If not, we can avoid having to serialise max_part.
99 bool have_max_part;
101 /** Database to get the bounds on doclength and wdf from. */
102 Xapian::Database db;
104 /** The query. */
105 Xapian::Query query;
107 /** Map of term frequencies and relevant term frequencies for the
108 * collection. */
109 std::map<std::string, TermFreqs> termfreqs;
111 Internal()
113 #ifdef XAPIAN_ASSERTIONS
114 subdbs(0), finalised(false),
115 #endif
116 total_length(0), collection_size(0), rset_size(0),
117 total_term_count(0), have_max_part(false) { }
119 /** Add in the supplied statistics from a sub-database.
121 * Used for remote databases, where we pass across a serialised stats
122 * object, unserialise it, and add it to our total.
124 Internal & operator+=(const Internal & inc);
126 void set_query(const Xapian::Query &query_) {
127 AssertEq(subdbs, 0);
128 query = query_;
131 /// Accumulate the rtermfreqs for terms in the query.
132 void accumulate_stats(const Xapian::Database::Internal &sub_db,
133 const Xapian::RSet &rset);
135 /** Get the frequencies for the given term.
137 * termfreq is "n_t", the number of documents in the collection indexed by
138 * the given term.
140 * reltermfreq is "r_t", the number of relevant documents in the
141 * collection indexed by the given term.
143 * collfreq is the total number of occurrences of the term in all
144 * documents.
146 bool get_stats(const std::string & term,
147 Xapian::doccount & termfreq,
148 Xapian::doccount & reltermfreq,
149 Xapian::termcount & collfreq) const {
150 #ifdef XAPIAN_ASSERTIONS
151 finalised = true;
152 #endif
153 // We pass an empty std::string for term when calculating the extra
154 // weight.
155 if (term.empty()) {
156 termfreq = collection_size;
157 collfreq = collection_size;
158 reltermfreq = rset_size;
159 return true;
162 auto i = termfreqs.find(term);
163 if (i == termfreqs.end()) {
164 termfreq = reltermfreq = collfreq = 0;
165 return false;
168 termfreq = i->second.termfreq;
169 reltermfreq = i->second.reltermfreq;
170 collfreq = i->second.collfreq;
171 return true;
174 /// Get just the termfreq.
175 bool get_stats(const std::string & term,
176 Xapian::doccount & termfreq) const {
177 Xapian::doccount dummy1;
178 Xapian::termcount dummy2;
179 return get_stats(term, termfreq, dummy1, dummy2);
182 /// Get the termweight.
183 bool get_termweight(const std::string & term, double & termweight) const {
184 #ifdef XAPIAN_ASSERTIONS
185 finalised = true;
186 #endif
187 termweight = 0.0;
188 if (term.empty()) {
189 return false;
192 auto i = termfreqs.find(term);
193 if (i == termfreqs.end()) {
194 return false;
197 termweight = i->second.max_part;
198 return true;
201 /** Get the minimum and maximum termweights.
203 * Used by the snippet code.
205 void get_max_termweight(double & min_tw, double & max_tw) {
206 auto i = termfreqs.begin();
207 while (i != termfreqs.end() && i->second.max_part == 0.0) ++i;
208 if (rare(i == termfreqs.end())) {
209 min_tw = max_tw = 0.0;
210 return;
212 min_tw = max_tw = i->second.max_part;
213 while (++i != termfreqs.end()) {
214 double max_part = i->second.max_part;
215 if (max_part > max_tw) {
216 max_tw = max_part;
217 } else if (max_part < min_tw && max_part != 0.0) {
218 min_tw = max_part;
223 /// Set max_part for a term.
224 void set_max_part(const std::string & term, double max_part) {
225 have_max_part = true;
226 Assert(!term.empty());
227 auto i = termfreqs.find(term);
228 if (i != termfreqs.end())
229 i->second.max_part += max_part;
232 Xapian::doclength get_average_length() const {
233 #ifdef XAPIAN_ASSERTIONS
234 finalised = true;
235 #endif
236 if (rare(collection_size == 0)) return 0;
237 return Xapian::doclength(total_length) / collection_size;
240 /** Set the "bounds" stats from Database @a db. */
241 void set_bounds_from_db(const Xapian::Database &db_) {
242 Assert(!finalised);
243 db = db_;
246 /// Return a std::string describing this object.
247 std::string get_description() const;
252 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H