1 /** @file weightinternal.h
2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/query.h"
30 #include "backends/database.h"
31 #include "internaltypes.h"
37 /// The frequencies for a term.
39 Xapian::doccount termfreq
;
40 Xapian::doccount reltermfreq
;
41 Xapian::termcount collfreq
;
44 TermFreqs() : termfreq(0), reltermfreq(0), collfreq(0), max_part(0.0) {}
45 TermFreqs(Xapian::doccount termfreq_
,
46 Xapian::doccount reltermfreq_
,
47 Xapian::termcount collfreq_
,
48 double max_part_
= 0.0)
49 : termfreq(termfreq_
),
50 reltermfreq(reltermfreq_
),
52 max_part(max_part_
) {}
54 void operator+=(const TermFreqs
& other
) {
55 termfreq
+= other
.termfreq
;
56 reltermfreq
+= other
.reltermfreq
;
57 collfreq
+= other
.collfreq
;
58 max_part
+= other
.max_part
;
61 /// Return a std::string describing this object.
62 std::string
get_description() const;
69 /** Class to hold statistics for a given collection. */
70 class Weight::Internal
{
71 #ifdef XAPIAN_ASSERTIONS
72 /** Number of sub-databases. */
75 /** True if we've finalised the stats.
77 * Used for assertions.
79 mutable bool finalised
;
83 /** Total length of all documents in the collection. */
84 totlen_t total_length
;
86 /** Number of documents in the collection. */
87 Xapian::doccount collection_size
;
89 /** Number of relevant documents in the collection. */
90 Xapian::doccount rset_size
;
92 /** Number of terms in the collection. */
93 Xapian::termcount total_term_count
;
95 /** Has max_part been set for any term?
97 * If not, we can avoid having to serialise max_part.
101 /** Database to get the bounds on doclength and wdf from. */
107 /** Map of term frequencies and relevant term frequencies for the
109 std::map
<std::string
, TermFreqs
> termfreqs
;
113 #ifdef XAPIAN_ASSERTIONS
114 subdbs(0), finalised(false),
116 total_length(0), collection_size(0), rset_size(0),
117 total_term_count(0), have_max_part(false) { }
119 /** Add in the supplied statistics from a sub-database.
121 * Used for remote databases, where we pass across a serialised stats
122 * object, unserialise it, and add it to our total.
124 Internal
& operator+=(const Internal
& inc
);
126 void set_query(const Xapian::Query
&query_
) {
131 /// Accumulate the rtermfreqs for terms in the query.
132 void accumulate_stats(const Xapian::Database::Internal
&sub_db
,
133 const Xapian::RSet
&rset
);
135 /** Get the frequencies for the given term.
137 * termfreq is "n_t", the number of documents in the collection indexed by
140 * reltermfreq is "r_t", the number of relevant documents in the
141 * collection indexed by the given term.
143 * collfreq is the total number of occurrences of the term in all
146 bool get_stats(const std::string
& term
,
147 Xapian::doccount
& termfreq
,
148 Xapian::doccount
& reltermfreq
,
149 Xapian::termcount
& collfreq
) const {
150 #ifdef XAPIAN_ASSERTIONS
153 // We pass an empty std::string for term when calculating the extra
156 termfreq
= collection_size
;
157 collfreq
= collection_size
;
158 reltermfreq
= rset_size
;
162 auto i
= termfreqs
.find(term
);
163 if (i
== termfreqs
.end()) {
164 termfreq
= reltermfreq
= collfreq
= 0;
168 termfreq
= i
->second
.termfreq
;
169 reltermfreq
= i
->second
.reltermfreq
;
170 collfreq
= i
->second
.collfreq
;
174 /// Get just the termfreq.
175 bool get_stats(const std::string
& term
,
176 Xapian::doccount
& termfreq
) const {
177 Xapian::doccount dummy1
;
178 Xapian::termcount dummy2
;
179 return get_stats(term
, termfreq
, dummy1
, dummy2
);
182 /// Get the termweight.
183 bool get_termweight(const std::string
& term
, double & termweight
) const {
184 #ifdef XAPIAN_ASSERTIONS
192 auto i
= termfreqs
.find(term
);
193 if (i
== termfreqs
.end()) {
197 termweight
= i
->second
.max_part
;
201 /** Get the minimum and maximum termweights.
203 * Used by the snippet code.
205 void get_max_termweight(double & min_tw
, double & max_tw
) {
206 auto i
= termfreqs
.begin();
207 while (i
!= termfreqs
.end() && i
->second
.max_part
== 0.0) ++i
;
208 if (rare(i
== termfreqs
.end())) {
209 min_tw
= max_tw
= 0.0;
212 min_tw
= max_tw
= i
->second
.max_part
;
213 while (++i
!= termfreqs
.end()) {
214 double max_part
= i
->second
.max_part
;
215 if (max_part
> max_tw
) {
217 } else if (max_part
< min_tw
&& max_part
!= 0.0) {
223 /// Set max_part for a term.
224 void set_max_part(const std::string
& term
, double max_part
) {
225 have_max_part
= true;
226 Assert(!term
.empty());
227 auto i
= termfreqs
.find(term
);
228 if (i
!= termfreqs
.end())
229 i
->second
.max_part
+= max_part
;
232 Xapian::doclength
get_average_length() const {
233 #ifdef XAPIAN_ASSERTIONS
236 if (rare(collection_size
== 0)) return 0;
237 return Xapian::doclength(total_length
) / collection_size
;
240 /** Set the "bounds" stats from Database @a db. */
241 void set_bounds_from_db(const Xapian::Database
&db_
) {
246 /// Return a std::string describing this object.
247 std::string
get_description() const;
252 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H