1 /** @file weightinternal.h
2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/error.h"
29 #include "xapian/query.h"
31 #include "backends/databaseinternal.h"
32 #include "internaltypes.h"
40 /// The frequencies for a term.
42 Xapian::doccount termfreq
;
43 Xapian::doccount reltermfreq
;
44 Xapian::termcount collfreq
;
47 TermFreqs() : termfreq(0), reltermfreq(0), collfreq(0), max_part(0.0) {}
48 TermFreqs(Xapian::doccount termfreq_
,
49 Xapian::doccount reltermfreq_
,
50 Xapian::termcount collfreq_
,
51 double max_part_
= 0.0)
52 : termfreq(termfreq_
),
53 reltermfreq(reltermfreq_
),
55 max_part(max_part_
) {}
57 void operator+=(const TermFreqs
& other
) {
58 termfreq
+= other
.termfreq
;
59 reltermfreq
+= other
.reltermfreq
;
60 collfreq
+= other
.collfreq
;
61 max_part
+= other
.max_part
;
64 /// Return a std::string describing this object.
65 std::string
get_description() const;
72 /** Class to hold statistics for a given collection. */
73 class Weight::Internal
{
74 #ifdef XAPIAN_ASSERTIONS
75 /** Number of sub-databases. */
78 /** True if we've finalised the stats.
80 * Used for assertions.
82 mutable bool finalised
;
86 /** Total length of all documents in the collection. */
87 Xapian::totallength total_length
;
89 /** Number of documents in the collection. */
90 Xapian::doccount collection_size
;
92 /** Number of relevant documents in the collection. */
93 Xapian::doccount rset_size
;
95 /** Number of terms in the collection. */
96 Xapian::termcount total_term_count
;
98 /** Has max_part been set for any term?
100 * If not, we can avoid having to serialise max_part.
104 /** Database to get the bounds on doclength and wdf from. */
110 /** Map of term frequencies and relevant term frequencies for the
112 std::map
<std::string
, TermFreqs
> termfreqs
;
116 #ifdef XAPIAN_ASSERTIONS
117 subdbs(0), finalised(false),
119 total_length(0), collection_size(0), rset_size(0),
120 total_term_count(0), have_max_part(false) { }
122 /** Add in the supplied statistics from a sub-database.
124 * Used for remote databases, where we pass across a serialised stats
125 * object, unserialise it, and add it to our total.
127 Internal
& operator+=(const Internal
& inc
);
129 void set_query(const Xapian::Query
&query_
) {
134 /// Accumulate the rtermfreqs for terms in the query.
135 void accumulate_stats(const Xapian::Database::Internal
&sub_db
,
136 const Xapian::RSet
&rset
);
138 /** Get the frequencies for the given term.
140 * termfreq is "n_t", the number of documents in the collection indexed by
143 * reltermfreq is "r_t", the number of relevant documents in the
144 * collection indexed by the given term.
146 * collfreq is the total number of occurrences of the term in all
149 bool get_stats(const std::string
& term
,
150 Xapian::doccount
& termfreq
,
151 Xapian::doccount
& reltermfreq
,
152 Xapian::termcount
& collfreq
) const {
153 #ifdef XAPIAN_ASSERTIONS
156 // We pass an empty std::string for term when calculating the extra
159 termfreq
= collection_size
;
160 collfreq
= collection_size
;
161 reltermfreq
= rset_size
;
165 auto i
= termfreqs
.find(term
);
166 if (i
== termfreqs
.end()) {
167 termfreq
= reltermfreq
= collfreq
= 0;
171 termfreq
= i
->second
.termfreq
;
172 reltermfreq
= i
->second
.reltermfreq
;
173 collfreq
= i
->second
.collfreq
;
177 /// Get just the termfreq.
178 bool get_stats(const std::string
& term
,
179 Xapian::doccount
& termfreq
) const {
180 Xapian::doccount dummy1
;
181 Xapian::termcount dummy2
;
182 return get_stats(term
, termfreq
, dummy1
, dummy2
);
185 /// Get the termweight.
186 bool get_termweight(const std::string
& term
, double & termweight
) const {
187 #ifdef XAPIAN_ASSERTIONS
195 auto i
= termfreqs
.find(term
);
196 if (i
== termfreqs
.end()) {
200 termweight
= i
->second
.max_part
;
204 /** Get the minimum and maximum termweights.
206 * Used by the snippet code.
208 void get_max_termweight(double & min_tw
, double & max_tw
) {
209 auto i
= termfreqs
.begin();
210 while (i
!= termfreqs
.end() && i
->second
.max_part
== 0.0) ++i
;
211 if (rare(i
== termfreqs
.end())) {
212 min_tw
= max_tw
= 0.0;
215 min_tw
= max_tw
= i
->second
.max_part
;
216 while (++i
!= termfreqs
.end()) {
217 double max_part
= i
->second
.max_part
;
218 if (max_part
> max_tw
) {
220 } else if (max_part
< min_tw
&& max_part
!= 0.0) {
226 /// Set max_part for a term.
227 void set_max_part(const std::string
& term
, double max_part
) {
228 have_max_part
= true;
229 Assert(!term
.empty());
230 auto i
= termfreqs
.find(term
);
231 if (i
!= termfreqs
.end())
232 i
->second
.max_part
+= max_part
;
235 Xapian::doclength
get_average_length() const {
236 #ifdef XAPIAN_ASSERTIONS
239 if (rare(collection_size
== 0)) return 0;
240 return Xapian::doclength(total_length
) / collection_size
;
243 /** Set the "bounds" stats from Database @a db. */
244 void set_bounds_from_db(const Xapian::Database
&db_
) {
249 /// Return a std::string describing this object.
250 std::string
get_description() const;
252 static bool double_param(const char ** p
, double * ptr_val
) {
255 double v
= strtod(*p
, &end
);
256 if (*p
== end
|| errno
) return false;
262 static void parameter_error(const char * msg
,
263 const std::string
& scheme
) {
268 throw InvalidArgumentError(m
);
274 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H