Fix termfreq used in weight calcs for repeated terms
[xapian.git] / xapian-core / weight / weightinternal.cc
bloba51ff8cdce71f56275953af1a4fc8c95b7fa5781
1 /** @file weightinternal.cc
2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
3 */
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2012,2013,2014,2015,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "weightinternal.h"
26 #include "xapian/enquire.h"
28 #include "omassert.h"
29 #include "api/rsetinternal.h"
30 #include "str.h"
31 #include "api/termlist.h"
33 #include <memory>
34 #include <set>
36 using namespace std;
38 string
39 TermFreqs::get_description() const {
40 string desc("TermFreqs(termfreq=");
41 desc += str(termfreq);
42 desc += ", reltermfreq=";
43 desc += str(reltermfreq);
44 desc += ", collfreq=";
45 desc += str(collfreq);
46 desc += ", max_part=";
47 desc += str(max_part);
48 desc += ")";
49 return desc;
52 namespace Xapian {
54 Weight::Internal &
55 Weight::Internal::operator+=(const Weight::Internal & inc)
57 #ifdef XAPIAN_ASSERTIONS
58 Assert(!finalised);
59 subdbs += inc.subdbs;
60 #endif
61 total_length += inc.total_length;
62 collection_size += inc.collection_size;
63 rset_size += inc.rset_size;
64 total_term_count += inc.total_term_count;
66 // Add termfreqs and reltermfreqs
67 map<string, TermFreqs>::const_iterator i;
68 for (i = inc.termfreqs.begin(); i != inc.termfreqs.end(); ++i) {
69 termfreqs[i->first] += i->second;
71 return *this;
74 void
75 Weight::Internal::accumulate_stats(const Xapian::Database::Internal &subdb,
76 const Xapian::RSet &rset)
78 #ifdef XAPIAN_ASSERTIONS
79 Assert(!finalised);
80 ++subdbs;
81 #endif
82 total_length += subdb.get_total_length();
83 collection_size += subdb.get_doccount();
84 rset_size += rset.size();
86 total_term_count += subdb.get_doccount() * subdb.get_total_length();
87 Xapian::TermIterator t;
88 for (t = query.get_unique_terms_begin(); t != Xapian::TermIterator(); ++t) {
89 const string & term = *t;
91 Xapian::doccount sub_tf;
92 Xapian::termcount sub_cf;
93 subdb.get_freqs(term, &sub_tf, &sub_cf);
94 TermFreqs & tf = termfreqs[term];
95 tf.termfreq += sub_tf;
96 tf.collfreq += sub_cf;
99 if (!rset.internal.get())
100 return;
102 for (Xapian::docid did : rset.internal->docs) {
103 Assert(did);
104 // The query is likely to contain far fewer terms than the documents,
105 // and we can skip the document's termlist, so look for each query term
106 // in the document.
107 unique_ptr<TermList> tl(subdb.open_term_list(did));
108 map<string, TermFreqs>::iterator i;
109 for (i = termfreqs.begin(); i != termfreqs.end(); ++i) {
110 const string & term = i->first;
111 TermList * ret = tl->skip_to(term);
112 Assert(ret == NULL);
113 (void)ret;
114 if (tl->at_end())
115 break;
116 if (term == tl->get_termname())
117 ++i->second.reltermfreq;
122 string
123 Weight::Internal::get_description() const
125 string desc = "Weight::Internal(totlen=";
126 desc += str(total_length);
127 desc += ", collection_size=";
128 desc += str(collection_size);
129 desc += ", rset_size=";
130 desc += str(rset_size);
131 desc += ", total_term_count=";
132 desc += str(total_term_count);
133 #ifdef XAPIAN_ASSERTIONS
134 desc += ", subdbs=";
135 desc += str(subdbs);
136 desc += ", finalised=";
137 desc += str(finalised);
138 #endif
139 desc += ", termfreqs={";
140 map<string, TermFreqs>::const_iterator i;
141 for (i = termfreqs.begin(); i != termfreqs.end(); ++i) {
142 if (i != termfreqs.begin())
143 desc += ", ";
144 desc += i->first;
145 desc += " => ";
146 desc += i->second.get_description();
148 desc += "})";
149 return desc;