Qualify std::size_t and include <cstddef> for it
[xapian.git] / xapian-core / tests / dbcheck.cc
blob46c974a3b2268c948f12f73a7e9d5fd2f5e12c37
1 /* dbcheck.cc: test database contents and consistency.
3 * Copyright 2009 Richard Boulton
4 * Copyright 2010,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "dbcheck.h"
26 #include "str.h"
27 #include "testsuite.h"
29 using namespace std;
31 string
32 positions_to_string(Xapian::PositionIterator & it,
33 const Xapian::PositionIterator & end,
34 Xapian::termcount * count)
36 string result;
37 bool need_comma = false;
38 Xapian::termcount c = 0;
39 while (it != end) {
40 if (need_comma)
41 result += ", ";
42 result += str(*it);
43 need_comma = true;
44 ++it;
45 ++c;
47 if (count) {
48 *count = c;
50 return result;
53 string
54 postlist_to_string(const Xapian::Database & db, const string & tname)
56 string result;
57 bool need_comma = false;
59 for (Xapian::PostingIterator p = db.postlist_begin(tname);
60 p != db.postlist_end(tname);
61 ++p) {
62 if (need_comma)
63 result += ", ";
65 Xapian::PositionIterator it(p.positionlist_begin());
66 string posrepr = positions_to_string(it, p.positionlist_end());
67 if (!posrepr.empty()) {
68 posrepr = ", pos=[" + posrepr + "]";
71 result += "(" + str(*p) +
72 ", doclen=" + str(p.get_doclength()) +
73 ", wdf=" + str(p.get_wdf()) +
74 posrepr + ")";
75 need_comma = true;
77 return result;
80 string
81 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
83 string result;
84 bool need_comma = false;
86 for (Xapian::TermIterator t = db.termlist_begin(did);
87 t != db.termlist_end(did);
88 ++t) {
89 Xapian::PositionIterator it(t.positionlist_begin());
90 string posrepr = positions_to_string(it, t.positionlist_end());
91 if (!posrepr.empty()) {
92 posrepr = ", pos=[" + posrepr + "]";
94 if (need_comma)
95 result += ", ";
96 result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr;
97 result += ")";
98 need_comma = true;
100 return result;
103 string
104 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
106 string result;
108 result += "len=" + str(db.get_doclength(did));
110 return result;
113 string
114 termstats_to_string(const Xapian::Database & db, const string & term)
116 string result;
118 result += "tf=" + str(db.get_termfreq(term));
119 result += ",cf=" + str(db.get_collection_freq(term));
121 return result;
124 void
125 dbcheck(const Xapian::Database & db,
126 Xapian::doccount expected_doccount,
127 Xapian::docid expected_lastdocid)
129 TEST_EQUAL(db.get_doccount(), expected_doccount);
130 TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
132 // Note - may not be a very big type, but we're only expecting to use this
133 // for small databases, so should be fine.
134 unsigned long totlen = 0;
136 // A map from term to a representation of the posting list for that term.
137 // We build this up from the documents, and then check it against the
138 // equivalent built up from the posting lists.
139 map<string, string> posting_reprs;
140 map<Xapian::valueno, string> value_reprs;
142 Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
143 Xapian::termcount doclen_upper_bound = 0;
145 for (Xapian::PostingIterator dociter = db.postlist_begin(string());
146 dociter != db.postlist_end(string());
147 ++dociter) {
148 Xapian::docid did = *dociter;
149 TEST_EQUAL(dociter.get_wdf(), 1);
150 Xapian::Document doc(db.get_document(did));
151 Xapian::termcount doclen(db.get_doclength(did));
152 Xapian::termcount unique_terms(db.get_unique_terms(did));
153 if (doclen < doclen_lower_bound)
154 doclen_lower_bound = doclen;
155 if (doclen > doclen_upper_bound)
156 doclen_upper_bound = doclen;
157 totlen += doclen;
159 Xapian::termcount found_termcount = 0;
160 Xapian::termcount found_unique_terms = 0;
161 Xapian::termcount wdf_sum = 0;
162 Xapian::TermIterator t, t2;
163 for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
164 t != doc.termlist_end();
165 ++t, ++t2) {
166 TEST(t2 != db.termlist_end(did));
168 ++found_termcount;
169 auto wdf = t.get_wdf();
170 if (wdf) ++found_unique_terms;
171 wdf_sum += wdf;
173 TEST_EQUAL(*t, *t2);
174 TEST_EQUAL(t.get_wdf(), t2.get_wdf());
175 TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
176 TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
178 // Check the position lists are equal.
179 Xapian::termcount tc1, tc2;
180 Xapian::PositionIterator it1(t.positionlist_begin());
181 string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
182 Xapian::PositionIterator it2(t2.positionlist_begin());
183 string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
184 TEST_EQUAL(posrepr, posrepr2);
185 TEST_EQUAL(tc1, tc2);
186 TEST_EQUAL(tc1, t.positionlist_count());
188 // Make a representation of the posting.
189 if (!posrepr.empty()) {
190 posrepr = ",[" + posrepr + "]";
192 string posting_repr = "(" + str(did) + "," +
193 str(t.get_wdf()) + "/" + str(doclen) +
194 posrepr + ")";
196 // Append the representation to the list for the term.
197 map<string, string>::iterator i = posting_reprs.find(*t);
198 if (i == posting_reprs.end()) {
199 posting_reprs[*t] = posting_repr;
200 } else {
201 i->second += "," + posting_repr;
205 Xapian::termcount vcount = 0;
206 for (Xapian::ValueIterator v = doc.values_begin();
207 v != doc.values_end();
208 ++v, ++vcount) {
209 TEST((*v).size() != 0);
210 string value_repr = "(" + str(did) + "," + *v + ")";
212 // Append the values to the value lists.
213 map<Xapian::valueno, string>::iterator i;
214 i = value_reprs.find(v.get_valueno());
215 if (i == value_reprs.end()) {
216 value_reprs[v.get_valueno()] = value_repr;
217 } else {
218 i->second += "," + value_repr;
221 TEST_EQUAL(vcount, doc.values_count());
222 TEST(t2 == db.termlist_end(did));
223 Xapian::termcount expected_termcount = doc.termlist_count();
224 TEST_EQUAL(expected_termcount, found_termcount);
225 // Ideally this would be equal, but currently we don't store the
226 // unique_terms values but calculate them, and scanning the termlist
227 // of each document would be slow, so instead get_unique_terms(did)
228 // returns min(doclen, termcount) at present.
229 TEST_REL(unique_terms, >=, found_unique_terms);
230 TEST_REL(unique_terms, <=, found_termcount);
231 TEST_REL(unique_terms, <=, doclen);
232 TEST_EQUAL(doclen, wdf_sum);
235 TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
236 TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
238 Xapian::TermIterator t;
239 map<string, string>::const_iterator i;
240 for (t = db.allterms_begin(), i = posting_reprs.begin();
241 t != db.allterms_end();
242 ++t, ++i) {
243 TEST(db.term_exists(*t));
244 TEST(i != posting_reprs.end());
245 TEST_EQUAL(i->first, *t);
247 Xapian::doccount tf_count = 0;
248 Xapian::termcount cf_count = 0;
249 Xapian::termcount wdf_upper_bound = 0;
250 string posting_repr;
251 bool need_comma = false;
252 for (Xapian::PostingIterator p = db.postlist_begin(*t);
253 p != db.postlist_end(*t);
254 ++p) {
255 if (need_comma) {
256 posting_repr += ",";
259 ++tf_count;
260 cf_count += p.get_wdf();
262 Xapian::PositionIterator it(p.positionlist_begin());
263 string posrepr = positions_to_string(it, p.positionlist_end());
264 if (!posrepr.empty()) {
265 posrepr = ",[" + posrepr + "]";
267 posting_repr += "(" + str(*p) + "," +
268 str(p.get_wdf()) + "/" +
269 str(p.get_doclength()) + posrepr + ")";
270 if (wdf_upper_bound < p.get_wdf())
271 wdf_upper_bound = p.get_wdf();
272 need_comma = true;
275 TEST_EQUAL(posting_repr, i->second);
276 TEST_EQUAL(tf_count, t.get_termfreq());
277 TEST_EQUAL(tf_count, db.get_termfreq(*t));
278 TEST_EQUAL(cf_count, db.get_collection_freq(*t));
279 TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
281 TEST(i == posting_reprs.end());
283 map<Xapian::valueno, string>::const_iterator j;
284 for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
285 string value_repr;
286 string value_lower_bound;
287 string value_upper_bound;
288 bool first = true;
289 for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
290 v != db.valuestream_end(j->first); ++v) {
291 if (first) {
292 value_lower_bound = *v;
293 value_upper_bound = *v;
294 first = false;
295 } else {
296 value_repr += ",";
297 if (*v > value_upper_bound) {
298 value_upper_bound = *v;
300 if (*v < value_lower_bound) {
301 value_lower_bound = *v;
304 value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
306 TEST_EQUAL(value_repr, j->second);
307 try {
308 TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
309 TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
310 } catch (const Xapian::UnimplementedError &) {
311 // Skip the checks if the methods to get the bounds aren't
312 // implemented for this backend.
316 if (expected_doccount == 0) {
317 TEST_EQUAL(0, db.get_avlength());
318 } else {
319 TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
320 db.get_avlength());