Document xapian-compact --blocksize takes an argument
[xapian.git] / xapian-core / tests / dbcheck.cc
blob20ce2c653fe00a17a497132af0a2e3f55d68a0b1
1 /* dbcheck.cc: test database contents and consistency.
3 * Copyright 2009 Richard Boulton
4 * Copyright 2010,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "dbcheck.h"
26 #include "str.h"
27 #include "testsuite.h"
29 using namespace std;
31 string
32 positions_to_string(Xapian::PositionIterator & it,
33 const Xapian::PositionIterator & end,
34 Xapian::termcount * count)
36 string result;
37 bool need_comma = false;
38 Xapian::termcount c = 0;
39 while (it != end) {
40 if (need_comma)
41 result += ", ";
42 result += str(*it);
43 need_comma = true;
44 ++it;
45 ++c;
47 if (count) {
48 *count = c;
50 return result;
53 string
54 postlist_to_string(const Xapian::Database & db, const string & tname)
56 string result;
57 bool need_comma = false;
59 for (Xapian::PostingIterator p = db.postlist_begin(tname);
60 p != db.postlist_end(tname);
61 ++p) {
62 if (need_comma)
63 result += ", ";
65 Xapian::PositionIterator it(p.positionlist_begin());
66 string posrepr = positions_to_string(it, p.positionlist_end());
67 if (!posrepr.empty()) {
68 posrepr = ", pos=[" + posrepr + "]";
71 result += "(" + str(*p) +
72 ", doclen=" + str(p.get_doclength()) +
73 ", wdf=" + str(p.get_wdf()) +
74 posrepr + ")";
75 need_comma = true;
77 return result;
80 string
81 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
83 string result;
84 bool need_comma = false;
86 for (Xapian::TermIterator t = db.termlist_begin(did);
87 t != db.termlist_end(did);
88 ++t) {
89 Xapian::PositionIterator it(t.positionlist_begin());
90 string posrepr = positions_to_string(it, t.positionlist_end());
91 if (!posrepr.empty()) {
92 posrepr = ", pos=[" + posrepr + "]";
94 if (need_comma)
95 result += ", ";
96 result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr;
97 result += ")";
98 need_comma = true;
100 return result;
103 string
104 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
106 string result;
108 result += "len=" + str(db.get_doclength(did));
110 return result;
113 string
114 termstats_to_string(const Xapian::Database & db, const string & term)
116 string result;
118 result += "tf=" + str(db.get_termfreq(term));
119 result += ",cf=" + str(db.get_collection_freq(term));
121 return result;
124 void
125 dbcheck(const Xapian::Database & db,
126 Xapian::doccount expected_doccount,
127 Xapian::docid expected_lastdocid)
129 TEST_EQUAL(db.get_doccount(), expected_doccount);
130 TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
132 // Note - may not be a very big type, but we're only expecting to use this
133 // for small databases, so should be fine.
134 unsigned long totlen = 0;
136 // A map from term to a representation of the posting list for that term.
137 // We build this up from the documents, and then check it against the
138 // equivalent built up from the posting lists.
139 map<string, string> posting_reprs;
140 map<Xapian::valueno, string> value_reprs;
142 Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
143 Xapian::termcount doclen_upper_bound = 0;
145 for (Xapian::PostingIterator dociter = db.postlist_begin(string());
146 dociter != db.postlist_end(string());
147 ++dociter) {
148 Xapian::docid did = *dociter;
149 TEST_EQUAL(dociter.get_wdf(), 1);
150 Xapian::Document doc(db.get_document(did));
151 Xapian::termcount doclen(db.get_doclength(did));
152 Xapian::termcount unique_terms(db.get_unique_terms(did));
153 if (doclen < doclen_lower_bound)
154 doclen_lower_bound = doclen;
155 if (doclen > doclen_upper_bound)
156 doclen_upper_bound = doclen;
157 totlen += doclen;
159 Xapian::termcount found_termcount = 0;
160 Xapian::termcount wdf_sum = 0;
161 Xapian::TermIterator t, t2;
162 for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
163 t != doc.termlist_end();
164 ++t, ++t2) {
165 TEST(t2 != db.termlist_end(did));
167 ++found_termcount;
168 wdf_sum += t.get_wdf();
170 TEST_EQUAL(*t, *t2);
171 TEST_EQUAL(t.get_wdf(), t2.get_wdf());
172 TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
173 TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
175 // Check the position lists are equal.
176 Xapian::termcount tc1, tc2;
177 Xapian::PositionIterator it1(t.positionlist_begin());
178 string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
179 Xapian::PositionIterator it2(t2.positionlist_begin());
180 string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
181 TEST_EQUAL(posrepr, posrepr2);
182 TEST_EQUAL(tc1, tc2);
183 try {
184 TEST_EQUAL(tc1, t.positionlist_count());
185 } catch (const Xapian::UnimplementedError &) {
186 // positionlist_count() isn't implemented for remote databases.
189 // Make a representation of the posting.
190 if (!posrepr.empty()) {
191 posrepr = ",[" + posrepr + "]";
193 string posting_repr = "(" + str(did) + "," +
194 str(t.get_wdf()) + "/" + str(doclen) +
195 posrepr + ")";
197 // Append the representation to the list for the term.
198 map<string, string>::iterator i = posting_reprs.find(*t);
199 if (i == posting_reprs.end()) {
200 posting_reprs[*t] = posting_repr;
201 } else {
202 i->second += "," + posting_repr;
206 Xapian::termcount vcount = 0;
207 for (Xapian::ValueIterator v = doc.values_begin();
208 v != doc.values_end();
209 ++v, ++vcount) {
210 TEST((*v).size() != 0);
211 string value_repr = "(" + str(did) + "," + *v + ")";
213 // Append the values to the value lists.
214 map<Xapian::valueno, string>::iterator i;
215 i = value_reprs.find(v.get_valueno());
216 if (i == value_reprs.end()) {
217 value_reprs[v.get_valueno()] = value_repr;
218 } else {
219 i->second += "," + value_repr;
222 TEST_EQUAL(vcount, doc.values_count());
223 TEST(t2 == db.termlist_end(did));
224 Xapian::termcount expected_termcount = doc.termlist_count();
225 TEST_EQUAL(expected_termcount, found_termcount);
226 TEST_EQUAL(unique_terms, found_termcount);
227 TEST_EQUAL(doclen, wdf_sum);
230 TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
231 TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
233 Xapian::TermIterator t;
234 map<string, string>::const_iterator i;
235 for (t = db.allterms_begin(), i = posting_reprs.begin();
236 t != db.allterms_end();
237 ++t, ++i) {
238 TEST(db.term_exists(*t));
239 TEST(i != posting_reprs.end());
240 TEST_EQUAL(i->first, *t);
242 Xapian::doccount tf_count = 0;
243 Xapian::termcount cf_count = 0;
244 Xapian::termcount wdf_upper_bound = 0;
245 string posting_repr;
246 bool need_comma = false;
247 for (Xapian::PostingIterator p = db.postlist_begin(*t);
248 p != db.postlist_end(*t);
249 ++p) {
250 if (need_comma) {
251 posting_repr += ",";
254 ++tf_count;
255 cf_count += p.get_wdf();
257 Xapian::PositionIterator it(p.positionlist_begin());
258 string posrepr = positions_to_string(it, p.positionlist_end());
259 if (!posrepr.empty()) {
260 posrepr = ",[" + posrepr + "]";
262 posting_repr += "(" + str(*p) + "," +
263 str(p.get_wdf()) + "/" +
264 str(p.get_doclength()) + posrepr + ")";
265 if (wdf_upper_bound < p.get_wdf())
266 wdf_upper_bound = p.get_wdf();
267 need_comma = true;
270 TEST_EQUAL(posting_repr, i->second);
271 TEST_EQUAL(tf_count, t.get_termfreq());
272 TEST_EQUAL(tf_count, db.get_termfreq(*t));
273 TEST_EQUAL(cf_count, db.get_collection_freq(*t));
274 TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
276 TEST(i == posting_reprs.end());
278 map<Xapian::valueno, string>::const_iterator j;
279 for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
280 string value_repr;
281 string value_lower_bound;
282 string value_upper_bound;
283 bool first = true;
284 for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
285 v != db.valuestream_end(j->first); ++v) {
286 if (first) {
287 value_lower_bound = *v;
288 value_upper_bound = *v;
289 first = false;
290 } else {
291 value_repr += ",";
292 if (*v > value_upper_bound) {
293 value_upper_bound = *v;
295 if (*v < value_lower_bound) {
296 value_lower_bound = *v;
299 value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
301 TEST_EQUAL(value_repr, j->second);
302 try {
303 TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
304 TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
305 } catch (const Xapian::UnimplementedError &) {
306 // Skip the checks if the methods to get the bounds aren't
307 // implemented for this backend.
311 if (expected_doccount == 0) {
312 TEST_EQUAL(0, db.get_avlength());
313 } else {
314 TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
315 db.get_avlength());