1 /* dbcheck.cc: test database contents and consistency.
3 * Copyright 2009 Richard Boulton
4 * Copyright 2010,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "testsuite.h"
32 positions_to_string(Xapian::PositionIterator
& it
,
33 const Xapian::PositionIterator
& end
,
34 Xapian::termcount
* count
)
37 bool need_comma
= false;
38 Xapian::termcount c
= 0;
54 postlist_to_string(const Xapian::Database
& db
, const string
& tname
)
57 bool need_comma
= false;
59 for (Xapian::PostingIterator p
= db
.postlist_begin(tname
);
60 p
!= db
.postlist_end(tname
);
65 Xapian::PositionIterator
it(p
.positionlist_begin());
66 string posrepr
= positions_to_string(it
, p
.positionlist_end());
67 if (!posrepr
.empty()) {
68 posrepr
= ", pos=[" + posrepr
+ "]";
71 result
+= "(" + str(*p
) +
72 ", doclen=" + str(p
.get_doclength()) +
73 ", wdf=" + str(p
.get_wdf()) +
81 docterms_to_string(const Xapian::Database
& db
, Xapian::docid did
)
84 bool need_comma
= false;
86 for (Xapian::TermIterator t
= db
.termlist_begin(did
);
87 t
!= db
.termlist_end(did
);
89 Xapian::PositionIterator
it(t
.positionlist_begin());
90 string posrepr
= positions_to_string(it
, t
.positionlist_end());
91 if (!posrepr
.empty()) {
92 posrepr
= ", pos=[" + posrepr
+ "]";
96 result
+= "Term(" + *t
+ ", wdf=" + str(t
.get_wdf()) + posrepr
;
104 docstats_to_string(const Xapian::Database
& db
, Xapian::docid did
)
108 result
+= "len=" + str(db
.get_doclength(did
));
114 termstats_to_string(const Xapian::Database
& db
, const string
& term
)
118 result
+= "tf=" + str(db
.get_termfreq(term
));
119 result
+= ",cf=" + str(db
.get_collection_freq(term
));
125 dbcheck(const Xapian::Database
& db
,
126 Xapian::doccount expected_doccount
,
127 Xapian::docid expected_lastdocid
)
129 TEST_EQUAL(db
.get_doccount(), expected_doccount
);
130 TEST_EQUAL(db
.get_lastdocid(), expected_lastdocid
);
132 // Note - may not be a very big type, but we're only expecting to use this
133 // for small databases, so should be fine.
134 unsigned long totlen
= 0;
136 // A map from term to a representation of the posting list for that term.
137 // We build this up from the documents, and then check it against the
138 // equivalent built up from the posting lists.
139 map
<string
, string
> posting_reprs
;
140 map
<Xapian::valueno
, string
> value_reprs
;
142 Xapian::termcount doclen_lower_bound
= Xapian::termcount(-1);
143 Xapian::termcount doclen_upper_bound
= 0;
145 for (Xapian::PostingIterator dociter
= db
.postlist_begin(string());
146 dociter
!= db
.postlist_end(string());
148 Xapian::docid did
= *dociter
;
149 TEST_EQUAL(dociter
.get_wdf(), 1);
150 Xapian::Document
doc(db
.get_document(did
));
151 Xapian::termcount
doclen(db
.get_doclength(did
));
152 Xapian::termcount
unique_terms(db
.get_unique_terms(did
));
153 if (doclen
< doclen_lower_bound
)
154 doclen_lower_bound
= doclen
;
155 if (doclen
> doclen_upper_bound
)
156 doclen_upper_bound
= doclen
;
159 Xapian::termcount found_termcount
= 0;
160 Xapian::termcount found_unique_terms
= 0;
161 Xapian::termcount wdf_sum
= 0;
162 Xapian::TermIterator t
, t2
;
163 for (t
= doc
.termlist_begin(), t2
= db
.termlist_begin(did
);
164 t
!= doc
.termlist_end();
166 TEST(t2
!= db
.termlist_end(did
));
169 auto wdf
= t
.get_wdf();
170 if (wdf
) ++found_unique_terms
;
174 TEST_EQUAL(t
.get_wdf(), t2
.get_wdf());
175 TEST_EQUAL(db
.get_termfreq(*t
), t
.get_termfreq());
176 TEST_EQUAL(db
.get_termfreq(*t
), t2
.get_termfreq());
178 // Check the position lists are equal.
179 Xapian::termcount tc1
, tc2
;
180 Xapian::PositionIterator
it1(t
.positionlist_begin());
181 string posrepr
= positions_to_string(it1
, t
.positionlist_end(), &tc1
);
182 Xapian::PositionIterator
it2(t2
.positionlist_begin());
183 string posrepr2
= positions_to_string(it2
, t2
.positionlist_end(), &tc2
);
184 TEST_EQUAL(posrepr
, posrepr2
);
185 TEST_EQUAL(tc1
, tc2
);
186 TEST_EQUAL(tc1
, t
.positionlist_count());
188 // Make a representation of the posting.
189 if (!posrepr
.empty()) {
190 posrepr
= ",[" + posrepr
+ "]";
192 string posting_repr
= "(" + str(did
) + "," +
193 str(t
.get_wdf()) + "/" + str(doclen
) +
196 // Append the representation to the list for the term.
197 map
<string
, string
>::iterator i
= posting_reprs
.find(*t
);
198 if (i
== posting_reprs
.end()) {
199 posting_reprs
[*t
] = posting_repr
;
201 i
->second
+= "," + posting_repr
;
205 Xapian::termcount vcount
= 0;
206 for (Xapian::ValueIterator v
= doc
.values_begin();
207 v
!= doc
.values_end();
209 TEST((*v
).size() != 0);
210 string value_repr
= "(" + str(did
) + "," + *v
+ ")";
212 // Append the values to the value lists.
213 map
<Xapian::valueno
, string
>::iterator i
;
214 i
= value_reprs
.find(v
.get_valueno());
215 if (i
== value_reprs
.end()) {
216 value_reprs
[v
.get_valueno()] = value_repr
;
218 i
->second
+= "," + value_repr
;
221 TEST_EQUAL(vcount
, doc
.values_count());
222 TEST(t2
== db
.termlist_end(did
));
223 Xapian::termcount expected_termcount
= doc
.termlist_count();
224 TEST_EQUAL(expected_termcount
, found_termcount
);
225 // Ideally this would be equal, but currently we don't store the
226 // unique_terms values but calculate them, and scanning the termlist
227 // of each document would be slow, so instead get_unique_terms(did)
228 // returns min(doclen, termcount) at present.
229 TEST_REL(unique_terms
, >=, found_unique_terms
);
230 TEST_REL(unique_terms
, <=, found_termcount
);
231 TEST_REL(unique_terms
, <=, doclen
);
232 TEST_EQUAL(doclen
, wdf_sum
);
235 TEST_REL(doclen_lower_bound
, >=, db
.get_doclength_lower_bound());
236 TEST_REL(doclen_upper_bound
, <=, db
.get_doclength_upper_bound());
238 Xapian::TermIterator t
;
239 map
<string
, string
>::const_iterator i
;
240 for (t
= db
.allterms_begin(), i
= posting_reprs
.begin();
241 t
!= db
.allterms_end();
243 TEST(db
.term_exists(*t
));
244 TEST(i
!= posting_reprs
.end());
245 TEST_EQUAL(i
->first
, *t
);
247 Xapian::doccount tf_count
= 0;
248 Xapian::termcount cf_count
= 0;
249 Xapian::termcount wdf_upper_bound
= 0;
251 bool need_comma
= false;
252 for (Xapian::PostingIterator p
= db
.postlist_begin(*t
);
253 p
!= db
.postlist_end(*t
);
260 cf_count
+= p
.get_wdf();
262 Xapian::PositionIterator
it(p
.positionlist_begin());
263 string posrepr
= positions_to_string(it
, p
.positionlist_end());
264 if (!posrepr
.empty()) {
265 posrepr
= ",[" + posrepr
+ "]";
267 posting_repr
+= "(" + str(*p
) + "," +
268 str(p
.get_wdf()) + "/" +
269 str(p
.get_doclength()) + posrepr
+ ")";
270 if (wdf_upper_bound
< p
.get_wdf())
271 wdf_upper_bound
= p
.get_wdf();
275 TEST_EQUAL(posting_repr
, i
->second
);
276 TEST_EQUAL(tf_count
, t
.get_termfreq());
277 TEST_EQUAL(tf_count
, db
.get_termfreq(*t
));
278 TEST_EQUAL(cf_count
, db
.get_collection_freq(*t
));
279 TEST_REL(wdf_upper_bound
, <=, db
.get_wdf_upper_bound(*t
));
281 TEST(i
== posting_reprs
.end());
283 map
<Xapian::valueno
, string
>::const_iterator j
;
284 for (j
= value_reprs
.begin(); j
!= value_reprs
.end(); ++j
) {
286 string value_lower_bound
;
287 string value_upper_bound
;
289 for (Xapian::ValueIterator v
= db
.valuestream_begin(j
->first
);
290 v
!= db
.valuestream_end(j
->first
); ++v
) {
292 value_lower_bound
= *v
;
293 value_upper_bound
= *v
;
297 if (*v
> value_upper_bound
) {
298 value_upper_bound
= *v
;
300 if (*v
< value_lower_bound
) {
301 value_lower_bound
= *v
;
304 value_repr
+= "(" + str(v
.get_docid()) + "," + *v
+ ")";
306 TEST_EQUAL(value_repr
, j
->second
);
308 TEST_REL(value_upper_bound
, <=, db
.get_value_upper_bound(j
->first
));
309 TEST_REL(value_lower_bound
, >=, db
.get_value_lower_bound(j
->first
));
310 } catch (const Xapian::UnimplementedError
&) {
311 // Skip the checks if the methods to get the bounds aren't
312 // implemented for this backend.
316 if (expected_doccount
== 0) {
317 TEST_EQUAL(0, db
.get_avlength());
319 TEST_EQUAL_DOUBLE(double(totlen
) / expected_doccount
,