1 /* dbcheck.cc: test database contents and consistency.
3 * Copyright 2009 Richard Boulton
4 * Copyright 2010,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "testsuite.h"
32 positions_to_string(Xapian::PositionIterator
& it
,
33 const Xapian::PositionIterator
& end
,
34 Xapian::termcount
* count
)
37 bool need_comma
= false;
38 Xapian::termcount c
= 0;
54 postlist_to_string(const Xapian::Database
& db
, const string
& tname
)
57 bool need_comma
= false;
59 for (Xapian::PostingIterator p
= db
.postlist_begin(tname
);
60 p
!= db
.postlist_end(tname
);
65 Xapian::PositionIterator
it(p
.positionlist_begin());
66 string posrepr
= positions_to_string(it
, p
.positionlist_end());
67 if (!posrepr
.empty()) {
68 posrepr
= ", pos=[" + posrepr
+ "]";
71 result
+= "(" + str(*p
) +
72 ", doclen=" + str(p
.get_doclength()) +
73 ", wdf=" + str(p
.get_wdf()) +
81 docterms_to_string(const Xapian::Database
& db
, Xapian::docid did
)
84 bool need_comma
= false;
86 for (Xapian::TermIterator t
= db
.termlist_begin(did
);
87 t
!= db
.termlist_end(did
);
89 Xapian::PositionIterator
it(t
.positionlist_begin());
90 string posrepr
= positions_to_string(it
, t
.positionlist_end());
91 if (!posrepr
.empty()) {
92 posrepr
= ", pos=[" + posrepr
+ "]";
96 result
+= "Term(" + *t
+ ", wdf=" + str(t
.get_wdf()) + posrepr
;
104 docstats_to_string(const Xapian::Database
& db
, Xapian::docid did
)
108 result
+= "len=" + str(db
.get_doclength(did
));
114 termstats_to_string(const Xapian::Database
& db
, const string
& term
)
118 result
+= "tf=" + str(db
.get_termfreq(term
));
119 result
+= ",cf=" + str(db
.get_collection_freq(term
));
125 dbcheck(const Xapian::Database
& db
,
126 Xapian::doccount expected_doccount
,
127 Xapian::docid expected_lastdocid
)
129 TEST_EQUAL(db
.get_doccount(), expected_doccount
);
130 TEST_EQUAL(db
.get_lastdocid(), expected_lastdocid
);
132 // Note - may not be a very big type, but we're only expecting to use this
133 // for small databases, so should be fine.
134 unsigned long totlen
= 0;
136 // A map from term to a representation of the posting list for that term.
137 // We build this up from the documents, and then check it against the
138 // equivalent built up from the posting lists.
139 map
<string
, string
> posting_reprs
;
140 map
<Xapian::valueno
, string
> value_reprs
;
142 Xapian::termcount doclen_lower_bound
= Xapian::termcount(-1);
143 Xapian::termcount doclen_upper_bound
= 0;
145 for (Xapian::PostingIterator dociter
= db
.postlist_begin(string());
146 dociter
!= db
.postlist_end(string());
148 Xapian::docid did
= *dociter
;
149 TEST_EQUAL(dociter
.get_wdf(), 1);
150 Xapian::Document
doc(db
.get_document(did
));
151 Xapian::termcount
doclen(db
.get_doclength(did
));
152 Xapian::termcount
unique_terms(db
.get_unique_terms(did
));
153 if (doclen
< doclen_lower_bound
)
154 doclen_lower_bound
= doclen
;
155 if (doclen
> doclen_upper_bound
)
156 doclen_upper_bound
= doclen
;
159 Xapian::termcount found_termcount
= 0;
160 Xapian::termcount wdf_sum
= 0;
161 Xapian::TermIterator t
, t2
;
162 for (t
= doc
.termlist_begin(), t2
= db
.termlist_begin(did
);
163 t
!= doc
.termlist_end();
165 TEST(t2
!= db
.termlist_end(did
));
168 wdf_sum
+= t
.get_wdf();
171 TEST_EQUAL(t
.get_wdf(), t2
.get_wdf());
172 TEST_EQUAL(db
.get_termfreq(*t
), t
.get_termfreq());
173 TEST_EQUAL(db
.get_termfreq(*t
), t2
.get_termfreq());
175 // Check the position lists are equal.
176 Xapian::termcount tc1
, tc2
;
177 Xapian::PositionIterator
it1(t
.positionlist_begin());
178 string posrepr
= positions_to_string(it1
, t
.positionlist_end(), &tc1
);
179 Xapian::PositionIterator
it2(t2
.positionlist_begin());
180 string posrepr2
= positions_to_string(it2
, t2
.positionlist_end(), &tc2
);
181 TEST_EQUAL(posrepr
, posrepr2
);
182 TEST_EQUAL(tc1
, tc2
);
184 TEST_EQUAL(tc1
, t
.positionlist_count());
185 } catch (const Xapian::UnimplementedError
&) {
186 // positionlist_count() isn't implemented for remote databases.
189 // Make a representation of the posting.
190 if (!posrepr
.empty()) {
191 posrepr
= ",[" + posrepr
+ "]";
193 string posting_repr
= "(" + str(did
) + "," +
194 str(t
.get_wdf()) + "/" + str(doclen
) +
197 // Append the representation to the list for the term.
198 map
<string
, string
>::iterator i
= posting_reprs
.find(*t
);
199 if (i
== posting_reprs
.end()) {
200 posting_reprs
[*t
] = posting_repr
;
202 i
->second
+= "," + posting_repr
;
206 Xapian::termcount vcount
= 0;
207 for (Xapian::ValueIterator v
= doc
.values_begin();
208 v
!= doc
.values_end();
210 TEST((*v
).size() != 0);
211 string value_repr
= "(" + str(did
) + "," + *v
+ ")";
213 // Append the values to the value lists.
214 map
<Xapian::valueno
, string
>::iterator i
;
215 i
= value_reprs
.find(v
.get_valueno());
216 if (i
== value_reprs
.end()) {
217 value_reprs
[v
.get_valueno()] = value_repr
;
219 i
->second
+= "," + value_repr
;
222 TEST_EQUAL(vcount
, doc
.values_count());
223 TEST(t2
== db
.termlist_end(did
));
224 Xapian::termcount expected_termcount
= doc
.termlist_count();
225 TEST_EQUAL(expected_termcount
, found_termcount
);
226 TEST_EQUAL(unique_terms
, found_termcount
);
227 TEST_EQUAL(doclen
, wdf_sum
);
230 TEST_REL(doclen_lower_bound
, >=, db
.get_doclength_lower_bound());
231 TEST_REL(doclen_upper_bound
, <=, db
.get_doclength_upper_bound());
233 Xapian::TermIterator t
;
234 map
<string
, string
>::const_iterator i
;
235 for (t
= db
.allterms_begin(), i
= posting_reprs
.begin();
236 t
!= db
.allterms_end();
238 TEST(db
.term_exists(*t
));
239 TEST(i
!= posting_reprs
.end());
240 TEST_EQUAL(i
->first
, *t
);
242 Xapian::doccount tf_count
= 0;
243 Xapian::termcount cf_count
= 0;
244 Xapian::termcount wdf_upper_bound
= 0;
246 bool need_comma
= false;
247 for (Xapian::PostingIterator p
= db
.postlist_begin(*t
);
248 p
!= db
.postlist_end(*t
);
255 cf_count
+= p
.get_wdf();
257 Xapian::PositionIterator
it(p
.positionlist_begin());
258 string posrepr
= positions_to_string(it
, p
.positionlist_end());
259 if (!posrepr
.empty()) {
260 posrepr
= ",[" + posrepr
+ "]";
262 posting_repr
+= "(" + str(*p
) + "," +
263 str(p
.get_wdf()) + "/" +
264 str(p
.get_doclength()) + posrepr
+ ")";
265 if (wdf_upper_bound
< p
.get_wdf())
266 wdf_upper_bound
= p
.get_wdf();
270 TEST_EQUAL(posting_repr
, i
->second
);
271 TEST_EQUAL(tf_count
, t
.get_termfreq());
272 TEST_EQUAL(tf_count
, db
.get_termfreq(*t
));
273 TEST_EQUAL(cf_count
, db
.get_collection_freq(*t
));
274 TEST_REL(wdf_upper_bound
, <=, db
.get_wdf_upper_bound(*t
));
276 TEST(i
== posting_reprs
.end());
278 map
<Xapian::valueno
, string
>::const_iterator j
;
279 for (j
= value_reprs
.begin(); j
!= value_reprs
.end(); ++j
) {
281 string value_lower_bound
;
282 string value_upper_bound
;
284 for (Xapian::ValueIterator v
= db
.valuestream_begin(j
->first
);
285 v
!= db
.valuestream_end(j
->first
); ++v
) {
287 value_lower_bound
= *v
;
288 value_upper_bound
= *v
;
292 if (*v
> value_upper_bound
) {
293 value_upper_bound
= *v
;
295 if (*v
< value_lower_bound
) {
296 value_lower_bound
= *v
;
299 value_repr
+= "(" + str(v
.get_docid()) + "," + *v
+ ")";
301 TEST_EQUAL(value_repr
, j
->second
);
303 TEST_REL(value_upper_bound
, <=, db
.get_value_upper_bound(j
->first
));
304 TEST_REL(value_lower_bound
, >=, db
.get_value_lower_bound(j
->first
));
305 } catch (const Xapian::UnimplementedError
&) {
306 // Skip the checks if the methods to get the bounds aren't
307 // implemented for this backend.
311 if (expected_doccount
== 0) {
312 TEST_EQUAL(0, db
.get_avlength());
314 TEST_EQUAL_DOUBLE(double(totlen
) / expected_doccount
,