Fix termfreq used in weight calcs for repeated terms
[xapian.git] / xapian-core / tests / api_weight.cc
blob23545e89f5384fdf8c204c453c3252398f4463bd
1 /** @file api_weight.cc
2 * @brief tests of Xapian::Weight subclasses
3 */
4 /* Copyright (C) 2004,2012,2013,2016,2017 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "api_weight.h"
26 #include <cmath>
28 #include <xapian.h>
30 #include "apitest.h"
31 #include "testutils.h"
33 using namespace std;
35 // Test exception for junk after serialised weight.
36 DEFINE_TESTCASE(tradweight3, !backend) {
37 Xapian::TradWeight wt(42);
38 try {
39 Xapian::TradWeight t;
40 Xapian::TradWeight * t2 = t.unserialise(wt.serialise() + "X");
41 // Make sure we actually use the weight.
42 bool empty = t2->name().empty();
43 delete t2;
44 if (empty)
45 FAIL_TEST("Serialised TradWeight with junk appended unserialised to empty name!");
46 FAIL_TEST("Serialised TradWeight with junk appended unserialised OK");
47 } catch (const Xapian::SerialisationError &e) {
48 // Regression test for error in exception message fixed in 1.2.11 and
49 // 1.3.1.
50 TEST(e.get_msg().find("BM25") == string::npos);
51 TEST(e.get_msg().find("Trad") != string::npos);
53 return true;
56 // Test Exception for junk after serialised weight.
57 DEFINE_TESTCASE(unigramlmweight3, !backend) {
58 Xapian::LMWeight wt(79898.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0.5, 1.0);
59 try {
60 Xapian::LMWeight t;
61 Xapian::LMWeight * t2 = t.unserialise(wt.serialise() + "X");
62 // Make sure we actually use the weight.
63 bool empty = t2->name().empty();
64 delete t2;
65 if (empty)
66 FAIL_TEST("Serialised LMWeight with junk appended unserialised to empty name!");
67 FAIL_TEST("Serialised LMWeight with junk appended unserialised OK");
68 } catch (const Xapian::SerialisationError &e) {
69 TEST(e.get_msg().find("LM") != string::npos);
71 return true;
74 // Test exception for junk after serialised weight.
75 DEFINE_TESTCASE(bm25weight3, !backend) {
76 Xapian::BM25Weight wt(2.0, 0.5, 1.3, 0.6, 0.01);
77 try {
78 Xapian::BM25Weight b;
79 Xapian::BM25Weight * b2 = b.unserialise(wt.serialise() + "X");
80 // Make sure we actually use the weight.
81 bool empty = b2->name().empty();
82 delete b2;
83 if (empty)
84 FAIL_TEST("Serialised BM25Weight with junk appended unserialised to empty name!");
85 FAIL_TEST("Serialised BM25Weight with junk appended unserialised OK");
86 } catch (const Xapian::SerialisationError &e) {
87 TEST(e.get_msg().find("BM25") != string::npos);
89 return true;
92 // Test parameter combinations which should be unaffected by doclength.
93 DEFINE_TESTCASE(bm25weight4, backend) {
94 Xapian::Database db = get_database("apitest_simpledata");
95 Xapian::Enquire enquire(db);
96 enquire.set_query(Xapian::Query("paragraph"));
97 Xapian::MSet mset;
99 enquire.set_weighting_scheme(Xapian::BM25Weight(1, 0, 1, 0, 0.5));
100 mset = enquire.get_mset(0, 10);
101 TEST_EQUAL(mset.size(), 5);
102 // Expect: wdf has an effect on weight, but doclen doesn't.
103 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
104 TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
105 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
106 TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
108 enquire.set_weighting_scheme(Xapian::BM25Weight(0, 0, 1, 1, 0.5));
109 mset = enquire.get_mset(0, 10);
110 TEST_EQUAL(mset.size(), 5);
111 // Expect: neither wdf nor doclen affects weight.
112 TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
114 return true;
117 /// Test non-zero k2 with zero k1.
118 // Regression test for bug fixed in 1.2.17 and 1.3.2.
119 DEFINE_TESTCASE(bm25weight5, backend) {
120 Xapian::Database db = get_database("apitest_simpledata");
121 Xapian::Enquire enquire(db);
122 enquire.set_query(Xapian::Query("paragraph"));
123 Xapian::MSet mset;
125 enquire.set_weighting_scheme(Xapian::BM25Weight(0, 1, 1, 0.5, 0.5));
126 mset = enquire.get_mset(0, 10);
127 TEST_EQUAL(mset.size(), 5);
128 // Expect: wdf has no effect on weight; shorter docs rank higher.
129 mset_expect_order(mset, 3, 5, 1, 4, 2);
130 TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[1].get_weight());
131 TEST_REL(mset[1].get_weight(),>,mset[2].get_weight());
132 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
133 TEST_REL(mset[3].get_weight(),>,mset[4].get_weight());
135 return true;
138 // Test exception for junk after serialised weight.
139 DEFINE_TESTCASE(bm25plusweight1, !backend) {
140 Xapian::BM25PlusWeight wt(2.0, 0.1, 1.3, 0.6, 0.01, 0.5);
141 try {
142 Xapian::BM25PlusWeight b;
143 Xapian::BM25PlusWeight * b2 = b.unserialise(wt.serialise() + "X");
144 // Make sure we actually use the weight.
145 bool empty = b2->name().empty();
146 delete b2;
147 if (empty)
148 FAIL_TEST("Serialised BM25PlusWeight with junk appended unserialised to empty name!");
149 FAIL_TEST("Serialised BM25PlusWeight with junk appended unserialised OK");
150 } catch (const Xapian::SerialisationError &e) {
151 TEST(e.get_msg().find("BM25Plus") != string::npos);
153 return true;
156 // Test parameter combinations which should be unaffected by doclength.
157 DEFINE_TESTCASE(bm25plusweight2, backend) {
158 Xapian::Database db = get_database("apitest_simpledata");
159 Xapian::Enquire enquire(db);
160 enquire.set_query(Xapian::Query("paragraph"));
161 Xapian::MSet mset;
163 enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0, 0.5, 1));
164 mset = enquire.get_mset(0, 10);
165 TEST_EQUAL(mset.size(), 5);
166 // Expect: wdf has an effect on weight, but doclen doesn't.
167 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
168 TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
169 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
170 TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
172 enquire.set_weighting_scheme(Xapian::BM25PlusWeight(0, 0, 1, 1, 0.5, 1));
173 mset = enquire.get_mset(0, 10);
174 TEST_EQUAL(mset.size(), 5);
175 // Expect: neither wdf nor doclen affects weight.
176 TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
178 return true;
181 // Regression test for a mistake corrected in the BM25+ implementation.
182 DEFINE_TESTCASE(bm25plusweight3, backend) {
183 Xapian::Database db = get_database("apitest_simpledata");
184 Xapian::Enquire enquire(db);
185 enquire.set_query(Xapian::Query("paragraph"));
186 Xapian::MSet mset;
188 enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0.5, 0.5, 1));
189 mset = enquire.get_mset(0, 10);
190 TEST_EQUAL(mset.size(), 5);
192 // The value of each doc weight calculated manually from the BM25+ formulae
193 // by using the respective document statistics.
194 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.7920796567487473);
195 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.7846980783848447);
196 TEST_EQUAL_DOUBLE(mset[2].get_weight(), 0.7558817623365934);
197 TEST_EQUAL_DOUBLE(mset[3].get_weight(), 0.7210119356168847);
198 TEST_EQUAL_DOUBLE(mset[4].get_weight(), 0.7210119356168847);
200 return true;
203 // Test exception for junk after serialised weight.
204 DEFINE_TESTCASE(inl2weight1, !backend) {
205 Xapian::InL2Weight wt(2.0);
206 try {
207 Xapian::InL2Weight b;
208 Xapian::InL2Weight * b2 = b.unserialise(wt.serialise() + "X");
209 // Make sure we actually use the weight.
210 bool empty = b2->name().empty();
211 delete b2;
212 if (empty)
213 FAIL_TEST("Serialised inl2weight with junk appended unserialised to empty name!");
214 FAIL_TEST("Serialised inl2weight with junk appended unserialised OK");
215 } catch (const Xapian::SerialisationError &e) {
216 TEST(e.get_msg().find("InL2") != string::npos);
219 return true;
222 // Test for invalid values of c.
223 DEFINE_TESTCASE(inl2weight2, !backend) {
224 // InvalidArgumentError should be thrown if the parameter c is invalid.
225 TEST_EXCEPTION(Xapian::InvalidArgumentError,
226 Xapian::InL2Weight wt(-2.0));
228 TEST_EXCEPTION(Xapian::InvalidArgumentError,
229 Xapian::InL2Weight wt2(0.0));
231 /* Parameter c should be set to 1.0 by constructor if none is given. */
232 Xapian::InL2Weight weight2;
233 TEST_EQUAL(weight2.serialise(), Xapian::InL2Weight(1.0).serialise());
235 return true;
238 // Feature tests for Inl2Weight
239 DEFINE_TESTCASE(inl2weight3, backend) {
240 Xapian::Database db = get_database("apitest_simpledata");
241 Xapian::Enquire enquire(db);
242 Xapian::Query query("banana");
244 enquire.set_query(query);
245 enquire.set_weighting_scheme(Xapian::InL2Weight(2.0));
247 Xapian::MSet mset1;
248 mset1 = enquire.get_mset(0, 10);
249 TEST_EQUAL(mset1.size(), 1);
250 mset_expect_order(mset1, 6);
252 /* The value has been calculated in the python interpreter by looking at the
253 * database statistics. */
254 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.559711143842063);
256 // Test with OP_SCALE_WEIGHT.
257 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
258 enquire.set_weighting_scheme(Xapian::InL2Weight(2.0));
260 Xapian::MSet mset2;
261 mset2 = enquire.get_mset(0, 10);
262 TEST_EQUAL(mset2.size(), 1);
263 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
264 TEST_EQUAL_DOUBLE(15.0 * mset1[0].get_weight(), mset2[0].get_weight());
266 return true;
269 // Test exception for junk after serialised weight.
270 DEFINE_TESTCASE(ifb2weight1, !backend) {
271 Xapian::IfB2Weight wt(2.0);
272 try {
273 Xapian::IfB2Weight b;
274 Xapian::IfB2Weight * b2 = b.unserialise(wt.serialise() + "X");
275 // Make sure we actually use the weight.
276 bool empty = b2->name().empty();
277 delete b2;
278 if (empty)
279 FAIL_TEST("Serialised IfB2Weight with junk appended unserialised to empty name!");
280 FAIL_TEST("Serialised IfB2Weight with junk appended unserialised OK");
281 } catch (const Xapian::SerialisationError &e) {
282 TEST(e.get_msg().find("IfB2") != string::npos);
284 return true;
287 // Test for invalid values of c.
288 DEFINE_TESTCASE(ifb2weight2, !backend) {
289 // InvalidArgumentError should be thrown if the parameter c is invalid.
290 TEST_EXCEPTION(Xapian::InvalidArgumentError,
291 Xapian::IfB2Weight wt(-2.0));
293 TEST_EXCEPTION(Xapian::InvalidArgumentError,
294 Xapian::IfB2Weight wt2(0.0));
296 /* Parameter c should be set to 1.0 by constructor if none is given. */
297 Xapian::IfB2Weight weight2;
298 TEST_EQUAL(weight2.serialise(), Xapian::IfB2Weight(1.0).serialise());
300 return true;
303 // Feature test
304 DEFINE_TESTCASE(ifb2weight3, backend) {
305 Xapian::Database db = get_database("apitest_simpledata");
306 Xapian::Enquire enquire(db);
307 Xapian::Query query("banana");
309 enquire.set_query(query);
310 enquire.set_weighting_scheme(Xapian::IfB2Weight(2.0));
312 Xapian::MSet mset1;
313 mset1 = enquire.get_mset(0, 10);
314 TEST_EQUAL(mset1.size(), 1);
316 /* The value of the weight has been manually calculated using the statistics
317 * of the test database. */
318 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.119422287684126);
320 // Test with OP_SCALE_WEIGHT.
321 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
322 enquire.set_weighting_scheme(Xapian::IfB2Weight(2.0));
324 Xapian::MSet mset2;
325 mset2 = enquire.get_mset(0, 10);
326 TEST_EQUAL(mset2.size(), 1);
327 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
328 TEST_EQUAL_DOUBLE(15.0 * mset1[0].get_weight(), mset2[0].get_weight());
330 return true;
333 // Test exception for junk after serialised weight.
334 DEFINE_TESTCASE(ineb2weight1, !backend) {
335 Xapian::IneB2Weight wt(2.0);
336 try {
337 Xapian::IneB2Weight b;
338 Xapian::IneB2Weight * b2 = b.unserialise(wt.serialise() + "X");
339 // Make sure we actually use the weight.
340 bool empty = b2->name().empty();
341 delete b2;
342 if (empty)
343 FAIL_TEST("Serialised ineb2weight with junk appended unserialised to empty name!");
344 FAIL_TEST("Serialised ineb2weight with junk appended unserialised OK");
345 } catch (const Xapian::SerialisationError &e) {
346 TEST(e.get_msg().find("IneB2") != string::npos);
349 return true;
352 // Test for invalid values of c.
353 DEFINE_TESTCASE(ineb2weight2, !backend) {
354 // InvalidArgumentError should be thrown if parameter c is invalid.
355 TEST_EXCEPTION(Xapian::InvalidArgumentError,
356 Xapian::IneB2Weight wt(-2.0));
358 TEST_EXCEPTION(Xapian::InvalidArgumentError,
359 Xapian::IneB2Weight wt2(0.0));
361 /* Parameter c should be set to 1.0 by constructor if none is given. */
362 Xapian::IneB2Weight weight2;
363 TEST_EQUAL(weight2.serialise(), Xapian::IneB2Weight(1.0).serialise());
365 return true;
368 // Feature test.
369 DEFINE_TESTCASE(ineb2weight3, backend) {
370 Xapian::Database db = get_database("apitest_simpledata");
371 Xapian::Enquire enquire(db);
372 Xapian::Query query("paragraph");
373 enquire.set_query(query);
374 enquire.set_weighting_scheme(Xapian::IneB2Weight(2.0));
376 Xapian::MSet mset1;
377 mset1 = enquire.get_mset(0, 10);
378 TEST_EQUAL(mset1.size(), 5);
380 // The third document in the database is 4th in the ranking.
381 /* The weight value has been manually calculated by using the statistics
382 * of the test database. */
383 TEST_EQUAL_DOUBLE(mset1[4].get_weight(), 0.61709730297692400036);
385 // Test with OP_SCALE_WEIGHT.
386 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
387 enquire.set_weighting_scheme(Xapian::IneB2Weight(2.0));
389 Xapian::MSet mset2;
390 mset2 = enquire.get_mset(0, 10);
391 TEST_EQUAL(mset2.size(), 5);
393 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
394 for (int i = 0; i < 5; ++i) {
395 TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
398 return true;
401 // Test exception for junk after serialised weight.
402 DEFINE_TESTCASE(bb2weight1, !backend) {
403 Xapian::BB2Weight wt(2.0);
404 try {
405 Xapian::BB2Weight b;
406 Xapian::BB2Weight * b2 = b.unserialise(wt.serialise() + "X");
407 // Make sure we actually use the weight.
408 bool empty = b2->name().empty();
409 delete b2;
410 if (empty)
411 FAIL_TEST("Serialised BB2Weight with junk appended unserialised to empty name!");
412 FAIL_TEST("Serialised BB2Weight with junk appended unserialised OK");
413 } catch (const Xapian::SerialisationError &e) {
414 TEST(e.get_msg().find("BB2") != string::npos);
416 return true;
419 // Test for invalid values of c.
420 DEFINE_TESTCASE(bb2weight2, !backend) {
421 // InvalidArgumentError should be thrown if the parameter c is invalid.
422 TEST_EXCEPTION(Xapian::InvalidArgumentError,
423 Xapian::BB2Weight wt(-2.0));
425 TEST_EXCEPTION(Xapian::InvalidArgumentError,
426 Xapian::BB2Weight wt2(0.0));
428 /* Parameter c should be set to 1.0 by constructor if none is given. */
429 Xapian::BB2Weight weight2;
430 TEST_EQUAL(weight2.serialise(), Xapian::BB2Weight(1.0).serialise());
432 return true;
435 // Feature test
436 DEFINE_TESTCASE(bb2weight3, backend) {
437 Xapian::Database db = get_database("apitest_simpledata");
438 Xapian::Enquire enquire(db);
439 Xapian::Query query("paragraph");
441 enquire.set_query(query);
442 enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
444 Xapian::MSet mset1;
445 mset1 = enquire.get_mset(0, 10);
446 TEST_EQUAL(mset1.size(), 5);
447 /* The third document in the database has the highest weight and is the
448 * first in the mset. */
449 // Value calculated manually by using the statistics of the test database.
450 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.6823696969784483);
452 // Test with OP_SCALE_WEIGHT.
453 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
454 enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
456 Xapian::MSet mset2;
457 mset2 = enquire.get_mset(0, 10);
458 TEST_EQUAL(mset2.size(), 5);
460 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
461 for (int i = 0; i < 5; ++i) {
462 TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
465 // Test with OP_SCALE_WEIGHT and a small factor (regression test, as we
466 // were applying the factor to the upper bound twice).
467 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 1.0 / 1024));
468 enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
470 Xapian::MSet mset3;
471 mset3 = enquire.get_mset(0, 10);
472 TEST_EQUAL(mset3.size(), 5);
474 for (int i = 0; i < 5; ++i) {
475 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight() * 1024);
478 return true;
481 // Regression test: we used to calculate log2(0) when there was only one doc.
482 DEFINE_TESTCASE(bb2weight4, backend) {
483 Xapian::Database db = get_database("apitest_onedoc");
484 Xapian::Enquire enquire(db);
485 Xapian::Query query("word");
487 enquire.set_query(query);
488 enquire.set_weighting_scheme(Xapian::BB2Weight());
490 Xapian::MSet mset1;
491 mset1 = enquire.get_mset(0, 10);
492 TEST_EQUAL(mset1.size(), 1);
493 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.431020621347435);
495 return true;
498 // Feature test.
499 DEFINE_TESTCASE(dlhweight1, backend) {
500 Xapian::Database db = get_database("apitest_simpledata");
501 Xapian::Enquire enquire(db);
502 Xapian::Query query("a");
504 enquire.set_query(query);
505 enquire.set_weighting_scheme(Xapian::DLHWeight());
507 Xapian::MSet mset1;
508 mset1 = enquire.get_mset(0, 10);
509 TEST_EQUAL(mset1.size(), 3);
510 mset_expect_order(mset1, 3, 1, 2);
511 // Weights calculated manually using stats from the database.
512 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.0046477754371292362);
513 TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.97621929514640352757);
514 // The following weight would be negative but gets clamped to 0.
515 TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.0);
517 // Test with OP_SCALE_WEIGHT.
518 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
519 enquire.set_weighting_scheme(Xapian::DLHWeight());
521 Xapian::MSet mset2;
522 mset2 = enquire.get_mset(0, 10);
523 TEST_EQUAL(mset2.size(), 3);
525 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
526 for (Xapian::doccount i = 0; i < mset2.size(); ++i) {
527 TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
530 return true;
533 // Test exception for junk after serialised weight.
534 DEFINE_TESTCASE(dlhweight2, !backend) {
535 Xapian::DLHWeight wt;
536 try {
537 Xapian::DLHWeight t;
538 Xapian::DLHWeight * t2 = t.unserialise(wt.serialise() + "X");
539 // Make sure we actually use the weight.
540 bool empty = t2->name().empty();
541 delete t2;
542 if (empty)
543 FAIL_TEST("Serialised DLHWeight with junk appended unserialised to empty name!");
544 FAIL_TEST("Serialised DLHWeight with junk appended unserialised OK");
545 } catch (const Xapian::SerialisationError &e) {
546 TEST(e.get_msg().find("DLH") != string::npos);
548 return true;
551 static void
552 gen_wdf_eq_doclen_db(Xapian::WritableDatabase& db, const string&)
554 Xapian::Document doc;
555 doc.add_term("solo", 37);
556 db.add_document(doc);
559 // Test wdf == doclen.
560 DEFINE_TESTCASE(dlhweight3, generated) {
561 Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
562 Xapian::Enquire enquire(db);
563 Xapian::Query query("solo");
565 enquire.set_query(query);
566 enquire.set_weighting_scheme(Xapian::DLHWeight());
568 Xapian::MSet mset1;
569 mset1 = enquire.get_mset(0, 10);
570 TEST_EQUAL(mset1.size(), 1);
571 // Weight gets clamped to zero.
572 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
574 return true;
577 // Test exception for junk after serialised weight.
578 DEFINE_TESTCASE(pl2weight1, !backend) {
579 Xapian::PL2Weight wt(2.0);
580 try {
581 Xapian::PL2Weight b;
582 Xapian::PL2Weight * b2 = b.unserialise(wt.serialise() + "X");
583 // Make sure we actually use the weight.
584 bool empty = b2->name().empty();
585 delete b2;
586 if (empty)
587 FAIL_TEST("Serialised PL2Weight with junk appended unserialised to empty name!");
588 FAIL_TEST("Serialised PL2Weight with junk appended unserialised OK");
589 } catch (const Xapian::SerialisationError &e) {
590 TEST(e.get_msg().find("PL2") != string::npos);
592 return true;
595 // Test for invalid values of c.
596 DEFINE_TESTCASE(pl2weight2, !backend) {
597 // InvalidArgumentError should be thrown if parameter c is invalid.
598 TEST_EXCEPTION(Xapian::InvalidArgumentError,
599 Xapian::PL2Weight wt(-2.0));
601 /* Parameter c should be set to 1.0 by constructor if none is given. */
602 Xapian::PL2Weight weight2;
603 TEST_EQUAL(weight2.serialise(), Xapian::PL2Weight(1.0).serialise());
605 return true;
608 // Feature Test.
609 DEFINE_TESTCASE(pl2weight3, backend) {
610 Xapian::Database db = get_database("apitest_simpledata");
611 Xapian::Enquire enquire(db);
612 Xapian::Query query("paragraph");
613 enquire.set_query(query);
614 Xapian::MSet mset;
616 enquire.set_weighting_scheme(Xapian::PL2Weight(2.0));
617 mset = enquire.get_mset(0, 10);
618 TEST_EQUAL(mset.size(), 5);
619 // Expected weight difference calculated in extended precision using stats
620 // from the test database.
621 TEST_EQUAL_DOUBLE(mset[2].get_weight(),
622 mset[3].get_weight() + 0.0086861771701328694);
624 // Test with OP_SCALE_WEIGHT.
625 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
626 enquire.set_weighting_scheme(Xapian::PL2Weight(2.0));
628 Xapian::MSet mset2;
629 mset2 = enquire.get_mset(0, 10);
630 TEST_EQUAL(mset2.size(), 5);
631 TEST_NOT_EQUAL_DOUBLE(mset[0].get_weight(), 0.0);
632 for (int i = 0; i < 5; ++i) {
633 TEST_EQUAL_DOUBLE(15.0 * mset[i].get_weight(), mset2[i].get_weight());
636 return true;
639 // Test exception for junk after serialised weight.
640 DEFINE_TESTCASE(pl2plusweight1, !backend) {
641 Xapian::PL2PlusWeight wt(2.0, 0.9);
642 try {
643 Xapian::PL2PlusWeight b;
644 Xapian::PL2PlusWeight * b2 = b.unserialise(wt.serialise() + "X");
645 // Make sure we actually use the weight.
646 bool empty = b2->name().empty();
647 delete b2;
648 if (empty)
649 FAIL_TEST("Serialised PL2PlusWeight with junk appended unserialised to empty name!");
650 FAIL_TEST("Serialised PL2PlusWeight with junk appended unserialised OK");
651 } catch (const Xapian::SerialisationError &e) {
652 TEST(e.get_msg().find("PL2Plus") != string::npos);
654 return true;
657 // Test for invalid values of parameters, c and delta.
658 DEFINE_TESTCASE(pl2plusweight2, !backend) {
659 // InvalidArgumentError should be thrown if parameter c is invalid.
660 TEST_EXCEPTION(Xapian::InvalidArgumentError,
661 Xapian::PL2PlusWeight wt(-2.0, 0.9));
663 // InvalidArgumentError should be thrown if parameter delta is invalid.
664 TEST_EXCEPTION(Xapian::InvalidArgumentError,
665 Xapian::PL2PlusWeight wt(1.0, -1.9));
667 return true;
670 // Test for default values of parameters, c and delta.
671 DEFINE_TESTCASE(pl2plusweight3, !backend) {
672 Xapian::PL2PlusWeight weight2;
674 /* Parameter c should be set to 1.0 by constructor if none is given. */
675 TEST_EQUAL(weight2.serialise(), Xapian::PL2PlusWeight(1.0, 0.8).serialise());
677 /* Parameter delta should be set to 0.8 by constructor if none is given. */
678 TEST_EQUAL(weight2.serialise(), Xapian::PL2PlusWeight(1.0, 0.8).serialise());
680 return true;
683 // Feature Test 1 for PL2PlusWeight.
684 DEFINE_TESTCASE(pl2plusweight4, backend) {
685 Xapian::Database db = get_database("apitest_simpledata");
686 Xapian::Enquire enquire(db);
687 enquire.set_query(Xapian::Query("paragraph"));
688 Xapian::MSet mset;
690 enquire.set_weighting_scheme(Xapian::PL2PlusWeight(2.0, 0.8));
691 mset = enquire.get_mset(0, 10);
692 TEST_EQUAL(mset.size(), 5);
693 // Expected weight difference calculated in extended precision using stats
694 // from the test database.
695 TEST_EQUAL_DOUBLE(mset[2].get_weight(),
696 mset[3].get_weight() + 0.0086861771701328694);
698 return true;
701 // Feature Test 2 for PL2PlusWeight
702 DEFINE_TESTCASE(pl2plusweight5, backend) {
703 Xapian::Database db = get_database("apitest_simpledata");
704 Xapian::Enquire enquire(db);
705 Xapian::Query query("word");
706 enquire.set_query(query);
707 Xapian::MSet mset;
709 enquire.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
710 mset = enquire.get_mset(0, 10);
711 // Expect MSet contains two documents having query "word".
712 TEST_EQUAL(mset.size(), 2);
713 // Expect Document 2 has higher weight than document 4 because
714 // "word" appears more no. of times in document 2 than document 4.
715 mset_expect_order(mset, 2, 4);
717 // Test with OP_SCALE_WEIGHT.
718 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
719 enquire.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
721 Xapian::MSet mset2;
722 mset2 = enquire.get_mset(0, 10);
723 TEST_EQUAL(mset2.size(), mset.size());
724 TEST_NOT_EQUAL_DOUBLE(mset[0].get_weight(), 0.0);
725 for (Xapian::doccount i = 0; i < mset.size(); ++i) {
726 TEST_EQUAL_DOUBLE(15.0 * mset[i].get_weight(), mset2[i].get_weight());
729 return true;
732 // Feature test
733 DEFINE_TESTCASE(dphweight1, backend) {
734 Xapian::Database db = get_database("apitest_simpledata");
735 Xapian::Enquire enquire(db);
736 Xapian::Query query("paragraph");
738 enquire.set_query(query);
739 enquire.set_weighting_scheme(Xapian::DPHWeight());
741 Xapian::MSet mset1;
742 mset1 = enquire.get_mset(0, 10);
743 TEST_EQUAL(mset1.size(), 5);
744 /* The weight has been calculated manually by using the statistics of the
745 * test database. */
746 TEST_EQUAL_DOUBLE(mset1[2].get_weight() - mset1[4].get_weight(), 0.542623617687990167);
748 // Test with OP_SCALE_WEIGHT.
749 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
750 enquire.set_weighting_scheme(Xapian::DPHWeight());
752 Xapian::MSet mset2;
753 mset2 = enquire.get_mset(0, 10);
754 TEST_EQUAL(mset2.size(), 5);
755 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
756 for (int i = 0; i < 5; ++i) {
757 TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
760 return true;
763 // Test exception for junk after serialised weight.
764 DEFINE_TESTCASE(dphweight2, !backend) {
765 Xapian::DPHWeight wt;
766 try {
767 Xapian::DPHWeight t;
768 Xapian::DPHWeight * t2 = t.unserialise(wt.serialise() + "X");
769 // Make sure we actually use the weight.
770 bool empty = t2->name().empty();
771 delete t2;
772 if (empty)
773 FAIL_TEST("Serialised DPHWeight with junk appended unserialised to empty name!");
774 FAIL_TEST("Serialised DPHWeight with junk appended unserialised OK");
775 } catch (const Xapian::SerialisationError &e) {
776 TEST(e.get_msg().find("DPH") != string::npos);
778 return true;
781 // Test wdf == doclen.
782 DEFINE_TESTCASE(dphweight3, generated) {
783 Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
784 Xapian::Enquire enquire(db);
785 Xapian::Query query("solo");
787 enquire.set_query(query);
788 enquire.set_weighting_scheme(Xapian::DPHWeight());
790 Xapian::MSet mset1;
791 mset1 = enquire.get_mset(0, 10);
792 TEST_EQUAL(mset1.size(), 1);
793 // Weight gets clamped to zero.
794 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
796 return true;
799 // Test for various cases of normalization string.
800 DEFINE_TESTCASE(tfidfweight1, !backend) {
801 // InvalidArgumentError should be thrown if normalization string is invalid
802 TEST_EXCEPTION(Xapian::InvalidArgumentError,
803 Xapian::TfIdfWeight b("JOHN_LENNON"));
805 TEST_EXCEPTION(Xapian::InvalidArgumentError,
806 Xapian::TfIdfWeight b("LOL"));
808 /* Normalization string should be set to "ntn" by constructor if none is
809 given. */
810 Xapian::TfIdfWeight weight2;
811 TEST_EQUAL(weight2.serialise(), Xapian::TfIdfWeight("ntn").serialise());
813 return true;
816 // Test exception for junk after serialised weight.
817 DEFINE_TESTCASE(tfidfweight2, !backend) {
818 Xapian::TfIdfWeight wt("ntn");
819 try {
820 Xapian::TfIdfWeight b;
821 Xapian::TfIdfWeight * b2 = b.unserialise(wt.serialise() + "X");
822 // Make sure we actually use the weight.
823 bool empty = b2->name().empty();
824 delete b2;
825 if (empty)
826 FAIL_TEST("Serialised TfIdfWeight with junk appended unserialised to empty name!");
827 FAIL_TEST("Serialised TfIdfWeight with junk appended unserialised OK");
828 } catch (const Xapian::SerialisationError &e) {
829 TEST(e.get_msg().find("TfIdf") != string::npos);
831 return true;
834 // Feature tests for various normalization functions.
835 DEFINE_TESTCASE(tfidfweight3, backend) {
836 Xapian::Database db = get_database("apitest_simpledata");
837 Xapian::Enquire enquire(db);
838 Xapian::Query query("word");
839 Xapian::MSet mset;
841 // Check for "ntn" when termfreq != N
842 enquire.set_query(query);
843 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
844 mset = enquire.get_mset(0, 10);
845 TEST_EQUAL(mset.size(), 2);
846 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
847 mset_expect_order(mset, 2, 4);
848 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
850 // Check that wqf is taken into account.
851 enquire.set_query(Xapian::Query("word", 2));
852 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
853 Xapian::MSet mset2 = enquire.get_mset(0, 10);
854 TEST_EQUAL(mset2.size(), 2);
855 // wqf is 2, so weights should be doubled.
856 TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
857 TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
859 // Test with OP_SCALE_WEIGHT.
860 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
861 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
862 mset2 = enquire.get_mset(0, 10);
863 TEST_EQUAL(mset2.size(), 2);
864 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
865 mset_expect_order(mset2, 2, 4);
866 TEST_NOT_EQUAL_DOUBLE(mset[0].get_weight(), 0.0);
867 TEST_EQUAL_DOUBLE(15 * mset[0].get_weight(), mset2[0].get_weight());
869 // check for "nfn" when termfreq != N
870 enquire.set_query(query);
871 enquire.set_weighting_scheme(Xapian::TfIdfWeight("nfn"));
872 mset = enquire.get_mset(0, 10);
873 TEST_EQUAL(mset.size(), 2);
874 mset_expect_order(mset, 2, 4);
875 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
877 // check for "nsn" when termfreq != N
878 enquire.set_query(query);
879 enquire.set_weighting_scheme(Xapian::TfIdfWeight("nsn"));
880 mset = enquire.get_mset(0, 10);
881 TEST_EQUAL(mset.size(), 2);
882 mset_expect_order(mset, 2, 4);
883 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
885 // Check for "bnn" and for both branches of 'b'.
886 enquire.set_query(Xapian::Query("test"));
887 enquire.set_weighting_scheme(Xapian::TfIdfWeight("bnn"));
888 mset = enquire.get_mset(0, 10);
889 TEST_EQUAL(mset.size(), 1);
890 mset_expect_order(mset, 1);
891 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
893 // Check for "lnn" and for both branches of 'l'.
894 enquire.set_query(Xapian::Query("word"));
895 enquire.set_weighting_scheme(Xapian::TfIdfWeight("lnn"));
896 mset = enquire.get_mset(0, 10);
897 TEST_EQUAL(mset.size(), 2);
898 mset_expect_order(mset, 2, 4);
899 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0)); // idfn=1 and so wt=tfn=1+log(tf)
900 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0); // idfn=1 and wt=tfn=1+log(tf)=1+log(1)=1
902 // Check for "snn"
903 enquire.set_query(Xapian::Query("paragraph"));
904 enquire.set_weighting_scheme(Xapian::TfIdfWeight("snn")); // idf=1 and tfn=tf*tf
905 mset = enquire.get_mset(0, 10);
906 TEST_EQUAL(mset.size(), 5);
907 mset_expect_order(mset, 2, 1, 4, 3, 5);
908 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
909 TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
911 // Check for "ntn" when termfreq=N
912 enquire.set_query(Xapian::Query("this")); // N=termfreq amd so idfn=0 for "t"
913 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
914 mset = enquire.get_mset(0, 10);
915 TEST_EQUAL(mset.size(), 6);
916 mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
917 for (int i = 0; i < 6; ++i) {
918 TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
921 // Check for "npn" and for both branches of 'p'
922 enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "p"
923 enquire.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
924 mset = enquire.get_mset(0, 10);
925 TEST_EQUAL(mset.size(), 6);
926 mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
927 for (int i = 0; i < 6; ++i) {
928 TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
931 // Check for "Lnn".
932 enquire.set_query(Xapian::Query("word"));
933 enquire.set_weighting_scheme(Xapian::TfIdfWeight("Lnn"));
934 mset = enquire.get_mset(0, 10);
935 TEST_EQUAL(mset.size(), 2);
936 mset_expect_order(mset, 2, 4);
937 TEST_EQUAL_DOUBLE(mset[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
938 TEST_EQUAL_DOUBLE(mset[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
940 enquire.set_query(Xapian::Query("word"));
941 enquire.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
942 mset = enquire.get_mset(0, 10);
943 TEST_EQUAL(mset.size(), 2);
944 mset_expect_order(mset, 2, 4);
945 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
946 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
948 return true;
951 // Feature tests for pivoted normalization functions.
952 DEFINE_TESTCASE(tfidfweight4, backend) {
953 Xapian::Database db = get_database("apitest_simpledata");
954 Xapian::Enquire enquire(db);
955 Xapian::Query query("paragraph");
956 Xapian::MSet mset;
958 // Check for "PPn" normalization string.
959 enquire.set_query(query);
960 enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
961 mset = enquire.get_mset(0, 10);
962 TEST_EQUAL(mset.size(), 5);
963 // Shorter docs should ranker higher if wqf is equal among all the docs.
964 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
965 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
967 // Check that wqf is taken into account.
968 enquire.set_query(Xapian::Query("paragraph", 2));
969 enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
970 Xapian::MSet mset2 = enquire.get_mset(0, 10);
971 TEST_EQUAL(mset2.size(), 5);
972 // wqf is 2, so weights should be doubled.
973 TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
974 TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
976 // check for "nPn" which represents "xPx"
977 enquire.set_query(Xapian::Query("word"));
978 enquire.set_weighting_scheme(Xapian::TfIdfWeight("nPn", 0.2, 1.0));
979 mset = enquire.get_mset(0, 10);
980 TEST_EQUAL(mset.size(), 2);
981 // Expect doc 2 with query "word" to have higher weight than doc 4.
982 mset_expect_order(mset, 2, 4);
984 // check for "Ptn" which represents "Pxx"
985 enquire.set_query(Xapian::Query("word"));
986 enquire.set_weighting_scheme(Xapian::TfIdfWeight("Ptn", 0.2, 1.0));
987 mset = enquire.get_mset(0, 10);
988 TEST_EQUAL(mset.size(), 2);
989 // Expect doc 2 with query "word" to have higher weight than doc 4.
990 mset_expect_order(mset, 2, 4);
992 return true;
995 class CheckInitWeight : public Xapian::Weight {
996 public:
997 double factor;
999 unsigned & zero_inits, & non_zero_inits;
1001 CheckInitWeight(unsigned &z, unsigned &n)
1002 : factor(-1.0), zero_inits(z), non_zero_inits(n) { }
1004 void init(double factor_) {
1005 factor = factor_;
1006 if (factor == 0.0)
1007 ++zero_inits;
1008 else
1009 ++non_zero_inits;
1012 Weight * clone() const {
1013 return new CheckInitWeight(zero_inits, non_zero_inits);
1016 double get_sumpart(Xapian::termcount, Xapian::termcount,
1017 Xapian::termcount) const {
1018 return 1.0;
1021 double get_maxpart() const { return 1.0; }
1023 double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const {
1024 return 1.0 / doclen;
1027 double get_maxextra() const { return 1.0; }
1030 /// Regression test - check init() is called for the term-indep Weight obj.
1031 DEFINE_TESTCASE(checkinitweight1, backend && !multi && !remote) {
1032 Xapian::Database db = get_database("apitest_simpledata");
1033 Xapian::Enquire enquire(db);
1034 Xapian::Query q(Xapian::Query::OP_AND,
1035 Xapian::Query("this"), Xapian::Query("paragraph"));
1036 enquire.set_query(q);
1037 unsigned zero_inits = 0, non_zero_inits = 0;
1038 CheckInitWeight wt(zero_inits, non_zero_inits);
1039 enquire.set_weighting_scheme(wt);
1040 Xapian::MSet mset = enquire.get_mset(0, 3);
1041 TEST_EQUAL(zero_inits, 1);
1042 TEST_EQUAL(non_zero_inits, 2);
1043 return true;
1046 class CheckStatsWeight : public Xapian::Weight {
1047 public:
1048 double factor;
1050 Xapian::Database db;
1052 string term1;
1054 // When testing OP_SYNONYM, term2 is also set.
1055 // When testing OP_WILDCARD, term2 == "*".
1056 // When testing a repeated term, term2 == "=" for the first occurrence and
1057 // "_" for subsequent occurrences.
1058 mutable string term2;
1060 Xapian::termcount & sum;
1061 Xapian::termcount & sum_squares;
1063 mutable Xapian::termcount len_upper;
1064 mutable Xapian::termcount len_lower;
1065 mutable Xapian::termcount wdf_upper;
1067 CheckStatsWeight(const Xapian::Database & db_,
1068 const string & term1_,
1069 const string & term2_,
1070 Xapian::termcount & sum_,
1071 Xapian::termcount & sum_squares_)
1072 : factor(-1.0), db(db_), term1(term1_), term2(term2_),
1073 sum(sum_), sum_squares(sum_squares_),
1074 len_upper(0), len_lower(Xapian::termcount(-1)), wdf_upper(0)
1076 need_stat(COLLECTION_SIZE);
1077 need_stat(RSET_SIZE);
1078 need_stat(AVERAGE_LENGTH);
1079 need_stat(TERMFREQ);
1080 need_stat(RELTERMFREQ);
1081 need_stat(QUERY_LENGTH);
1082 need_stat(WQF);
1083 need_stat(WDF);
1084 need_stat(DOC_LENGTH);
1085 need_stat(DOC_LENGTH_MIN);
1086 need_stat(DOC_LENGTH_MAX);
1087 need_stat(WDF_MAX);
1088 need_stat(COLLECTION_FREQ);
1089 need_stat(UNIQUE_TERMS);
1092 CheckStatsWeight(const Xapian::Database & db_,
1093 const string & term_,
1094 Xapian::termcount & sum_,
1095 Xapian::termcount & sum_squares_)
1096 : CheckStatsWeight(db_, term_, string(), sum_, sum_squares_) { }
1098 void init(double factor_) {
1099 factor = factor_;
1102 Weight * clone() const {
1103 auto res = new CheckStatsWeight(db, term1, term2, sum, sum_squares);
1104 if (term2 == "=") {
1105 // The object passed to Enquire::set_weighting_scheme() is cloned
1106 // right away, and then cloned again for each term, and then
1107 // potentially once more for the term-independent weight
1108 // contribution. In the repeated case, we want to handle the first
1109 // actual term specially, so we arrange for that to have "=" for
1110 // term2, and subsequent clones to have "_", so that we accumulate
1111 // sum and sum_squares on the first occurrence only.
1112 term2 = "_";
1114 return res;
1117 double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen,
1118 Xapian::termcount uniqueterms) const {
1119 Xapian::doccount num_docs = db.get_doccount();
1120 TEST_EQUAL(get_collection_size(), num_docs);
1121 TEST_EQUAL(get_rset_size(), 0);
1122 TEST_EQUAL(get_average_length(), db.get_avlength());
1123 if (term2.empty() || term2 == "=" || term2 == "_") {
1124 TEST_EQUAL(get_termfreq(), db.get_termfreq(term1));
1125 TEST_EQUAL(get_collection_freq(), db.get_collection_freq(term1));
1126 if (term2.empty()) {
1127 TEST_EQUAL(get_query_length(), 1);
1128 } else {
1129 TEST_EQUAL(get_query_length(), 2);
1131 } else {
1132 Xapian::doccount tfmax = 0, tfsum = 0;
1133 Xapian::termcount cfmax = 0, cfsum = 0;
1134 if (term2 == "*") {
1135 // OP_WILDCARD case.
1136 for (auto&& t = db.allterms_begin(term1);
1137 t != db.allterms_end(term1); ++t) {
1138 Xapian::doccount tf = t.get_termfreq();
1139 tout << "->" << *t << " " << tf << endl;
1140 tfsum += tf;
1141 tfmax = max(tfmax, tf);
1142 Xapian::termcount cf = db.get_collection_freq(*t);
1143 cfsum += cf;
1144 cfmax = max(cfmax, cf);
1146 TEST_EQUAL(get_query_length(), 1);
1147 } else {
1148 // OP_SYNONYM case.
1149 Xapian::doccount tf1 = db.get_termfreq(term1);
1150 Xapian::doccount tf2 = db.get_termfreq(term2);
1151 tfsum = tf1 + tf2;
1152 tfmax = max(tf1, tf2);
1153 Xapian::termcount cf1 = db.get_collection_freq(term1);
1154 Xapian::termcount cf2 = db.get_collection_freq(term2);
1155 cfsum = cf1 + cf2;
1156 cfmax = max(cf1, cf2);
1157 TEST_EQUAL(get_query_length(), 2);
1159 // Synonym occurs at least as many times as any term.
1160 TEST_REL(get_termfreq(), >=, tfmax);
1161 TEST_REL(get_collection_freq(), >=, cfmax);
1162 // Synonym can't occur more times than the terms do.
1163 TEST_REL(get_termfreq(), <=, tfsum);
1164 TEST_REL(get_collection_freq(), <=, cfsum);
1165 // Synonym can't occur more times than there are documents/terms.
1166 TEST_REL(get_termfreq(), <=, num_docs);
1167 double total_term_occurences = get_average_length() * num_docs;
1168 TEST_REL(get_collection_freq(), <=, total_term_occurences);
1170 TEST_EQUAL(get_reltermfreq(), 0);
1171 TEST_EQUAL(get_wqf(), 1);
1172 TEST_REL(doclen,>=,len_lower);
1173 TEST_REL(doclen,<=,len_upper);
1174 TEST_REL(uniqueterms,>=,1);
1175 TEST_REL(uniqueterms,<=,doclen);
1176 TEST_REL(wdf,<=,wdf_upper);
1177 if (term2 != "_") {
1178 sum += wdf;
1179 sum_squares += wdf * wdf;
1181 return 1.0;
1184 double get_maxpart() const {
1185 if (len_upper == 0) {
1186 len_lower = get_doclength_lower_bound();
1187 len_upper = get_doclength_upper_bound();
1188 wdf_upper = get_wdf_upper_bound();
1190 return 1.0;
1193 double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const {
1194 return 1.0 / doclen;
1197 double get_maxextra() const { return 1.0; }
1200 /// Check the weight subclass gets the correct stats.
1201 DEFINE_TESTCASE(checkstatsweight1, backend && !remote) {
1202 Xapian::Database db = get_database("apitest_simpledata");
1203 Xapian::Enquire enquire(db);
1204 Xapian::TermIterator a;
1205 for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1206 const string & term = *a;
1207 enquire.set_query(Xapian::Query(term));
1208 Xapian::termcount sum = 0;
1209 Xapian::termcount sum_squares = 0;
1210 CheckStatsWeight wt(db, term, sum, sum_squares);
1211 enquire.set_weighting_scheme(wt);
1212 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1214 // The document order in the multi-db case isn't the same as the
1215 // postlist order on the combined DB, so it's hard to compare the
1216 // wdf for each document in the Weight objects, so we can sum
1217 // the wdfs and the squares of the wdfs which provides a decent
1218 // check that we're not getting the wrong wdf values (it ensures
1219 // they have the right mean and standard deviation).
1220 Xapian::termcount expected_sum = 0;
1221 Xapian::termcount expected_sum_squares = 0;
1222 Xapian::PostingIterator i;
1223 for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1224 Xapian::termcount wdf = i.get_wdf();
1225 expected_sum += wdf;
1226 expected_sum_squares += wdf * wdf;
1228 TEST_EQUAL(sum, expected_sum);
1229 TEST_EQUAL(sum_squares, expected_sum_squares);
1231 return true;
1234 /// Check the weight subclass gets the correct stats with OP_SYNONYM.
1235 // Regression test for bugs fixed in 1.4.1.
1236 DEFINE_TESTCASE(checkstatsweight2, backend && !remote) {
1237 Xapian::Database db = get_database("apitest_simpledata");
1238 Xapian::Enquire enquire(db);
1239 Xapian::TermIterator a;
1240 for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1241 const string & term1 = *a;
1242 if (++a == db.allterms_end()) break;
1243 const string & term2 = *a;
1244 Xapian::Query q(Xapian::Query::OP_SYNONYM,
1245 Xapian::Query(term1), Xapian::Query(term2));
1246 tout << q.get_description() << endl;
1247 enquire.set_query(q);
1248 Xapian::termcount sum = 0;
1249 Xapian::termcount sum_squares = 0;
1250 CheckStatsWeight wt(db, term1, term2, sum, sum_squares);
1251 enquire.set_weighting_scheme(wt);
1252 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1254 // The document order in the multi-db case isn't the same as the
1255 // postlist order on the combined DB, so it's hard to compare the
1256 // wdf for each document in the Weight objects, so we can sum
1257 // the wdfs and the squares of the wdfs which provides a decent
1258 // check that we're not getting the wrong wdf values (it ensures
1259 // they have the right mean and standard deviation).
1260 Xapian::termcount expected_sum = 0;
1261 Xapian::termcount expected_sum_squares = 0;
1262 Xapian::PostingIterator i = db.postlist_begin(term1);
1263 Xapian::PostingIterator j = db.postlist_begin(term2);
1264 Xapian::docid did1 = *i, did2 = *j;
1265 while (true) {
1266 // To calculate expected_sum_squares correctly we need to square
1267 // the sum per document.
1268 Xapian::termcount wdf;
1269 if (did1 == did2) {
1270 wdf = i.get_wdf() + j.get_wdf();
1271 did1 = did2 = 0;
1272 } else if (did1 < did2) {
1273 wdf = i.get_wdf();
1274 did1 = 0;
1275 } else {
1276 wdf = j.get_wdf();
1277 did2 = 0;
1279 expected_sum += wdf;
1280 expected_sum_squares += wdf * wdf;
1282 if (did1 == 0) {
1283 if (++i != db.postlist_end(term1)) {
1284 did1 = *i;
1285 } else {
1286 if (did2 == Xapian::docid(-1)) break;
1287 did1 = Xapian::docid(-1);
1290 if (did2 == 0) {
1291 if (++j != db.postlist_end(term2)) {
1292 did2 = *j;
1293 } else {
1294 if (did1 == Xapian::docid(-1)) break;
1295 did2 = Xapian::docid(-1);
1299 // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1300 // the individual terms.
1301 TEST_EQUAL(sum, expected_sum);
1302 TEST_REL(sum_squares, >=, expected_sum_squares);
1304 return true;
1307 /// Check the weight subclass gets the correct stats with OP_WILDCARD.
1308 // Regression test for bug fixed in 1.4.1.
1309 // Don't run with multi-database, as the termfreq checks don't work
1310 // there - FIXME: Investigate this - it smells like a bug.
1311 DEFINE_TESTCASE(checkstatsweight3, backend && !remote && !multi) {
1312 struct PlCmp {
1313 bool operator()(const Xapian::PostingIterator& a,
1314 const Xapian::PostingIterator& b) {
1315 return *a < *b;
1319 Xapian::Database db = get_database("apitest_simpledata");
1320 Xapian::Enquire enquire(db);
1321 Xapian::TermIterator a;
1322 static const char * const testcases[] = {
1323 "a", // a* matches all documents, but no term matches all.
1324 "pa", // Expands to only "paragraph", matching 5.
1325 "zulu", // No matches.
1326 "th", // Term "this" matches all documents.
1328 for (auto pattern : testcases) {
1329 Xapian::Query q(Xapian::Query::OP_WILDCARD, pattern);
1330 tout << q.get_description() << endl;
1331 enquire.set_query(q);
1332 Xapian::termcount sum = 0;
1333 Xapian::termcount sum_squares = 0;
1334 CheckStatsWeight wt(db, pattern, "*", sum, sum_squares);
1335 enquire.set_weighting_scheme(wt);
1336 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1338 // The document order in the multi-db case isn't the same as the
1339 // postlist order on the combined DB, so it's hard to compare the
1340 // wdf for each document in the Weight objects, so we can sum
1341 // the wdfs and the squares of the wdfs which provides a decent
1342 // check that we're not getting the wrong wdf values (it ensures
1343 // they have the right mean and standard deviation).
1344 Xapian::termcount expected_sum = 0;
1345 Xapian::termcount expected_sum_squares = 0;
1346 vector<Xapian::PostingIterator> postlists;
1347 for (auto&& t = db.allterms_begin(pattern);
1348 t != db.allterms_end(pattern); ++t) {
1349 postlists.emplace_back(db.postlist_begin(*t));
1351 make_heap(postlists.begin(), postlists.end(), PlCmp());
1352 Xapian::docid did = 0;
1353 Xapian::termcount wdf = 0;
1354 while (!postlists.empty()) {
1355 pop_heap(postlists.begin(), postlists.end(), PlCmp());
1356 Xapian::docid did_new = *postlists.back();
1357 Xapian::termcount wdf_new = postlists.back().get_wdf();
1358 if (++(postlists.back()) == Xapian::PostingIterator()) {
1359 postlists.pop_back();
1360 } else {
1361 push_heap(postlists.begin(), postlists.end(), PlCmp());
1363 if (did_new != did) {
1364 expected_sum += wdf;
1365 expected_sum_squares += wdf * wdf;
1366 wdf = 0;
1367 did = did_new;
1369 wdf += wdf_new;
1371 expected_sum += wdf;
1372 expected_sum_squares += wdf * wdf;
1373 // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1374 // the individual terms.
1375 TEST_EQUAL(sum, expected_sum);
1376 TEST_REL(sum_squares, >=, expected_sum_squares);
1378 return true;
1381 /// Check the stats for a repeated term are correct.
1382 // Regression test for bug fixed in 1.4.6. Doesn't work with
1383 // multi as the weight object is cloned more times.
1384 DEFINE_TESTCASE(checkstatsweight4, backend && !remote && !multi) {
1385 Xapian::Database db = get_database("apitest_simpledata");
1386 Xapian::Enquire enquire(db);
1387 Xapian::TermIterator a;
1388 for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1389 const string & term = *a;
1390 enquire.set_query(Xapian::Query(term, 1, 1) |
1391 Xapian::Query(term, 1, 2));
1392 Xapian::termcount sum = 0;
1393 Xapian::termcount sum_squares = 0;
1394 CheckStatsWeight wt(db, term, "=", sum, sum_squares);
1395 enquire.set_weighting_scheme(wt);
1396 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1398 // The document order in the multi-db case isn't the same as the
1399 // postlist order on the combined DB, so it's hard to compare the
1400 // wdf for each document in the Weight objects, so we can sum
1401 // the wdfs and the squares of the wdfs which provides a decent
1402 // check that we're not getting the wrong wdf values (it ensures
1403 // they have the right mean and standard deviation).
1404 Xapian::termcount expected_sum = 0;
1405 Xapian::termcount expected_sum_squares = 0;
1406 Xapian::PostingIterator i;
1407 for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1408 Xapian::termcount wdf = i.get_wdf();
1409 expected_sum += wdf;
1410 expected_sum_squares += wdf * wdf;
1412 TEST_EQUAL(sum, expected_sum);
1413 TEST_EQUAL(sum_squares, expected_sum_squares);
1415 return true;
1418 // Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both.
1419 DEFINE_TESTCASE(unigramlmweight4, backend) {
1420 Xapian::Database db = get_database("apitest_simpledata");
1421 Xapian::Enquire enquire1(db);
1422 Xapian::Enquire enquire2(db);
1423 enquire1.set_query(Xapian::Query("paragraph"));
1424 Xapian::MSet mset1;
1425 enquire2.set_query(Xapian::Query("paragraph"));
1426 Xapian::MSet mset2;
1427 // 5 documents available with term paragraph so mset size should be 5
1428 enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::TWO_STAGE_SMOOTHING, 1, 0));
1429 enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 1, 0));
1430 mset1 = enquire1.get_mset(0, 10);
1431 mset2 = enquire2.get_mset(0, 10);
1433 TEST_EQUAL(mset1.size(), 5);
1434 TEST_EQUAL_DOUBLE(mset1[1].get_weight(), mset2[1].get_weight());
1435 return true;
1438 /* Test for checking if we don't use smoothing all
1439 * of them should give same result i.e wdf_double/len_double */
1440 DEFINE_TESTCASE(unigramlmweight5, backend) {
1441 Xapian::Database db = get_database("apitest_simpledata");
1442 Xapian::Enquire enquire1(db);
1443 Xapian::Enquire enquire2(db);
1444 Xapian::Enquire enquire3(db);
1445 Xapian::Enquire enquire4(db);
1446 enquire1.set_query(Xapian::Query("paragraph"));
1447 Xapian::MSet mset1;
1448 enquire2.set_query(Xapian::Query("paragraph"));
1449 Xapian::MSet mset2;
1450 enquire3.set_query(Xapian::Query("paragraph"));
1451 Xapian::MSet mset3;
1452 enquire4.set_query(Xapian::Query("paragraph"));
1453 Xapian::MSet mset4;
1454 // 5 documents available with term paragraph so mset size should be 5
1455 enquire1.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0, 0));
1456 enquire2.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0, 0));
1457 enquire3.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING, 0, 0));
1458 enquire4.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::DIRICHLET_SMOOTHING, 0, 0));
1460 mset1 = enquire1.get_mset(0, 10);
1461 mset2 = enquire2.get_mset(0, 10);
1462 mset3 = enquire3.get_mset(0, 10);
1463 mset4 = enquire4.get_mset(0, 10);
1465 TEST_EQUAL(mset1.size(), 5);
1466 TEST_EQUAL(mset2.size(), 5);
1467 TEST_EQUAL(mset3.size(), 5);
1468 TEST_EQUAL(mset4.size(), 5);
1469 for (size_t i = 0; i < 5; ++i) {
1470 TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset4[i].get_weight());
1471 TEST_EQUAL_DOUBLE(mset2[i].get_weight(), mset4[i].get_weight());
1472 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset2[i].get_weight());
1473 TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset2[i].get_weight());
1474 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset4[i].get_weight());
1475 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight());
1477 return true;
1480 // Test Exception for junk after serialised weight (with Dir+ enabled).
1481 DEFINE_TESTCASE(unigramlmweight6, !backend) {
1482 Xapian::LMWeight wt(0, Xapian::Weight::DIRICHLET_SMOOTHING, 0.5, 1.0);
1483 try {
1484 Xapian::LMWeight d;
1485 Xapian::LMWeight * d2 = d.unserialise(wt.serialise() + "X");
1486 // Make sure we actually use the weight.
1487 bool empty = d2->name().empty();
1488 delete d2;
1489 if (empty)
1490 FAIL_TEST("Serialised LMWeight with junk appended unserialised to empty name!");
1491 FAIL_TEST("Serialised LMWeight with junk appended unserialised OK");
1492 } catch (const Xapian::SerialisationError &e) {
1493 TEST(e.get_msg().find("LM") != string::npos);
1495 return true;
1498 // Feature test for Dir+ function.
1499 DEFINE_TESTCASE(unigramlmweight7, backend) {
1500 Xapian::Database db = get_database("apitest_simpledata");
1501 Xapian::Enquire enquire1(db);
1502 Xapian::Enquire enquire2(db);
1503 enquire1.set_query(Xapian::Query("paragraph"));
1504 enquire2.set_query(Xapian::Query("paragraph"));
1505 Xapian::MSet mset1;
1506 Xapian::MSet mset2;
1508 enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1509 enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_PLUS_SMOOTHING, 2000, 0.05));
1511 mset1 = enquire1.get_mset(0, 10);
1512 mset2 = enquire2.get_mset(0, 10);
1514 // mset size should be 5
1515 TEST_EQUAL(mset1.size(), 5);
1516 TEST_EQUAL(mset2.size(), 5);
1518 // Expect mset weights associated with Dir+ more than mset weights by Dir
1519 // because of the presence of extra weight component in Dir+ function.
1520 TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight());
1521 TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight());
1522 TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight());
1523 TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight());
1524 TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight());
1526 return true;
1529 // Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1).
1530 DEFINE_TESTCASE(unigramlmweight8, backend) {
1531 Xapian::Database db = get_database("apitest_simpledata");
1532 Xapian::Enquire enquire(db);
1533 Xapian::Query query("paragraph");
1535 enquire.set_query(query);
1536 enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1538 Xapian::MSet mset1;
1539 mset1 = enquire.get_mset(0, 10);
1540 TEST_EQUAL(mset1.size(), 5);
1542 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
1543 enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1545 Xapian::MSet mset2;
1546 mset2 = enquire.get_mset(0, 10);
1547 TEST_EQUAL(mset2.size(), mset1.size());
1548 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
1549 for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
1550 TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
1553 return true;
1556 // Feature test for BoolWeight.
1557 // Test exception for junk after serialised weight.
1558 DEFINE_TESTCASE(boolweight1, !backend) {
1559 Xapian::BoolWeight wt;
1560 try {
1561 Xapian::BoolWeight t;
1562 Xapian::BoolWeight * t2 = t.unserialise(wt.serialise() + "X");
1563 // Make sure we actually use the weight.
1564 bool empty = t2->name().empty();
1565 delete t2;
1566 if (empty)
1567 FAIL_TEST("Serialised BoolWeight with junk appended unserialised to empty name!");
1568 FAIL_TEST("Serialised BoolWeight with junk appended unserialised OK");
1569 } catch (const Xapian::SerialisationError &e) {
1570 TEST(e.get_msg().find("Bool") != string::npos);
1572 return true;
1575 // Feature test for CoordWeight.
1576 DEFINE_TESTCASE(coordweight1, backend) {
1577 Xapian::Enquire enquire(get_database("apitest_simpledata"));
1578 enquire.set_weighting_scheme(Xapian::CoordWeight());
1579 static const char * const terms[] = {
1580 "this", "line", "paragraph", "rubbish"
1582 Xapian::Query query(Xapian::Query::OP_OR,
1583 terms, terms + sizeof(terms) / sizeof(terms[0]));
1584 enquire.set_query(query);
1585 Xapian::MSet mymset1 = enquire.get_mset(0, 100);
1586 // CoordWeight scores 1 for each matching term, so the weight should equal
1587 // the number of matching terms.
1588 for (Xapian::MSetIterator i = mymset1.begin(); i != mymset1.end(); ++i) {
1589 Xapian::termcount matching_terms = 0;
1590 Xapian::TermIterator t = enquire.get_matching_terms_begin(i);
1591 while (t != enquire.get_matching_terms_end(i)) {
1592 ++matching_terms;
1593 ++t;
1595 TEST_EQUAL(i.get_weight(), matching_terms);
1598 // Test with OP_SCALE_WEIGHT.
1599 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
1600 Xapian::MSet mymset2 = enquire.get_mset(0, 100);
1601 TEST_EQUAL(mymset1.size(), mymset2.size());
1602 for (Xapian::doccount i = 0; i != mymset1.size(); ++i) {
1603 TEST_EQUAL(15.0 * mymset1[i].get_weight(), mymset2[i].get_weight());
1606 return true;
1609 // Test exception for junk after serialised weight.
1610 DEFINE_TESTCASE(coordweight2, !backend) {
1611 Xapian::CoordWeight wt;
1612 try {
1613 Xapian::CoordWeight t;
1614 Xapian::CoordWeight * t2 = t.unserialise(wt.serialise() + "X");
1615 // Make sure we actually use the weight.
1616 bool empty = t2->name().empty();
1617 delete t2;
1618 if (empty)
1619 FAIL_TEST("Serialised CoordWeight with junk appended unserialised to empty name!");
1620 FAIL_TEST("Serialised CoordWeight with junk appended unserialised OK");
1621 } catch (const Xapian::SerialisationError &e) {
1622 TEST(e.get_msg().find("Coord") != string::npos);
1624 return true;