xapian-core/tests/api_percentages.cc

   1 /** @file api_percentages.cc
   2  * @brief Tests of percentage calculations.
   3  */
   4 /* Copyright (C) 2008,2009 Lemur Consulting Ltd
   5  * Copyright (C) 2008,2009,2010,2011,2012,2014 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "api_percentages.h"
  25
  26 #include <xapian.h>
  27
  28 #include "apitest.h"
  29 #include "backendmanager_local.h"
  30 #include "str.h"
  31 #include "testutils.h"
  32
  33 #include <cfloat>
  34
  35 using namespace std;
  36
  37 // Test that percentages reported are the same regardless of which part of the
  38 // mset is returned, for sort-by-value search.  Regression test for bug#216 in
  39 // 1.0.10 and earlier with returned percentages.
  40 DEFINE_TESTCASE(consistency3, backend) {
  41     Xapian::Database db(get_database("apitest_sortconsist"));
  42     Xapian::Enquire enquire(db);
  43     enquire.set_query(Xapian::Query("foo"));
  44     enquire.set_sort_by_value(1, 0);
  45     Xapian::doccount lots = 3;
  46     Xapian::MSet bigmset = enquire.get_mset(0, lots);
  47     TEST_EQUAL(bigmset.size(), lots);
  48     for (Xapian::doccount start = 0; start < lots; ++start) {
  49         tout << *bigmset[start] << ":" << bigmset[start].get_weight() << ":"
  50              << bigmset[start].get_percent() << "%" << endl;
  51         for (Xapian::doccount size = 0; size < lots - start; ++size) {
  52             Xapian::MSet mset = enquire.get_mset(start, size);
  53             if (mset.size()) {
  54                 TEST_EQUAL(start + mset.size(),
  55                            min(start + size, bigmset.size()));
  56             } else if (size) {
  57                 TEST(start >= bigmset.size());
  58             }
  59             for (Xapian::doccount i = 0; i < mset.size(); ++i) {
  60                 TEST_EQUAL(*mset[i], *bigmset[start + i]);
  61                 TEST_EQUAL_DOUBLE(mset[i].get_weight(),
  62                                   bigmset[start + i].get_weight());
  63                 TEST_EQUAL_DOUBLE(mset[i].get_percent(),
  64                                   bigmset[start + i].get_percent());
  65             }
  66         }
  67     }
  68     return true;
  69 }
  70
  71 class MyPostingSource : public Xapian::PostingSource {
  72     vector<pair<Xapian::docid, double> > weights;
  73     vector<pair<Xapian::docid, double> >::const_iterator i;
  74     bool started;
  75
  76     MyPostingSource(const vector<pair<Xapian::docid, double> > &weights_,
  77                     double max_wt)
  78         : weights(weights_), started(false)
  79     {
  80         set_maxweight(max_wt);
  81     }
  82
  83   public:
  84     MyPostingSource() : started(false) { }
  85
  86     PostingSource * clone() const
  87     {
  88         return new MyPostingSource(weights, get_maxweight());
  89     }
  90
  91     void append_docweight(Xapian::docid did, double wt) {
  92         weights.push_back(make_pair(did, wt));
  93         if (wt > get_maxweight()) set_maxweight(wt);
  94     }
  95
  96     void init(const Xapian::Database &) { started = false; }
  97
  98     double get_weight() const { return i->second; }
  99
 100     Xapian::doccount get_termfreq_min() const { return weights.size(); }
 101     Xapian::doccount get_termfreq_est() const { return weights.size(); }
 102     Xapian::doccount get_termfreq_max() const { return weights.size(); }
 103
 104     void next(double /*wt*/) {
 105         if (!started) {
 106             i = weights.begin();
 107             started = true;
 108         } else {
 109             ++i;
 110         }
 111     }
 112
 113     bool at_end() const {
 114         return (i == weights.end());
 115     }
 116
 117     Xapian::docid get_docid() const { return i->first; }
 118
 119     string get_description() const {
 120         return "MyPostingSource";
 121     }
 122 };
 123
 124
 125 /// Test for rounding errors in percentage weight calculations and cutoffs.
 126 DEFINE_TESTCASE(pctcutoff4, backend && !remote && !multi) {
 127     // Find the number of DBL_EPSILONs to subtract which result in the
 128     // percentage of the second hit being 49% instead of 50%.
 129     int epsilons = 0;
 130     Xapian::Database db(get_database("apitest_simpledata"));
 131     Xapian::Enquire enquire(db);
 132     while (true) {
 133         MyPostingSource source;
 134         source.append_docweight(1, 100);
 135         source.append_docweight(2, 50 - epsilons * DBL_EPSILON);
 136         enquire.set_query(Xapian::Query(&source));
 137         Xapian::MSet mset = enquire.get_mset(0, 10);
 138         TEST_EQUAL(mset.size(), 2);
 139         if (mset[1].get_percent() != 50) break;
 140         ++epsilons;
 141     }
 142
 143     // Make a set of document weights including ones on either side of the
 144     // 49% / 50% boundary.
 145     MyPostingSource source;
 146     source.append_docweight(1, 100);
 147     source.append_docweight(2, 50);
 148     source.append_docweight(3, 50 - (epsilons - 1) * DBL_EPSILON);
 149     source.append_docweight(4, 50 - epsilons * DBL_EPSILON);
 150     source.append_docweight(5, 25);
 151
 152     enquire.set_query(Xapian::Query(&source));
 153     Xapian::MSet mset1 = enquire.get_mset(0, 10);
 154     TEST_EQUAL(mset1.size(), 5);
 155     TEST_EQUAL(mset1[2].get_percent(), 50);
 156     TEST_EQUAL(mset1[3].get_percent(), 49);
 157
 158     // Use various different percentage cutoffs, and check that the values
 159     // returned are as expected.
 160     int percent = 100;
 161     for (Xapian::MSetIterator i = mset1.begin(); i != mset1.end(); ++i) {
 162         int new_percent = mset1.convert_to_percent(i);
 163         tout << "mset1 item = " << i.get_percent() << "%\n";
 164         if (new_percent != percent) {
 165             enquire.set_cutoff(percent);
 166             Xapian::MSet mset2 = enquire.get_mset(0, 10);
 167             tout << "cutoff = " << percent << "%, "
 168                     "mset size = " << mset2.size() << "\n";
 169             TEST_EQUAL(mset2.size(), i.get_rank());
 170             percent = new_percent;
 171         }
 172     }
 173
 174     return true;
 175 }
 176
 177 /// Check we throw for a percentage cutoff while sorting primarily by value.
 178 DEFINE_TESTCASE(pctcutoff5, backend) {
 179     Xapian::Database db(get_database("apitest_simpledata"));
 180     Xapian::Enquire enquire(db);
 181     enquire.set_query(Xapian::Query("test"));
 182     enquire.set_cutoff(42);
 183     Xapian::MSet mset;
 184
 185     enquire.set_sort_by_value(0, false);
 186     TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
 187
 188     enquire.set_sort_by_value(0, true);
 189     TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
 190
 191     enquire.set_sort_by_value_then_relevance(0, false);
 192     TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
 193
 194     enquire.set_sort_by_value_then_relevance(0, true);
 195     TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
 196
 197     return true;
 198 }
 199
 200 // Regression test for bug fixed in 1.0.14.
 201 DEFINE_TESTCASE(topercent3, remote) {
 202     BackendManagerLocal local_manager;
 203     local_manager.set_datadir(test_driver::get_srcdir() + "/testdata/");
 204     Xapian::Database db;
 205     db.add_database(get_database("apitest_simpledata"));
 206     db.add_database(local_manager.get_database("apitest_simpledata"));
 207
 208     Xapian::Enquire enquire(db);
 209     enquire.set_sort_by_value(1, false);
 210
 211     const char * terms[] = { "paragraph", "banana" };
 212     enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, terms, terms + 2));
 213
 214     Xapian::MSet mset = enquire.get_mset(0, 20);
 215
 216     Xapian::MSetIterator i;
 217     for (i = mset.begin(); i != mset.end(); ++i) {
 218         // We should never achieve 100%.
 219         TEST_REL(i.get_percent(),<,100);
 220     }
 221
 222     return true;
 223 }
 224
 225 // Regression test for bug introduced temporarily by the "percent without
 226 // termlist" patch.
 227 DEFINE_TESTCASE(topercent4, backend) {
 228     Xapian::Enquire enquire(get_database("apitest_simpledata"));
 229
 230     Xapian::Query query(Xapian::Query::OP_FILTER,
 231                         Xapian::Query("paragraph"),
 232                         Xapian::Query("queri"));
 233     query = Xapian::Query(Xapian::Query::OP_XOR,
 234                           query, Xapian::Query("rubbish"));
 235
 236     enquire.set_query(query);
 237     Xapian::MSet mset = enquire.get_mset(0, 10);
 238
 239     // We should get 50% not 33%.
 240     TEST(!mset.empty());
 241     TEST_EQUAL(mset[0].get_percent(), 50);
 242
 243     return true;
 244 }
 245
 246 /// Test that a search with a non-existent term doesn't get 100%.
 247 DEFINE_TESTCASE(topercent5, backend) {
 248     Xapian::Enquire enquire(get_database("apitest_simpledata"));
 249     Xapian::Query q(Xapian::Query::OP_OR,
 250                     Xapian::Query("paragraph"), Xapian::Query("xyzzy"));
 251     enquire.set_query(q);
 252     Xapian::MSet mset = enquire.get_mset(0, 10);
 253     TEST(!mset.empty());
 254     TEST(mset[0].get_percent() < 100);
 255     // It would be odd if the non-existent term was worth more, but in 1.0.x
 256     // the top hit got 4% in this testcase.  In 1.2.x it gets 50%, which is
 257     // better, but >50% would be more natural.
 258     TEST(mset[0].get_percent() >= 50);
 259     return true;
 260 }
 261
 262 /// Test that OP_FILTER doesn't affect percentages.
 263 //  Regression test for bug#590 fixed in 1.3.1 and 1.2.10.
 264 DEFINE_TESTCASE(topercent6, backend) {
 265     Xapian::Enquire enquire(get_database("apitest_simpledata"));
 266     Xapian::Query q(Xapian::Query::OP_OR,
 267                     Xapian::Query("rubbish"), Xapian::Query("letter"));
 268     enquire.set_query(q);
 269     Xapian::MSet mset = enquire.get_mset(0, 10);
 270     TEST(!mset.empty());
 271     TEST(mset[0].get_percent() < 100);
 272
 273     q = Xapian::Query(q.OP_FILTER, q, Xapian::Query("this"));
 274     enquire.set_query(q);
 275     Xapian::MSet mset2 = enquire.get_mset(0, 10);
 276     TEST(!mset2.empty());
 277     TEST_EQUAL(mset[0].get_percent(), mset2[0].get_percent());
 278     return true;
 279 }
 280
 281 static void
 282 make_topercent7_db(Xapian::WritableDatabase &db, const string &)
 283 {
 284     for (int i = 1; i <= 6; ++i) {
 285         Xapian::Document d;
 286         d.set_data(str(i));
 287         d.add_term("boom", 2 + (i - 4)*(i - 2));
 288         if (i != 5)
 289             d.add_boolean_term("XCAT122");
 290         db.add_document(d);
 291     }
 292     db.commit();
 293 }
 294
 295 /// Test that a term with wdf always = 0 gets counted.
 296 //  Regression test for bug introduced in 1.2.10 by the original fix for #590,
 297 //  and fixed in 1.2.13 (and in trunk before 1.3.1 was released).
 298 DEFINE_TESTCASE(topercent7, generated) {
 299     Xapian::Database db(get_database("topercent7", make_topercent7_db));
 300
 301     Xapian::Query q;
 302     q = Xapian::Query(q.OP_OR, Xapian::Query("tomb"), Xapian::Query("boom"));
 303     q = Xapian::Query(q.OP_AND, q, Xapian::Query("XCAT122"));
 304
 305     Xapian::Enquire enq(db);
 306     enq.set_query(q);
 307     Xapian::MSet m = enq.get_mset(0, 10);
 308     TEST(!m.empty());
 309     TEST_REL(m[0].get_percent(),>,60);
 310     return true;
 311 }
 312
 313 class ZWeight : public Xapian::Weight {
 314   public:
 315     ZWeight() { }
 316
 317     void init(double) { }
 318
 319     Weight * clone() const {
 320         return new ZWeight();
 321     }
 322
 323     double get_sumpart(Xapian::termcount,
 324                        Xapian::termcount,
 325                        Xapian::termcount) const {
 326         return 0.0;
 327     }
 328
 329     double get_maxpart() const {
 330         return 0.0;
 331     }
 332
 333     double get_sumextra(Xapian::termcount doclen,
 334                         Xapian::termcount) const {
 335         return 1.0 / doclen;
 336     }
 337
 338     double get_maxextra() const {
 339         return 1.0;
 340     }
 341 };
 342
 343 /// Regression test for bug introduced in 1.3.1 and fixed in 1.3.2.
 344 DEFINE_TESTCASE(checkzeromaxpartopt1, backend && !remote) {
 345     Xapian::Database db = get_database("apitest_simpledata");
 346     Xapian::Enquire enquire(db);
 347     // "this" indexes all documents, so will get replaced with MatchAll
 348     // internally.
 349     const char * terms[] = { "this", "spoken", "blank" };
 350     enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, terms, terms + 3));
 351     ZWeight wt;
 352     enquire.set_weighting_scheme(wt);
 353     Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
 354     // No documents match all 3 terms, so the score shouldn't be 100%.
 355     TEST(mset[0].get_percent() != 100);
 356     // Make sure the percentage score isn't 0 or 1 though.
 357     TEST_REL(mset[0].get_percent(), >, 1);
 358     return true;
 359 }