Support: quest -f cjk_ngram
[xapian.git] / xapian-core / tests / api_opsynonym.cc
blob8e6c6f4677326b4b17ccb9cc2365a1f1436c5156
1 /** @file api_opsynonym.cc
2 * @brief tests of OP_SYNONYM and OP_MAX.
3 */
4 /* Copyright 2009,2011,2014 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "api_opsynonym.h"
27 #include <map>
28 #include <set>
29 #include <vector>
31 #include <xapian.h>
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
37 #include "apitest.h"
39 using namespace std;
41 // #######################################################################
42 // # Tests start here
44 struct synonym1_data_type {
45 // How many results should have the same weight when combined with
46 // OP_SYNONYM instead of OP_OR.
47 int sameweight_count;
48 // How many results should have a different weight when combined with
49 // OP_SYNONYM instead of OP_OR.
50 int diffweight_count;
51 // How many subqueries.
52 unsigned n_subqs;
53 // The subqueries (use NOQ for unused ones).
54 Xapian::Query subqs[4];
57 #define NOQ Xapian::Query::MatchNothing
58 static synonym1_data_type synonym1_data[] = {
60 // Single term - all 33 results should be same weight.
61 33, 0, 1,
62 { Xapian::Query("date"), NOQ, NOQ, NOQ }
65 // Two terms, which co-occur in some documents.
67 // All 34 results should be different.
68 0, 34, 2,
69 { Xapian::Query("sky"), Xapian::Query("date"), NOQ, NOQ }
72 // Two terms which are entirely disjoint, and where the maximum weight
73 // doesn't occur in the first or second match.
75 // All 18 results should be different.
76 0, 18, 2,
77 { Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ, NOQ }
80 // All 34 results should be different.
81 0, 34, 2,
83 Xapian::Query("date"),
84 Xapian::Query(Xapian::Query::OP_OR,
85 Xapian::Query("sky"),
86 Xapian::Query("glove")),
87 NOQ, NOQ
91 // All 34 results should be different.
92 0, 34, 2,
94 Xapian::Query("date"),
95 Xapian::Query(Xapian::Query::OP_OR,
96 Xapian::Query("sky"),
97 Xapian::Query("date")),
98 NOQ, NOQ
102 // All 34 results should be different.
103 0, 34, 2,
105 Xapian::Query("date"),
106 Xapian::Query(Xapian::Query::OP_AND_MAYBE,
107 Xapian::Query("sky"),
108 Xapian::Query("date")),
109 NOQ, NOQ
113 // All 34 results should be different.
114 0, 34, 2,
116 Xapian::Query("date"),
117 Xapian::Query(Xapian::Query::OP_AND_NOT,
118 Xapian::Query("sky"),
119 Xapian::Query("date")),
120 NOQ, NOQ
124 // The AND only matches 1 document, so the estimated termfreq for the
125 // whole synonym works out as 33 (due to rounding), which is the same
126 // as the termfreq for "date". Therefore most of the weights are the
127 // same as just for the pure "date" search, and the only document which
128 // gets a different weight is the one also matched by "sky" (because it
129 // has a wdf boost).
130 32, 1, 2,
132 Xapian::Query("date"),
133 Xapian::Query(Xapian::Query::OP_AND,
134 Xapian::Query("sky"),
135 Xapian::Query("date")),
136 NOQ, NOQ
140 // All 34 results should be different.
141 0, 34, 2,
143 Xapian::Query("date"),
144 Xapian::Query(Xapian::Query::OP_XOR,
145 Xapian::Query("sky"),
146 Xapian::Query("date")),
147 NOQ, NOQ
151 // When the top-level operator is OR, the synonym part has an estimated
152 // termfreq of 35. When the top-level operator is SYNONYM, the whole
153 // query has an estimated termfreq of 66, which is rather bogus, but
154 // that's the current situation here (1.2 did better as it flattened
155 // this into a single OP_SYNONYM operator and then merged the two
156 // "date" terms to one with wqf=2. We've decided we shouldn't do such
157 // merging from 1.3.x on (merging to sum the scale_factors is fine, but
158 // we don't do that yet - FIXME).
160 // Anyway, this means that currently the weights are different for all
161 // matches.
162 0, 34, 2,
164 Xapian::Query("date"),
165 Xapian::Query(Xapian::Query::OP_SYNONYM,
166 Xapian::Query("sky"),
167 Xapian::Query("date")),
168 NOQ, NOQ
172 // All 35 results should be different.
173 0, 35, 4,
175 Xapian::Query("sky"),
176 Xapian::Query("date"),
177 Xapian::Query("stein"),
178 Xapian::Query("ally")
182 // The estimated term frequency for the synoynm is 2 (because the
183 // estimate for the phrase is 0), which is the same as the term
184 // frequency of "attitud". Thus, the synonym gets the same weight as
185 // "attitud", so documents with only "attitud" (but not the phrase) in
186 // them get the same wdf, and have the same total weight. There turns
187 // out to be exactly one such document.
188 1, 3, 2,
190 Xapian::Query("attitud"),
191 Xapian::Query(Xapian::Query::OP_PHRASE,
192 Xapian::Query("german"),
193 Xapian::Query("adventur")),
194 NOQ, NOQ
198 // All 54 results should be different.
199 0, 54, 2,
201 Xapian::Query("attitud"),
202 Xapian::Query(Xapian::Query::OP_OR,
203 Xapian::Query("german"),
204 Xapian::Query(Xapian::Query::OP_SYNONYM,
205 Xapian::Query("sky"),
206 Xapian::Query("date"))),
207 NOQ, NOQ
212 // Check a synonym search
213 DEFINE_TESTCASE(synonym1, backend) {
214 Xapian::Database db(get_database("etext"));
216 TEST_REL(db.get_doclength_upper_bound(), >, 0);
218 const Xapian::doccount lots = 214;
220 for (size_t subqgroup = 0;
221 subqgroup != sizeof(synonym1_data) / sizeof(synonym1_data[0]);
222 ++subqgroup) {
223 const synonym1_data_type & data = synonym1_data[subqgroup];
224 const Xapian::Query * qlist = data.subqs;
225 const Xapian::Query * qlist_end = qlist + data.n_subqs;
227 // Run two queries, one joining the subqueries with OR and one joining
228 // them with SYNONYM.
229 Xapian::Enquire enquire(db);
231 // Do the search with OP_OR, getting all the results.
232 Xapian::Query orquery(Xapian::Query::OP_OR, qlist, qlist_end);
233 enquire.set_query(orquery);
234 Xapian::MSet ormset = enquire.get_mset(0, lots);
236 // Do the search with OP_SYNONYM, getting all the results.
237 Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist, qlist_end);
238 enquire.set_query(synquery);
239 Xapian::MSet synmset = enquire.get_mset(0, lots);
241 tout << "Comparing " << orquery << " with " << synquery << '\n';
243 // Check that the queries return some results.
244 TEST_NOT_EQUAL(synmset.size(), 0);
245 // Check that the queries return the same number of results.
246 TEST_EQUAL(synmset.size(), ormset.size());
247 map<Xapian::docid, double> values_or;
248 map<Xapian::docid, double> values_synonym;
249 for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
250 values_or[*ormset[i]] = ormset[i].get_weight();
251 values_synonym[*synmset[i]] = synmset[i].get_weight();
253 TEST_EQUAL(values_or.size(), values_synonym.size());
255 /* Check that the most of the weights for items in the "or" mset are
256 * different from those in the "synonym" mset. */
257 int same_weight = 0;
258 int different_weight = 0;
259 for (map<Xapian::docid, double>::const_iterator
260 j = values_or.begin(); j != values_or.end(); ++j) {
261 Xapian::docid did = j->first;
262 // Check that all the results in the or tree make it to the synonym
263 // tree.
264 TEST(values_synonym.find(did) != values_synonym.end());
265 if (values_or[did] == values_synonym[did]) {
266 ++same_weight;
267 } else {
268 ++different_weight;
273 TEST_EQUAL(different_weight, data.diffweight_count);
274 TEST_EQUAL(same_weight, data.sameweight_count);
276 // Do the search with synonym, but just get the top result.
277 // (Regression test - the OR subquery in the synonym postlist tree used
278 // to shortcut incorrectly, and return the wrong result here).
279 Xapian::MSet mset_top = enquire.get_mset(0, 1);
280 TEST_EQUAL(mset_top.size(), 1);
281 TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
283 return true;
286 // Regression test - test a synonym search with a MultiAndPostlist.
287 DEFINE_TESTCASE(synonym2, backend) {
288 Xapian::Query query;
289 vector<Xapian::Query> subqueries;
290 subqueries.push_back(Xapian::Query("file"));
291 subqueries.push_back(Xapian::Query("the"));
292 subqueries.push_back(Xapian::Query("next"));
293 subqueries.push_back(Xapian::Query("reader"));
294 query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
295 subqueries.clear();
296 subqueries.push_back(query);
297 subqueries.push_back(Xapian::Query("gutenberg"));
298 query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
300 tout << query << '\n';
302 Xapian::Database db(get_database("etext"));
303 Xapian::Enquire enquire(db);
304 enquire.set_query(query);
305 Xapian::MSet mset = enquire.get_mset(0, 10);
306 tout << mset << '\n';
308 // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
309 double maxposs = mset.get_max_possible();
310 query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
311 enquire.set_query(query);
312 mset = enquire.get_mset(0, 10);
313 double maxposs2 = mset.get_max_possible();
315 TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
317 return true;
320 static void
321 check_msets_contain_same_docs(const Xapian::MSet & mset1,
322 const Xapian::MSet & mset2)
324 TEST_EQUAL(mset1.size(), mset2.size());
326 set<Xapian::docid> docids;
327 for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
328 docids.insert(*mset1[i]);
331 // Check that all the results in mset1 are in mset2.
332 for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
333 // Check that we can erase each entry from mset2 element. Since mset1
334 // and mset2 are the same size this means we can be sure that there
335 // were no repeated docids in either (it would be a bug if there were).
336 TEST(docids.erase(*mset2[j]));
340 // Test a synonym search which has had its weight scaled to 0.
341 DEFINE_TESTCASE(synonym3, backend) {
342 Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
343 Xapian::Query("sky"),
344 Xapian::Query("date"));
346 Xapian::Database db(get_database("etext"));
347 Xapian::Enquire enquire(db);
348 enquire.set_query(query);
349 Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
351 tout << query << '\n';
352 tout << mset_orig << '\n';
354 // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
355 // (this has a special codepath to avoid doing the synonym calculation).
356 query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
357 enquire.set_query(query);
358 Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
360 tout << query << '\n';
361 tout << mset_zero << '\n';
363 // Check that the queries return some results.
364 TEST_NOT_EQUAL(mset_zero.size(), 0);
365 // Check that the queries return the same document IDs, and the zero
366 // one has zero weight.
367 check_msets_contain_same_docs(mset_orig, mset_zero);
368 for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
369 TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
370 TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
373 return true;
376 // Test synonym searches combined with various operators.
377 DEFINE_TESTCASE(synonym4, backend) {
378 Xapian::Database db(get_database("etext"));
379 Xapian::Enquire enquire(db);
380 Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
381 Xapian::Query("gutenberg"),
382 Xapian::Query("blockhead"));
383 Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
384 Xapian::Query("gutenberg"),
385 Xapian::Query("blockhead"));
386 Xapian::Query date_query = Xapian::Query("date");
388 // Check some queries.
389 static const Xapian::Query::op operators[] = {
390 Xapian::Query::OP_AND_MAYBE,
391 Xapian::Query::OP_AND_NOT,
392 Xapian::Query::OP_AND,
393 Xapian::Query::OP_XOR,
394 Xapian::Query::OP_OR,
395 Xapian::Query::OP_SYNONYM
397 const Xapian::Query::op * end;
398 end = operators + sizeof(operators) / sizeof(operators[0]);
399 for (const Xapian::Query::op * i = operators; i != end; ++i) {
400 tout.str(string());
401 Xapian::Query query1(*i, syn_query, date_query);
402 Xapian::Query query2(*i, or_query, date_query);
404 enquire.set_query(query1);
405 tout << "query1:" << query1 << '\n';
406 Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
407 tout << "mset1:" << mset1 << '\n';
408 enquire.set_query(query2);
409 tout << "query2:" << query2 << '\n';
410 Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
411 tout << "mset2:" << mset2 << '\n';
413 TEST_NOT_EQUAL(mset1.size(), 0);
414 if (*i != Xapian::Query::OP_XOR) {
415 TEST_EQUAL(mset1[0].get_percent(), 100);
416 } else {
417 TEST(mset1[0].get_percent() != 100);
419 check_msets_contain_same_docs(mset1, mset2);
422 return true;
425 DEFINE_TESTCASE(opmax1, backend) {
426 Xapian::Database db(get_database("etext"));
427 Xapian::Enquire enq(db);
428 Xapian::Query q1("king");
429 Xapian::Query q2("friedrich");
430 Xapian::Query qmax(Xapian::Query::OP_MAX, q1, q2);
431 enq.set_query(q1);
432 Xapian::MSet mset1 = enq.get_mset(0, db.get_doccount());
433 enq.set_query(q2);
434 Xapian::MSet mset2 = enq.get_mset(0, db.get_doccount());
435 enq.set_query(qmax);
436 Xapian::MSet msetmax = enq.get_mset(0, db.get_doccount());
438 // Check that the weights in msetmax are the maximum of the weights in
439 // mset1 and mset2 for each docid.
440 map<Xapian::docid, double> expected_weights;
441 Xapian::MSetIterator i;
442 for (i = mset1.begin(); i != mset1.end(); ++i) {
443 expected_weights[*i] = i.get_weight();
445 for (i = mset2.begin(); i != mset2.end(); ++i) {
446 map<Xapian::docid, double>::iterator j;
447 j = expected_weights.find(*i);
448 if (j != expected_weights.end()) {
449 j->second = max(j->second, i.get_weight());
450 } else {
451 expected_weights[*i] = i.get_weight();
455 for (i = msetmax.begin(); i != msetmax.end(); ++i) {
456 map<Xapian::docid, double>::iterator j;
457 j = expected_weights.find(*i);
458 TEST(j != expected_weights.end());
459 TEST_EQUAL_DOUBLE(j->second, i.get_weight());
460 expected_weights.erase(j);
461 tout << expected_weights.size() << endl;
464 // Any document in mset1 or mset2 should also be in msetmax.
465 TEST_EQUAL(expected_weights.size(), 0);
467 return true;