1 /** @file api_opsynonym.cc
2 * @brief tests of OP_SYNONYM and OP_MAX.
4 /* Copyright 2009,2011,2014 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "api_opsynonym.h"
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
41 // #######################################################################
44 struct synonym1_data_type
{
45 // How many results should have the same weight when combined with
46 // OP_SYNONYM instead of OP_OR.
48 // How many results should have a different weight when combined with
49 // OP_SYNONYM instead of OP_OR.
51 // How many subqueries.
53 // The subqueries (use NOQ for unused ones).
54 Xapian::Query subqs
[4];
57 #define NOQ Xapian::Query::MatchNothing
58 static synonym1_data_type synonym1_data
[] = {
60 // Single term - all 33 results should be same weight.
62 { Xapian::Query("date"), NOQ
, NOQ
, NOQ
}
65 // Two terms, which co-occur in some documents.
67 // All 34 results should be different.
69 { Xapian::Query("sky"), Xapian::Query("date"), NOQ
, NOQ
}
72 // Two terms which are entirely disjoint, and where the maximum weight
73 // doesn't occur in the first or second match.
75 // All 18 results should be different.
77 { Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ
, NOQ
}
80 // All 34 results should be different.
83 Xapian::Query("date"),
84 Xapian::Query(Xapian::Query::OP_OR
,
86 Xapian::Query("glove")),
91 // All 34 results should be different.
94 Xapian::Query("date"),
95 Xapian::Query(Xapian::Query::OP_OR
,
97 Xapian::Query("date")),
102 // All 34 results should be different.
105 Xapian::Query("date"),
106 Xapian::Query(Xapian::Query::OP_AND_MAYBE
,
107 Xapian::Query("sky"),
108 Xapian::Query("date")),
113 // All 34 results should be different.
116 Xapian::Query("date"),
117 Xapian::Query(Xapian::Query::OP_AND_NOT
,
118 Xapian::Query("sky"),
119 Xapian::Query("date")),
124 // The AND only matches 1 document, so the estimated termfreq for the
125 // whole synonym works out as 33 (due to rounding), which is the same
126 // as the termfreq for "date". Therefore most of the weights are the
127 // same as just for the pure "date" search, and the only document which
128 // gets a different weight is the one also matched by "sky" (because it
132 Xapian::Query("date"),
133 Xapian::Query(Xapian::Query::OP_AND
,
134 Xapian::Query("sky"),
135 Xapian::Query("date")),
140 // All 34 results should be different.
143 Xapian::Query("date"),
144 Xapian::Query(Xapian::Query::OP_XOR
,
145 Xapian::Query("sky"),
146 Xapian::Query("date")),
151 // When the top-level operator is OR, the synonym part has an estimated
152 // termfreq of 35. When the top-level operator is SYNONYM, the whole
153 // query has an estimated termfreq of 66, which is rather bogus, but
154 // that's the current situation here (1.2 did better as it flattened
155 // this into a single OP_SYNONYM operator and then merged the two
156 // "date" terms to one with wqf=2. We've decided we shouldn't do such
157 // merging from 1.3.x on (merging to sum the scale_factors is fine, but
158 // we don't do that yet - FIXME).
160 // Anyway, this means that currently the weights are different for all
164 Xapian::Query("date"),
165 Xapian::Query(Xapian::Query::OP_SYNONYM
,
166 Xapian::Query("sky"),
167 Xapian::Query("date")),
172 // All 35 results should be different.
175 Xapian::Query("sky"),
176 Xapian::Query("date"),
177 Xapian::Query("stein"),
178 Xapian::Query("ally")
182 // The estimated term frequency for the synoynm is 2 (because the
183 // estimate for the phrase is 0), which is the same as the term
184 // frequency of "attitud". Thus, the synonym gets the same weight as
185 // "attitud", so documents with only "attitud" (but not the phrase) in
186 // them get the same wdf, and have the same total weight. There turns
187 // out to be exactly one such document.
190 Xapian::Query("attitud"),
191 Xapian::Query(Xapian::Query::OP_PHRASE
,
192 Xapian::Query("german"),
193 Xapian::Query("adventur")),
198 // All 54 results should be different.
201 Xapian::Query("attitud"),
202 Xapian::Query(Xapian::Query::OP_OR
,
203 Xapian::Query("german"),
204 Xapian::Query(Xapian::Query::OP_SYNONYM
,
205 Xapian::Query("sky"),
206 Xapian::Query("date"))),
212 // Check a synonym search
213 DEFINE_TESTCASE(synonym1
, backend
) {
214 Xapian::Database
db(get_database("etext"));
216 TEST_REL(db
.get_doclength_upper_bound(), >, 0);
218 const Xapian::doccount lots
= 214;
220 for (size_t subqgroup
= 0;
221 subqgroup
!= sizeof(synonym1_data
) / sizeof(synonym1_data
[0]);
223 const synonym1_data_type
& data
= synonym1_data
[subqgroup
];
224 const Xapian::Query
* qlist
= data
.subqs
;
225 const Xapian::Query
* qlist_end
= qlist
+ data
.n_subqs
;
227 // Run two queries, one joining the subqueries with OR and one joining
228 // them with SYNONYM.
229 Xapian::Enquire
enquire(db
);
231 // Do the search with OP_OR, getting all the results.
232 Xapian::Query
orquery(Xapian::Query::OP_OR
, qlist
, qlist_end
);
233 enquire
.set_query(orquery
);
234 Xapian::MSet ormset
= enquire
.get_mset(0, lots
);
236 // Do the search with OP_SYNONYM, getting all the results.
237 Xapian::Query
synquery(Xapian::Query::OP_SYNONYM
, qlist
, qlist_end
);
238 enquire
.set_query(synquery
);
239 Xapian::MSet synmset
= enquire
.get_mset(0, lots
);
241 tout
<< "Comparing " << orquery
<< " with " << synquery
<< '\n';
243 // Check that the queries return some results.
244 TEST_NOT_EQUAL(synmset
.size(), 0);
245 // Check that the queries return the same number of results.
246 TEST_EQUAL(synmset
.size(), ormset
.size());
247 map
<Xapian::docid
, double> values_or
;
248 map
<Xapian::docid
, double> values_synonym
;
249 for (Xapian::doccount i
= 0; i
< synmset
.size(); ++i
) {
250 values_or
[*ormset
[i
]] = ormset
[i
].get_weight();
251 values_synonym
[*synmset
[i
]] = synmset
[i
].get_weight();
253 TEST_EQUAL(values_or
.size(), values_synonym
.size());
255 /* Check that the most of the weights for items in the "or" mset are
256 * different from those in the "synonym" mset. */
258 int different_weight
= 0;
259 for (map
<Xapian::docid
, double>::const_iterator
260 j
= values_or
.begin(); j
!= values_or
.end(); ++j
) {
261 Xapian::docid did
= j
->first
;
262 // Check that all the results in the or tree make it to the synonym
264 TEST(values_synonym
.find(did
) != values_synonym
.end());
265 if (values_or
[did
] == values_synonym
[did
]) {
273 TEST_EQUAL(different_weight
, data
.diffweight_count
);
274 TEST_EQUAL(same_weight
, data
.sameweight_count
);
276 // Do the search with synonym, but just get the top result.
277 // (Regression test - the OR subquery in the synonym postlist tree used
278 // to shortcut incorrectly, and return the wrong result here).
279 Xapian::MSet mset_top
= enquire
.get_mset(0, 1);
280 TEST_EQUAL(mset_top
.size(), 1);
281 TEST(mset_range_is_same(mset_top
, 0, synmset
, 0, 1));
286 // Regression test - test a synonym search with a MultiAndPostlist.
287 DEFINE_TESTCASE(synonym2
, backend
) {
289 vector
<Xapian::Query
> subqueries
;
290 subqueries
.push_back(Xapian::Query("file"));
291 subqueries
.push_back(Xapian::Query("the"));
292 subqueries
.push_back(Xapian::Query("next"));
293 subqueries
.push_back(Xapian::Query("reader"));
294 query
= Xapian::Query(Xapian::Query::OP_AND
, subqueries
.begin(), subqueries
.end());
296 subqueries
.push_back(query
);
297 subqueries
.push_back(Xapian::Query("gutenberg"));
298 query
= Xapian::Query(Xapian::Query::OP_SYNONYM
, subqueries
.begin(), subqueries
.end());
300 tout
<< query
<< '\n';
302 Xapian::Database
db(get_database("etext"));
303 Xapian::Enquire
enquire(db
);
304 enquire
.set_query(query
);
305 Xapian::MSet mset
= enquire
.get_mset(0, 10);
306 tout
<< mset
<< '\n';
308 // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
309 double maxposs
= mset
.get_max_possible();
310 query
= Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT
, query
, 10.0);
311 enquire
.set_query(query
);
312 mset
= enquire
.get_mset(0, 10);
313 double maxposs2
= mset
.get_max_possible();
315 TEST_EQUAL_DOUBLE(maxposs
* 10.0, maxposs2
);
321 check_msets_contain_same_docs(const Xapian::MSet
& mset1
,
322 const Xapian::MSet
& mset2
)
324 TEST_EQUAL(mset1
.size(), mset2
.size());
326 set
<Xapian::docid
> docids
;
327 for (Xapian::doccount i
= 0; i
< mset1
.size(); ++i
) {
328 docids
.insert(*mset1
[i
]);
331 // Check that all the results in mset1 are in mset2.
332 for (Xapian::doccount j
= 0; j
< mset2
.size(); ++j
) {
333 // Check that we can erase each entry from mset2 element. Since mset1
334 // and mset2 are the same size this means we can be sure that there
335 // were no repeated docids in either (it would be a bug if there were).
336 TEST(docids
.erase(*mset2
[j
]));
340 // Test a synonym search which has had its weight scaled to 0.
341 DEFINE_TESTCASE(synonym3
, backend
) {
342 Xapian::Query query
= Xapian::Query(Xapian::Query::OP_SYNONYM
,
343 Xapian::Query("sky"),
344 Xapian::Query("date"));
346 Xapian::Database
db(get_database("etext"));
347 Xapian::Enquire
enquire(db
);
348 enquire
.set_query(query
);
349 Xapian::MSet mset_orig
= enquire
.get_mset(0, db
.get_doccount());
351 tout
<< query
<< '\n';
352 tout
<< mset_orig
<< '\n';
354 // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
355 // (this has a special codepath to avoid doing the synonym calculation).
356 query
= Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT
, query
, 0.0);
357 enquire
.set_query(query
);
358 Xapian::MSet mset_zero
= enquire
.get_mset(0, db
.get_doccount());
360 tout
<< query
<< '\n';
361 tout
<< mset_zero
<< '\n';
363 // Check that the queries return some results.
364 TEST_NOT_EQUAL(mset_zero
.size(), 0);
365 // Check that the queries return the same document IDs, and the zero
366 // one has zero weight.
367 check_msets_contain_same_docs(mset_orig
, mset_zero
);
368 for (Xapian::doccount i
= 0; i
< mset_orig
.size(); ++i
) {
369 TEST_NOT_EQUAL(mset_orig
[i
].get_weight(), 0.0);
370 TEST_EQUAL(mset_zero
[i
].get_weight(), 0.0);
376 // Test synonym searches combined with various operators.
377 DEFINE_TESTCASE(synonym4
, backend
) {
378 Xapian::Database
db(get_database("etext"));
379 Xapian::Enquire
enquire(db
);
380 Xapian::Query syn_query
= Xapian::Query(Xapian::Query::OP_SYNONYM
,
381 Xapian::Query("gutenberg"),
382 Xapian::Query("blockhead"));
383 Xapian::Query or_query
= Xapian::Query(Xapian::Query::OP_OR
,
384 Xapian::Query("gutenberg"),
385 Xapian::Query("blockhead"));
386 Xapian::Query date_query
= Xapian::Query("date");
388 // Check some queries.
389 static const Xapian::Query::op operators
[] = {
390 Xapian::Query::OP_AND_MAYBE
,
391 Xapian::Query::OP_AND_NOT
,
392 Xapian::Query::OP_AND
,
393 Xapian::Query::OP_XOR
,
394 Xapian::Query::OP_OR
,
395 Xapian::Query::OP_SYNONYM
397 const Xapian::Query::op
* end
;
398 end
= operators
+ sizeof(operators
) / sizeof(operators
[0]);
399 for (const Xapian::Query::op
* i
= operators
; i
!= end
; ++i
) {
401 Xapian::Query
query1(*i
, syn_query
, date_query
);
402 Xapian::Query
query2(*i
, or_query
, date_query
);
404 enquire
.set_query(query1
);
405 tout
<< "query1:" << query1
<< '\n';
406 Xapian::MSet mset1
= enquire
.get_mset(0, db
.get_doccount());
407 tout
<< "mset1:" << mset1
<< '\n';
408 enquire
.set_query(query2
);
409 tout
<< "query2:" << query2
<< '\n';
410 Xapian::MSet mset2
= enquire
.get_mset(0, db
.get_doccount());
411 tout
<< "mset2:" << mset2
<< '\n';
413 TEST_NOT_EQUAL(mset1
.size(), 0);
414 if (*i
!= Xapian::Query::OP_XOR
) {
415 TEST_EQUAL(mset1
[0].get_percent(), 100);
417 TEST(mset1
[0].get_percent() != 100);
419 check_msets_contain_same_docs(mset1
, mset2
);
425 DEFINE_TESTCASE(opmax1
, backend
) {
426 Xapian::Database
db(get_database("etext"));
427 Xapian::Enquire
enq(db
);
428 Xapian::Query
q1("king");
429 Xapian::Query
q2("friedrich");
430 Xapian::Query
qmax(Xapian::Query::OP_MAX
, q1
, q2
);
432 Xapian::MSet mset1
= enq
.get_mset(0, db
.get_doccount());
434 Xapian::MSet mset2
= enq
.get_mset(0, db
.get_doccount());
436 Xapian::MSet msetmax
= enq
.get_mset(0, db
.get_doccount());
438 // Check that the weights in msetmax are the maximum of the weights in
439 // mset1 and mset2 for each docid.
440 map
<Xapian::docid
, double> expected_weights
;
441 Xapian::MSetIterator i
;
442 for (i
= mset1
.begin(); i
!= mset1
.end(); ++i
) {
443 expected_weights
[*i
] = i
.get_weight();
445 for (i
= mset2
.begin(); i
!= mset2
.end(); ++i
) {
446 map
<Xapian::docid
, double>::iterator j
;
447 j
= expected_weights
.find(*i
);
448 if (j
!= expected_weights
.end()) {
449 j
->second
= max(j
->second
, i
.get_weight());
451 expected_weights
[*i
] = i
.get_weight();
455 for (i
= msetmax
.begin(); i
!= msetmax
.end(); ++i
) {
456 map
<Xapian::docid
, double>::iterator j
;
457 j
= expected_weights
.find(*i
);
458 TEST(j
!= expected_weights
.end());
459 TEST_EQUAL_DOUBLE(j
->second
, i
.get_weight());
460 expected_weights
.erase(j
);
461 tout
<< expected_weights
.size() << endl
;
464 // Any document in mset1 or mset2 should also be in msetmax.
465 TEST_EQUAL(expected_weights
.size(), 0);