2 * @brief tests snippets
4 /* Copyright 2012 Mihai Bivol
5 * Copyright 2015,2016,2017,2019,2020 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "api_snippets.h"
33 #include "testsuite.h"
34 #include "testutils.h"
38 struct snippet_testcase
{
44 /// Test snippets without stemming.
45 DEFINE_TESTCASE(snippet1
, backend
) {
46 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
47 enquire
.set_query(Xapian::Query(Xapian::Query::OP_OR
,
48 Xapian::Query("rubbish"),
49 Xapian::Query("mention")));
50 Xapian::MSet mset
= enquire
.get_mset(0, 0);
52 static const snippet_testcase testcases
[] = {
53 // Test highlighting in full sample.
54 { "Rubbish and junk", 20, "<b>Rubbish</b> and junk" },
55 { "Project R.U.B.B.I.S.H. greenlit", 31, "Project <b>R.U.B.B.I.S.H.</b> greenlit" },
56 { "What a load of rubbish", 100, "What a load of <b>rubbish</b>" },
57 { "Mention rubbish", 100, "<b>Mention</b> <b>rubbish</b>" },
58 { "A mention of rubbish", 100, "A <b>mention</b> of <b>rubbish</b>" },
59 { "Rubbish mention of rubbish", 100, "<b>Rubbish</b> <b>mention</b> of <b>rubbish</b>" },
61 // Test selection of snippet.
62 { "Rubbish and junk", 12, "<b>Rubbish</b> and..." },
63 { "Project R.U.B.B.I.S.H. greenlit", 14, "...<b>R.U.B.B.I.S.H.</b>..." },
64 { "What a load of rubbish", 12, "...of <b>rubbish</b>" },
65 { "What a load of rubbish", 8, "...<b>rubbish</b>" },
66 { "Rubbish mention where the start is better than the rubbish ending", 18, "<b>Rubbish</b> <b>mention</b>..." },
68 // Should prefer "interesting" words for context.
69 { "And of the rubbish document to this", 18, "...<b>rubbish</b> document..." },
70 { "And if they document rubbish to be this", 18, "...document <b>rubbish</b>..." },
73 for (auto i
: testcases
) {
74 TEST_STRINGS_EQUAL(mset
.snippet(i
.input
, i
.len
), i
.expect
);
78 /// Test snippets with stemming.
79 DEFINE_TESTCASE(snippetstem1
, backend
) {
80 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
81 enquire
.set_query(Xapian::Query(Xapian::Query::OP_OR
,
82 Xapian::Query("rubbish"),
83 Xapian::Query("Zexampl")));
84 Xapian::MSet mset
= enquire
.get_mset(0, 0);
86 // Term Zexampl isn't in the database, but the highlighter should still
88 static const snippet_testcase testcases
[] = {
89 // "rubbish" isn't stemmed, example is.
90 { "You rubbished my ideas", 24, "You rubbished my ideas" },
91 { "Rubbished all my examples", 20, "...all my <b>examples</b>" },
92 { "Examples of text", 20, "<b>Examples</b> of text" },
95 Xapian::Stem
stem("en");
96 for (auto i
: testcases
) {
97 TEST_STRINGS_EQUAL(mset
.snippet(i
.input
, i
.len
, stem
), i
.expect
);
101 /// Test snippets with phrases.
102 DEFINE_TESTCASE(snippetphrase1
, backend
) {
103 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
104 Xapian::Query
q(Xapian::Query::OP_PHRASE
,
105 Xapian::Query("rubbish"),
106 Xapian::Query("mention"));
107 // Regression test - a phrase with a following sibling query would crash in
108 // the highlighting code.
109 enquire
.set_query(q
&~ Xapian::Query("banana"));
110 Xapian::MSet mset
= enquire
.get_mset(0, 0);
112 static const snippet_testcase testcases
[] = {
113 { "A mention of rubbish", 18, "...mention of rubbish" },
114 { "This is a rubbish mention", 20, "...is a <b>rubbish mention</b>" },
115 { "Mention of a rubbish mention of rubbish", 45, "Mention of a <b>rubbish mention</b> of rubbish" },
116 { "Mention of a rubbish mention of rubbish", 18, "...<b>rubbish mention</b> of..." },
117 { "rubbish rubbish mention mention", 45, "rubbish <b>rubbish mention</b> mention" },
118 { "rubbish mention rubbish mention", 45, "<b>rubbish mention</b> <b>rubbish mention</b>" },
121 Xapian::Stem
stem("en");
122 for (auto i
: testcases
) {
123 TEST_STRINGS_EQUAL(mset
.snippet(i
.input
, i
.len
, stem
), i
.expect
);
127 /// Index file to a DB with TermGenerator.
129 make_tg_db(Xapian::WritableDatabase
&db
, const string
& source
)
131 string file
= test_driver::get_srcdir();
132 file
+= "/testdata/";
136 input
.open(file
.c_str());
137 if (!input
.is_open()) {
138 FAIL_TEST("Couldn't open input: " << file
);
141 Xapian::TermGenerator tg
;
142 tg
.set_stemmer(Xapian::Stem("en"));
143 while (!input
.eof()) {
144 Xapian::Document doc
;
145 tg
.set_document(doc
);
148 getline(input
, line
);
149 if (find_if(line
.begin(), line
.end(), C_isnotspace
) == line
.end())
152 if (!data
.empty()) data
+= ' ';
156 db
.add_document(doc
);
160 /// Test snippets in various ways.
161 DEFINE_TESTCASE(snippetmisc1
, backend
) {
162 Xapian::Database db
= get_database("snippet", make_tg_db
, "snippet");
163 Xapian::Enquire
enquire(db
);
164 enquire
.set_weighting_scheme(Xapian::BoolWeight());
165 Xapian::Stem
stem("en");
167 static const char * const words
[] = { "do", "we", "have" };
168 Xapian::Query
q(Xapian::Query::OP_PHRASE
, words
, words
+ 3);
169 enquire
.set_query(q
);
170 Xapian::MSet mset
= enquire
.get_mset(0, 6);
171 TEST_EQUAL(mset
.size(), 3);
172 TEST_STRINGS_EQUAL(mset
.snippet(mset
[0].get_document().get_data(), 40, stem
),
173 "How much o'brien <b>do we have</b>? Miles...");
174 TEST_STRINGS_EQUAL(mset
.snippet(mset
[1].get_document().get_data(), 40, stem
),
175 "...Unicode: How much o’brien <b>do we have</b>?");
176 TEST_STRINGS_EQUAL(mset
.snippet(mset
[2].get_document().get_data(), 32, stem
),
177 "We do have we <b>do we have</b> do we.");
179 enquire
.set_query(Xapian::Query("Zwelcom") | Xapian::Query("Zmike"));
180 mset
= enquire
.get_mset(0, 6);
181 TEST_EQUAL(mset
.size(), 3);
182 TEST_STRINGS_EQUAL(mset
.snippet(mset
[0].get_document().get_data(), 25, stem
),
183 "\"<b>Welcome</b> to <b>Mike's</b>...");
184 TEST_STRINGS_EQUAL(mset
.snippet(mset
[1].get_document().get_data(), 5, stem
),
186 TEST_STRINGS_EQUAL(mset
.snippet(mset
[2].get_document().get_data(), 10, stem
),
187 "...<b>Mike</b> can...");
189 enquire
.set_query(Xapian::Query(q
.OP_WILDCARD
, "m"));
190 mset
= enquire
.get_mset(0, 6);
191 TEST_EQUAL(mset
.size(), 5);
192 TEST_STRINGS_EQUAL(mset
.snippet(mset
[0].get_document().get_data(), 18, stem
),
193 "...<b>Mike's</b> <b>Mechanical</b>...");
194 TEST_STRINGS_EQUAL(mset
.snippet(mset
[1].get_document().get_data(), 80, stem
),
195 "<b>Mike</b> <b>McDonald</b> is a <b>mechanic</b> who enjoys repairing things of a <b>mechanical</b> sort.");
196 TEST_STRINGS_EQUAL(mset
.snippet(mset
[2].get_document().get_data(), 102, stem
),
197 "From autos to zip-lines, from tea-lights to x-rays, from sea ships to u-boats - <b>Mike</b> can fix them all.");
198 TEST_STRINGS_EQUAL(mset
.snippet(mset
[3].get_document().get_data(), 64, stem
),
199 "How <b>much</b> o'brien do we have? <b>Miles</b> O'Brien, that's how <b>much</b>.");
200 // The requested length is in bytes, so the "fancy" apostrophe results in
201 // fewer Unicode characters in this sample than the previous one.
202 TEST_STRINGS_EQUAL(mset
.snippet(mset
[4].get_document().get_data(), 64, stem
),
203 "...<b>much</b> o’brien do we have? <b>Miles</b> O’Brien, that’s how <b>much</b>.");
206 /// Test snippet term diversity.
207 DEFINE_TESTCASE(snippet_termcover1
, backend
) {
208 static const snippet_testcase testcases
[] = {
209 // "Zexample" isn't in the database, so should get termweight 0. Once
210 // max_tw is added on, "rubbish" should have just under twice the
211 // relevance of "example" so clearly should win in a straight fight.
212 { "A rubbish, but a good example", 14, "...<b>rubbish</b>, but a..."},
213 // But a second occurrence of "rubbish" has half the relevance, so
214 // "example" should add slightly more relevance.
215 { "Rubbish and rubbish, and rubbish examples", 22, "...and <b>rubbish</b> <b>examples</b>"},
217 { "rubbish rubbish example rubbish rubbish", 16, "...<b>example</b> <b>rubbish</b>..." },
220 Xapian::Stem
stem("en");
221 // Disable SNIPPET_BACKGROUND_MODEL so we can test the relevance decay
222 // for repeated terms.
223 unsigned flags
= Xapian::MSet::SNIPPET_EXHAUSTIVE
;
224 for (auto i
: testcases
) {
225 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
226 enquire
.set_query(Xapian::Query(Xapian::Query::OP_OR
,
227 Xapian::Query("rubbish"),
228 Xapian::Query("Zexampl")));
230 Xapian::MSet mset
= enquire
.get_mset(0, 0);
231 TEST_STRINGS_EQUAL(mset
.snippet(i
.input
, i
.len
, stem
, flags
), i
.expect
);
235 /// Test snippet term diversity cases with BoolWeight.
236 DEFINE_TESTCASE(snippet_termcover2
, backend
) {
237 // With BoolWeight, all terms have 0 termweight, and so relevance 1.0
238 // (since max_tw is set to 1.0 if it is zero).
239 static const snippet_testcase testcases
[] = {
240 // Diversity should pick two different terms in preference.
241 { "rubbish rubbish example rubbish rubbish", 16, "...<b>example</b> <b>rubbish</b>..." },
243 { "Rubbish and rubbish, and rubbish examples", 22, "...and <b>rubbish</b> <b>examples</b>"},
244 // The last of two equal snippet should win.
245 { "A rubbish, but a good example", 14, "...a good <b>example</b>"},
248 Xapian::Stem
stem("en");
249 // Disable SNIPPET_BACKGROUND_MODEL so we can test the relevance decay
250 // for repeated terms.
251 unsigned flags
= Xapian::MSet::SNIPPET_EXHAUSTIVE
;
252 for (auto i
: testcases
) {
253 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
254 enquire
.set_query(Xapian::Query(Xapian::Query::OP_OR
,
255 Xapian::Query("rubbish"),
256 Xapian::Query("Zexampl")));
257 enquire
.set_weighting_scheme(Xapian::BoolWeight());
259 Xapian::MSet mset
= enquire
.get_mset(0, 0);
260 TEST_STRINGS_EQUAL(mset
.snippet(i
.input
, i
.len
, stem
, flags
), i
.expect
);
264 /// Test snippet EMPTY_WITHOUT_MATCH flag
265 DEFINE_TESTCASE(snippet_empty
, backend
) {
266 Xapian::Stem
stem("en");
268 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
269 enquire
.set_query(Xapian::Query(Xapian::Query::OP_OR
,
270 Xapian::Query("rubbish"),
271 Xapian::Query("Zexampl")));
273 Xapian::MSet mset
= enquire
.get_mset(0, 0);
275 // A non-matching text
276 const char *input
= "A string without a match.";
277 size_t len
= strlen(input
);
279 // By default, snippet() returns len bytes of input without markup
281 TEST_STRINGS_EQUAL(mset
.snippet(input
, len
, stem
, 0), input
);
283 // force snippet() to return the empty string if no term got matched
284 flags
|= Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH
;
285 TEST_STRINGS_EQUAL(mset
.snippet(input
, len
, stem
, flags
), "");
287 // A text with a match
288 input
= "A rubbish example text";
292 TEST_STRINGS_EQUAL(mset
.snippet(input
, len
, stem
, flags
),
293 "A <b>rubbish</b> <b>example</b> text");
295 flags
|= Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH
;
296 TEST_STRINGS_EQUAL(mset
.snippet(input
, len
, stem
, flags
),
297 "A <b>rubbish</b> <b>example</b> text");
300 /// Check snippets include certain preceding punctuation.
301 DEFINE_TESTCASE(snippet_start_nonspace
, backend
) {
302 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
303 enquire
.set_query(Xapian::Query("foo") | Xapian::Query("10"));
305 Xapian::MSet mset
= enquire
.get_mset(0, 0);
309 const char *input
= "[xapian-devel] Re: foo";
310 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
311 "[xapian-devel] Re: <b>foo</b>");
313 input
= "bar [xapian-devel] Re: foo";
314 TEST_STRINGS_EQUAL(mset
.snippet(input
, 24, stem
),
315 "...[xapian-devel] Re: <b>foo</b>");
317 input
= "there is a $1000 prize for foo";
318 TEST_STRINGS_EQUAL(mset
.snippet(input
, 20, stem
),
319 "...$1000 prize for <b>foo</b>");
321 input
= "-1 is less than foo";
322 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
323 "-1 is less than <b>foo</b>");
325 input
= "+1 is less than foo";
326 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
327 "+1 is less than <b>foo</b>");
329 input
= "/bin/sh is a foo";
330 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
331 "/bin/sh is a <b>foo</b>");
333 input
= "'tis pity foo is a bar";
334 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
335 "'tis pity <b>foo</b> is a bar");
337 input
= "\"foo bar\" he whispered";
338 TEST_STRINGS_EQUAL(mset
.snippet(input
, 11, stem
),
339 "\"<b>foo</b> bar\" he...");
341 input
= "\\\\server\\share\\foo is a UNC path";
342 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
343 "\\\\server\\share\\<b>foo</b> is a UNC path");
345 input
= "«foo» is a placeholder";
346 TEST_STRINGS_EQUAL(mset
.snippet(input
, 9, stem
),
347 "«<b>foo</b>» is...");
349 input
= "#include <foo.h> to use libfoo";
350 TEST_STRINGS_EQUAL(mset
.snippet(input
, 12, stem
),
351 "...<<b>foo</b>.h> to...");
354 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
358 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
361 input
= "(foo) test";
362 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
363 "(<b>foo</b>) test");
365 input
= "{foo} test";
366 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
367 "{<b>foo</b>} test");
369 input
= "`foo` test";
370 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
371 "`<b>foo</b>` test");
373 input
= "@foo@ is replaced";
374 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
375 "@<b>foo</b>@ is replaced");
377 input
= "%foo is a perl hash";
378 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
379 "%<b>foo</b> is a perl hash");
381 input
= "&foo takes the address of foo";
382 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
383 "&<b>foo</b> takes the address of <b>foo</b>");
385 input
= "§3.1.4 foo";
386 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
387 "§3.1.4 <b>foo</b>");
390 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
393 input
= "~foo~ test";
394 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
395 "~<b>foo</b>~ test");
398 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
402 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
405 // Check that excessive non-word characters aren't included.
407 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
410 // Check we don't include characters that aren't useful.
412 TEST_STRINGS_EQUAL(mset
.snippet(input
, 5, stem
),
415 // Check trailing characters are included when useful.
416 input
= "/opt/foo/bin/";
417 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
418 "/opt/<b>foo</b>/bin/");
420 input
= "\"foo bar\"";
421 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
422 "\"<b>foo</b> bar\"");
424 input
= "\\\\server\\share\\foo\\";
425 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
426 "\\\\server\\share\\<b>foo</b>\\");
429 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
432 input
= "#include <foo>";
433 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
434 "#include <<b>foo</b>>");
437 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
441 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
445 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
449 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
453 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
456 input
= "foo for 10¢";
457 TEST_STRINGS_EQUAL(mset
.snippet(input
, strlen(input
), stem
),
458 "<b>foo</b> for <b>10</b>¢");
461 /// Test snippets with small and zero length.
462 DEFINE_TESTCASE(snippet_small_zerolength
, backend
) {
463 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
464 enquire
.set_query(Xapian::Query(Xapian::Query::OP_OR
,
465 Xapian::Query("rubbish"),
466 Xapian::Query("mention")));
467 Xapian::MSet mset
= enquire
.get_mset(0, 0);
469 static const snippet_testcase testcases
[] = {
470 // Test with small length
471 { "mention junk rubbish", 3, "" },
472 { "Project R.U.B.B.I.S.H. greenlit", 5, "" },
473 { "What load rubbish", 3, "" },
474 { "Mention rubbish", 4, "" },
476 // Test with zero length.
477 { "Rubbish and junk", 0, "" },
478 { "Project R.U.B.B.I.S.H. greenlit", 0, "" },
479 { "What a load of rubbish", 0, "" },
480 { "rubbish mention rubbish mention", 0, "" },
483 for (auto i
: testcases
) {
484 TEST_STRINGS_EQUAL(mset
.snippet(i
.input
, i
.len
), i
.expect
);
489 DEFINE_TESTCASE(snippet_ngrams
, backend
) {
490 Xapian::Database db
= get_database("snippet_ngrams",
491 [](Xapian::WritableDatabase
& wdb
,
494 Xapian::Document doc
;
495 Xapian::TermGenerator tg
;
496 tg
.set_flags(Xapian::TermGenerator::FLAG_NGRAMS
);
497 tg
.set_document(doc
);
498 tg
.index_text("明末時已經有香港地方的概念");
499 wdb
.add_document(doc
);
501 Xapian::Enquire
enquire(db
);
502 Xapian::QueryParser qp
;
503 auto q
= qp
.parse_query("已經完成", qp
.FLAG_DEFAULT
| qp
.FLAG_NGRAMS
);
504 enquire
.set_query(q
);
506 Xapian::MSet mset
= enquire
.get_mset(0, 0);
509 const char *input
= "明末時已經有香港地方的概念";
510 size_t len
= strlen(input
);
512 unsigned flags
= Xapian::MSet::SNIPPET_NGRAMS
;
514 s
= mset
.snippet(input
, len
, stem
, flags
, "<b>", "</b>", "...");
515 TEST_STRINGS_EQUAL(s
, "明末時<b>已</b><b>經</b>有香港地方的概念");
517 s
= mset
.snippet(input
, len
/ 2, stem
, flags
, "<b>", "</b>", "...");
518 TEST_STRINGS_EQUAL(s
, "...<b>已</b><b>經</b>有香港地...");
521 /// Test word break finding.
522 DEFINE_TESTCASE(snippet_wordbreaks
, backend
) {
523 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
524 enquire
.set_query(Xapian::Query("已經"));
526 Xapian::MSet mset
= enquire
.get_mset(0, 0);
529 const char *input
= "明末時已經有香港地方的概念";
530 const char *input2
= "明末時已經有香港地方的概念. Hello!";
531 size_t len
= strlen(input
);
533 unsigned flags
= Xapian::MSet::SNIPPET_WORD_BREAKS
;
536 # define DO_TEST(CODE, RESULT) TEST_STRINGS_EQUAL(CODE, RESULT)
538 # define DO_TEST(CODE, RESULT) \
541 FAIL_TEST("No exception thrown, expected FeatureUnavailableError"); \
542 } catch (const Xapian::FeatureUnavailableError& e) { \
543 TEST_STRINGS_EQUAL( \
545 "SNIPPET_WORD_BREAKS requires building Xapian to use ICU"); \
548 DO_TEST(mset
.snippet(input
, len
, stem
, flags
, "<b>", "</b>", "..."),
549 "明末時<b>已經</b>有香港地方的概念");
550 DO_TEST(mset
.snippet(input2
, len
/ 2, stem
, flags
, "[", "]", "~"),
555 DEFINE_TESTCASE(snippet_empty_mset
, backend
) {
556 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
557 enquire
.set_query(Xapian::Query());
558 Xapian::MSet mset
= enquire
.get_mset(0, 0);
559 TEST_STRINGS_EQUAL(mset
.snippet("foo", 3), "foo");
562 DEFINE_TESTCASE(snippet_empty_mset2
, !backend
) {
564 TEST_STRINGS_EQUAL(mset
.snippet("foo", 3), "foo");