[python3] Simplify generated wrapper post-processing
[xapian.git] / xapian-core / tests / api_snippets.cc
blob1e5faa0c60369a6c855682a204c20beb3e55e5da
1 /* api_snippets.cc: tests snippets
3 * Copyright 2012 Mihai Bivol
4 * Copyright 2015,2016,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "api_snippets.h"
26 #include <fstream>
27 #include <string>
29 #include <xapian.h>
31 #include "apitest.h"
32 #include "backendmanager_local.h"
33 #include "testsuite.h"
34 #include "testutils.h"
36 #include <iostream>
38 using namespace std;
40 struct snippet_testcase {
41 const char * input;
42 size_t len;
43 const char * expect;
46 /// Test snippets without stemming.
47 DEFINE_TESTCASE(snippet1, backend) {
48 Xapian::Enquire enquire(get_database("apitest_simpledata"));
49 enquire.set_query(Xapian::Query(Xapian::Query::OP_OR,
50 Xapian::Query("rubbish"),
51 Xapian::Query("mention")));
52 Xapian::MSet mset = enquire.get_mset(0, 0);
54 static const snippet_testcase testcases[] = {
55 // Test highlighting in full sample.
56 { "Rubbish and junk", 20, "<b>Rubbish</b> and junk" },
57 { "Project R.U.B.B.I.S.H. greenlit", 31, "Project <b>R.U.B.B.I.S.H.</b> greenlit" },
58 { "What a load of rubbish", 100, "What a load of <b>rubbish</b>" },
59 { "Mention rubbish", 100, "<b>Mention</b> <b>rubbish</b>" },
60 { "A mention of rubbish", 100, "A <b>mention</b> of <b>rubbish</b>" },
61 { "Rubbish mention of rubbish", 100, "<b>Rubbish</b> <b>mention</b> of <b>rubbish</b>" },
63 // Test selection of snippet.
64 { "Rubbish and junk", 12, "<b>Rubbish</b> and..." },
65 { "Project R.U.B.B.I.S.H. greenlit", 14, "...<b>R.U.B.B.I.S.H.</b>..." },
66 { "What a load of rubbish", 12, "...of <b>rubbish</b>" },
67 { "What a load of rubbish", 8, "...<b>rubbish</b>" },
68 { "Rubbish mention where the start is better than the rubbish ending", 18, "<b>Rubbish</b> <b>mention</b>..." },
70 // Should prefer "interesting" words for context.
71 { "And of the rubbish document to this", 18, "...<b>rubbish</b> document..." },
72 { "And if they document rubbish to be this", 18, "...document <b>rubbish</b>..." },
75 for (auto i : testcases) {
76 TEST_STRINGS_EQUAL(mset.snippet(i.input, i.len), i.expect);
79 return true;
82 /// Test snippets with stemming.
83 DEFINE_TESTCASE(snippetstem1, backend) {
84 Xapian::Enquire enquire(get_database("apitest_simpledata"));
85 enquire.set_query(Xapian::Query(Xapian::Query::OP_OR,
86 Xapian::Query("rubbish"),
87 Xapian::Query("Zexampl")));
88 Xapian::MSet mset = enquire.get_mset(0, 0);
90 // Term Zexampl isn't in the database, but the highlighter should still
91 // handle it.
92 static const snippet_testcase testcases[] = {
93 // "rubbish" isn't stemmed, example is.
94 { "You rubbished my ideas", 24, "You rubbished my ideas" },
95 { "Rubbished all my examples", 20, "...all my <b>examples</b>" },
96 { "Examples of text", 20, "<b>Examples</b> of text" },
99 Xapian::Stem stem("en");
100 for (auto i : testcases) {
101 TEST_STRINGS_EQUAL(mset.snippet(i.input, i.len, stem), i.expect);
104 return true;
107 /// Test snippets with phrases.
108 DEFINE_TESTCASE(snippetphrase1, backend) {
109 Xapian::Enquire enquire(get_database("apitest_simpledata"));
110 Xapian::Query q(Xapian::Query::OP_PHRASE,
111 Xapian::Query("rubbish"),
112 Xapian::Query("mention"));
113 // Regression test - a phrase with a following sibling query would crash in
114 // the highlighting code.
115 enquire.set_query(q &~ Xapian::Query("banana"));
116 Xapian::MSet mset = enquire.get_mset(0, 0);
118 static const snippet_testcase testcases[] = {
119 { "A mention of rubbish", 18, "...mention of rubbish" },
120 { "This is a rubbish mention", 20, "...is a <b>rubbish mention</b>" },
121 { "Mention of a rubbish mention of rubbish", 45, "Mention of a <b>rubbish mention</b> of rubbish" },
122 { "Mention of a rubbish mention of rubbish", 18, "...<b>rubbish mention</b> of..." },
123 { "rubbish rubbish mention mention", 45, "rubbish <b>rubbish mention</b> mention" },
124 { "rubbish mention rubbish mention", 45, "<b>rubbish mention</b> <b>rubbish mention</b>" },
127 Xapian::Stem stem("en");
128 for (auto i : testcases) {
129 TEST_STRINGS_EQUAL(mset.snippet(i.input, i.len, stem), i.expect);
132 return true;
135 /// Index file to a DB with TermGenerator.
136 static void
137 make_tg_db(Xapian::WritableDatabase &db, const string & source)
139 string file = test_driver::get_srcdir();
140 file += "/testdata/";
141 file += source;
142 file += ".txt";
143 ifstream input;
144 input.open(file.c_str());
145 if (!input.is_open()) {
146 FAIL_TEST("Couldn't open input: " << file);
149 Xapian::TermGenerator tg;
150 tg.set_stemmer(Xapian::Stem("en"));
151 while (!input.eof()) {
152 Xapian::Document doc;
153 tg.set_document(doc);
154 string line, data;
155 while (true) {
156 getline(input, line);
157 if (find_if(line.begin(), line.end(), C_isnotspace) == line.end())
158 break;
159 tg.index_text(line);
160 if (!data.empty()) data += ' ';
161 data += line;
163 doc.set_data(data);
164 db.add_document(doc);
168 /// Test snippets in various ways.
169 DEFINE_TESTCASE(snippetmisc1, generated) {
170 Xapian::Database db = get_database("snippet", make_tg_db, "snippet");
171 Xapian::Enquire enquire(db);
172 enquire.set_weighting_scheme(Xapian::BoolWeight());
173 Xapian::Stem stem("en");
175 static const char * const words[] = { "do", "we", "have" };
176 Xapian::Query q(Xapian::Query::OP_PHRASE, words, words + 3);
177 enquire.set_query(q);
178 Xapian::MSet mset = enquire.get_mset(0, 6);
179 TEST_EQUAL(mset.size(), 3);
180 TEST_STRINGS_EQUAL(mset.snippet(mset[0].get_document().get_data(), 40, stem),
181 "How much o'brien <b>do we have</b>? Miles...");
182 TEST_STRINGS_EQUAL(mset.snippet(mset[1].get_document().get_data(), 40, stem),
183 "...Unicode: How much o’brien <b>do we have</b>?");
184 TEST_STRINGS_EQUAL(mset.snippet(mset[2].get_document().get_data(), 32, stem),
185 "We do have we <b>do we have</b> do we.");
187 enquire.set_query(Xapian::Query("Zwelcom") | Xapian::Query("Zmike"));
188 mset = enquire.get_mset(0, 6);
189 TEST_EQUAL(mset.size(), 3);
190 TEST_STRINGS_EQUAL(mset.snippet(mset[0].get_document().get_data(), 25, stem),
191 "<b>Welcome</b> to <b>Mike's</b>...");
192 TEST_STRINGS_EQUAL(mset.snippet(mset[1].get_document().get_data(), 5, stem),
193 "<b>Mike</b>...");
194 TEST_STRINGS_EQUAL(mset.snippet(mset[2].get_document().get_data(), 10, stem),
195 "...<b>Mike</b> can...");
197 enquire.set_query(Xapian::Query(q.OP_WILDCARD, "m"));
198 mset = enquire.get_mset(0, 6);
199 TEST_EQUAL(mset.size(), 5);
200 TEST_STRINGS_EQUAL(mset.snippet(mset[0].get_document().get_data(), 18, stem),
201 "...<b>Mike's</b> <b>Mechanical</b>...");
202 TEST_STRINGS_EQUAL(mset.snippet(mset[1].get_document().get_data(), 80, stem),
203 "<b>Mike</b> <b>McDonald</b> is a <b>mechanic</b> who enjoys repairing things of a <b>mechanical</b> sort.");
204 TEST_STRINGS_EQUAL(mset.snippet(mset[2].get_document().get_data(), 102, stem),
205 "From autos to zip-lines, from tea-lights to x-rays, from sea ships to u-boats - <b>Mike</b> can fix them all.");
206 TEST_STRINGS_EQUAL(mset.snippet(mset[3].get_document().get_data(), 64, stem),
207 "How <b>much</b> o'brien do we have? <b>Miles</b> O'Brien, that's how <b>much</b>.");
208 // The requested length is in bytes, so the "fancy" apostrophe results in
209 // fewer Unicode characters in this sample than the previous one.
210 TEST_STRINGS_EQUAL(mset.snippet(mset[4].get_document().get_data(), 64, stem),
211 "...<b>much</b> o’brien do we have? <b>Miles</b> O’Brien, that’s how <b>much</b>.");
213 return true;
216 /// Test snippet term diversity.
217 DEFINE_TESTCASE(snippet_termcover1, backend) {
218 static const snippet_testcase testcases[] = {
219 // "Zexample" isn't in the database, so should get termweight 0. Once
220 // max_tw is added on, "rubbish" should have just under twice the
221 // relevance of "example" so clearly should win in a straight fight.
222 { "A rubbish, but a good example", 14, "...<b>rubbish</b>, but a..."},
223 // But a second occurrence of "rubbish" has half the relevance, so
224 // "example" should add slightly more relevance.
225 { "Rubbish and rubbish, and rubbish examples", 22, "...and <b>rubbish</b> <b>examples</b>"},
226 // And again.
227 { "rubbish rubbish example rubbish rubbish", 16, "...<b>example</b> <b>rubbish</b>..." },
230 Xapian::Stem stem("en");
231 // Disable SNIPPET_BACKGROUND_MODEL so we can test the relevance decay
232 // for repeated terms.
233 unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE;
234 for (auto i : testcases) {
235 Xapian::Enquire enquire(get_database("apitest_simpledata"));
236 enquire.set_query(Xapian::Query(Xapian::Query::OP_OR,
237 Xapian::Query("rubbish"),
238 Xapian::Query("Zexampl")));
240 Xapian::MSet mset = enquire.get_mset(0, 0);
241 TEST_STRINGS_EQUAL(mset.snippet(i.input, i.len, stem, flags), i.expect);
244 return true;
247 /// Test snippet term diversity cases with BoolWeight.
248 DEFINE_TESTCASE(snippet_termcover2, backend) {
249 // With BoolWeight, all terms have 0 termweight, and so relevance 1.0
250 // (since max_tw is set to 1.0 if it is zero).
251 static const snippet_testcase testcases[] = {
252 // Diversity should pick two different terms in preference.
253 { "rubbish rubbish example rubbish rubbish", 16, "...<b>example</b> <b>rubbish</b>..." },
254 // And again.
255 { "Rubbish and rubbish, and rubbish examples", 22, "...and <b>rubbish</b> <b>examples</b>"},
256 // The last of two equal snippet should win.
257 { "A rubbish, but a good example", 14, "...a good <b>example</b>"},
260 Xapian::Stem stem("en");
261 // Disable SNIPPET_BACKGROUND_MODEL so we can test the relevance decay
262 // for repeated terms.
263 unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE;
264 for (auto i : testcases) {
265 Xapian::Enquire enquire(get_database("apitest_simpledata"));
266 enquire.set_query(Xapian::Query(Xapian::Query::OP_OR,
267 Xapian::Query("rubbish"),
268 Xapian::Query("Zexampl")));
269 enquire.set_weighting_scheme(Xapian::BoolWeight());
271 Xapian::MSet mset = enquire.get_mset(0, 0);
272 TEST_STRINGS_EQUAL(mset.snippet(i.input, i.len, stem, flags), i.expect);
275 return true;
278 /// Test snippet EMPTY_WITHOUT_MATCH flag
279 DEFINE_TESTCASE(snippet_empty, backend) {
281 Xapian::Stem stem("en");
283 Xapian::Enquire enquire(get_database("apitest_simpledata"));
284 enquire.set_query(Xapian::Query(Xapian::Query::OP_OR,
285 Xapian::Query("rubbish"),
286 Xapian::Query("Zexampl")));
288 Xapian::MSet mset = enquire.get_mset(0, 0);
290 // A non-matching text
291 const char *input = "A string without a match.";
292 size_t len = strlen(input);
294 // By default, snippet() returns len bytes of input without markup
295 unsigned flags = 0;
296 TEST_STRINGS_EQUAL(mset.snippet(input, len, stem, 0), input);
298 // force snippet() to return the empty string if no term got matched
299 flags |= Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
300 TEST_STRINGS_EQUAL(mset.snippet(input, len, stem, flags), "");
302 // A text with a match
303 input = "A rubbish example text";
304 len = strlen(input);
306 flags = 0;
307 TEST_STRINGS_EQUAL(mset.snippet(input, len, stem, flags),
308 "A <b>rubbish</b> <b>example</b> text");
310 flags |= Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
311 TEST_STRINGS_EQUAL(mset.snippet(input, len, stem, flags),
312 "A <b>rubbish</b> <b>example</b> text");
314 return true;