Adjust new comments to match existing
[xapian.git] / xapian-letor / tests / api_letor.cc
blob5b19eeeff55a71cfbc834900075c41ab8ff68d5d
1 /** @file
2 * @brief test common features of API classes
3 */
4 /* Copyright (C) 2007,2009,2012,2014,2015,2016,2023 Olly Betts
5 * Copyright (C) 2019 Vaibhav Kansagara
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "api_letor.h"
26 #include <cstdlib>
27 #include <fstream>
28 #include <sstream>
30 #include <xapian.h>
31 #include <xapian-letor.h>
33 #include "apitest.h"
34 #include "filetests.h"
35 #include "safeunistd.h"
36 #include "testutils.h"
38 using namespace std;
40 // To check for one document edge
41 static void
42 db_index_one_document(Xapian::WritableDatabase& db, const string&)
44 Xapian::Document doc;
45 Xapian::TermGenerator termgenerator;
46 termgenerator.set_document(doc);
47 termgenerator.set_stemmer(Xapian::Stem("en"));
48 termgenerator.index_text("Tigers are solitary animals", 1, "S");
49 termgenerator.index_text("Might be that only one Tiger is good enough to "
50 "Take out a ranker, a Tiger is a good way to "
51 "check if a test is working or Tiger not. Tiger."
52 "What if the next line contains no Tigers? Would "
53 "it make a difference to your ranker ? Tigers "
54 "for the win.", 1, "XD");
55 termgenerator.index_text("The will.");
56 termgenerator.increase_termpos();
57 termgenerator.index_text("Tigers would not be caught if one calls out the "
58 "Tiger from the den. This document is to check if "
59 "in the massive dataset, you forget the sense of "
60 "something you would not like to stop.");
61 db.add_document(doc);
64 static void
65 db_index_two_documents(Xapian::WritableDatabase& db, const string&)
67 Xapian::Document doc;
68 Xapian::TermGenerator termgenerator;
69 termgenerator.set_document(doc);
70 termgenerator.set_stemmer(Xapian::Stem("en"));
71 termgenerator.index_text("Lions, Tigers, Bears and Giraffes", 1, "S");
72 termgenerator.index_text("This paragraph talks about lions and tigers and "
73 "bears (oh, my!). It mentions giraffes, "
74 "but that's not really very important. Lions "
75 "and tigers are big cats, so they must be really "
76 "cuddly. Bears are famous for being cuddly, at "
77 "least when they're teddy bears.", 1, "XD");
78 termgenerator.index_text("Lions, Tigers, Bears and Giraffes");
79 termgenerator.increase_termpos();
80 termgenerator.index_text("This paragraph talks about lions and tigers and "
81 "bears (oh, my!). It mentions giraffes, "
82 "but that's not really very important. Lions "
83 "and tigers are big cats, so they must be really "
84 "cuddly. Bears are famous for being cuddly, at "
85 "least when they're teddy bears.");
86 db.add_document(doc);
87 doc.clear_terms();
88 termgenerator.index_text("Lions, Tigers and Bears", 1, "S");
89 termgenerator.index_text("This is the paragraph of interest. Tigers are "
90 "massive beasts - I wouldn't want to meet a "
91 "hungry one anywhere. Lions are scary even when "
92 "lyin' down. Bears are scary even when bare. "
93 "Together I suspect they'd be less scary, as the "
94 "tigers, lions, and bears would all keep each "
95 "other busy. On the other hand, bears don't live "
96 "in the same continent as far as I know.", 1,
97 "XD");
98 termgenerator.index_text("Lions, Tigers and Bears");
99 termgenerator.increase_termpos();
100 termgenerator.index_text("This is the paragraph of interest. Tigers are "
101 "massive beasts - I wouldn't want to meet a "
102 "hungry one anywhere. Lions are scary even when "
103 "lyin' down. Bears are scary even when bare. "
104 "Together I suspect they'd be less scary, as the "
105 "tigers, lions, and bears would all keep each "
106 "other busy. On the other hand, bears don't live "
107 "in the same continent as far as I know.");
108 db.add_document(doc);
111 // To check for three documents. out of which one is irrelevant
112 static void
113 db_index_three_documents(Xapian::WritableDatabase& db, const string&)
115 Xapian::Document doc;
116 Xapian::TermGenerator termgenerator;
117 termgenerator.set_document(doc);
118 termgenerator.set_stemmer(Xapian::Stem("en"));
119 termgenerator.index_text("The will", 1, "S");
120 termgenerator.index_text("The will are considered stop words in xapian and "
121 "would be thrown off, so the query I want to say "
122 "is score, yes, score. The Score of a game is "
123 "the determining factor of a game, the score is "
124 "what matters at the end of the day. so my advise "
125 "to everyone is to Score it!.", 1, "XD");
126 termgenerator.index_text("Score might be something else too, but this para "
127 "refers to score only at an abstract. Scores are "
128 "in general scoring. Score it!");
129 termgenerator.increase_termpos();
130 termgenerator.index_text("Score score score is important.");
131 db.add_document(doc);
132 doc.clear_terms();
133 termgenerator.index_text("Score score score score score score", 1, "S");
134 termgenerator.index_text("it might have an absurdly high rank in the qrel "
135 "file or might have no rank at all in another. "
136 "Look out for this as a testcase, might be edgy "
137 "good luck and may this be with you.", 1, "XD");
138 termgenerator.index_text("Another irrelevant paragraph to make sure the tf "
139 "values are down, but this increases idf values "
140 "but let's see how this works out.");
141 termgenerator.increase_termpos();
142 termgenerator.index_text("Nothing to do with the query.");
143 db.add_document(doc);
144 doc.clear_terms();
145 termgenerator.index_text("Document has nothing to do with score", 1, "S");
146 termgenerator.index_text("This is just to check if score is given a higher "
147 "score if it is in the subject or not. Nothing "
148 "special, just judging scores by the look of it. "
149 "Some more scores but a bad qrel should be enough "
150 "to make sure it is ranked down.", 1, "XD");
151 termgenerator.index_text("Score might be something else too, but this para "
152 "refers to score only at an abstract. Scores are "
153 "in general scoring. Score it!");
154 termgenerator.increase_termpos();
155 termgenerator.index_text("Score score score is important.");
156 db.add_document(doc);
159 // To check for three documents in which one has no common terms with other two.
160 static void
161 db_index_three_documents_no_common(Xapian::WritableDatabase& db, const string&)
163 Xapian::Document doc;
164 Xapian::TermGenerator termgenerator;
165 termgenerator.set_document(doc);
166 termgenerator.set_stemmer(Xapian::Stem("en"));
167 termgenerator.index_text("The will", 1, "S");
168 termgenerator.index_text("The will are considered stop words in xapian and "
169 "would be thrown off, so the query I want to say "
170 "is score, yes, score. The Score of a game is "
171 "the determining factor of a game, the score is "
172 "what matters at the end of the day. so my advise "
173 "to everyone is to Score it!.", 1, "XD");
174 termgenerator.index_text("Score might be something else too, but this para "
175 "refers to score only at an abstract. Scores are "
176 "in general scoring. Score it!");
177 termgenerator.increase_termpos();
178 termgenerator.index_text("Score score score is important.");
179 db.add_document(doc);
180 doc.clear_terms();
181 termgenerator.index_text("Document has nothing to do with score", 1, "S");
182 termgenerator.index_text("This is just to check if score is given a higher "
183 "score if it is in the subject or not. Nothing "
184 "special, just judging scores by the look of it. "
185 "Some more scores but a bad qrel should be enough "
186 "to make sure it is ranked down.", 1, "XD");
187 termgenerator.index_text("Score might be something else too, but this para "
188 "refers to score only at an abstract. Scores are "
189 "in general scoring. Score it!");
190 termgenerator.increase_termpos();
191 termgenerator.index_text("Score score score is important.");
192 db.add_document(doc);
193 doc.clear_terms();
194 termgenerator.index_text("Tigers are solitary animals", 1, "S");
195 termgenerator.index_text("Might be that only one Tiger is good enough to "
196 "Take out a ranker, a Tiger is a good way to "
197 "check if a test is working or Tiger not. Tiger."
198 "What if the next line contains no Tigers? Would "
199 "it make a difference to your ranker ? Tigers "
200 "for the win.", 1, "XD");
201 termgenerator.index_text("The will.");
202 termgenerator.increase_termpos();
203 termgenerator.index_text("Tigers would not be caught if one calls out the "
204 "Tiger from the den. This document is to check if "
205 "in the massive dataset, you forget the sense of "
206 "something you would not like to stop.");
207 db.add_document(doc);
210 DEFINE_TESTCASE(createfeaturevector, backend)
212 Xapian::FeatureList fl;
213 Xapian::Database db = get_database("db_index_two_documents",
214 db_index_two_documents);
215 Xapian::Enquire enquire(db);
216 enquire.set_query(Xapian::Query("lions"));
217 Xapian::MSet mset;
218 mset = enquire.get_mset(0, 10);
219 TEST(!mset.empty());
220 TEST_EQUAL(mset.size(), 2);
221 auto fv = fl.create_feature_vectors(mset, Xapian::Query("lions"), db);
222 TEST_EQUAL(fv.size(), 2);
223 TEST_EQUAL(fv[0].get_fcount(), 19);
224 TEST_EQUAL(fv[1].get_fcount(), 19);
227 DEFINE_TESTCASE(createfeaturevectoronevector, backend)
229 Xapian::FeatureList fl;
230 Xapian::Database db = get_database("apitest_ranker2",
231 db_index_one_document);
232 Xapian::Enquire enquire(db);
233 enquire.set_query(Xapian::Query("tigers"));
234 Xapian::MSet mset;
235 mset = enquire.get_mset(0, 10);
236 TEST(!mset.empty());
237 auto fv = fl.create_feature_vectors(mset, Xapian::Query("tigers"), db);
238 TEST_EQUAL(fv.size(), 1);
239 TEST_EQUAL(fv[0].get_fcount(), 19);
242 DEFINE_TESTCASE(createfeaturevectoronevector_wrongquery, backend)
244 Xapian::FeatureList fl;
245 Xapian::Database db = get_database("apitest_ranker3",
246 db_index_one_document);
247 Xapian::Enquire enquire(db);
248 enquire.set_query(Xapian::Query("llamas"));
249 Xapian::MSet mset;
250 mset = enquire.get_mset(0, 10);
251 TEST(mset.empty());
252 auto fv = fl.create_feature_vectors(mset, Xapian::Query("llamas"), db);
253 TEST_EQUAL(fv.size(), 0);
256 DEFINE_TESTCASE(createfeaturevectorthree, backend)
258 Xapian::FeatureList fl;
259 Xapian::Database db = get_database("db_index_three_documents",
260 db_index_three_documents);
261 Xapian::Enquire enquire(db);
262 enquire.set_query(Xapian::Query("score"));
263 Xapian::MSet mset;
264 mset = enquire.get_mset(0, 10);
265 TEST(!mset.empty());
266 auto fv = fl.create_feature_vectors(mset, Xapian::Query("score"), db);
267 TEST_EQUAL(fv.size(), 2);
268 TEST_EQUAL(fv[0].get_fcount(), 19);
269 TEST_EQUAL(fv[1].get_fcount(), 19);
272 DEFINE_TESTCASE(emptyfeaturelist, !backend)
274 vector<Xapian::Feature*> f;
275 TEST_EXCEPTION(Xapian::InvalidArgumentError, Xapian::FeatureList fl(f));
278 DEFINE_TESTCASE(bigfeaturelist, backend)
280 vector<Xapian::Feature*> f;
281 f.push_back(new Xapian::TfFeature());
282 f.push_back(new Xapian::TfDoclenFeature());
283 f.push_back(new Xapian::IdfFeature());
284 f.push_back(new Xapian::CollTfCollLenFeature());
285 f.push_back(new Xapian::TfIdfDoclenFeature());
286 f.push_back(new Xapian::TfDoclenCollTfCollLenFeature());
287 f.push_back(new Xapian::TfFeature());
288 f.push_back(new Xapian::TfDoclenFeature());
290 // pass big feature list.
291 Xapian::FeatureList fl(f);
292 Xapian::Database db = get_database("db_index_two_documents",
293 db_index_two_documents);
294 Xapian::Enquire enquire(db);
295 enquire.set_query(Xapian::Query("tigers"));
296 Xapian::MSet mset;
297 mset = enquire.get_mset(0, 10);
299 TEST(!mset.empty());
301 auto fv = fl.create_feature_vectors(mset, Xapian::Query("tigers"), db);
302 TEST_EQUAL(fv.size(), 2);
303 // Each feature contributes three values and weight as the default one
304 // making total as 25.
305 TEST_EQUAL(fv[0].get_fcount(), 25);
306 TEST_EQUAL(fv[1].get_fcount(), 25);
309 DEFINE_TESTCASE(preparetrainingfileonedb, path && writable)
311 string db_path = get_database_path("apitest_listnet_ranker1",
312 db_index_one_document);
313 string data_directory = test_driver::get_srcdir() + "/testdata/";
314 string query = data_directory + "queryone.txt";
315 string qrel = data_directory + "qrelone.txt";
316 string training_data = data_directory + "training_data_one_document.txt";
317 unlink("training_output_data_one_doc.txt");
318 Xapian::prepare_training_file(db_path, query, qrel, 10,
319 "training_output_data_one_doc.txt");
320 TEST(file_exists("training_output_data_one_doc.txt"));
321 ifstream if1(training_data);
322 ifstream if2("training_output_data_one_doc.txt");
323 string line1;
324 string line2;
325 while (getline(if1, line1)) {
326 TEST(getline(if2, line2));
327 istringstream iss1(line1);
328 istringstream iss2(line2);
329 string temp1;
330 string temp2;
331 int i = 0;
332 while ((iss1 >> temp1) && (iss2 >> temp2)) {
333 // The 0th, 1st and 21st literals taken as input, are strings,
334 // and can be compared directly, They are: For example(test):
335 // ("1", "qid:20001" and "#docid=1") at 0th, 1st, and 21st pos
336 // respectively. Whereas the other values are doubles which
337 // would have to tested under TEST_DOUBLE() against precision.
338 if (i == 0 || i == 1 || i == 21) {
339 TEST_EQUAL(temp1, temp2);
340 } else {
341 size_t t1 = temp1.find_first_of(':');
342 size_t t2 = temp2.find_first_of(':');
343 TEST_EQUAL_DOUBLE(stod(temp1.substr(t1 + 1)),
344 stod(temp2.substr(t2 + 1)));
346 i++;
348 TEST_REL(i, ==, 22);
349 TEST(!(iss2 >> temp2));
351 TEST(!getline(if2, line2));
352 unlink("training_output_data_one_doc.txt");
355 #define TEST_PARSE_EXCEPTION(TESTFILE) TEST_EXCEPTION(Xapian::LetorParseError,\
356 Xapian::prepare_training_file(db_path,\
357 data_directory + TESTFILE, qrel, 10,\
358 "training_output.txt"))
360 // test whether query ids are unique in queryfile.
361 DEFINE_TESTCASE(unique_queryid, path)
363 string db_path = get_database_path("db_index_one_document",
364 db_index_one_document);
365 string data_directory = test_driver::get_srcdir() + "/testdata/";
366 string qrel = data_directory + "qrelone.txt";
367 TEST_PARSE_EXCEPTION("unique_query_id.txt");
370 DEFINE_TESTCASE(parse_querystring, path)
372 // All those cases which are not valid.
373 string db_path = get_database_path("db_index_one_document",
374 db_index_one_document);
375 string data_directory = test_driver::get_srcdir() + "/testdata/";
376 string qrel = data_directory + "qrelone.txt";
377 TEST_PARSE_EXCEPTION("parse_query_noopenquote.txt");
378 TEST_PARSE_EXCEPTION("parse_query_noclosingquote.txt");
379 TEST_PARSE_EXCEPTION("parse_query_empty_string.txt");
380 TEST_PARSE_EXCEPTION("nospace.txt");
381 TEST_PARSE_EXCEPTION("nosinglequotes.txt");
382 TEST_PARSE_EXCEPTION("blank_space_before_query.txt");
384 // All those cases which are valid.
385 Xapian::prepare_training_file(db_path, data_directory +
386 "parse_query_valid.txt", qrel, 10,
387 "training_output.txt");
390 // Check stability for an empty qrel file
391 DEFINE_TESTCASE(preparetrainingfileonedb_empty_qrel, path)
393 string db_path = get_database_path("ranker_empty",
394 db_index_one_document);
395 string data_directory = test_driver::get_srcdir() + "/testdata/";
396 string query = data_directory + "queryone.txt";
397 string qrel = data_directory + "empty_file.txt";
398 string training_data = data_directory + "empty_file.txt";
399 unlink("training_output_empty.txt");
400 Xapian::prepare_training_file(db_path, query, qrel, 10,
401 "training_output_empty.txt");
402 TEST(file_exists("training_output_empty.txt"));
403 ifstream if1(training_data);
404 ifstream if2("training_output_empty.txt");
405 string line1;
406 string line2;
407 while (getline(if1, line1)) {
408 TEST(getline(if2, line2));
409 istringstream iss1(line1);
410 istringstream iss2(line2);
411 string temp1;
412 string temp2;
413 int i = 0;
414 while ((iss1 >> temp1) && (iss2 >> temp2)) {
415 if (i == 0 || i == 1 || i == 21) {
416 TEST_EQUAL(temp1, temp2);
417 } else {
418 size_t t1 = temp1.find_first_of(':');
419 size_t t2 = temp2.find_first_of(':');
420 TEST_EQUAL_DOUBLE(stod(temp1.substr(t1 + 1)),
421 stod(temp2.substr(t2 + 1)));
423 i++;
425 TEST_REL(i, ==, 22);
426 TEST(!(iss2 >> temp2));
428 TEST(!getline(if2, line2));
429 unlink("training_output_empty.txt");
432 DEFINE_TESTCASE(preparetrainingfile_two_docs, path)
434 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
435 string db_path = get_database_path("db_index_two_documents",
436 db_index_two_documents);
437 string data_directory = test_driver::get_srcdir() + "/testdata/";
438 string query = data_directory + "query.txt";
439 string qrel = data_directory + "qrel.txt";
440 string training_data = data_directory + "training_data.txt";
441 unlink("training_output1.txt");
442 Xapian::prepare_training_file(db_path, query, qrel, 10,
443 "training_output1.txt");
444 TEST(file_exists("training_output1.txt"));
445 ifstream if1(training_data);
446 ifstream if2("training_output1.txt");
447 string line1;
448 string line2;
449 while (getline(if1, line1)) {
450 TEST(getline(if2, line2));
451 istringstream iss1(line1);
452 istringstream iss2(line2);
453 string temp1;
454 string temp2;
455 int i = 0;
456 while ((iss1 >> temp1) && (iss2 >> temp2)) {
457 if (i == 0 || i == 1 || i == 21) {
458 TEST_EQUAL(temp1, temp2);
459 } else {
460 size_t t1 = temp1.find_first_of(':');
461 size_t t2 = temp2.find_first_of(':');
462 TEST_EQUAL_DOUBLE(stod(temp1.substr(t1 + 1)),
463 stod(temp2.substr(t2 + 1)));
465 i++;
467 TEST_REL(i, ==, 22);
468 TEST(!(iss2 >> temp2));
470 TEST(!getline(if2, line2));
471 unlink("training_output1.txt");
474 DEFINE_TESTCASE(preparetrainingfilethree, path)
476 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
477 string db_path = get_database_path("db_index_three_documents",
478 db_index_three_documents);
479 string data_directory = test_driver::get_srcdir() + "/testdata/";
480 string query = data_directory + "querythree.txt";
481 string qrel = data_directory + "qrelthree_correct.txt";
482 string training_data = data_directory + "training_data_three_correct.txt";
483 unlink("training_output_three_correct.txt");
484 Xapian::prepare_training_file(db_path, query, qrel, 10,
485 "training_output_three_correct.txt");
486 TEST(file_exists("training_output_three_correct.txt"));
487 ifstream if1(training_data);
488 ifstream if2("training_output_three_correct.txt");
489 string line1;
490 string line2;
491 while (getline(if1, line1)) {
492 TEST(getline(if2, line2));
493 istringstream iss1(line1);
494 istringstream iss2(line2);
495 string temp1;
496 string temp2;
497 int i = 0;
498 while ((iss1 >> temp1) && (iss2 >> temp2)) {
499 if (i == 0 || i == 1 || i == 21) {
500 TEST_EQUAL(temp1, temp2);
501 } else {
502 size_t t1 = temp1.find_first_of(':');
503 size_t t2 = temp2.find_first_of(':');
504 TEST_EQUAL_DOUBLE(stod(temp1.substr(t1 + 1)),
505 stod(temp2.substr(t2 + 1)));
507 i++;
509 TEST_REL(i, ==, 22);
510 TEST(!(iss2 >> temp2));
512 TEST(!getline(if2, line2));
513 unlink("training_output_three_correct.txt");
516 // ListNet_Ranker check
517 DEFINE_TESTCASE(listnet_ranker, path && writable)
519 Xapian::ListNETRanker ranker;
520 TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
521 string db_path = get_database_path("db_index_two_documents",
522 db_index_two_documents);
523 Xapian::Enquire enquire((Xapian::Database(db_path)));
524 enquire.set_query(Xapian::Query("lions"));
525 Xapian::MSet mymset = enquire.get_mset(0, 10);
526 string data_directory = test_driver::get_srcdir() + "/testdata/";
527 string query = data_directory + "query.txt";
528 string qrel = data_directory + "qrel.txt";
529 string training_data = data_directory + "training_data.txt";
530 ranker.set_database_path(db_path);
531 TEST_EQUAL(ranker.get_database_path(), db_path);
532 ranker.set_query(Xapian::Query("lions"));
533 ranker.train_model(training_data);
534 Xapian::docid doc1 = *mymset[0];
535 Xapian::docid doc2 = *mymset[1];
536 ranker.rank(mymset);
537 TEST_EQUAL(doc1, *mymset[1]);
538 TEST_EQUAL(doc2, *mymset[0]);
539 mymset = enquire.get_mset(0, 10);
540 ranker.train_model(training_data, "ListNet_Ranker");
541 ranker.rank(mymset, "ListNet_Ranker");
542 TEST_EQUAL(doc1, *mymset[1]);
543 TEST_EQUAL(doc2, *mymset[0]);
544 TEST_EXCEPTION(Xapian::LetorInternalError,
545 ranker.score(query, qrel, "ListNet_Ranker",
546 "scorer_output.txt", 10, ""));
547 TEST_EXCEPTION(Xapian::FileNotFoundError,
548 ranker.score("", qrel, "ListNet_Ranker",
549 "scorer_output.txt", 10));
550 TEST_EXCEPTION(Xapian::FileNotFoundError,
551 ranker.score(qrel, "", "ListNet_Ranker",
552 "scorer_output.txt", 10));
553 unlink("ndcg_output_listnet_2.txt");
554 ranker.score(query, qrel, "ListNet_Ranker", "ndcg_output_listnet_2.txt",
555 10);
556 TEST(file_exists("ndcg_output_listnet_2.txt"));
557 unlink("ndcg_output_listnet_2.txt");
558 unlink("err_output_listnet_2.txt");
559 ranker.score(query, qrel, "ListNet_Ranker", "err_output_listnet_2.txt",
560 10, "ERRScore");
561 TEST(file_exists("err_output_listnet_2.txt"));
562 unlink("err_output_listnet_2.txt");
565 DEFINE_TESTCASE(listnet_ranker_one_file, path && writable)
567 Xapian::ListNETRanker ranker;
568 TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
569 string db_path = get_database_path("apitest_listnet_ranker5",
570 db_index_one_document);
571 Xapian::Enquire enquire((Xapian::Database(db_path)));
572 enquire.set_query(Xapian::Query("tigers"));
573 Xapian::MSet mymset = enquire.get_mset(0, 10);
574 string data_directory = test_driver::get_srcdir() + "/testdata/";
575 string query = data_directory + "queryone.txt";
576 string qrel = data_directory + "qrelone.txt";
577 string training_data = data_directory + "training_data_one_document.txt";
578 ranker.set_database_path(db_path);
579 TEST_EQUAL(ranker.get_database_path(), db_path);
580 ranker.set_query(Xapian::Query("tigers"));
581 ranker.train_model(training_data);
582 Xapian::docid doc1 = *mymset[0];
583 ranker.rank(mymset);
584 TEST_EQUAL(doc1, *mymset[0]);
585 mymset = enquire.get_mset(0, 10);
586 ranker.train_model(training_data, "ListNet_Ranker");
587 ranker.rank(mymset, "ListNet_Ranker");
588 TEST_EQUAL(doc1, *mymset[0]);
589 TEST_EXCEPTION(Xapian::LetorInternalError,
590 ranker.score(query, qrel, "ListNet_Ranker",
591 "scorer_output.txt", 10, ""));
592 TEST_EXCEPTION(Xapian::FileNotFoundError,
593 ranker.score("", qrel, "ListNet_Ranker",
594 "scorer_output.txt", 10));
595 TEST_EXCEPTION(Xapian::FileNotFoundError,
596 ranker.score(qrel, "", "ListNet_Ranker",
597 "scorer_output.txt", 10));
598 unlink("ndcg_output_listnet_1.txt");
599 ranker.score(query, qrel, "ListNet_Ranker", "ndcg_output_listnet_1.txt",
600 10);
601 TEST(file_exists("ndcg_output_listnet_1.txt"));
602 unlink("ndcg_output_listnet_1.txt");
603 unlink("err_output_listnet_1.txt");
604 ranker.score(query, qrel, "ListNet_Ranker", "err_output_listnet_1.txt", 10,
605 "ERRScore");
606 TEST(file_exists("err_output_listnet_1.txt"));
607 unlink("err_output_listnet_1.txt");
610 DEFINE_TESTCASE(listnet_ranker_three_correct, path && writable)
612 Xapian::ListNETRanker ranker;
613 TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
614 string db_path = get_database_path("db_index_three_documents",
615 db_index_three_documents);
616 Xapian::Enquire enquire((Xapian::Database(db_path)));
617 enquire.set_query(Xapian::Query("score"));
618 Xapian::MSet mymset = enquire.get_mset(0, 10);
619 string data_directory = test_driver::get_srcdir() + "/testdata/";
620 string query = data_directory + "querythree.txt";
621 string qrel = data_directory + "qrelthree_correct.txt";
622 string training_data = data_directory + "training_data_three_correct.txt";
623 ranker.set_database_path(db_path);
624 TEST_EQUAL(ranker.get_database_path(), db_path);
625 ranker.set_query(Xapian::Query("score"));
626 ranker.train_model(training_data);
627 Xapian::docid doc1 = *mymset[0];
628 Xapian::docid doc2 = *mymset[1];
629 ranker.rank(mymset);
630 TEST_EQUAL(doc1, *mymset[1]);
631 TEST_EQUAL(doc2, *mymset[0]);
632 mymset = enquire.get_mset(0, 10);
633 ranker.train_model(training_data, "ListNet_Ranker");
634 ranker.rank(mymset, "ListNet_Ranker");
635 TEST_EQUAL(doc1, *mymset[1]);
636 TEST_EQUAL(doc2, *mymset[0]);
637 TEST_EXCEPTION(Xapian::LetorInternalError,
638 ranker.score(query, qrel, "ListNet_Ranker",
639 "scorer_output.txt", 10, ""));
640 TEST_EXCEPTION(Xapian::FileNotFoundError,
641 ranker.score("", qrel, "ListNet_Ranker",
642 "scorer_output.txt", 10));
643 TEST_EXCEPTION(Xapian::FileNotFoundError,
644 ranker.score(qrel, "", "ListNet_Ranker",
645 "scorer_output.txt", 10));
646 unlink("ndcg_output_listnet_3.txt");
647 ranker.score(query, qrel, "ListNet_Ranker", "ndcg_output_listnet_3.txt",
648 10);
649 TEST(file_exists("ndcg_output_listnet_3.txt"));
650 unlink("ndcg_output_listnet_3.txt");
651 unlink("err_output_listnet_3.txt");
652 ranker.score(query, qrel, "ListNet_Ranker", "err_output_listnet_3.txt", 10,
653 "ERRScore");
654 TEST(file_exists("err_output_listnet_3.txt"));
655 unlink("err_output_listnet_3.txt");
658 DEFINE_TESTCASE(scorer, path && writable)
660 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
661 Xapian::ListNETRanker ranker;
662 string db_path = get_database_path("db_index_three_documents",
663 db_index_three_documents);
664 Xapian::Enquire enquire((Xapian::Database(db_path)));
665 enquire.set_query(Xapian::Query("score"));
666 Xapian::MSet mymset = enquire.get_mset(0, 10);
667 string data_directory = test_driver::get_srcdir() + "/testdata/";
668 string query = data_directory + "querythree.txt";
669 string qrel = data_directory + "qrelthree_correct.txt";
670 string training_data = data_directory + "training_data_three_correct.txt";
671 ranker.set_database_path(db_path);
672 TEST_EQUAL(ranker.get_database_path(), db_path);
673 ranker.set_query(Xapian::Query("score"));
674 ranker.train_model(training_data);
675 Xapian::docid doc1 = *mymset[0];
676 Xapian::docid doc2 = *mymset[1];
677 ranker.rank(mymset);
678 TEST_EQUAL(doc1, *mymset[1]);
679 TEST_EQUAL(doc2, *mymset[0]);
680 unlink("ndcg_score_output.txt");
681 ranker.score(query, qrel, "ListNet_Ranker", "ndcg_score_output.txt",
682 10);
683 TEST(file_exists("ndcg_score_output.txt"));
684 ifstream ndcg_score_file;
685 ndcg_score_file.open("ndcg_score_output.txt", ios::in);
686 string line;
687 getline(ndcg_score_file, line);
688 size_t pos = 1 + line.find_first_of("=");
689 double ndcg_score = stod(line.substr(pos));
690 // It should have the perfect ndcg score(1.0)
691 TEST_EQUAL(ndcg_score, 1.0);
693 unlink("ndcg_score_output.txt");
696 // ListMLE_Ranker check
697 DEFINE_TESTCASE(listmle_ranker, path && writable)
699 Xapian::ListMLERanker ranker;
700 TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
701 string db_path = get_database_path("db_index_two_documents",
702 db_index_two_documents);
703 Xapian::Enquire enquire((Xapian::Database(db_path)));
704 enquire.set_query(Xapian::Query("lions"));
705 Xapian::MSet mymset = enquire.get_mset(0, 10);
706 string data_directory = test_driver::get_srcdir() + "/testdata/";
707 string query = data_directory + "query.txt";
708 string qrel = data_directory + "qrel.txt";
709 string training_data = data_directory + "training_data.txt";
710 ranker.set_database_path(db_path);
711 TEST_EQUAL(ranker.get_database_path(), db_path);
712 ranker.set_query(Xapian::Query("lions"));
713 ranker.train_model(training_data);
714 Xapian::docid doc1 = *mymset[0];
715 Xapian::docid doc2 = *mymset[1];
716 ranker.rank(mymset);
717 TEST_EQUAL(doc1, *mymset[1]);
718 TEST_EQUAL(doc2, *mymset[0]);
719 mymset = enquire.get_mset(0, 10);
720 ranker.train_model(training_data, "ListMLE_Ranker");
721 ranker.rank(mymset, "ListMLE_Ranker");
722 TEST_EQUAL(doc1, *mymset[1]);
723 TEST_EQUAL(doc2, *mymset[0]);
724 TEST_EXCEPTION(Xapian::LetorInternalError,
725 ranker.score(query, qrel, "ListMLE_Ranker",
726 "scorer_output.txt", 10, ""));
727 TEST_EXCEPTION(Xapian::FileNotFoundError,
728 ranker.score("", qrel, "ListMLE_Ranker",
729 "scorer_output.txt", 10));
730 TEST_EXCEPTION(Xapian::FileNotFoundError,
731 ranker.score(qrel, "", "ListMLE_Ranker",
732 "scorer_output.txt", 10));
733 unlink("ndcg_output_listmle_2.txt");
734 ranker.score(query, qrel, "ListMLE_Ranker", "ndcg_output_listmle_2.txt",
735 10);
736 TEST(file_exists("ndcg_output_listmle_2.txt"));
737 unlink("ndcg_output_listmle_2.txt");
738 unlink("err_output_listmle_2.txt");
739 ranker.score(query, qrel, "ListMLE_Ranker", "err_output_listmle_2.txt", 10,
740 "ERRScore");
741 TEST(file_exists("err_output_listmle_2.txt"));
742 unlink("err_output_listmle_2.txt");
745 DEFINE_TESTCASE(listmle_ranker_one_file, path && writable)
747 Xapian::ListMLERanker ranker;
748 TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
749 string db_path = get_database_path("apitest_listmle_ranker1",
750 db_index_one_document);
751 Xapian::Enquire enquire((Xapian::Database(db_path)));
752 enquire.set_query(Xapian::Query("tigers"));
753 Xapian::MSet mymset = enquire.get_mset(0, 10);
754 string data_directory = test_driver::get_srcdir() + "/testdata/";
755 string query = data_directory + "queryone.txt";
756 string qrel = data_directory + "qrelone.txt";
757 string training_data = data_directory + "training_data_one_document.txt";
758 ranker.set_database_path(db_path);
759 TEST_EQUAL(ranker.get_database_path(), db_path);
760 ranker.set_query(Xapian::Query("tigers"));
761 ranker.train_model(training_data);
762 Xapian::docid doc1 = *mymset[0];
763 ranker.rank(mymset);
764 TEST_EQUAL(doc1, *mymset[0]);
765 mymset = enquire.get_mset(0, 10);
766 ranker.train_model(training_data, "ListMLE_Ranker");
767 ranker.rank(mymset, "ListMLE_Ranker");
768 TEST_EQUAL(doc1, *mymset[0]);
769 TEST_EXCEPTION(Xapian::LetorInternalError,
770 ranker.score(query, qrel, "ListMLE_Ranker",
771 "scorer_output.txt", 10, ""));
772 TEST_EXCEPTION(Xapian::FileNotFoundError,
773 ranker.score("", qrel, "ListMLE_Ranker",
774 "scorer_output.txt", 10));
775 TEST_EXCEPTION(Xapian::FileNotFoundError,
776 ranker.score(qrel, "", "ListMLE_Ranker",
777 "scorer_output.txt", 10));
778 unlink("ndcg_output_listmle_1.txt");
779 ranker.score(query, qrel, "ListMLE_Ranker", "ndcg_output_listmle_1.txt",
780 10);
781 TEST(file_exists("ndcg_output_listmle_1.txt"));
782 unlink("ndcg_output_listmle_1.txt");
783 unlink("err_output_listmle_1.txt");
784 ranker.score(query, qrel, "ListMLE_Ranker", "err_output_listmle_1.txt", 10,
785 "ERRScore");
786 TEST(file_exists("err_output_listmle_1.txt"));
787 unlink("err_output_listmle_1.txt");
790 DEFINE_TESTCASE(listmle_ranker_three_correct, path && writable)
792 Xapian::ListMLERanker ranker;
793 string db_path = get_database_path("db_index_three_documents",
794 db_index_three_documents);
795 Xapian::Enquire enquire((Xapian::Database(db_path)));
796 enquire.set_query(Xapian::Query("score"));
797 Xapian::MSet mymset = enquire.get_mset(0, 10);
798 string data_directory = test_driver::get_srcdir() + "/testdata/";
799 string query = data_directory + "querythree.txt";
800 string qrel = data_directory + "qrelthree_correct.txt";
801 string training_data = data_directory + "training_data_three_correct.txt";
802 ranker.set_database_path(db_path);
803 TEST_EQUAL(ranker.get_database_path(), db_path);
804 ranker.set_query(Xapian::Query("score"));
805 ranker.train_model(training_data);
806 Xapian::docid doc1 = *mymset[0];
807 Xapian::docid doc2 = *mymset[1];
808 ranker.rank(mymset);
809 TEST_EQUAL(mymset.size(), 2);
810 TEST_EQUAL(doc1, *mymset[1]);
811 TEST_EQUAL(doc2, *mymset[0]);
812 mymset = enquire.get_mset(0, 10);
813 ranker.train_model(training_data, "ListMLE_Ranker");
814 ranker.rank(mymset, "ListMLE_Ranker");
815 TEST_EQUAL(doc1, *mymset[1]);
816 TEST_EQUAL(doc2, *mymset[0]);
817 TEST_EXCEPTION(Xapian::LetorInternalError,
818 ranker.score(query, qrel, "ListMLE_Ranker",
819 "scorer_output.txt", 10, ""));
820 TEST_EXCEPTION(Xapian::FileNotFoundError,
821 ranker.score("", qrel, "ListMLE_Ranker",
822 "scorer_output.txt", 10));
823 TEST_EXCEPTION(Xapian::FileNotFoundError,
824 ranker.score(qrel, "", "ListMLE_Ranker",
825 "scorer_output.txt", 10));
826 unlink("ndcg_output_listmle_3.txt");
827 ranker.score(query, qrel, "ListMLE_Ranker", "ndcg_output_listmle_3.txt",
828 10);
829 TEST(file_exists("ndcg_output_listmle_3.txt"));
830 unlink("err_output_listmle_3.txt");
831 unlink("ndcg_output_listmle_3.txt");
832 ranker.score(query, qrel, "ListMLE_Ranker", "err_output_listmle_3.txt", 10,
833 "ERRScore");
834 TEST(file_exists("err_output_listmle_3.txt"));
835 unlink("err_output_listmle_3.txt");
838 // Featurename check
839 DEFINE_TESTCASE(featurename, !backend)
841 Xapian::TfDoclenCollTfCollLenFeature feature1;
842 Xapian::TfDoclenFeature feature2;
843 Xapian::IdfFeature feature3;
844 Xapian::TfFeature feature4;
845 Xapian::TfIdfDoclenFeature feature5;
846 Xapian::CollTfCollLenFeature feature6;
847 TEST_EQUAL(feature1.name(), "TfDoclenCollTfCollLenFeature");
848 TEST_EQUAL(feature2.name(), "TfDoclenFeature");
849 TEST_EQUAL(feature3.name(), "IdfFeature");
850 TEST_EQUAL(feature4.name(), "TfFeature");
851 TEST_EQUAL(feature5.name(), "TfIdfDoclenFeature");
852 TEST_EQUAL(feature6.name(), "CollTfCollLenFeature");
855 DEFINE_TESTCASE(err_scorer, !backend)
857 /* Derived from the example mentioned in the blogpost
858 * https://lingpipe-blog.com/2010/03/09/chapelle-metzler-zhang-grinspan-2009-expected-reciprocal-rank-for-graded-relevance/
860 vector<Xapian::FeatureVector> fvv;
861 Xapian::FeatureVector temp1;
862 Xapian::FeatureVector temp2;
863 Xapian::FeatureVector temp3;
864 temp1.set_label(3);
865 fvv.push_back(temp1);
866 temp2.set_label(2);
867 fvv.push_back(temp2);
868 temp3.set_label(4);
869 fvv.push_back(temp3);
870 Xapian::ERRScore err;
871 double err_score = err.score(fvv);
873 TEST(abs(err_score - 0.63) < 0.01);
876 DEFINE_TESTCASE(ndcg_score_test, path && writable)
878 Xapian::ListNETRanker ranker;
879 string db_path = get_database_path("db_index_three_documents",
880 db_index_three_documents);
881 Xapian::Enquire enquire((Xapian::Database(db_path)));
882 enquire.set_query(Xapian::Query("score"));
883 Xapian::MSet mymset = enquire.get_mset(0, 10);
884 string data_directory = test_driver::get_srcdir() + "/testdata/";
885 string query = data_directory + "querythree.txt";
886 string qrel = data_directory + "score_qrel.txt";
887 string training_data = data_directory + "training_data_ndcg.txt";
888 ranker.set_database_path(db_path);
889 ranker.set_query(Xapian::Query("score"));
890 ranker.train_model(training_data, "ListNet_Ranker");
891 ranker.rank(mymset, "ListNet_Ranker");
892 unlink("ndcg_score_test.txt");
893 ranker.score(query, qrel, "ListNet_Ranker", "ndcg_score_test.txt", 10);
894 TEST(file_exists("ndcg_score_test.txt"));
895 unlink("ndcg_score_test.txt");
898 DEFINE_TESTCASE(different_no_features, !backend)
900 Xapian::ListNETRanker ranker;
901 string data_directory = test_driver::get_srcdir() + "/testdata/";
902 string training_data = data_directory +
903 "training_data_different_no_features.txt";
904 TEST_EXCEPTION(Xapian::InvalidArgumentError,
905 ranker.train_model(training_data, "ListNet_Ranker"));
908 // Test createfeaturevector method for TfFeature
909 DEFINE_TESTCASE(createfeaturevector_tffeature, backend)
911 vector<Xapian::Feature*> f;
912 Xapian::TfFeature* f1 = new Xapian::TfFeature();
913 f.push_back(f1);
915 Xapian::FeatureList fl(f);
916 Xapian::Database db = get_database("db_index_three_documents",
917 db_index_three_documents);
918 Xapian::Enquire enquire(db);
919 enquire.set_query(Xapian::Query("score"));
920 Xapian::MSet mset;
921 mset = enquire.get_mset(0, 10);
923 Xapian::QueryParser queryparser;
924 queryparser.set_stemmer(Xapian::Stem("en"));
925 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
926 queryparser.add_prefix("title", "S");
927 queryparser.add_prefix("description", "XD");
929 // As the feature values depend on different prefixed terms in the query
930 // like "title","body" and "whole", so we need to separate it out instead
931 // of just writing Query("score").
932 string querystring = "title:score description:score score";
933 Xapian::Query query = queryparser.parse_query(querystring);
935 auto fv = fl.create_feature_vectors(mset, query, db);
936 TEST_EQUAL(fv.size(), 2);
937 TEST_EQUAL(fv[0].get_fcount(), 4);
938 TEST_EQUAL(fv[1].get_fcount(), 4);
940 vector<double> fvals_doc1 = fv[0].get_fvals();
941 vector<double> fvals_doc2 = fv[1].get_fvals();
942 TEST_EQUAL(fvals_doc1.size(), 4);
943 TEST_EQUAL(fvals_doc2.size(), 4);
945 vector<double> test_vals_doc1(4);
946 // These are the appropriate TfFeature values for the first document.
947 test_vals_doc1[0] = 0.301029995663981;
948 test_vals_doc1[1] = 1.653212513775344;
949 test_vals_doc1[2] = 1.954242509439325;
951 Xapian::MSetIterator it = mset.begin();
952 test_vals_doc1[3] = it.get_weight();
954 vector<double> test_vals_doc2(4);
955 // These are the appropriate TfFeature values for the second document.
956 test_vals_doc2[0] = 0;
957 test_vals_doc2[1] = 1.732393759822969;
958 test_vals_doc2[2] = 1.732393759822969;
960 ++it;
961 test_vals_doc2[3] = it.get_weight();
963 double max_val[4];
964 for (int i = 0; i < 4; ++i) {
965 max_val[i] = max(test_vals_doc1[i], test_vals_doc2[i]);
968 // test for title in normalized form
969 TEST_EQUAL_DOUBLE(fvals_doc1[0], test_vals_doc1[0] / max_val[0]);
970 TEST_EQUAL_DOUBLE(fvals_doc2[0], test_vals_doc2[0] / max_val[0]);
972 // test for body in normalized form
973 TEST_EQUAL_DOUBLE(fvals_doc1[1], test_vals_doc1[1] / max_val[1]);
974 TEST_EQUAL_DOUBLE(fvals_doc2[1], test_vals_doc2[1] / max_val[1]);
976 // test for whole in normalized form
977 TEST_EQUAL_DOUBLE(fvals_doc1[2], test_vals_doc1[2] / max_val[2]);
978 TEST_EQUAL_DOUBLE(fvals_doc2[2], test_vals_doc2[2] / max_val[2]);
980 // test for weight in normalized form
981 TEST_EQUAL_DOUBLE(fvals_doc1[3], test_vals_doc1[3] / max_val[3]);
982 TEST_EQUAL_DOUBLE(fvals_doc2[3], test_vals_doc2[3] / max_val[3]);
985 // Test createfeaturevector method for IdfFeature
986 DEFINE_TESTCASE(createfeaturevector_idffeature, backend)
988 vector<Xapian::Feature*> f;
989 Xapian::IdfFeature* f1 = new Xapian::IdfFeature();
990 f.push_back(f1);
992 Xapian::FeatureList fl(f);
993 Xapian::Database db = get_database("db_index_three_documents",
994 db_index_three_documents);
995 Xapian::Enquire enquire(db);
996 enquire.set_query(Xapian::Query("score"));
997 Xapian::MSet mset;
998 mset = enquire.get_mset(0, 10);
1000 Xapian::QueryParser queryparser;
1001 queryparser.set_stemmer(Xapian::Stem("en"));
1002 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
1003 queryparser.add_prefix("title", "S");
1004 queryparser.add_prefix("description", "XD");
1006 // As the feature values depend on different prefixed terms in the query
1007 // like "title","body" and "whole", so we need to separate it out instead
1008 // of just writing Query("score").
1009 string querystring = "title:tigers description:tigers tigers"
1010 " title:score description:score score";
1011 Xapian::Query query = queryparser.parse_query(querystring);
1013 auto fv = fl.create_feature_vectors(mset, query, db);
1014 TEST_EQUAL(fv.size(), 2);
1015 TEST_EQUAL(fv[0].get_fcount(), 4);
1016 TEST_EQUAL(fv[1].get_fcount(), 4);
1018 vector<double> fvals_doc1 = fv[0].get_fvals();
1019 vector<double> fvals_doc2 = fv[1].get_fvals();
1020 TEST_EQUAL(fvals_doc1.size(), 4);
1021 TEST_EQUAL(fvals_doc2.size(), 4);
1023 vector<double> test_vals_doc1(4);
1024 // These are the appropriate IdfFeature values for the first document.
1025 test_vals_doc1[0] = 0.0;
1026 test_vals_doc1[1] = 0.0;
1027 test_vals_doc1[2] = 0.0;
1029 Xapian::MSetIterator it = mset.begin();
1030 test_vals_doc1[3] = it.get_weight();
1032 vector<double> test_vals_doc2(4);
1033 // These are the appropriate IdfFeature values for the second document.
1034 test_vals_doc2[0] = 0.0;
1035 test_vals_doc2[1] = 0.0;
1036 test_vals_doc2[2] = 0.0;
1038 ++it;
1039 test_vals_doc2[3] = it.get_weight();
1041 double max_weight = max(test_vals_doc1[3], test_vals_doc2[3]);
1043 // test for title in normalized form
1044 TEST_EQUAL_DOUBLE(fvals_doc1[0], 0.0);
1045 TEST_EQUAL_DOUBLE(fvals_doc2[0], 0.0);
1047 // test for body in normalized form
1048 TEST_EQUAL_DOUBLE(fvals_doc1[1], 0.0);
1049 TEST_EQUAL_DOUBLE(fvals_doc2[1], 0.0);
1051 // test for whole in normalized form
1052 TEST_EQUAL_DOUBLE(fvals_doc1[2], 0.0);
1053 TEST_EQUAL_DOUBLE(fvals_doc2[2], 0.0);
1055 // test for weight in normalized form
1056 TEST_EQUAL_DOUBLE(fvals_doc1[3], test_vals_doc1[3] / max_weight);
1057 TEST_EQUAL_DOUBLE(fvals_doc2[3], test_vals_doc2[3] / max_weight);
1060 // Test createfeaturevector method for TfDoclenFeature
1061 DEFINE_TESTCASE(createfeaturevector_tfdoclenfeature, backend)
1063 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
1064 vector<Xapian::Feature*> f;
1065 Xapian::TfDoclenFeature* f1 = new Xapian::TfDoclenFeature();
1066 f.push_back(f1);
1068 Xapian::FeatureList fl(f);
1069 Xapian::Database db = get_database("db_index_three_documents_no_common",
1070 db_index_three_documents_no_common);
1071 Xapian::Enquire enquire(db);
1072 enquire.set_query(Xapian::Query("score"));
1073 Xapian::MSet mset;
1074 mset = enquire.get_mset(0, 10);
1076 Xapian::QueryParser queryparser;
1077 queryparser.set_stemmer(Xapian::Stem("en"));
1078 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
1079 queryparser.add_prefix("title", "S");
1080 queryparser.add_prefix("description", "XD");
1082 // As the feature values depend on different prefixed terms in the query
1083 // like "title","body" and "whole", so we need to separate it out instead
1084 // of just writing Query("score").
1085 string querystring = "title:score description:score score";
1086 Xapian::Query query = queryparser.parse_query(querystring);
1088 auto fv = fl.create_feature_vectors(mset, query, db);
1089 TEST_EQUAL(fv.size(), 2);
1090 TEST_EQUAL(fv[0].get_fcount(), 4);
1091 TEST_EQUAL(fv[1].get_fcount(), 4);
1093 vector<double> fvals_doc1 = fv[0].get_fvals();
1094 vector<double> fvals_doc2 = fv[1].get_fvals();
1095 TEST_EQUAL(fvals_doc1.size(), 4);
1096 TEST_EQUAL(fvals_doc2.size(), 4);
1098 vector<double> test_vals_doc1(4);
1099 // These are the appropriate TfDoclenFeature values for the first
1100 // document.
1101 test_vals_doc1[0] = 0.0511525224473813;
1102 test_vals_doc1[1] = 0.0323089286738408;
1103 test_vals_doc1[2] = 0.0335890631408052;
1105 Xapian::MSetIterator it = mset.begin();
1106 test_vals_doc1[3] = it.get_weight();
1108 vector<double> test_vals_doc2(4);
1109 // These are the appropriate TfDoclenFeature values for the second
1110 // document.
1111 test_vals_doc2[0] = 0.0;
1112 test_vals_doc2[1] = 0.03237347800973529;
1113 test_vals_doc2[2] = 0.03200637092048766;
1115 ++it;
1116 test_vals_doc2[3] = it.get_weight();
1118 double max_val[4];
1119 for (int i = 0; i < 4; ++i) {
1120 max_val[i] = max(test_vals_doc1[i], test_vals_doc2[i]);
1123 // test for title in normalized form
1124 TEST_EQUAL_DOUBLE(fvals_doc1[0], test_vals_doc1[0] / max_val[0]);
1125 TEST_EQUAL_DOUBLE(fvals_doc2[0], test_vals_doc2[0] / max_val[0]);
1127 // test for body in normalized form
1128 TEST_EQUAL_DOUBLE(fvals_doc1[1], test_vals_doc1[1] / max_val[1]);
1129 TEST_EQUAL_DOUBLE(fvals_doc2[1], test_vals_doc2[1] / max_val[1]);
1131 // test for whole in normalized form
1132 TEST_EQUAL_DOUBLE(fvals_doc1[2], test_vals_doc1[2] / max_val[2]);
1133 TEST_EQUAL_DOUBLE(fvals_doc2[2], test_vals_doc2[2] / max_val[2]);
1135 // test for weight in normalized form
1136 TEST_EQUAL_DOUBLE(fvals_doc1[3], test_vals_doc1[3] / max_val[3]);
1137 TEST_EQUAL_DOUBLE(fvals_doc2[3], test_vals_doc2[3] / max_val[3]);
1140 // Test createfeaturevector method for CollTfCollLenFeature
1141 DEFINE_TESTCASE(createfeaturevector_colltfcolllenfeature, backend)
1143 vector<Xapian::Feature*> f;
1144 Xapian::CollTfCollLenFeature* f1 = new Xapian::CollTfCollLenFeature();
1145 f.push_back(f1);
1147 Xapian::FeatureList fl(f);
1148 Xapian::Database db = get_database("db_index_three_documents_no_common",
1149 db_index_three_documents_no_common);
1150 Xapian::Enquire enquire(db);
1151 enquire.set_query(Xapian::Query("score"));
1152 Xapian::MSet mset;
1153 mset = enquire.get_mset(0, 10);
1155 Xapian::QueryParser queryparser;
1156 queryparser.set_stemmer(Xapian::Stem("en"));
1157 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
1158 queryparser.add_prefix("title", "S");
1159 queryparser.add_prefix("description", "XD");
1161 // As the feature values depend on different prefixed terms in the query
1162 // like "title","body" and "whole", so we need to separate it out instead
1163 // of just writing Query("score").
1164 string querystring = "title:score description:score score";
1165 Xapian::Query query = queryparser.parse_query(querystring);
1167 auto fv = fl.create_feature_vectors(mset, query, db);
1168 TEST_EQUAL(fv.size(), 2);
1169 TEST_EQUAL(fv[0].get_fcount(), 4);
1170 TEST_EQUAL(fv[1].get_fcount(), 4);
1172 vector<double> fvals_doc1 = fv[0].get_fvals();
1173 vector<double> fvals_doc2 = fv[1].get_fvals();
1174 TEST_EQUAL(fvals_doc1.size(), 4);
1175 TEST_EQUAL(fvals_doc2.size(), 4);
1177 vector<double> test_vals_doc1(4);
1178 // These are the appropriate CollTfCollLenFeature values for the first
1179 // document.
1180 test_vals_doc1[0] = 0.45863784902564930;
1181 test_vals_doc1[1] = 3.13291481930625260;
1182 test_vals_doc1[2] = 4.94672282004904673;
1184 Xapian::MSetIterator it = mset.begin();
1185 test_vals_doc1[3] = it.get_weight();
1187 vector<double> test_vals_doc2(4);
1188 // values will be same as that of the first document
1189 test_vals_doc2[0] = 0.45863784902564930;
1190 test_vals_doc2[1] = 3.13291481930625260;
1191 test_vals_doc2[2] = 4.94672282004904673;
1193 ++it;
1194 test_vals_doc2[3] = it.get_weight();
1196 double max_val[4];
1197 for (int i = 0; i < 4; ++i) {
1198 max_val[i] = max(test_vals_doc1[i], test_vals_doc2[i]);
1201 // test for title in normalized form
1202 TEST_EQUAL_DOUBLE(fvals_doc1[0], test_vals_doc1[0] / max_val[0]);
1203 TEST_EQUAL_DOUBLE(fvals_doc2[0], test_vals_doc2[0] / max_val[0]);
1205 // test for body in normalized form
1206 TEST_EQUAL_DOUBLE(fvals_doc1[1], test_vals_doc1[1] / max_val[1]);
1207 TEST_EQUAL_DOUBLE(fvals_doc2[1], test_vals_doc2[1] / max_val[1]);
1209 // test for whole in normalized form
1210 TEST_EQUAL_DOUBLE(fvals_doc1[2], test_vals_doc1[2] / max_val[2]);
1211 TEST_EQUAL_DOUBLE(fvals_doc2[2], test_vals_doc2[2] / max_val[2]);
1213 // test for weight in normalized form
1214 TEST_EQUAL_DOUBLE(fvals_doc1[3], test_vals_doc1[3] / max_val[3]);
1215 TEST_EQUAL_DOUBLE(fvals_doc2[3], test_vals_doc2[3] / max_val[3]);
1218 // Test createfeaturevector method for TfIdfDoclenFeature
1219 DEFINE_TESTCASE(createfeaturevector_tfidfdoclenfeature, backend)
1221 vector<Xapian::Feature*> f;
1222 Xapian::TfIdfDoclenFeature* f1 = new Xapian::TfIdfDoclenFeature();
1223 f.push_back(f1);
1225 Xapian::FeatureList fl(f);
1226 Xapian::Database db = get_database("db_index_three_documents",
1227 db_index_three_documents);
1228 Xapian::Enquire enquire(db);
1229 enquire.set_query(Xapian::Query("score"));
1230 Xapian::MSet mset;
1231 mset = enquire.get_mset(0, 10);
1233 Xapian::QueryParser queryparser;
1234 queryparser.set_stemmer(Xapian::Stem("en"));
1235 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
1236 queryparser.add_prefix("title", "S");
1237 queryparser.add_prefix("description", "XD");
1239 // As the feature values depend on different prefixed terms in the query
1240 // like "title","body" and "whole", so we need to separate it out instead
1241 // of just writing Query("score").
1242 string querystring = "title:score description:score score";
1243 Xapian::Query query = queryparser.parse_query(querystring);
1245 auto fv = fl.create_feature_vectors(mset, query, db);
1246 TEST_EQUAL(fv.size(), 2);
1247 TEST_EQUAL(fv[0].get_fcount(), 4);
1248 TEST_EQUAL(fv[1].get_fcount(), 4);
1250 vector<double> fvals_doc1 = fv[0].get_fvals();
1251 vector<double> fvals_doc2 = fv[1].get_fvals();
1252 TEST_EQUAL(fvals_doc1.size(), 4);
1253 TEST_EQUAL(fvals_doc2.size(), 4);
1255 vector<double> test_vals_doc1(4);
1256 // These are the appropriate TfIdfDoclenFeature values for the first
1257 // document.
1258 test_vals_doc1[0] = 0.0;
1259 test_vals_doc1[1] = 0.0;
1260 test_vals_doc1[2] = 0.0;
1262 Xapian::MSetIterator it = mset.begin();
1263 test_vals_doc1[3] = it.get_weight();
1265 vector<double> test_vals_doc2(4);
1266 // These are the appropriate TfIdfDoclenFeature values for the second
1267 // document.
1268 test_vals_doc2[0] = 0.0;
1269 test_vals_doc2[1] = 0.0;
1270 test_vals_doc2[2] = 0.0;
1272 ++it;
1273 test_vals_doc2[3] = it.get_weight();
1275 double max_weight = max(test_vals_doc1[3], test_vals_doc2[3]);
1277 // test for title in normalized form
1278 TEST_EQUAL_DOUBLE(fvals_doc1[0], 0.0);
1279 TEST_EQUAL_DOUBLE(fvals_doc2[0], 0.0);
1281 // test for body in normalized form
1282 TEST_EQUAL_DOUBLE(fvals_doc1[1], 0.0);
1283 TEST_EQUAL_DOUBLE(fvals_doc2[1], 0.0);
1285 // test for whole in normalized form
1286 TEST_EQUAL_DOUBLE(fvals_doc1[2], 0.0);
1287 TEST_EQUAL_DOUBLE(fvals_doc2[2], 0.0);
1289 // test for weight in normalized form
1290 TEST_EQUAL_DOUBLE(fvals_doc1[3], test_vals_doc1[3] / max_weight);
1291 TEST_EQUAL_DOUBLE(fvals_doc2[3], test_vals_doc2[3] / max_weight);
1294 // Test createfeaturevector method for TfDoclenCollTfCollLenFeature
1295 DEFINE_TESTCASE(createfeaturevector_tfdoclencolllfcolllen, backend)
1297 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
1298 vector<Xapian::Feature*> f;
1299 Xapian::TfDoclenCollTfCollLenFeature* f1 =
1300 new Xapian::TfDoclenCollTfCollLenFeature();
1301 f.push_back(f1);
1303 Xapian::FeatureList fl(f);
1304 Xapian::Database db = get_database("db_index_three_documents",
1305 db_index_three_documents);
1306 Xapian::Enquire enquire(db);
1307 enquire.set_query(Xapian::Query("score"));
1308 Xapian::MSet mset;
1309 mset = enquire.get_mset(0, 10);
1311 Xapian::QueryParser queryparser;
1312 queryparser.set_stemmer(Xapian::Stem("en"));
1313 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
1314 queryparser.add_prefix("title", "S");
1315 queryparser.add_prefix("description", "XD");
1317 // As the feature values depend on different prefixed terms in the query
1318 // like "title","body" and "whole", so we need to separate it out instead
1319 // of just writing Query("score").
1320 string querystring = "title:score description:score score";
1321 Xapian::Query query = queryparser.parse_query(querystring);
1323 auto fv = fl.create_feature_vectors(mset, query, db);
1324 TEST_EQUAL(fv.size(), 2);
1325 TEST_EQUAL(fv[0].get_fcount(), 4);
1326 TEST_EQUAL(fv[1].get_fcount(), 4);
1328 vector<double> fvals_doc1 = fv[0].get_fvals();
1329 vector<double> fvals_doc2 = fv[1].get_fvals();
1330 TEST_EQUAL(fvals_doc1.size(), 4);
1331 TEST_EQUAL(fvals_doc2.size(), 4);
1333 vector<double> test_vals_doc1(4);
1334 // These are the appropriate TfDoclenCollTfCollLenFeature values for
1335 // the first document.
1336 test_vals_doc1[0] = 0.11394335230683678;
1337 test_vals_doc1[1] = 0.76130720333102619;
1338 test_vals_doc1[2] = 0.90738326700002048;
1340 Xapian::MSetIterator it = mset.begin();
1341 test_vals_doc1[3] = it.get_weight();
1343 vector<double> test_vals_doc2(4);
1344 // These are the appropriate TfDoclenCollTfCollLenFeature values for
1345 // the second document.
1346 test_vals_doc2[0] = 0.0;
1347 test_vals_doc2[1] = 0.77758890362035493;
1348 test_vals_doc2[2] = 0.78786362447009839;
1350 ++it;
1351 test_vals_doc2[3] = it.get_weight();
1353 double max_val[4];
1354 for (int i = 0; i < 4; ++i) {
1355 max_val[i] = max(test_vals_doc1[i], test_vals_doc2[i]);
1358 // test for title in normalized form
1359 TEST_EQUAL_DOUBLE(fvals_doc1[0], test_vals_doc1[0] / max_val[0]);
1360 TEST_EQUAL_DOUBLE(fvals_doc2[0], test_vals_doc2[0] / max_val[0]);
1362 // test for body in normalized form
1363 TEST_EQUAL_DOUBLE(fvals_doc1[1], test_vals_doc1[1] / max_val[1]);
1364 TEST_EQUAL_DOUBLE(fvals_doc2[1], test_vals_doc2[1] / max_val[1]);
1366 // test for whole in normalized form
1367 TEST_EQUAL_DOUBLE(fvals_doc1[2], test_vals_doc1[2] / max_val[2]);
1368 TEST_EQUAL_DOUBLE(fvals_doc2[2], test_vals_doc2[2] / max_val[2]);
1370 // test for weight in normalized form
1371 TEST_EQUAL_DOUBLE(fvals_doc1[3], test_vals_doc1[3] / max_val[3]);
1372 TEST_EQUAL_DOUBLE(fvals_doc2[3], test_vals_doc2[3] / max_val[3]);
1375 class CustomFeature : public Xapian::Feature {
1376 public:
1377 CustomFeature() {
1378 need_stat(Xapian::Feature::TERM_FREQUENCY);
1379 need_stat(Xapian::Feature::DOCUMENT_LENGTH);
1380 need_stat(Xapian::Feature::COLLECTION_TERM_FREQ);
1381 need_stat(Xapian::Feature::COLLECTION_LENGTH);
1382 need_stat(Xapian::Feature::INVERSE_DOCUMENT_FREQUENCY);
1384 std::vector<double> get_values() const {
1385 return vector<double>();
1387 std::string name() const {
1388 return "CustomFeature";
1390 void test_stats() {
1391 // test for term frequency
1392 TEST_EQUAL(get_termfreq("ZStiger"), 1);
1393 TEST_EQUAL(get_termfreq("ZXDtiger"), 6);
1394 TEST_EQUAL(get_termfreq("Ztiger"), 2);
1396 // test for inverse document frequency
1397 TEST_EQUAL_DOUBLE(get_inverse_doc_freq("ZStiger"), 0.176091259055681);
1398 TEST_EQUAL_DOUBLE(get_inverse_doc_freq("ZXDtiger"), 0.176091259055681);
1399 TEST_EQUAL_DOUBLE(get_inverse_doc_freq("Ztiger"), 0.176091259055681);
1401 // test for document length
1402 TEST_EQUAL(get_doc_length("title"), 4);
1403 TEST_EQUAL(get_doc_length("body"), 182);
1404 TEST_EQUAL(get_doc_length("whole"), 186);
1406 // test for collection length
1407 TEST_EQUAL(get_collection_length("title"), 13);
1408 TEST_EQUAL(get_collection_length("body"), 509);
1409 TEST_EQUAL(get_collection_length("whole"), 522);
1411 // test for collection term frequency
1412 TEST_EQUAL(get_collection_termfreq("ZStiger"), 1);
1413 TEST_EQUAL(get_collection_termfreq("ZXDtiger"), 6);
1414 TEST_EQUAL(get_collection_termfreq("Ztiger"), 2);
1418 DEFINE_TESTCASE(populatefeature, backend) {
1419 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
1420 vector<Xapian::Feature*> f;
1421 CustomFeature* custom_feature = new CustomFeature();
1422 f.push_back(custom_feature);
1424 Xapian::FeatureList fl(f);
1425 Xapian::Database db = get_database("db_index_three_documents_no_common",
1426 db_index_three_documents_no_common);
1427 Xapian::Enquire enquire(db);
1428 enquire.set_query(Xapian::Query("tigers"));
1429 Xapian::MSet mset;
1430 mset = enquire.get_mset(0, 10);
1432 TEST(!mset.empty());
1434 Xapian::QueryParser queryparser;
1435 queryparser.set_stemmer(Xapian::Stem("en"));
1436 queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z);
1437 queryparser.add_prefix("title", "S");
1438 queryparser.add_prefix("description", "XD");
1440 string querystring = "title:tigers description:tigers tigers";
1441 Xapian::Query query = queryparser.parse_query(querystring);
1443 auto fv = fl.create_feature_vectors(mset, query, db);
1444 TEST_EQUAL(fv.size(), 1);
1446 custom_feature->test_stats();