2 * @brief test common features of API classes
4 /* Copyright (C) 2007,2009,2012,2014,2015,2016,2023 Olly Betts
5 * Copyright (C) 2019 Vaibhav Kansagara
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "api_letor.h"
31 #include <xapian-letor.h>
34 #include "filetests.h"
35 #include "safeunistd.h"
36 #include "testutils.h"
40 // To check for one document edge
42 db_index_one_document(Xapian::WritableDatabase
& db
, const string
&)
45 Xapian::TermGenerator termgenerator
;
46 termgenerator
.set_document(doc
);
47 termgenerator
.set_stemmer(Xapian::Stem("en"));
48 termgenerator
.index_text("Tigers are solitary animals", 1, "S");
49 termgenerator
.index_text("Might be that only one Tiger is good enough to "
50 "Take out a ranker, a Tiger is a good way to "
51 "check if a test is working or Tiger not. Tiger."
52 "What if the next line contains no Tigers? Would "
53 "it make a difference to your ranker ? Tigers "
54 "for the win.", 1, "XD");
55 termgenerator
.index_text("The will.");
56 termgenerator
.increase_termpos();
57 termgenerator
.index_text("Tigers would not be caught if one calls out the "
58 "Tiger from the den. This document is to check if "
59 "in the massive dataset, you forget the sense of "
60 "something you would not like to stop.");
65 db_index_two_documents(Xapian::WritableDatabase
& db
, const string
&)
68 Xapian::TermGenerator termgenerator
;
69 termgenerator
.set_document(doc
);
70 termgenerator
.set_stemmer(Xapian::Stem("en"));
71 termgenerator
.index_text("Lions, Tigers, Bears and Giraffes", 1, "S");
72 termgenerator
.index_text("This paragraph talks about lions and tigers and "
73 "bears (oh, my!). It mentions giraffes, "
74 "but that's not really very important. Lions "
75 "and tigers are big cats, so they must be really "
76 "cuddly. Bears are famous for being cuddly, at "
77 "least when they're teddy bears.", 1, "XD");
78 termgenerator
.index_text("Lions, Tigers, Bears and Giraffes");
79 termgenerator
.increase_termpos();
80 termgenerator
.index_text("This paragraph talks about lions and tigers and "
81 "bears (oh, my!). It mentions giraffes, "
82 "but that's not really very important. Lions "
83 "and tigers are big cats, so they must be really "
84 "cuddly. Bears are famous for being cuddly, at "
85 "least when they're teddy bears.");
88 termgenerator
.index_text("Lions, Tigers and Bears", 1, "S");
89 termgenerator
.index_text("This is the paragraph of interest. Tigers are "
90 "massive beasts - I wouldn't want to meet a "
91 "hungry one anywhere. Lions are scary even when "
92 "lyin' down. Bears are scary even when bare. "
93 "Together I suspect they'd be less scary, as the "
94 "tigers, lions, and bears would all keep each "
95 "other busy. On the other hand, bears don't live "
96 "in the same continent as far as I know.", 1,
98 termgenerator
.index_text("Lions, Tigers and Bears");
99 termgenerator
.increase_termpos();
100 termgenerator
.index_text("This is the paragraph of interest. Tigers are "
101 "massive beasts - I wouldn't want to meet a "
102 "hungry one anywhere. Lions are scary even when "
103 "lyin' down. Bears are scary even when bare. "
104 "Together I suspect they'd be less scary, as the "
105 "tigers, lions, and bears would all keep each "
106 "other busy. On the other hand, bears don't live "
107 "in the same continent as far as I know.");
108 db
.add_document(doc
);
111 // To check for three documents. out of which one is irrelevant
113 db_index_three_documents(Xapian::WritableDatabase
& db
, const string
&)
115 Xapian::Document doc
;
116 Xapian::TermGenerator termgenerator
;
117 termgenerator
.set_document(doc
);
118 termgenerator
.set_stemmer(Xapian::Stem("en"));
119 termgenerator
.index_text("The will", 1, "S");
120 termgenerator
.index_text("The will are considered stop words in xapian and "
121 "would be thrown off, so the query I want to say "
122 "is score, yes, score. The Score of a game is "
123 "the determining factor of a game, the score is "
124 "what matters at the end of the day. so my advise "
125 "to everyone is to Score it!.", 1, "XD");
126 termgenerator
.index_text("Score might be something else too, but this para "
127 "refers to score only at an abstract. Scores are "
128 "in general scoring. Score it!");
129 termgenerator
.increase_termpos();
130 termgenerator
.index_text("Score score score is important.");
131 db
.add_document(doc
);
133 termgenerator
.index_text("Score score score score score score", 1, "S");
134 termgenerator
.index_text("it might have an absurdly high rank in the qrel "
135 "file or might have no rank at all in another. "
136 "Look out for this as a testcase, might be edgy "
137 "good luck and may this be with you.", 1, "XD");
138 termgenerator
.index_text("Another irrelevant paragraph to make sure the tf "
139 "values are down, but this increases idf values "
140 "but let's see how this works out.");
141 termgenerator
.increase_termpos();
142 termgenerator
.index_text("Nothing to do with the query.");
143 db
.add_document(doc
);
145 termgenerator
.index_text("Document has nothing to do with score", 1, "S");
146 termgenerator
.index_text("This is just to check if score is given a higher "
147 "score if it is in the subject or not. Nothing "
148 "special, just judging scores by the look of it. "
149 "Some more scores but a bad qrel should be enough "
150 "to make sure it is ranked down.", 1, "XD");
151 termgenerator
.index_text("Score might be something else too, but this para "
152 "refers to score only at an abstract. Scores are "
153 "in general scoring. Score it!");
154 termgenerator
.increase_termpos();
155 termgenerator
.index_text("Score score score is important.");
156 db
.add_document(doc
);
159 // To check for three documents in which one has no common terms with other two.
161 db_index_three_documents_no_common(Xapian::WritableDatabase
& db
, const string
&)
163 Xapian::Document doc
;
164 Xapian::TermGenerator termgenerator
;
165 termgenerator
.set_document(doc
);
166 termgenerator
.set_stemmer(Xapian::Stem("en"));
167 termgenerator
.index_text("The will", 1, "S");
168 termgenerator
.index_text("The will are considered stop words in xapian and "
169 "would be thrown off, so the query I want to say "
170 "is score, yes, score. The Score of a game is "
171 "the determining factor of a game, the score is "
172 "what matters at the end of the day. so my advise "
173 "to everyone is to Score it!.", 1, "XD");
174 termgenerator
.index_text("Score might be something else too, but this para "
175 "refers to score only at an abstract. Scores are "
176 "in general scoring. Score it!");
177 termgenerator
.increase_termpos();
178 termgenerator
.index_text("Score score score is important.");
179 db
.add_document(doc
);
181 termgenerator
.index_text("Document has nothing to do with score", 1, "S");
182 termgenerator
.index_text("This is just to check if score is given a higher "
183 "score if it is in the subject or not. Nothing "
184 "special, just judging scores by the look of it. "
185 "Some more scores but a bad qrel should be enough "
186 "to make sure it is ranked down.", 1, "XD");
187 termgenerator
.index_text("Score might be something else too, but this para "
188 "refers to score only at an abstract. Scores are "
189 "in general scoring. Score it!");
190 termgenerator
.increase_termpos();
191 termgenerator
.index_text("Score score score is important.");
192 db
.add_document(doc
);
194 termgenerator
.index_text("Tigers are solitary animals", 1, "S");
195 termgenerator
.index_text("Might be that only one Tiger is good enough to "
196 "Take out a ranker, a Tiger is a good way to "
197 "check if a test is working or Tiger not. Tiger."
198 "What if the next line contains no Tigers? Would "
199 "it make a difference to your ranker ? Tigers "
200 "for the win.", 1, "XD");
201 termgenerator
.index_text("The will.");
202 termgenerator
.increase_termpos();
203 termgenerator
.index_text("Tigers would not be caught if one calls out the "
204 "Tiger from the den. This document is to check if "
205 "in the massive dataset, you forget the sense of "
206 "something you would not like to stop.");
207 db
.add_document(doc
);
210 DEFINE_TESTCASE(createfeaturevector
, backend
)
212 Xapian::FeatureList fl
;
213 Xapian::Database db
= get_database("db_index_two_documents",
214 db_index_two_documents
);
215 Xapian::Enquire
enquire(db
);
216 enquire
.set_query(Xapian::Query("lions"));
218 mset
= enquire
.get_mset(0, 10);
220 TEST_EQUAL(mset
.size(), 2);
221 auto fv
= fl
.create_feature_vectors(mset
, Xapian::Query("lions"), db
);
222 TEST_EQUAL(fv
.size(), 2);
223 TEST_EQUAL(fv
[0].get_fcount(), 19);
224 TEST_EQUAL(fv
[1].get_fcount(), 19);
227 DEFINE_TESTCASE(createfeaturevectoronevector
, backend
)
229 Xapian::FeatureList fl
;
230 Xapian::Database db
= get_database("apitest_ranker2",
231 db_index_one_document
);
232 Xapian::Enquire
enquire(db
);
233 enquire
.set_query(Xapian::Query("tigers"));
235 mset
= enquire
.get_mset(0, 10);
237 auto fv
= fl
.create_feature_vectors(mset
, Xapian::Query("tigers"), db
);
238 TEST_EQUAL(fv
.size(), 1);
239 TEST_EQUAL(fv
[0].get_fcount(), 19);
242 DEFINE_TESTCASE(createfeaturevectoronevector_wrongquery
, backend
)
244 Xapian::FeatureList fl
;
245 Xapian::Database db
= get_database("apitest_ranker3",
246 db_index_one_document
);
247 Xapian::Enquire
enquire(db
);
248 enquire
.set_query(Xapian::Query("llamas"));
250 mset
= enquire
.get_mset(0, 10);
252 auto fv
= fl
.create_feature_vectors(mset
, Xapian::Query("llamas"), db
);
253 TEST_EQUAL(fv
.size(), 0);
256 DEFINE_TESTCASE(createfeaturevectorthree
, backend
)
258 Xapian::FeatureList fl
;
259 Xapian::Database db
= get_database("db_index_three_documents",
260 db_index_three_documents
);
261 Xapian::Enquire
enquire(db
);
262 enquire
.set_query(Xapian::Query("score"));
264 mset
= enquire
.get_mset(0, 10);
266 auto fv
= fl
.create_feature_vectors(mset
, Xapian::Query("score"), db
);
267 TEST_EQUAL(fv
.size(), 2);
268 TEST_EQUAL(fv
[0].get_fcount(), 19);
269 TEST_EQUAL(fv
[1].get_fcount(), 19);
272 DEFINE_TESTCASE(emptyfeaturelist
, !backend
)
274 vector
<Xapian::Feature
*> f
;
275 TEST_EXCEPTION(Xapian::InvalidArgumentError
, Xapian::FeatureList
fl(f
));
278 DEFINE_TESTCASE(bigfeaturelist
, backend
)
280 vector
<Xapian::Feature
*> f
;
281 f
.push_back(new Xapian::TfFeature());
282 f
.push_back(new Xapian::TfDoclenFeature());
283 f
.push_back(new Xapian::IdfFeature());
284 f
.push_back(new Xapian::CollTfCollLenFeature());
285 f
.push_back(new Xapian::TfIdfDoclenFeature());
286 f
.push_back(new Xapian::TfDoclenCollTfCollLenFeature());
287 f
.push_back(new Xapian::TfFeature());
288 f
.push_back(new Xapian::TfDoclenFeature());
290 // pass big feature list.
291 Xapian::FeatureList
fl(f
);
292 Xapian::Database db
= get_database("db_index_two_documents",
293 db_index_two_documents
);
294 Xapian::Enquire
enquire(db
);
295 enquire
.set_query(Xapian::Query("tigers"));
297 mset
= enquire
.get_mset(0, 10);
301 auto fv
= fl
.create_feature_vectors(mset
, Xapian::Query("tigers"), db
);
302 TEST_EQUAL(fv
.size(), 2);
303 // Each feature contributes three values and weight as the default one
304 // making total as 25.
305 TEST_EQUAL(fv
[0].get_fcount(), 25);
306 TEST_EQUAL(fv
[1].get_fcount(), 25);
309 DEFINE_TESTCASE(preparetrainingfileonedb
, path
&& writable
)
311 string db_path
= get_database_path("apitest_listnet_ranker1",
312 db_index_one_document
);
313 string data_directory
= test_driver::get_srcdir() + "/testdata/";
314 string query
= data_directory
+ "queryone.txt";
315 string qrel
= data_directory
+ "qrelone.txt";
316 string training_data
= data_directory
+ "training_data_one_document.txt";
317 unlink("training_output_data_one_doc.txt");
318 Xapian::prepare_training_file(db_path
, query
, qrel
, 10,
319 "training_output_data_one_doc.txt");
320 TEST(file_exists("training_output_data_one_doc.txt"));
321 ifstream
if1(training_data
);
322 ifstream
if2("training_output_data_one_doc.txt");
325 while (getline(if1
, line1
)) {
326 TEST(getline(if2
, line2
));
327 istringstream
iss1(line1
);
328 istringstream
iss2(line2
);
332 while ((iss1
>> temp1
) && (iss2
>> temp2
)) {
333 // The 0th, 1st and 21st literals taken as input, are strings,
334 // and can be compared directly, They are: For example(test):
335 // ("1", "qid:20001" and "#docid=1") at 0th, 1st, and 21st pos
336 // respectively. Whereas the other values are doubles which
337 // would have to tested under TEST_DOUBLE() against precision.
338 if (i
== 0 || i
== 1 || i
== 21) {
339 TEST_EQUAL(temp1
, temp2
);
341 size_t t1
= temp1
.find_first_of(':');
342 size_t t2
= temp2
.find_first_of(':');
343 TEST_EQUAL_DOUBLE(stod(temp1
.substr(t1
+ 1)),
344 stod(temp2
.substr(t2
+ 1)));
349 TEST(!(iss2
>> temp2
));
351 TEST(!getline(if2
, line2
));
352 unlink("training_output_data_one_doc.txt");
355 #define TEST_PARSE_EXCEPTION(TESTFILE) TEST_EXCEPTION(Xapian::LetorParseError,\
356 Xapian::prepare_training_file(db_path,\
357 data_directory + TESTFILE, qrel, 10,\
358 "training_output.txt"))
360 // test whether query ids are unique in queryfile.
361 DEFINE_TESTCASE(unique_queryid
, path
)
363 string db_path
= get_database_path("db_index_one_document",
364 db_index_one_document
);
365 string data_directory
= test_driver::get_srcdir() + "/testdata/";
366 string qrel
= data_directory
+ "qrelone.txt";
367 TEST_PARSE_EXCEPTION("unique_query_id.txt");
370 DEFINE_TESTCASE(parse_querystring
, path
)
372 // All those cases which are not valid.
373 string db_path
= get_database_path("db_index_one_document",
374 db_index_one_document
);
375 string data_directory
= test_driver::get_srcdir() + "/testdata/";
376 string qrel
= data_directory
+ "qrelone.txt";
377 TEST_PARSE_EXCEPTION("parse_query_noopenquote.txt");
378 TEST_PARSE_EXCEPTION("parse_query_noclosingquote.txt");
379 TEST_PARSE_EXCEPTION("parse_query_empty_string.txt");
380 TEST_PARSE_EXCEPTION("nospace.txt");
381 TEST_PARSE_EXCEPTION("nosinglequotes.txt");
382 TEST_PARSE_EXCEPTION("blank_space_before_query.txt");
384 // All those cases which are valid.
385 Xapian::prepare_training_file(db_path
, data_directory
+
386 "parse_query_valid.txt", qrel
, 10,
387 "training_output.txt");
390 // Check stability for an empty qrel file
391 DEFINE_TESTCASE(preparetrainingfileonedb_empty_qrel
, path
)
393 string db_path
= get_database_path("ranker_empty",
394 db_index_one_document
);
395 string data_directory
= test_driver::get_srcdir() + "/testdata/";
396 string query
= data_directory
+ "queryone.txt";
397 string qrel
= data_directory
+ "empty_file.txt";
398 string training_data
= data_directory
+ "empty_file.txt";
399 unlink("training_output_empty.txt");
400 Xapian::prepare_training_file(db_path
, query
, qrel
, 10,
401 "training_output_empty.txt");
402 TEST(file_exists("training_output_empty.txt"));
403 ifstream
if1(training_data
);
404 ifstream
if2("training_output_empty.txt");
407 while (getline(if1
, line1
)) {
408 TEST(getline(if2
, line2
));
409 istringstream
iss1(line1
);
410 istringstream
iss2(line2
);
414 while ((iss1
>> temp1
) && (iss2
>> temp2
)) {
415 if (i
== 0 || i
== 1 || i
== 21) {
416 TEST_EQUAL(temp1
, temp2
);
418 size_t t1
= temp1
.find_first_of(':');
419 size_t t2
= temp2
.find_first_of(':');
420 TEST_EQUAL_DOUBLE(stod(temp1
.substr(t1
+ 1)),
421 stod(temp2
.substr(t2
+ 1)));
426 TEST(!(iss2
>> temp2
));
428 TEST(!getline(if2
, line2
));
429 unlink("training_output_empty.txt");
432 DEFINE_TESTCASE(preparetrainingfile_two_docs
, path
)
434 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
435 string db_path
= get_database_path("db_index_two_documents",
436 db_index_two_documents
);
437 string data_directory
= test_driver::get_srcdir() + "/testdata/";
438 string query
= data_directory
+ "query.txt";
439 string qrel
= data_directory
+ "qrel.txt";
440 string training_data
= data_directory
+ "training_data.txt";
441 unlink("training_output1.txt");
442 Xapian::prepare_training_file(db_path
, query
, qrel
, 10,
443 "training_output1.txt");
444 TEST(file_exists("training_output1.txt"));
445 ifstream
if1(training_data
);
446 ifstream
if2("training_output1.txt");
449 while (getline(if1
, line1
)) {
450 TEST(getline(if2
, line2
));
451 istringstream
iss1(line1
);
452 istringstream
iss2(line2
);
456 while ((iss1
>> temp1
) && (iss2
>> temp2
)) {
457 if (i
== 0 || i
== 1 || i
== 21) {
458 TEST_EQUAL(temp1
, temp2
);
460 size_t t1
= temp1
.find_first_of(':');
461 size_t t2
= temp2
.find_first_of(':');
462 TEST_EQUAL_DOUBLE(stod(temp1
.substr(t1
+ 1)),
463 stod(temp2
.substr(t2
+ 1)));
468 TEST(!(iss2
>> temp2
));
470 TEST(!getline(if2
, line2
));
471 unlink("training_output1.txt");
474 DEFINE_TESTCASE(preparetrainingfilethree
, path
)
476 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
477 string db_path
= get_database_path("db_index_three_documents",
478 db_index_three_documents
);
479 string data_directory
= test_driver::get_srcdir() + "/testdata/";
480 string query
= data_directory
+ "querythree.txt";
481 string qrel
= data_directory
+ "qrelthree_correct.txt";
482 string training_data
= data_directory
+ "training_data_three_correct.txt";
483 unlink("training_output_three_correct.txt");
484 Xapian::prepare_training_file(db_path
, query
, qrel
, 10,
485 "training_output_three_correct.txt");
486 TEST(file_exists("training_output_three_correct.txt"));
487 ifstream
if1(training_data
);
488 ifstream
if2("training_output_three_correct.txt");
491 while (getline(if1
, line1
)) {
492 TEST(getline(if2
, line2
));
493 istringstream
iss1(line1
);
494 istringstream
iss2(line2
);
498 while ((iss1
>> temp1
) && (iss2
>> temp2
)) {
499 if (i
== 0 || i
== 1 || i
== 21) {
500 TEST_EQUAL(temp1
, temp2
);
502 size_t t1
= temp1
.find_first_of(':');
503 size_t t2
= temp2
.find_first_of(':');
504 TEST_EQUAL_DOUBLE(stod(temp1
.substr(t1
+ 1)),
505 stod(temp2
.substr(t2
+ 1)));
510 TEST(!(iss2
>> temp2
));
512 TEST(!getline(if2
, line2
));
513 unlink("training_output_three_correct.txt");
516 // ListNet_Ranker check
517 DEFINE_TESTCASE(listnet_ranker
, path
&& writable
)
519 Xapian::ListNETRanker ranker
;
520 TEST_EXCEPTION(Xapian::FileNotFoundError
, ranker
.train_model(""));
521 string db_path
= get_database_path("db_index_two_documents",
522 db_index_two_documents
);
523 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
524 enquire
.set_query(Xapian::Query("lions"));
525 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
526 string data_directory
= test_driver::get_srcdir() + "/testdata/";
527 string query
= data_directory
+ "query.txt";
528 string qrel
= data_directory
+ "qrel.txt";
529 string training_data
= data_directory
+ "training_data.txt";
530 ranker
.set_database_path(db_path
);
531 TEST_EQUAL(ranker
.get_database_path(), db_path
);
532 ranker
.set_query(Xapian::Query("lions"));
533 ranker
.train_model(training_data
);
534 Xapian::docid doc1
= *mymset
[0];
535 Xapian::docid doc2
= *mymset
[1];
537 TEST_EQUAL(doc1
, *mymset
[1]);
538 TEST_EQUAL(doc2
, *mymset
[0]);
539 mymset
= enquire
.get_mset(0, 10);
540 ranker
.train_model(training_data
, "ListNet_Ranker");
541 ranker
.rank(mymset
, "ListNet_Ranker");
542 TEST_EQUAL(doc1
, *mymset
[1]);
543 TEST_EQUAL(doc2
, *mymset
[0]);
544 TEST_EXCEPTION(Xapian::LetorInternalError
,
545 ranker
.score(query
, qrel
, "ListNet_Ranker",
546 "scorer_output.txt", 10, ""));
547 TEST_EXCEPTION(Xapian::FileNotFoundError
,
548 ranker
.score("", qrel
, "ListNet_Ranker",
549 "scorer_output.txt", 10));
550 TEST_EXCEPTION(Xapian::FileNotFoundError
,
551 ranker
.score(qrel
, "", "ListNet_Ranker",
552 "scorer_output.txt", 10));
553 unlink("ndcg_output_listnet_2.txt");
554 ranker
.score(query
, qrel
, "ListNet_Ranker", "ndcg_output_listnet_2.txt",
556 TEST(file_exists("ndcg_output_listnet_2.txt"));
557 unlink("ndcg_output_listnet_2.txt");
558 unlink("err_output_listnet_2.txt");
559 ranker
.score(query
, qrel
, "ListNet_Ranker", "err_output_listnet_2.txt",
561 TEST(file_exists("err_output_listnet_2.txt"));
562 unlink("err_output_listnet_2.txt");
565 DEFINE_TESTCASE(listnet_ranker_one_file
, path
&& writable
)
567 Xapian::ListNETRanker ranker
;
568 TEST_EXCEPTION(Xapian::FileNotFoundError
, ranker
.train_model(""));
569 string db_path
= get_database_path("apitest_listnet_ranker5",
570 db_index_one_document
);
571 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
572 enquire
.set_query(Xapian::Query("tigers"));
573 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
574 string data_directory
= test_driver::get_srcdir() + "/testdata/";
575 string query
= data_directory
+ "queryone.txt";
576 string qrel
= data_directory
+ "qrelone.txt";
577 string training_data
= data_directory
+ "training_data_one_document.txt";
578 ranker
.set_database_path(db_path
);
579 TEST_EQUAL(ranker
.get_database_path(), db_path
);
580 ranker
.set_query(Xapian::Query("tigers"));
581 ranker
.train_model(training_data
);
582 Xapian::docid doc1
= *mymset
[0];
584 TEST_EQUAL(doc1
, *mymset
[0]);
585 mymset
= enquire
.get_mset(0, 10);
586 ranker
.train_model(training_data
, "ListNet_Ranker");
587 ranker
.rank(mymset
, "ListNet_Ranker");
588 TEST_EQUAL(doc1
, *mymset
[0]);
589 TEST_EXCEPTION(Xapian::LetorInternalError
,
590 ranker
.score(query
, qrel
, "ListNet_Ranker",
591 "scorer_output.txt", 10, ""));
592 TEST_EXCEPTION(Xapian::FileNotFoundError
,
593 ranker
.score("", qrel
, "ListNet_Ranker",
594 "scorer_output.txt", 10));
595 TEST_EXCEPTION(Xapian::FileNotFoundError
,
596 ranker
.score(qrel
, "", "ListNet_Ranker",
597 "scorer_output.txt", 10));
598 unlink("ndcg_output_listnet_1.txt");
599 ranker
.score(query
, qrel
, "ListNet_Ranker", "ndcg_output_listnet_1.txt",
601 TEST(file_exists("ndcg_output_listnet_1.txt"));
602 unlink("ndcg_output_listnet_1.txt");
603 unlink("err_output_listnet_1.txt");
604 ranker
.score(query
, qrel
, "ListNet_Ranker", "err_output_listnet_1.txt", 10,
606 TEST(file_exists("err_output_listnet_1.txt"));
607 unlink("err_output_listnet_1.txt");
610 DEFINE_TESTCASE(listnet_ranker_three_correct
, path
&& writable
)
612 Xapian::ListNETRanker ranker
;
613 TEST_EXCEPTION(Xapian::FileNotFoundError
, ranker
.train_model(""));
614 string db_path
= get_database_path("db_index_three_documents",
615 db_index_three_documents
);
616 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
617 enquire
.set_query(Xapian::Query("score"));
618 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
619 string data_directory
= test_driver::get_srcdir() + "/testdata/";
620 string query
= data_directory
+ "querythree.txt";
621 string qrel
= data_directory
+ "qrelthree_correct.txt";
622 string training_data
= data_directory
+ "training_data_three_correct.txt";
623 ranker
.set_database_path(db_path
);
624 TEST_EQUAL(ranker
.get_database_path(), db_path
);
625 ranker
.set_query(Xapian::Query("score"));
626 ranker
.train_model(training_data
);
627 Xapian::docid doc1
= *mymset
[0];
628 Xapian::docid doc2
= *mymset
[1];
630 TEST_EQUAL(doc1
, *mymset
[1]);
631 TEST_EQUAL(doc2
, *mymset
[0]);
632 mymset
= enquire
.get_mset(0, 10);
633 ranker
.train_model(training_data
, "ListNet_Ranker");
634 ranker
.rank(mymset
, "ListNet_Ranker");
635 TEST_EQUAL(doc1
, *mymset
[1]);
636 TEST_EQUAL(doc2
, *mymset
[0]);
637 TEST_EXCEPTION(Xapian::LetorInternalError
,
638 ranker
.score(query
, qrel
, "ListNet_Ranker",
639 "scorer_output.txt", 10, ""));
640 TEST_EXCEPTION(Xapian::FileNotFoundError
,
641 ranker
.score("", qrel
, "ListNet_Ranker",
642 "scorer_output.txt", 10));
643 TEST_EXCEPTION(Xapian::FileNotFoundError
,
644 ranker
.score(qrel
, "", "ListNet_Ranker",
645 "scorer_output.txt", 10));
646 unlink("ndcg_output_listnet_3.txt");
647 ranker
.score(query
, qrel
, "ListNet_Ranker", "ndcg_output_listnet_3.txt",
649 TEST(file_exists("ndcg_output_listnet_3.txt"));
650 unlink("ndcg_output_listnet_3.txt");
651 unlink("err_output_listnet_3.txt");
652 ranker
.score(query
, qrel
, "ListNet_Ranker", "err_output_listnet_3.txt", 10,
654 TEST(file_exists("err_output_listnet_3.txt"));
655 unlink("err_output_listnet_3.txt");
658 DEFINE_TESTCASE(scorer
, path
&& writable
)
660 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
661 Xapian::ListNETRanker ranker
;
662 string db_path
= get_database_path("db_index_three_documents",
663 db_index_three_documents
);
664 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
665 enquire
.set_query(Xapian::Query("score"));
666 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
667 string data_directory
= test_driver::get_srcdir() + "/testdata/";
668 string query
= data_directory
+ "querythree.txt";
669 string qrel
= data_directory
+ "qrelthree_correct.txt";
670 string training_data
= data_directory
+ "training_data_three_correct.txt";
671 ranker
.set_database_path(db_path
);
672 TEST_EQUAL(ranker
.get_database_path(), db_path
);
673 ranker
.set_query(Xapian::Query("score"));
674 ranker
.train_model(training_data
);
675 Xapian::docid doc1
= *mymset
[0];
676 Xapian::docid doc2
= *mymset
[1];
678 TEST_EQUAL(doc1
, *mymset
[1]);
679 TEST_EQUAL(doc2
, *mymset
[0]);
680 unlink("ndcg_score_output.txt");
681 ranker
.score(query
, qrel
, "ListNet_Ranker", "ndcg_score_output.txt",
683 TEST(file_exists("ndcg_score_output.txt"));
684 ifstream ndcg_score_file
;
685 ndcg_score_file
.open("ndcg_score_output.txt", ios::in
);
687 getline(ndcg_score_file
, line
);
688 size_t pos
= 1 + line
.find_first_of("=");
689 double ndcg_score
= stod(line
.substr(pos
));
690 // It should have the perfect ndcg score(1.0)
691 TEST_EQUAL(ndcg_score
, 1.0);
693 unlink("ndcg_score_output.txt");
696 // ListMLE_Ranker check
697 DEFINE_TESTCASE(listmle_ranker
, path
&& writable
)
699 Xapian::ListMLERanker ranker
;
700 TEST_EXCEPTION(Xapian::FileNotFoundError
, ranker
.train_model(""));
701 string db_path
= get_database_path("db_index_two_documents",
702 db_index_two_documents
);
703 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
704 enquire
.set_query(Xapian::Query("lions"));
705 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
706 string data_directory
= test_driver::get_srcdir() + "/testdata/";
707 string query
= data_directory
+ "query.txt";
708 string qrel
= data_directory
+ "qrel.txt";
709 string training_data
= data_directory
+ "training_data.txt";
710 ranker
.set_database_path(db_path
);
711 TEST_EQUAL(ranker
.get_database_path(), db_path
);
712 ranker
.set_query(Xapian::Query("lions"));
713 ranker
.train_model(training_data
);
714 Xapian::docid doc1
= *mymset
[0];
715 Xapian::docid doc2
= *mymset
[1];
717 TEST_EQUAL(doc1
, *mymset
[1]);
718 TEST_EQUAL(doc2
, *mymset
[0]);
719 mymset
= enquire
.get_mset(0, 10);
720 ranker
.train_model(training_data
, "ListMLE_Ranker");
721 ranker
.rank(mymset
, "ListMLE_Ranker");
722 TEST_EQUAL(doc1
, *mymset
[1]);
723 TEST_EQUAL(doc2
, *mymset
[0]);
724 TEST_EXCEPTION(Xapian::LetorInternalError
,
725 ranker
.score(query
, qrel
, "ListMLE_Ranker",
726 "scorer_output.txt", 10, ""));
727 TEST_EXCEPTION(Xapian::FileNotFoundError
,
728 ranker
.score("", qrel
, "ListMLE_Ranker",
729 "scorer_output.txt", 10));
730 TEST_EXCEPTION(Xapian::FileNotFoundError
,
731 ranker
.score(qrel
, "", "ListMLE_Ranker",
732 "scorer_output.txt", 10));
733 unlink("ndcg_output_listmle_2.txt");
734 ranker
.score(query
, qrel
, "ListMLE_Ranker", "ndcg_output_listmle_2.txt",
736 TEST(file_exists("ndcg_output_listmle_2.txt"));
737 unlink("ndcg_output_listmle_2.txt");
738 unlink("err_output_listmle_2.txt");
739 ranker
.score(query
, qrel
, "ListMLE_Ranker", "err_output_listmle_2.txt", 10,
741 TEST(file_exists("err_output_listmle_2.txt"));
742 unlink("err_output_listmle_2.txt");
745 DEFINE_TESTCASE(listmle_ranker_one_file
, path
&& writable
)
747 Xapian::ListMLERanker ranker
;
748 TEST_EXCEPTION(Xapian::FileNotFoundError
, ranker
.train_model(""));
749 string db_path
= get_database_path("apitest_listmle_ranker1",
750 db_index_one_document
);
751 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
752 enquire
.set_query(Xapian::Query("tigers"));
753 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
754 string data_directory
= test_driver::get_srcdir() + "/testdata/";
755 string query
= data_directory
+ "queryone.txt";
756 string qrel
= data_directory
+ "qrelone.txt";
757 string training_data
= data_directory
+ "training_data_one_document.txt";
758 ranker
.set_database_path(db_path
);
759 TEST_EQUAL(ranker
.get_database_path(), db_path
);
760 ranker
.set_query(Xapian::Query("tigers"));
761 ranker
.train_model(training_data
);
762 Xapian::docid doc1
= *mymset
[0];
764 TEST_EQUAL(doc1
, *mymset
[0]);
765 mymset
= enquire
.get_mset(0, 10);
766 ranker
.train_model(training_data
, "ListMLE_Ranker");
767 ranker
.rank(mymset
, "ListMLE_Ranker");
768 TEST_EQUAL(doc1
, *mymset
[0]);
769 TEST_EXCEPTION(Xapian::LetorInternalError
,
770 ranker
.score(query
, qrel
, "ListMLE_Ranker",
771 "scorer_output.txt", 10, ""));
772 TEST_EXCEPTION(Xapian::FileNotFoundError
,
773 ranker
.score("", qrel
, "ListMLE_Ranker",
774 "scorer_output.txt", 10));
775 TEST_EXCEPTION(Xapian::FileNotFoundError
,
776 ranker
.score(qrel
, "", "ListMLE_Ranker",
777 "scorer_output.txt", 10));
778 unlink("ndcg_output_listmle_1.txt");
779 ranker
.score(query
, qrel
, "ListMLE_Ranker", "ndcg_output_listmle_1.txt",
781 TEST(file_exists("ndcg_output_listmle_1.txt"));
782 unlink("ndcg_output_listmle_1.txt");
783 unlink("err_output_listmle_1.txt");
784 ranker
.score(query
, qrel
, "ListMLE_Ranker", "err_output_listmle_1.txt", 10,
786 TEST(file_exists("err_output_listmle_1.txt"));
787 unlink("err_output_listmle_1.txt");
790 DEFINE_TESTCASE(listmle_ranker_three_correct
, path
&& writable
)
792 Xapian::ListMLERanker ranker
;
793 string db_path
= get_database_path("db_index_three_documents",
794 db_index_three_documents
);
795 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
796 enquire
.set_query(Xapian::Query("score"));
797 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
798 string data_directory
= test_driver::get_srcdir() + "/testdata/";
799 string query
= data_directory
+ "querythree.txt";
800 string qrel
= data_directory
+ "qrelthree_correct.txt";
801 string training_data
= data_directory
+ "training_data_three_correct.txt";
802 ranker
.set_database_path(db_path
);
803 TEST_EQUAL(ranker
.get_database_path(), db_path
);
804 ranker
.set_query(Xapian::Query("score"));
805 ranker
.train_model(training_data
);
806 Xapian::docid doc1
= *mymset
[0];
807 Xapian::docid doc2
= *mymset
[1];
809 TEST_EQUAL(mymset
.size(), 2);
810 TEST_EQUAL(doc1
, *mymset
[1]);
811 TEST_EQUAL(doc2
, *mymset
[0]);
812 mymset
= enquire
.get_mset(0, 10);
813 ranker
.train_model(training_data
, "ListMLE_Ranker");
814 ranker
.rank(mymset
, "ListMLE_Ranker");
815 TEST_EQUAL(doc1
, *mymset
[1]);
816 TEST_EQUAL(doc2
, *mymset
[0]);
817 TEST_EXCEPTION(Xapian::LetorInternalError
,
818 ranker
.score(query
, qrel
, "ListMLE_Ranker",
819 "scorer_output.txt", 10, ""));
820 TEST_EXCEPTION(Xapian::FileNotFoundError
,
821 ranker
.score("", qrel
, "ListMLE_Ranker",
822 "scorer_output.txt", 10));
823 TEST_EXCEPTION(Xapian::FileNotFoundError
,
824 ranker
.score(qrel
, "", "ListMLE_Ranker",
825 "scorer_output.txt", 10));
826 unlink("ndcg_output_listmle_3.txt");
827 ranker
.score(query
, qrel
, "ListMLE_Ranker", "ndcg_output_listmle_3.txt",
829 TEST(file_exists("ndcg_output_listmle_3.txt"));
830 unlink("err_output_listmle_3.txt");
831 unlink("ndcg_output_listmle_3.txt");
832 ranker
.score(query
, qrel
, "ListMLE_Ranker", "err_output_listmle_3.txt", 10,
834 TEST(file_exists("err_output_listmle_3.txt"));
835 unlink("err_output_listmle_3.txt");
839 DEFINE_TESTCASE(featurename
, !backend
)
841 Xapian::TfDoclenCollTfCollLenFeature feature1
;
842 Xapian::TfDoclenFeature feature2
;
843 Xapian::IdfFeature feature3
;
844 Xapian::TfFeature feature4
;
845 Xapian::TfIdfDoclenFeature feature5
;
846 Xapian::CollTfCollLenFeature feature6
;
847 TEST_EQUAL(feature1
.name(), "TfDoclenCollTfCollLenFeature");
848 TEST_EQUAL(feature2
.name(), "TfDoclenFeature");
849 TEST_EQUAL(feature3
.name(), "IdfFeature");
850 TEST_EQUAL(feature4
.name(), "TfFeature");
851 TEST_EQUAL(feature5
.name(), "TfIdfDoclenFeature");
852 TEST_EQUAL(feature6
.name(), "CollTfCollLenFeature");
855 DEFINE_TESTCASE(err_scorer
, !backend
)
857 /* Derived from the example mentioned in the blogpost
858 * https://lingpipe-blog.com/2010/03/09/chapelle-metzler-zhang-grinspan-2009-expected-reciprocal-rank-for-graded-relevance/
860 vector
<Xapian::FeatureVector
> fvv
;
861 Xapian::FeatureVector temp1
;
862 Xapian::FeatureVector temp2
;
863 Xapian::FeatureVector temp3
;
865 fvv
.push_back(temp1
);
867 fvv
.push_back(temp2
);
869 fvv
.push_back(temp3
);
870 Xapian::ERRScore err
;
871 double err_score
= err
.score(fvv
);
873 TEST(abs(err_score
- 0.63) < 0.01);
876 DEFINE_TESTCASE(ndcg_score_test
, path
&& writable
)
878 Xapian::ListNETRanker ranker
;
879 string db_path
= get_database_path("db_index_three_documents",
880 db_index_three_documents
);
881 Xapian::Enquire
enquire((Xapian::Database(db_path
)));
882 enquire
.set_query(Xapian::Query("score"));
883 Xapian::MSet mymset
= enquire
.get_mset(0, 10);
884 string data_directory
= test_driver::get_srcdir() + "/testdata/";
885 string query
= data_directory
+ "querythree.txt";
886 string qrel
= data_directory
+ "score_qrel.txt";
887 string training_data
= data_directory
+ "training_data_ndcg.txt";
888 ranker
.set_database_path(db_path
);
889 ranker
.set_query(Xapian::Query("score"));
890 ranker
.train_model(training_data
, "ListNet_Ranker");
891 ranker
.rank(mymset
, "ListNet_Ranker");
892 unlink("ndcg_score_test.txt");
893 ranker
.score(query
, qrel
, "ListNet_Ranker", "ndcg_score_test.txt", 10);
894 TEST(file_exists("ndcg_score_test.txt"));
895 unlink("ndcg_score_test.txt");
898 DEFINE_TESTCASE(different_no_features
, !backend
)
900 Xapian::ListNETRanker ranker
;
901 string data_directory
= test_driver::get_srcdir() + "/testdata/";
902 string training_data
= data_directory
+
903 "training_data_different_no_features.txt";
904 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
905 ranker
.train_model(training_data
, "ListNet_Ranker"));
908 // Test createfeaturevector method for TfFeature
909 DEFINE_TESTCASE(createfeaturevector_tffeature
, backend
)
911 vector
<Xapian::Feature
*> f
;
912 Xapian::TfFeature
* f1
= new Xapian::TfFeature();
915 Xapian::FeatureList
fl(f
);
916 Xapian::Database db
= get_database("db_index_three_documents",
917 db_index_three_documents
);
918 Xapian::Enquire
enquire(db
);
919 enquire
.set_query(Xapian::Query("score"));
921 mset
= enquire
.get_mset(0, 10);
923 Xapian::QueryParser queryparser
;
924 queryparser
.set_stemmer(Xapian::Stem("en"));
925 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
926 queryparser
.add_prefix("title", "S");
927 queryparser
.add_prefix("description", "XD");
929 // As the feature values depend on different prefixed terms in the query
930 // like "title","body" and "whole", so we need to separate it out instead
931 // of just writing Query("score").
932 string querystring
= "title:score description:score score";
933 Xapian::Query query
= queryparser
.parse_query(querystring
);
935 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
936 TEST_EQUAL(fv
.size(), 2);
937 TEST_EQUAL(fv
[0].get_fcount(), 4);
938 TEST_EQUAL(fv
[1].get_fcount(), 4);
940 vector
<double> fvals_doc1
= fv
[0].get_fvals();
941 vector
<double> fvals_doc2
= fv
[1].get_fvals();
942 TEST_EQUAL(fvals_doc1
.size(), 4);
943 TEST_EQUAL(fvals_doc2
.size(), 4);
945 vector
<double> test_vals_doc1(4);
946 // These are the appropriate TfFeature values for the first document.
947 test_vals_doc1
[0] = 0.301029995663981;
948 test_vals_doc1
[1] = 1.653212513775344;
949 test_vals_doc1
[2] = 1.954242509439325;
951 Xapian::MSetIterator it
= mset
.begin();
952 test_vals_doc1
[3] = it
.get_weight();
954 vector
<double> test_vals_doc2(4);
955 // These are the appropriate TfFeature values for the second document.
956 test_vals_doc2
[0] = 0;
957 test_vals_doc2
[1] = 1.732393759822969;
958 test_vals_doc2
[2] = 1.732393759822969;
961 test_vals_doc2
[3] = it
.get_weight();
964 for (int i
= 0; i
< 4; ++i
) {
965 max_val
[i
] = max(test_vals_doc1
[i
], test_vals_doc2
[i
]);
968 // test for title in normalized form
969 TEST_EQUAL_DOUBLE(fvals_doc1
[0], test_vals_doc1
[0] / max_val
[0]);
970 TEST_EQUAL_DOUBLE(fvals_doc2
[0], test_vals_doc2
[0] / max_val
[0]);
972 // test for body in normalized form
973 TEST_EQUAL_DOUBLE(fvals_doc1
[1], test_vals_doc1
[1] / max_val
[1]);
974 TEST_EQUAL_DOUBLE(fvals_doc2
[1], test_vals_doc2
[1] / max_val
[1]);
976 // test for whole in normalized form
977 TEST_EQUAL_DOUBLE(fvals_doc1
[2], test_vals_doc1
[2] / max_val
[2]);
978 TEST_EQUAL_DOUBLE(fvals_doc2
[2], test_vals_doc2
[2] / max_val
[2]);
980 // test for weight in normalized form
981 TEST_EQUAL_DOUBLE(fvals_doc1
[3], test_vals_doc1
[3] / max_val
[3]);
982 TEST_EQUAL_DOUBLE(fvals_doc2
[3], test_vals_doc2
[3] / max_val
[3]);
985 // Test createfeaturevector method for IdfFeature
986 DEFINE_TESTCASE(createfeaturevector_idffeature
, backend
)
988 vector
<Xapian::Feature
*> f
;
989 Xapian::IdfFeature
* f1
= new Xapian::IdfFeature();
992 Xapian::FeatureList
fl(f
);
993 Xapian::Database db
= get_database("db_index_three_documents",
994 db_index_three_documents
);
995 Xapian::Enquire
enquire(db
);
996 enquire
.set_query(Xapian::Query("score"));
998 mset
= enquire
.get_mset(0, 10);
1000 Xapian::QueryParser queryparser
;
1001 queryparser
.set_stemmer(Xapian::Stem("en"));
1002 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
1003 queryparser
.add_prefix("title", "S");
1004 queryparser
.add_prefix("description", "XD");
1006 // As the feature values depend on different prefixed terms in the query
1007 // like "title","body" and "whole", so we need to separate it out instead
1008 // of just writing Query("score").
1009 string querystring
= "title:tigers description:tigers tigers"
1010 " title:score description:score score";
1011 Xapian::Query query
= queryparser
.parse_query(querystring
);
1013 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
1014 TEST_EQUAL(fv
.size(), 2);
1015 TEST_EQUAL(fv
[0].get_fcount(), 4);
1016 TEST_EQUAL(fv
[1].get_fcount(), 4);
1018 vector
<double> fvals_doc1
= fv
[0].get_fvals();
1019 vector
<double> fvals_doc2
= fv
[1].get_fvals();
1020 TEST_EQUAL(fvals_doc1
.size(), 4);
1021 TEST_EQUAL(fvals_doc2
.size(), 4);
1023 vector
<double> test_vals_doc1(4);
1024 // These are the appropriate IdfFeature values for the first document.
1025 test_vals_doc1
[0] = 0.0;
1026 test_vals_doc1
[1] = 0.0;
1027 test_vals_doc1
[2] = 0.0;
1029 Xapian::MSetIterator it
= mset
.begin();
1030 test_vals_doc1
[3] = it
.get_weight();
1032 vector
<double> test_vals_doc2(4);
1033 // These are the appropriate IdfFeature values for the second document.
1034 test_vals_doc2
[0] = 0.0;
1035 test_vals_doc2
[1] = 0.0;
1036 test_vals_doc2
[2] = 0.0;
1039 test_vals_doc2
[3] = it
.get_weight();
1041 double max_weight
= max(test_vals_doc1
[3], test_vals_doc2
[3]);
1043 // test for title in normalized form
1044 TEST_EQUAL_DOUBLE(fvals_doc1
[0], 0.0);
1045 TEST_EQUAL_DOUBLE(fvals_doc2
[0], 0.0);
1047 // test for body in normalized form
1048 TEST_EQUAL_DOUBLE(fvals_doc1
[1], 0.0);
1049 TEST_EQUAL_DOUBLE(fvals_doc2
[1], 0.0);
1051 // test for whole in normalized form
1052 TEST_EQUAL_DOUBLE(fvals_doc1
[2], 0.0);
1053 TEST_EQUAL_DOUBLE(fvals_doc2
[2], 0.0);
1055 // test for weight in normalized form
1056 TEST_EQUAL_DOUBLE(fvals_doc1
[3], test_vals_doc1
[3] / max_weight
);
1057 TEST_EQUAL_DOUBLE(fvals_doc2
[3], test_vals_doc2
[3] / max_weight
);
1060 // Test createfeaturevector method for TfDoclenFeature
1061 DEFINE_TESTCASE(createfeaturevector_tfdoclenfeature
, backend
)
1063 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
1064 vector
<Xapian::Feature
*> f
;
1065 Xapian::TfDoclenFeature
* f1
= new Xapian::TfDoclenFeature();
1068 Xapian::FeatureList
fl(f
);
1069 Xapian::Database db
= get_database("db_index_three_documents_no_common",
1070 db_index_three_documents_no_common
);
1071 Xapian::Enquire
enquire(db
);
1072 enquire
.set_query(Xapian::Query("score"));
1074 mset
= enquire
.get_mset(0, 10);
1076 Xapian::QueryParser queryparser
;
1077 queryparser
.set_stemmer(Xapian::Stem("en"));
1078 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
1079 queryparser
.add_prefix("title", "S");
1080 queryparser
.add_prefix("description", "XD");
1082 // As the feature values depend on different prefixed terms in the query
1083 // like "title","body" and "whole", so we need to separate it out instead
1084 // of just writing Query("score").
1085 string querystring
= "title:score description:score score";
1086 Xapian::Query query
= queryparser
.parse_query(querystring
);
1088 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
1089 TEST_EQUAL(fv
.size(), 2);
1090 TEST_EQUAL(fv
[0].get_fcount(), 4);
1091 TEST_EQUAL(fv
[1].get_fcount(), 4);
1093 vector
<double> fvals_doc1
= fv
[0].get_fvals();
1094 vector
<double> fvals_doc2
= fv
[1].get_fvals();
1095 TEST_EQUAL(fvals_doc1
.size(), 4);
1096 TEST_EQUAL(fvals_doc2
.size(), 4);
1098 vector
<double> test_vals_doc1(4);
1099 // These are the appropriate TfDoclenFeature values for the first
1101 test_vals_doc1
[0] = 0.0511525224473813;
1102 test_vals_doc1
[1] = 0.0323089286738408;
1103 test_vals_doc1
[2] = 0.0335890631408052;
1105 Xapian::MSetIterator it
= mset
.begin();
1106 test_vals_doc1
[3] = it
.get_weight();
1108 vector
<double> test_vals_doc2(4);
1109 // These are the appropriate TfDoclenFeature values for the second
1111 test_vals_doc2
[0] = 0.0;
1112 test_vals_doc2
[1] = 0.03237347800973529;
1113 test_vals_doc2
[2] = 0.03200637092048766;
1116 test_vals_doc2
[3] = it
.get_weight();
1119 for (int i
= 0; i
< 4; ++i
) {
1120 max_val
[i
] = max(test_vals_doc1
[i
], test_vals_doc2
[i
]);
1123 // test for title in normalized form
1124 TEST_EQUAL_DOUBLE(fvals_doc1
[0], test_vals_doc1
[0] / max_val
[0]);
1125 TEST_EQUAL_DOUBLE(fvals_doc2
[0], test_vals_doc2
[0] / max_val
[0]);
1127 // test for body in normalized form
1128 TEST_EQUAL_DOUBLE(fvals_doc1
[1], test_vals_doc1
[1] / max_val
[1]);
1129 TEST_EQUAL_DOUBLE(fvals_doc2
[1], test_vals_doc2
[1] / max_val
[1]);
1131 // test for whole in normalized form
1132 TEST_EQUAL_DOUBLE(fvals_doc1
[2], test_vals_doc1
[2] / max_val
[2]);
1133 TEST_EQUAL_DOUBLE(fvals_doc2
[2], test_vals_doc2
[2] / max_val
[2]);
1135 // test for weight in normalized form
1136 TEST_EQUAL_DOUBLE(fvals_doc1
[3], test_vals_doc1
[3] / max_val
[3]);
1137 TEST_EQUAL_DOUBLE(fvals_doc2
[3], test_vals_doc2
[3] / max_val
[3]);
1140 // Test createfeaturevector method for CollTfCollLenFeature
1141 DEFINE_TESTCASE(createfeaturevector_colltfcolllenfeature
, backend
)
1143 vector
<Xapian::Feature
*> f
;
1144 Xapian::CollTfCollLenFeature
* f1
= new Xapian::CollTfCollLenFeature();
1147 Xapian::FeatureList
fl(f
);
1148 Xapian::Database db
= get_database("db_index_three_documents_no_common",
1149 db_index_three_documents_no_common
);
1150 Xapian::Enquire
enquire(db
);
1151 enquire
.set_query(Xapian::Query("score"));
1153 mset
= enquire
.get_mset(0, 10);
1155 Xapian::QueryParser queryparser
;
1156 queryparser
.set_stemmer(Xapian::Stem("en"));
1157 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
1158 queryparser
.add_prefix("title", "S");
1159 queryparser
.add_prefix("description", "XD");
1161 // As the feature values depend on different prefixed terms in the query
1162 // like "title","body" and "whole", so we need to separate it out instead
1163 // of just writing Query("score").
1164 string querystring
= "title:score description:score score";
1165 Xapian::Query query
= queryparser
.parse_query(querystring
);
1167 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
1168 TEST_EQUAL(fv
.size(), 2);
1169 TEST_EQUAL(fv
[0].get_fcount(), 4);
1170 TEST_EQUAL(fv
[1].get_fcount(), 4);
1172 vector
<double> fvals_doc1
= fv
[0].get_fvals();
1173 vector
<double> fvals_doc2
= fv
[1].get_fvals();
1174 TEST_EQUAL(fvals_doc1
.size(), 4);
1175 TEST_EQUAL(fvals_doc2
.size(), 4);
1177 vector
<double> test_vals_doc1(4);
1178 // These are the appropriate CollTfCollLenFeature values for the first
1180 test_vals_doc1
[0] = 0.45863784902564930;
1181 test_vals_doc1
[1] = 3.13291481930625260;
1182 test_vals_doc1
[2] = 4.94672282004904673;
1184 Xapian::MSetIterator it
= mset
.begin();
1185 test_vals_doc1
[3] = it
.get_weight();
1187 vector
<double> test_vals_doc2(4);
1188 // values will be same as that of the first document
1189 test_vals_doc2
[0] = 0.45863784902564930;
1190 test_vals_doc2
[1] = 3.13291481930625260;
1191 test_vals_doc2
[2] = 4.94672282004904673;
1194 test_vals_doc2
[3] = it
.get_weight();
1197 for (int i
= 0; i
< 4; ++i
) {
1198 max_val
[i
] = max(test_vals_doc1
[i
], test_vals_doc2
[i
]);
1201 // test for title in normalized form
1202 TEST_EQUAL_DOUBLE(fvals_doc1
[0], test_vals_doc1
[0] / max_val
[0]);
1203 TEST_EQUAL_DOUBLE(fvals_doc2
[0], test_vals_doc2
[0] / max_val
[0]);
1205 // test for body in normalized form
1206 TEST_EQUAL_DOUBLE(fvals_doc1
[1], test_vals_doc1
[1] / max_val
[1]);
1207 TEST_EQUAL_DOUBLE(fvals_doc2
[1], test_vals_doc2
[1] / max_val
[1]);
1209 // test for whole in normalized form
1210 TEST_EQUAL_DOUBLE(fvals_doc1
[2], test_vals_doc1
[2] / max_val
[2]);
1211 TEST_EQUAL_DOUBLE(fvals_doc2
[2], test_vals_doc2
[2] / max_val
[2]);
1213 // test for weight in normalized form
1214 TEST_EQUAL_DOUBLE(fvals_doc1
[3], test_vals_doc1
[3] / max_val
[3]);
1215 TEST_EQUAL_DOUBLE(fvals_doc2
[3], test_vals_doc2
[3] / max_val
[3]);
1218 // Test createfeaturevector method for TfIdfDoclenFeature
1219 DEFINE_TESTCASE(createfeaturevector_tfidfdoclenfeature
, backend
)
1221 vector
<Xapian::Feature
*> f
;
1222 Xapian::TfIdfDoclenFeature
* f1
= new Xapian::TfIdfDoclenFeature();
1225 Xapian::FeatureList
fl(f
);
1226 Xapian::Database db
= get_database("db_index_three_documents",
1227 db_index_three_documents
);
1228 Xapian::Enquire
enquire(db
);
1229 enquire
.set_query(Xapian::Query("score"));
1231 mset
= enquire
.get_mset(0, 10);
1233 Xapian::QueryParser queryparser
;
1234 queryparser
.set_stemmer(Xapian::Stem("en"));
1235 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
1236 queryparser
.add_prefix("title", "S");
1237 queryparser
.add_prefix("description", "XD");
1239 // As the feature values depend on different prefixed terms in the query
1240 // like "title","body" and "whole", so we need to separate it out instead
1241 // of just writing Query("score").
1242 string querystring
= "title:score description:score score";
1243 Xapian::Query query
= queryparser
.parse_query(querystring
);
1245 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
1246 TEST_EQUAL(fv
.size(), 2);
1247 TEST_EQUAL(fv
[0].get_fcount(), 4);
1248 TEST_EQUAL(fv
[1].get_fcount(), 4);
1250 vector
<double> fvals_doc1
= fv
[0].get_fvals();
1251 vector
<double> fvals_doc2
= fv
[1].get_fvals();
1252 TEST_EQUAL(fvals_doc1
.size(), 4);
1253 TEST_EQUAL(fvals_doc2
.size(), 4);
1255 vector
<double> test_vals_doc1(4);
1256 // These are the appropriate TfIdfDoclenFeature values for the first
1258 test_vals_doc1
[0] = 0.0;
1259 test_vals_doc1
[1] = 0.0;
1260 test_vals_doc1
[2] = 0.0;
1262 Xapian::MSetIterator it
= mset
.begin();
1263 test_vals_doc1
[3] = it
.get_weight();
1265 vector
<double> test_vals_doc2(4);
1266 // These are the appropriate TfIdfDoclenFeature values for the second
1268 test_vals_doc2
[0] = 0.0;
1269 test_vals_doc2
[1] = 0.0;
1270 test_vals_doc2
[2] = 0.0;
1273 test_vals_doc2
[3] = it
.get_weight();
1275 double max_weight
= max(test_vals_doc1
[3], test_vals_doc2
[3]);
1277 // test for title in normalized form
1278 TEST_EQUAL_DOUBLE(fvals_doc1
[0], 0.0);
1279 TEST_EQUAL_DOUBLE(fvals_doc2
[0], 0.0);
1281 // test for body in normalized form
1282 TEST_EQUAL_DOUBLE(fvals_doc1
[1], 0.0);
1283 TEST_EQUAL_DOUBLE(fvals_doc2
[1], 0.0);
1285 // test for whole in normalized form
1286 TEST_EQUAL_DOUBLE(fvals_doc1
[2], 0.0);
1287 TEST_EQUAL_DOUBLE(fvals_doc2
[2], 0.0);
1289 // test for weight in normalized form
1290 TEST_EQUAL_DOUBLE(fvals_doc1
[3], test_vals_doc1
[3] / max_weight
);
1291 TEST_EQUAL_DOUBLE(fvals_doc2
[3], test_vals_doc2
[3] / max_weight
);
1294 // Test createfeaturevector method for TfDoclenCollTfCollLenFeature
1295 DEFINE_TESTCASE(createfeaturevector_tfdoclencolllfcolllen
, backend
)
1297 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
1298 vector
<Xapian::Feature
*> f
;
1299 Xapian::TfDoclenCollTfCollLenFeature
* f1
=
1300 new Xapian::TfDoclenCollTfCollLenFeature();
1303 Xapian::FeatureList
fl(f
);
1304 Xapian::Database db
= get_database("db_index_three_documents",
1305 db_index_three_documents
);
1306 Xapian::Enquire
enquire(db
);
1307 enquire
.set_query(Xapian::Query("score"));
1309 mset
= enquire
.get_mset(0, 10);
1311 Xapian::QueryParser queryparser
;
1312 queryparser
.set_stemmer(Xapian::Stem("en"));
1313 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
1314 queryparser
.add_prefix("title", "S");
1315 queryparser
.add_prefix("description", "XD");
1317 // As the feature values depend on different prefixed terms in the query
1318 // like "title","body" and "whole", so we need to separate it out instead
1319 // of just writing Query("score").
1320 string querystring
= "title:score description:score score";
1321 Xapian::Query query
= queryparser
.parse_query(querystring
);
1323 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
1324 TEST_EQUAL(fv
.size(), 2);
1325 TEST_EQUAL(fv
[0].get_fcount(), 4);
1326 TEST_EQUAL(fv
[1].get_fcount(), 4);
1328 vector
<double> fvals_doc1
= fv
[0].get_fvals();
1329 vector
<double> fvals_doc2
= fv
[1].get_fvals();
1330 TEST_EQUAL(fvals_doc1
.size(), 4);
1331 TEST_EQUAL(fvals_doc2
.size(), 4);
1333 vector
<double> test_vals_doc1(4);
1334 // These are the appropriate TfDoclenCollTfCollLenFeature values for
1335 // the first document.
1336 test_vals_doc1
[0] = 0.11394335230683678;
1337 test_vals_doc1
[1] = 0.76130720333102619;
1338 test_vals_doc1
[2] = 0.90738326700002048;
1340 Xapian::MSetIterator it
= mset
.begin();
1341 test_vals_doc1
[3] = it
.get_weight();
1343 vector
<double> test_vals_doc2(4);
1344 // These are the appropriate TfDoclenCollTfCollLenFeature values for
1345 // the second document.
1346 test_vals_doc2
[0] = 0.0;
1347 test_vals_doc2
[1] = 0.77758890362035493;
1348 test_vals_doc2
[2] = 0.78786362447009839;
1351 test_vals_doc2
[3] = it
.get_weight();
1354 for (int i
= 0; i
< 4; ++i
) {
1355 max_val
[i
] = max(test_vals_doc1
[i
], test_vals_doc2
[i
]);
1358 // test for title in normalized form
1359 TEST_EQUAL_DOUBLE(fvals_doc1
[0], test_vals_doc1
[0] / max_val
[0]);
1360 TEST_EQUAL_DOUBLE(fvals_doc2
[0], test_vals_doc2
[0] / max_val
[0]);
1362 // test for body in normalized form
1363 TEST_EQUAL_DOUBLE(fvals_doc1
[1], test_vals_doc1
[1] / max_val
[1]);
1364 TEST_EQUAL_DOUBLE(fvals_doc2
[1], test_vals_doc2
[1] / max_val
[1]);
1366 // test for whole in normalized form
1367 TEST_EQUAL_DOUBLE(fvals_doc1
[2], test_vals_doc1
[2] / max_val
[2]);
1368 TEST_EQUAL_DOUBLE(fvals_doc2
[2], test_vals_doc2
[2] / max_val
[2]);
1370 // test for weight in normalized form
1371 TEST_EQUAL_DOUBLE(fvals_doc1
[3], test_vals_doc1
[3] / max_val
[3]);
1372 TEST_EQUAL_DOUBLE(fvals_doc2
[3], test_vals_doc2
[3] / max_val
[3]);
1375 class CustomFeature
: public Xapian::Feature
{
1378 need_stat(Xapian::Feature::TERM_FREQUENCY
);
1379 need_stat(Xapian::Feature::DOCUMENT_LENGTH
);
1380 need_stat(Xapian::Feature::COLLECTION_TERM_FREQ
);
1381 need_stat(Xapian::Feature::COLLECTION_LENGTH
);
1382 need_stat(Xapian::Feature::INVERSE_DOCUMENT_FREQUENCY
);
1384 std::vector
<double> get_values() const {
1385 return vector
<double>();
1387 std::string
name() const {
1388 return "CustomFeature";
1391 // test for term frequency
1392 TEST_EQUAL(get_termfreq("ZStiger"), 1);
1393 TEST_EQUAL(get_termfreq("ZXDtiger"), 6);
1394 TEST_EQUAL(get_termfreq("Ztiger"), 2);
1396 // test for inverse document frequency
1397 TEST_EQUAL_DOUBLE(get_inverse_doc_freq("ZStiger"), 0.176091259055681);
1398 TEST_EQUAL_DOUBLE(get_inverse_doc_freq("ZXDtiger"), 0.176091259055681);
1399 TEST_EQUAL_DOUBLE(get_inverse_doc_freq("Ztiger"), 0.176091259055681);
1401 // test for document length
1402 TEST_EQUAL(get_doc_length("title"), 4);
1403 TEST_EQUAL(get_doc_length("body"), 182);
1404 TEST_EQUAL(get_doc_length("whole"), 186);
1406 // test for collection length
1407 TEST_EQUAL(get_collection_length("title"), 13);
1408 TEST_EQUAL(get_collection_length("body"), 509);
1409 TEST_EQUAL(get_collection_length("whole"), 522);
1411 // test for collection term frequency
1412 TEST_EQUAL(get_collection_termfreq("ZStiger"), 1);
1413 TEST_EQUAL(get_collection_termfreq("ZXDtiger"), 6);
1414 TEST_EQUAL(get_collection_termfreq("Ztiger"), 2);
1418 DEFINE_TESTCASE(populatefeature
, backend
) {
1419 XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
1420 vector
<Xapian::Feature
*> f
;
1421 CustomFeature
* custom_feature
= new CustomFeature();
1422 f
.push_back(custom_feature
);
1424 Xapian::FeatureList
fl(f
);
1425 Xapian::Database db
= get_database("db_index_three_documents_no_common",
1426 db_index_three_documents_no_common
);
1427 Xapian::Enquire
enquire(db
);
1428 enquire
.set_query(Xapian::Query("tigers"));
1430 mset
= enquire
.get_mset(0, 10);
1432 TEST(!mset
.empty());
1434 Xapian::QueryParser queryparser
;
1435 queryparser
.set_stemmer(Xapian::Stem("en"));
1436 queryparser
.set_stemming_strategy(queryparser
.STEM_ALL_Z
);
1437 queryparser
.add_prefix("title", "S");
1438 queryparser
.add_prefix("description", "XD");
1440 string querystring
= "title:tigers description:tigers tigers";
1441 Xapian::Query query
= queryparser
.parse_query(querystring
);
1443 auto fv
= fl
.create_feature_vectors(mset
, query
, db
);
1444 TEST_EQUAL(fv
.size(), 1);
1446 custom_feature
->test_stats();