Add README
[xapian-trec.git] / trec_search.cc
blob43e6eea0008861d4f29689db438cb69dac0dc721
1 /* trec_search.cc: Example batch search for TREC experiments
3 * ----START-LICENCE----
4 * Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2003 Olly Betts
6 * Copyright 2003 Andy MacFarlane, City University
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * USA
22 * -----END-LICENCE-----
25 #include "config_file.h"
26 #include <fstream>
27 #include <xapian.h>
28 #include <algorithm>
29 #include <iostream>
30 #include <string>
31 #include <time.h>
32 #include "stopword.h"
33 #include "split.h"
34 #include "timer.h"
36 using namespace Xapian;
37 using namespace std;
39 int load_query( std::ifstream & queryfile, int & topicno, SW_STORE sw_store, Xapian::Query & query, Xapian::Stem & stemmer ) {
40 // load a query and record its terms
42 if( queryfile.eof() ) return 0;
44 int found_topicno=0;
45 string line;
46 getline(queryfile,line);
47 line[line.size()-1] ='\0';
48 vector<string> data;
49 split(line, ' ', data );
50 vector <string> terms;
51 for( vector<string>::const_iterator start = data.begin(); start != data.end(); start++ ) {
52 string queryword;
53 if( !found_topicno ) {
54 topicno = atoi( start->c_str());
55 found_topicno=1;
56 } else if(!IsStopWord( sw_store, (char *) start->c_str() )) {
57 queryword = stemmer(*start);
58 cout << "Queryword: = " << queryword << endl;
59 terms.push_back(queryword);
60 } // END if
62 } // END for
64 // make the query with the terms
65 Xapian::Query consquery(Xapian::Query::OP_OR, terms.begin(), terms.end());
66 query = consquery;
68 return 1;
70 } // END load_query
72 int main(int argc, char **argv)
74 // Simplest possible options parsing: we just require two or more
75 // parameters.
76 if(argc < 2) {
77 cout << "usage: " << argv[0] << " <config file>" << endl;
78 exit(1);
81 // Catch any Xapian::Error exceptions thrown
82 try {
83 // load the TREC experiment configuration file
84 CONFIG_TREC config;
85 config.setup_config( string(argv[1]) );
86 config.check_search_config();
87 Xapian::Stem stemmer( config.get_language() );
88 struct timeval start_time, finish_time, timelapse; /* timing variables */
90 // Make the database
91 Xapian::Database db(Xapian::Flint::open(config.get_db().c_str()));
93 // Start an enquire session
94 Xapian::Enquire enquire(db);
96 // open the query file
97 std::ifstream queryfile( config.get_queryfile().c_str() );
99 // open the results file
100 std::ofstream resultsfile( config.get_resultsfile().c_str() );
102 // open the transaction file
103 std::ofstream transfile( config.get_transfile().c_str() );
105 // load the stop word list
106 SW_STORE sw_store;
107 Read_SW_File( (char *) config.get_stopsfile().c_str(), &sw_store );
109 // count of no queries done
110 int count=0;
112 // total query time
113 float total_qp_time = 0.0;
115 // process the queries
116 while( !queryfile.eof() ) {
118 int topicno; // topic number for the query
119 int len=0;
121 // Build the query object
122 Xapian::Query query;
123 int gotquery = load_query( queryfile, topicno, sw_store, query, stemmer );
125 if(gotquery && !queryfile.eof()) {
127 // start the timer
128 gettimeofday( &start_time, 0 );
130 cout << "Running " << topicno << ", query = [" << query.get_description() << "] getting " << config.get_noresults() << " docs" << endl;
132 // Give the query object to the enquire session
133 enquire.set_query(query);
135 // Get the top n results of the query
136 Xapian::MSet matches = enquire.get_mset( 0, config.get_noresults() );
138 // record the number of matches made in this query
139 //int queryweightings = enquire.get_totalweightings();
140 //cout << "W's) for this query is -> " << queryweightings << endl;
142 // Display the results cout << matches.size() << " results found" << endl;
143 count++;
144 if( (count % 1000) == 0 ) cout << "QUERIES PROCESSED) " << count << endl;
146 // record the results in a 'trec.log' file
147 for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); i++) {
148 Xapian::Document doc = i.get_document();
149 resultsfile << topicno << " Q0 " << doc.get_data() << " " << i.get_rank() << " " <<
150 i.get_weight() << " " << config.get_runname() << endl;
151 len++;
152 } // END for
154 // record the finish
155 gettimeofday( &finish_time, 0 );
156 diff_time( finish_time, start_time, &timelapse );
157 float qp_time = time_real( timelapse );
158 total_qp_time += qp_time;
159 transfile << topicno << "," << qp_time << "," << len << endl;
160 cout << topicno << "," << qp_time << "," << len << endl;
163 } // END if
164 } // END while
166 // print the total time, and average time per query
168 float avg_qp_time = total_qp_time /(float) count;
169 cout << "Average query time for " << count << " Queries is " <<
170 avg_qp_time << " secs, took a total of " << total_qp_time << " secs" << endl;
172 } catch( const Xapian::Error &error) {
173 cout << "Exception: " << error.get_msg() << endl;
174 } // END try/catch block
176 } // END main