Add README
[xapian-trec.git] / trec_query.cc
blob726e922116c076b72bd70e3522822f955b7d4db2
1 /* trec_query.cc: Example batch query generator for TREC experiments
3 * ----START-LICENCE----
4 * Copyright 2003 Andy MacFarlane, City University
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 * -----END-LICENCE-----
24 #include <xapian.h>
25 #include <algorithm>
26 #include <iostream>
27 #include <string>
28 #include <sys/types.h>
29 #include <dirent.h>
30 #include <sys/stat.h>
31 #include <unistd.h>
32 #include <getopt.h>
33 #include <fcntl.h>
34 #include "htmlparse.h"
35 #include "stopword.h"
36 #include "config_file.h"
37 #include "indextext.h"
38 #include <ctype.h>
39 #include <malloc.h>
40 #include <signal.h>
41 #include <math.h>
42 #include <errno.h>
43 #include <cstring>
45 using namespace Xapian;
46 using namespace std;
48 #define MAX_KEYWORDS 10000
49 #define BIG_BUFFER 100000
50 #define NOACTION 0
51 #define END_TOPIC 1
52 #define TOPIC_NUMBER 2
53 #define TITLE 3
54 #define DESC 4
55 #define NARR 5
56 #define SAVETERM 6
57 #define TRUE 1
58 #define FALSE 0
61 typedef struct query {
62 /* data structure for query */
64 int topic_no; /* topic no for the query */
65 char term_set[MAX_KEYWORDS][KW_SIZE]; /* list of query terms */
66 double weights[MAX_KEYWORDS]; /* weights for those terms */
67 int nwords; /* no of words in the query */
69 } QUERY;
71 void skip_spaces( char page[], int size, int *curpos ) {
72 /* skip any number of spaces/irrelevant chars in rows/cols */
74 /* find an alpha numeric character */
75 while( page[*curpos] != '\0' &&
76 !isalnum( page[*curpos] ) && !(page[*curpos] == '<')) {
77 *curpos += 1;
78 } /*END while*/
80 } /*END skip_spaces*/
82 void fold_word( char string[], int len ) {
83 /* case fold a word */
85 int i;
87 for( i= 0; i < len; i++ )
88 string[i] = tolower( string[i] );
90 } /* END fold_word */
92 int iswordbreak( char c ) {
93 /* see if there is a break in the middle of a word, such as a hyper
94 less than sign etc, if so return true else false - version2 */
95 /* note: used for the chamber: ignores <> chars for tag processing */
97 if( c >= 0 && c <= '-' )
98 return 1;
99 if( c >= '[' && c <= '`' )
100 return 1;
101 if( c >= '{' && c <= '~' )
102 return 1;
103 if( c >= ':' && c <= ';' )
104 return 1;
105 if( c >= '=' && c <= '@' )
106 return 1;
107 if( c == '/' )
108 return 1;
109 return 0;
111 } /* END iswordbreak */
113 void get_chars( char buffer[], int size, int *curpos, char saved_chars[KW_SIZE] ) {
114 /* get each character for a word */
116 int found = FALSE; /* condition for a found word */
117 register int cur_char; /* pos of char in current word */
119 /* loop until end of word found */
120 for( cur_char = 0; cur_char < KW_SIZE && !found; cur_char++ )
122 if ( isspace(buffer[*curpos]) )
124 found = 1;
125 break;
127 else if ( buffer[*curpos] == '\0' )
129 found = 1;
130 *curpos += 1;
131 break;
133 else
134 { /* got a saveable character */
135 saved_chars[cur_char] = buffer[*curpos];
136 *curpos += 1;
137 } /*END if*/
138 } /*END for*/
140 fold_word( saved_chars, cur_char ); /* case fold the word */
142 /* deal with appostr. (mostly <string>'s really) */
143 if( cur_char >= 2 )
145 if( saved_chars[cur_char-2] == '\'' || saved_chars[cur_char-2] == '\"' )
147 saved_chars[cur_char-2] = saved_chars[cur_char-1];
148 saved_chars[cur_char-1] = '\0';
150 else
151 saved_chars[cur_char] = '\0';
152 } /* END if */
153 else
154 saved_chars[cur_char] = '\0';
156 } /* END get_chars */
158 void get_word( char buffer[], int size, char word[KW_SIZE], int *curpos ) {
159 /* get a word from a page */
161 /*skip any number of spaces/irrelevant chars in rows/cols*/
162 skip_spaces( buffer, size, curpos );
164 /* now get each character from the line */
165 get_chars( buffer, size, curpos, word );
166 word[KW_SIZE-1] = '\0'; /*max size keyword to record */
168 } /*END get_word*/
170 void check_code( char word[KW_SIZE], int *code ) {
172 *code = SAVETERM; /* assume its a saveable term */
174 if( strncmp( word,"<top>",5) == 0 )
175 *code = -1;
176 if( strncmp( word,"<num>",5) == 0 )
177 *code = TOPIC_NUMBER;
178 if( strncmp( word,"<title>",7) == 0 )
179 *code = TITLE;
180 if( strncmp( word,"<desc>",6) == 0 )
181 *code = DESC;
182 if( strncmp( word,"<narr>",6) == 0 )
183 *code = NARR;
184 if( strncmp( word,"</top>",6) == 0 )
185 *code = END_TOPIC;
187 } /* END check_code */
189 void clean_word( char word[KW_SIZE] ) {
191 int i;
192 int curpos;
193 char cleanedword[KW_SIZE];
196 Init_str( cleanedword, KW_SIZE );
198 for( i=0, curpos=0; i < strlen(word); i++ ) {
199 if(isalnum(word[i])) cleanedword[curpos++] = word[i];
200 } /* END for */
202 strcpy( word, cleanedword );
204 } /* END clean_word */
207 void find_field_status( char arg[], int *use_title, int *use_desc, int *use_narr ) {
208 /* find out what type of search you want the user wants */
210 int foundone=0;
211 int i;
213 for( i=0; i < strlen(arg); i++ ) {
214 switch(arg[i]) {
215 case 't' : *use_title=1; foundone=1;
216 break;
217 case 'd' : *use_desc=1; foundone=1;
218 break;
219 case 'n' : *use_narr=1; foundone=1;
220 break;
221 //default :
222 } // END switch
223 } /* END for */
225 if( foundone ) {
226 if(*use_title) fprintf(stderr,"FIELD) you have requested the use of the title field\n");
227 if(*use_desc) fprintf(stderr,"FIELD) you have requested the use of the description field\n");
228 if(*use_narr) fprintf(stderr,"FIELD) you have requested the use of the narrative field\n");
230 } else {
231 fprintf(stderr,"ERROR - invalid TREC field entered [%s] please try again\n", arg );
232 } /* END if */
234 } /* END find_field_status */
236 void Init_QUERY( QUERY *q ) {
237 /* initalise a query */
239 int i;
241 for( i=0; i < MAX_KEYWORDS; i++ ) {
242 Init_str( q->term_set[i], KW_SIZE );
243 q->weights[i] = 0.0;
244 } /* END for */
245 q->nwords = 0;
247 } /* END Init_QUERY */
249 void Save_QUERY( FILE *qd, QUERY q ) {
250 /* save query details to disk */
252 int i;
253 fprintf( qd, "%d ", q.topic_no );
254 fprintf( stderr, "got query {%d ", q.topic_no );
255 for( i=0; i < q.nwords; i++ ) {
256 fprintf( qd, "%s ", q.term_set[i] );
257 fprintf(stderr, "%s ", q.term_set[i] );
258 } /* END if */
259 fprintf( qd, "\n" );
260 fprintf( stderr, "}\n" );
262 } /* END Save_QUERY */
265 int Isin_query( char word[KW_SIZE], QUERY *q ) {
266 /* returns 1 if word is in query, 0 otherwise */
268 int found=0;
269 int i;
271 for( i=0; i < q->nwords && !found; i++ ) {
272 if( strcmp( word, q->term_set[i] ) == 0 ) found=1;
273 } /* END for */
275 return found;
277 } /* END Isin_query */
279 #define NTOPICSTOPS 66
281 char topic_stops[NTOPICSTOPS][KW_SIZE] = {
282 "document", "documents","discuss","discussed","mention","mentions","cite","cited","include","includes",
283 "included","report","reports","describe","describes","described","relevant","relevance","concern","concerns",
284 "concerned","reveal","specify","specifying","specified","specifics","announce","announces","announced",
285 "announcing","event","provided","occur","present","contain","contains","containing","example","examples",
286 "eg","ie","instance","instances","consider","considered","indicative","note","notes","noted","quote",
287 "quotes","substantive","unless","find","finds", "identify", "identified","identfies", "evidence",
288 "continue","define","determine", "determined", "discussing", "references", "reference" };
290 int stopped_word( char word[KW_SIZE], SW_STORE sw_store ) {
291 /* returns 1 if word is a stop word or frequent topic word, 0 otherwise */
293 int found=0;
294 int i;
296 /* check for frequent topic words */
297 for( i=0; i < NTOPICSTOPS && !found; i++ ) {
298 if( strcmp( word, topic_stops[i] ) == 0 ) found=1;
299 } /* END */
301 /* check for main stops */
302 if (!found) found = IsStopWord( sw_store, word );
304 return found;
306 } /* END if */
308 void Save_Word( char word[KW_SIZE], QUERY *q, SW_STORE sw_store ) {
309 /* save a word in a query, iff it isn't already there */
311 if( !Isin_query( word, q ) && !stopped_word( word, sw_store ) ) {
312 strcpy( q->term_set[q->nwords], word );
313 q->nwords += 1;
314 } /* END if */
316 } /* END Save_Word */
318 void create_queries( CONFIG_TREC & config ) {
320 FILE *query_fd;
321 FILE *topic_fd;
322 QUERY q;
323 int code;
324 char word[KW_SIZE];
325 int use_desc=0;
326 int use_narr=0;
327 int use_title=0;
328 char topics[BIG_BUFFER];
329 int topic_size=0;
330 int curpos=0;
331 int terms_to_save;
332 int save_word=0;
333 SW_STORE sw_store;
335 terms_to_save = config.get_nterms(); /* number of terms to save */
337 /* find out what type of search you want the user wants */
338 find_field_status( (char *) config.get_topicfields().c_str(), &use_title, &use_desc, &use_narr );
340 /* open the files for manipulation */
341 cout << "QUERY) topic file name is " << config.get_topicfile().c_str() << endl;
342 topic_fd = fopen( config.get_topicfile().c_str(), "r" );
343 if(!topic_fd) {
344 cout << "ERROR - can't open topic file" << config.get_queryfile().c_str() << "for reading" << endl;
345 std::exit(-1);
346 } /* END if */
347 for( topic_size=0; topic_size < BIG_BUFFER && !feof(topic_fd); topic_size++ ){
348 topics[topic_size] = fgetc(topic_fd);
349 } // END for
350 fclose( topic_fd );
352 query_fd = fopen( config.get_queryfile().c_str(), "w" );
353 if(!query_fd) {
354 cout << "ERROR - can't open file [" << config.get_queryfile().c_str() << "] for writing" << endl;
355 std::exit(0);
356 } // END if
357 cout << "QUERY) query file name is: " << config.get_queryfile().c_str() << endl;
359 Read_SW_File( (char *) config.get_stopsfile().c_str(), &sw_store );
361 /* iniatisation */
362 Init_QUERY( &q );
363 code = NOACTION;
365 while( curpos < topic_size ) {
367 Init_str( word, KW_SIZE );
368 get_word( topics, BIG_BUFFER, word, &curpos );
369 check_code( word, &code );
370 clean_word( word );
371 switch( code ) {
372 case NOACTION :
374 cout << "No action" << endl;
376 break;
377 case END_TOPIC :
379 /* save the query and clean it */
380 Save_QUERY( query_fd, q );
381 Init_QUERY( &q );
382 save_word=0;
384 break;
385 case TOPIC_NUMBER :
387 get_word( topics, BIG_BUFFER, word, &curpos ); // spin past Number
388 get_word( topics, BIG_BUFFER, word, &curpos );
389 q.topic_no = atoi(word);
390 save_word=0;
392 break;
393 case TITLE :
395 if( use_title ) save_word = 1;
396 else save_word=0;
398 break;
399 case DESC :
401 get_word( topics, BIG_BUFFER, word, &curpos ); /* spin past Description: */
402 if( use_desc ) save_word = 1;
403 else save_word=0;
405 break;
406 case NARR :
408 get_word( topics, BIG_BUFFER, word, &curpos ); /* spin past Narrative: */
409 if( use_narr ) save_word = 1;
410 else save_word=0;
412 break;
413 default : /* save the word if required */
415 if( save_word ) Save_Word( word, &q, sw_store );
417 } /* END switch */
418 } /* END while */
420 } /* END create_queries( */
422 int main(int argc, char **argv)
425 // only one parameter is required for this program
426 if(argc < 2) {
427 cout << "usage: " << argv[0] << " <config file>" << endl;
428 std::exit(1);
431 // Catch any Xapian::Error exceptions thrown
432 try {
434 CONFIG_TREC config;
435 config.setup_config( string(argv[1]) );
436 config.check_query_config();
438 create_queries( config );
440 } catch(const Xapian::Error &error) {
441 cout << "Exception: " << error.get_msg() << endl;
443 } // END main