1 /* trec_query.cc: Example batch query generator for TREC experiments
3 * ----START-LICENCE----
4 * Copyright 2003 Andy MacFarlane, City University
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * -----END-LICENCE-----
28 #include <sys/types.h>
34 #include "htmlparse.h"
36 #include "config_file.h"
37 #include "indextext.h"
45 using namespace Xapian
;
48 #define MAX_KEYWORDS 10000
49 #define BIG_BUFFER 100000
52 #define TOPIC_NUMBER 2
61 typedef struct query
{
62 /* data structure for query */
64 int topic_no
; /* topic no for the query */
65 char term_set
[MAX_KEYWORDS
][KW_SIZE
]; /* list of query terms */
66 double weights
[MAX_KEYWORDS
]; /* weights for those terms */
67 int nwords
; /* no of words in the query */
71 void skip_spaces( char page
[], int size
, int *curpos
) {
72 /* skip any number of spaces/irrelevant chars in rows/cols */
74 /* find an alpha numeric character */
75 while( page
[*curpos
] != '\0' &&
76 !isalnum( page
[*curpos
] ) && !(page
[*curpos
] == '<')) {
82 void fold_word( char string
[], int len
) {
83 /* case fold a word */
87 for( i
= 0; i
< len
; i
++ )
88 string
[i
] = tolower( string
[i
] );
92 int iswordbreak( char c
) {
93 /* see if there is a break in the middle of a word, such as a hyper
94 less than sign etc, if so return true else false - version2 */
95 /* note: used for the chamber: ignores <> chars for tag processing */
97 if( c
>= 0 && c
<= '-' )
99 if( c
>= '[' && c
<= '`' )
101 if( c
>= '{' && c
<= '~' )
103 if( c
>= ':' && c
<= ';' )
105 if( c
>= '=' && c
<= '@' )
111 } /* END iswordbreak */
113 void get_chars( char buffer
[], int size
, int *curpos
, char saved_chars
[KW_SIZE
] ) {
114 /* get each character for a word */
116 int found
= FALSE
; /* condition for a found word */
117 register int cur_char
; /* pos of char in current word */
119 /* loop until end of word found */
120 for( cur_char
= 0; cur_char
< KW_SIZE
&& !found
; cur_char
++ )
122 if ( isspace(buffer
[*curpos
]) )
127 else if ( buffer
[*curpos
] == '\0' )
134 { /* got a saveable character */
135 saved_chars
[cur_char
] = buffer
[*curpos
];
140 fold_word( saved_chars
, cur_char
); /* case fold the word */
142 /* deal with appostr. (mostly <string>'s really) */
145 if( saved_chars
[cur_char
-2] == '\'' || saved_chars
[cur_char
-2] == '\"' )
147 saved_chars
[cur_char
-2] = saved_chars
[cur_char
-1];
148 saved_chars
[cur_char
-1] = '\0';
151 saved_chars
[cur_char
] = '\0';
154 saved_chars
[cur_char
] = '\0';
156 } /* END get_chars */
158 void get_word( char buffer
[], int size
, char word
[KW_SIZE
], int *curpos
) {
159 /* get a word from a page */
161 /*skip any number of spaces/irrelevant chars in rows/cols*/
162 skip_spaces( buffer
, size
, curpos
);
164 /* now get each character from the line */
165 get_chars( buffer
, size
, curpos
, word
);
166 word
[KW_SIZE
-1] = '\0'; /*max size keyword to record */
170 void check_code( char word
[KW_SIZE
], int *code
) {
172 *code
= SAVETERM
; /* assume its a saveable term */
174 if( strncmp( word
,"<top>",5) == 0 )
176 if( strncmp( word
,"<num>",5) == 0 )
177 *code
= TOPIC_NUMBER
;
178 if( strncmp( word
,"<title>",7) == 0 )
180 if( strncmp( word
,"<desc>",6) == 0 )
182 if( strncmp( word
,"<narr>",6) == 0 )
184 if( strncmp( word
,"</top>",6) == 0 )
187 } /* END check_code */
189 void clean_word( char word
[KW_SIZE
] ) {
193 char cleanedword
[KW_SIZE
];
196 Init_str( cleanedword
, KW_SIZE
);
198 for( i
=0, curpos
=0; i
< strlen(word
); i
++ ) {
199 if(isalnum(word
[i
])) cleanedword
[curpos
++] = word
[i
];
202 strcpy( word
, cleanedword
);
204 } /* END clean_word */
207 void find_field_status( char arg
[], int *use_title
, int *use_desc
, int *use_narr
) {
208 /* find out what type of search you want the user wants */
213 for( i
=0; i
< strlen(arg
); i
++ ) {
215 case 't' : *use_title
=1; foundone
=1;
217 case 'd' : *use_desc
=1; foundone
=1;
219 case 'n' : *use_narr
=1; foundone
=1;
226 if(*use_title
) fprintf(stderr
,"FIELD) you have requested the use of the title field\n");
227 if(*use_desc
) fprintf(stderr
,"FIELD) you have requested the use of the description field\n");
228 if(*use_narr
) fprintf(stderr
,"FIELD) you have requested the use of the narrative field\n");
231 fprintf(stderr
,"ERROR - invalid TREC field entered [%s] please try again\n", arg
);
234 } /* END find_field_status */
236 void Init_QUERY( QUERY
*q
) {
237 /* initalise a query */
241 for( i
=0; i
< MAX_KEYWORDS
; i
++ ) {
242 Init_str( q
->term_set
[i
], KW_SIZE
);
247 } /* END Init_QUERY */
249 void Save_QUERY( FILE *qd
, QUERY q
) {
250 /* save query details to disk */
253 fprintf( qd
, "%d ", q
.topic_no
);
254 fprintf( stderr
, "got query {%d ", q
.topic_no
);
255 for( i
=0; i
< q
.nwords
; i
++ ) {
256 fprintf( qd
, "%s ", q
.term_set
[i
] );
257 fprintf(stderr
, "%s ", q
.term_set
[i
] );
260 fprintf( stderr
, "}\n" );
262 } /* END Save_QUERY */
265 int Isin_query( char word
[KW_SIZE
], QUERY
*q
) {
266 /* returns 1 if word is in query, 0 otherwise */
271 for( i
=0; i
< q
->nwords
&& !found
; i
++ ) {
272 if( strcmp( word
, q
->term_set
[i
] ) == 0 ) found
=1;
277 } /* END Isin_query */
279 #define NTOPICSTOPS 66
281 char topic_stops
[NTOPICSTOPS
][KW_SIZE
] = {
282 "document", "documents","discuss","discussed","mention","mentions","cite","cited","include","includes",
283 "included","report","reports","describe","describes","described","relevant","relevance","concern","concerns",
284 "concerned","reveal","specify","specifying","specified","specifics","announce","announces","announced",
285 "announcing","event","provided","occur","present","contain","contains","containing","example","examples",
286 "eg","ie","instance","instances","consider","considered","indicative","note","notes","noted","quote",
287 "quotes","substantive","unless","find","finds", "identify", "identified","identfies", "evidence",
288 "continue","define","determine", "determined", "discussing", "references", "reference" };
290 int stopped_word( char word
[KW_SIZE
], SW_STORE sw_store
) {
291 /* returns 1 if word is a stop word or frequent topic word, 0 otherwise */
296 /* check for frequent topic words */
297 for( i
=0; i
< NTOPICSTOPS
&& !found
; i
++ ) {
298 if( strcmp( word
, topic_stops
[i
] ) == 0 ) found
=1;
301 /* check for main stops */
302 if (!found
) found
= IsStopWord( sw_store
, word
);
308 void Save_Word( char word
[KW_SIZE
], QUERY
*q
, SW_STORE sw_store
) {
309 /* save a word in a query, iff it isn't already there */
311 if( !Isin_query( word
, q
) && !stopped_word( word
, sw_store
) ) {
312 strcpy( q
->term_set
[q
->nwords
], word
);
316 } /* END Save_Word */
318 void create_queries( CONFIG_TREC
& config
) {
328 char topics
[BIG_BUFFER
];
335 terms_to_save
= config
.get_nterms(); /* number of terms to save */
337 /* find out what type of search you want the user wants */
338 find_field_status( (char *) config
.get_topicfields().c_str(), &use_title
, &use_desc
, &use_narr
);
340 /* open the files for manipulation */
341 cout
<< "QUERY) topic file name is " << config
.get_topicfile().c_str() << endl
;
342 topic_fd
= fopen( config
.get_topicfile().c_str(), "r" );
344 cout
<< "ERROR - can't open topic file" << config
.get_queryfile().c_str() << "for reading" << endl
;
347 for( topic_size
=0; topic_size
< BIG_BUFFER
&& !feof(topic_fd
); topic_size
++ ){
348 topics
[topic_size
] = fgetc(topic_fd
);
352 query_fd
= fopen( config
.get_queryfile().c_str(), "w" );
354 cout
<< "ERROR - can't open file [" << config
.get_queryfile().c_str() << "] for writing" << endl
;
357 cout
<< "QUERY) query file name is: " << config
.get_queryfile().c_str() << endl
;
359 Read_SW_File( (char *) config
.get_stopsfile().c_str(), &sw_store
);
365 while( curpos
< topic_size
) {
367 Init_str( word
, KW_SIZE
);
368 get_word( topics
, BIG_BUFFER
, word
, &curpos
);
369 check_code( word
, &code
);
374 cout
<< "No action" << endl
;
379 /* save the query and clean it */
380 Save_QUERY( query_fd
, q
);
387 get_word( topics
, BIG_BUFFER
, word
, &curpos
); // spin past Number
388 get_word( topics
, BIG_BUFFER
, word
, &curpos
);
389 q
.topic_no
= atoi(word
);
395 if( use_title
) save_word
= 1;
401 get_word( topics
, BIG_BUFFER
, word
, &curpos
); /* spin past Description: */
402 if( use_desc
) save_word
= 1;
408 get_word( topics
, BIG_BUFFER
, word
, &curpos
); /* spin past Narrative: */
409 if( use_narr
) save_word
= 1;
413 default : /* save the word if required */
415 if( save_word
) Save_Word( word
, &q
, sw_store
);
420 } /* END create_queries( */
422 int main(int argc
, char **argv
)
425 // only one parameter is required for this program
427 cout
<< "usage: " << argv
[0] << " <config file>" << endl
;
431 // Catch any Xapian::Error exceptions thrown
435 config
.setup_config( string(argv
[1]) );
436 config
.check_query_config();
438 create_queries( config
);
440 } catch(const Xapian::Error
&error
) {
441 cout
<< "Exception: " << error
.get_msg() << endl
;