Add README
[xapian-trec.git] / trec_index.cc
blobe7274a01a214ed71b9b44ea7c3f332c6b0e6a9a9
1 /* trec_index.cc: indexer for trec experiments
3 * ----START-LICENCE----
4 * Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2003 Olly Betts
6 * Copyright 2003 Andy MacFarlane, City University
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * USA
22 * -----END-LICENCE-----
25 #include <xapian.h>
26 #include <algorithm>
27 #include <iostream>
28 #include <string>
30 #include <sys/types.h>
31 #include <dirent.h>
32 #include <sys/stat.h>
33 #include <unistd.h>
34 #include <getopt.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <fcntl.h>
38 #include "htmlparse.h"
39 #include "stopword.h"
40 #include "config_file.h"
41 #include "indextext.h"
42 #include "P98_gzip.h"
43 #include <time.h>
44 #include "timer.h"
46 using namespace Xapian;
47 using namespace std;
49 #define ENDDOC "</DOC>"
51 static const unsigned int MAX_URL_LENGTH = 240;
52 // chamber (from hashbld) is where the input bundles are decompressed.
53 #define CHAMBER_SIZE 30000000
54 char chamber[CHAMBER_SIZE];
56 float ttextsize=0; // total amount of text in mb indexed
57 int totaldocs=0; // total number of documents indexed
59 class SGMLParser : public HtmlParser {
60 public:
61 bool in_script_tag;
62 bool in_style_tag;
63 string title, sample, keywords, dump;
64 bool indexing_allowed;
65 void process_text(const string &text);
66 void opening_tag(const string &tag, const map<string,string> &p);
67 void closing_tag(const string &tag);
68 SGMLParser() :
69 in_script_tag(false),
70 in_style_tag(false),
71 indexing_allowed(true) { }
74 void SGMLParser::process_text(const string &text) {
75 // some tags are meaningful mid-word so this is simplistic at best...
77 if (!in_script_tag && !in_style_tag) {
78 string::size_type firstchar = text.find_first_not_of(" \t\n\r");
79 if (firstchar != string::npos) {
80 dump += text.substr(firstchar);
81 dump += " ";
86 void
87 SGMLParser::opening_tag(const string &tag, const map<string,string> &p) {
90 if (tag == "meta") {
91 map<string, string>::const_iterator i, j;
92 if ((i = p.find("content")) != p.end()) {
93 if ((j = p.find("name")) != p.end()) {
94 string name = j->second;
95 lowercase_term(name);
96 if (name == "description") {
97 if (sample.empty()) {
98 sample = i->second;
99 decode_entities(sample);
101 } else if (name == "keywords") {
102 if (!keywords.empty()) keywords += ' ';
103 string tmp = i->second;
104 decode_entities(tmp);
105 keywords += tmp;
106 } else if (name == "robots") {
107 string val = i->second;
108 decode_entities(val);
109 lowercase_term(val);
110 if (val.find("none") != string::npos ||
111 val.find("noindex") != string::npos) {
112 //indexing_allowed = false;
113 //cout << "HELP!) found a robot tag which is difficult to index :(" << endl;
118 } else if (tag == "script") {
119 in_script_tag = true;
120 } else if (tag == "style") {
121 in_style_tag = true;
122 } else if (tag == "body") {
123 dump = "";
127 void
128 SGMLParser::closing_tag(const string &tag)
130 //cout << "closing_tag) : " << tag << endl;
131 if (tag == "docno") {
132 if( dump.size() < 30 ) // nasty hack to get round problems on terabyte track with robot tags
133 title = dump;
134 else
135 title = "DOCNO-ERROR";
136 dump = "";
137 } else if (tag == "script") {
138 in_script_tag = false;
139 } else if (tag == "style") {
140 in_style_tag = false;
141 } // END if
144 string getline( int & curpos, int uncolen ) {
146 string line;
147 for( ; curpos < uncolen && chamber[curpos] !='\n'; curpos++ )
148 line += chamber[curpos];
149 curpos++;
150 return line;
152 } // END getline
154 string get_document( int & curpos, int uncolen ) {
155 // alternative version of get document
157 int end_found=0;
158 string document;
159 while( !end_found ) {
160 string line = getline( curpos, uncolen );
161 document += line;
162 string::size_type pos = line.find(ENDDOC,0);
163 if( pos != string::npos ) end_found=1;
164 } // END while
166 return document;
168 } // END get_document
170 Xapian::Document remove_stopwords( Xapian::Document doc, SW_STORE & sw_store ) {
171 // take a list of keywords and remove
173 Xapian::Document wordlist;
174 char word[100];
176 for( TermIterator t = doc.termlist_begin(); t != doc.termlist_end(); t++ ) {
177 for( int i=0; i < (*t).size(); i++ ) word[i] = (*t)[i];
178 if(!IsStopWord( sw_store, word )) wordlist.add_term( *t );
180 } // END for
182 return wordlist;
184 } // END remove_stopwords
186 Xapian::Document stem_document( Xapian::Document & doc ) {
188 Stem stemmer("english");
189 Xapian::Document wordlist;
191 for( TermIterator t = doc.termlist_begin(); t != doc.termlist_end(); t++ ) {
192 wordlist.add_term(stemmer(*t) );
194 } // END for
196 return wordlist;
199 } // END stem_document
201 inline static bool
202 p_plusminus(unsigned int c)
204 return c == '+' || c == '-';
207 Xapian::termpos
208 index_text(const string &s, Xapian::Document &doc, Xapian::Stem &stemmer,
209 Xapian::termcount wdfinc, const string &prefix,
210 Xapian::termpos pos)
212 string rprefix = prefix;
213 // If we're using a multi-character prefix, make sure to add a colon when
214 // generating raw (R) terms as otherwise XFOO + Rterm will collide with
215 // XFOOR + term
216 if (rprefix.size() > 1 && rprefix[rprefix.size() - 1] != ':')
217 rprefix += ':';
218 rprefix += 'R';
220 AccentNormalisingItor j(s.begin());
221 const AccentNormalisingItor s_end(s.end());
222 while (true) {
223 AccentNormalisingItor first = j;
224 while (first != s_end && !isalnum(*first)) ++first;
225 if (first == s_end) break;
226 AccentNormalisingItor last;
227 string term;
228 if (isupper(*first)) {
229 j = first;
230 term = *j;
231 while (++j != s_end && *j == '.' && ++j != s_end && isupper(*j)) {
232 term += *j;
234 if (term.length() < 2 || (j != s_end && isalnum(*j))) {
235 term = "";
237 last = j;
239 if (term.empty()) {
240 j = first;
241 while (isalnum(*j)) {
242 term += *j;
243 ++j;
244 if (j == s_end) break;
245 if (*j == '&') {
246 AccentNormalisingItor next = j;
247 ++next;
248 if (next == s_end || !isalnum(*next)) break;
249 term += '&';
250 j = next;
253 string::size_type len = term.length();
254 last = j;
255 while (j != s_end && p_plusminus(*j)) {
256 term += *j;
257 ++j;
259 if (j != s_end && isalnum(*j)) {
260 term.resize(len);
261 } else {
262 last = j;
265 if (term.length() <= MAX_PROB_TERM_LENGTH) {
266 lowercase_term(term);
267 if (isupper(*first)) {
268 if (pos != static_cast<Xapian::termpos>(-1)
269 // Not in GCC 2.95.2 numeric_limits<Xapian::termpos>::max()
271 doc.add_posting(rprefix + term, pos, wdfinc);
272 } else {
273 doc.add_term(rprefix + term, wdfinc);
277 term = stemmer(term);
278 if (pos != static_cast<Xapian::termpos>(-1)
279 // Not in GCC 2.95.2 numeric_limits<Xapian::termpos>::max()
281 doc.add_posting(prefix + term, pos++, wdfinc);
282 } else {
283 doc.add_term(prefix + term, wdfinc);
287 return pos;
290 static void index_file( const string &file,
291 CONFIG_TREC &config,
292 Xapian::WritableDatabase & db,
293 SW_STORE sw_store ) {
294 // index a file containing a number of SGML/HTML documents
296 if (file.empty()) {
297 cout << "can't read \"" << file << "\" - skipping\n";
298 return;
299 } //else cout << "Indexing [" << file << "]" << endl;
301 int curpos=0;
302 Xapian::Stem stemmer( config.get_language() );
303 int uncolen;
305 uncolen = decompress_bundle( (u_char*)file.c_str(), (u_char *) chamber, CHAMBER_SIZE);
306 //cout << "DEBUG) decompresses file done, size = " << uncolen << endl;
308 // accumulate the text size read in
309 ttextsize += ( (float) uncolen / 1048576.0);
311 while( curpos < uncolen ) {
313 // get a document
314 string rawdoc = get_document( curpos, uncolen );
315 //cout << "DEBUG) got a document, size = " << rawdoc.size() <<
316 // ", curpos = " << curpos << endl;
318 if( rawdoc.size() > 1 ) {
320 // parse the document for the data
321 SGMLParser p;
322 p.parse_html(rawdoc);
324 // Add postings for terms to the document
325 Xapian::Document doc;
326 Xapian::termpos pos = 1;
327 pos = index_text( p.title, doc, stemmer, pos);
328 pos = index_text( p.keywords, doc, stemmer, pos + 1);
330 // index the document
331 Xapian::Document doc_stopsremoved = remove_stopwords( doc, sw_store );
332 Xapian::Document stemdoc = stem_document( doc_stopsremoved );
333 //cout << "DOCID = " << p.title << endl;
334 stemdoc.set_data(p.title); // set the data
335 db.add_document(stemdoc);
337 // record the total no of docs done
338 totaldocs++;
339 //if( (totaldocs % 10000) == 0 ) cout << "DOCUMENTS PROCESSED) " << totaldocs << endl;
340 } // END if
342 } // END while
344 } // END index_file
346 static void index_directory( const string &dir, CONFIG_TREC & config, Xapian::WritableDatabase & db,
347 SW_STORE sw_store )
349 DIR *d;
350 struct dirent *ent;
351 string path = dir;
353 //cout << "[Entering directory " << dir << "]" << endl;
355 d = opendir(path.c_str());
356 if (d == NULL) {
357 cout << "Can't open directory \"" << path << "\" - skipping\n";
358 return;
360 while ((ent = readdir(d)) != NULL) {
361 struct stat statbuf;
362 // ".", "..", and other hidden files
363 if (ent->d_name[0] == '.') continue;
364 string file = dir;
365 if (!file.empty() && file[file.size() - 1] != '/') file += '/';
366 file += ent->d_name;
367 if (stat(file.c_str(), &statbuf) == -1) {
368 cout << "Can't stat \"" << file << "\" - skipping\n";
369 continue;
370 } // END if
372 if (S_ISDIR(statbuf.st_mode)) {
373 // file is a directory
374 try {
375 index_directory( file, config, db, sw_store );
377 catch (...) {
378 cout << "Caught unknown exception in index_directory, rethrowing" << endl;
379 throw;
381 continue;
382 } // END if
384 if (S_ISREG(statbuf.st_mode)) {
385 // file is a regular indexable text file
386 string ext;
387 string::size_type dot = file.find_last_of('.');
388 if (dot != string::npos) ext = file.substr(dot + 1);
390 index_file( file, config, db, sw_store );
391 continue;
392 } // END if
394 cout << "Not a regular file \"" << file << "\" - skipping\n";
396 closedir(d);
398 } // END index_directory
400 int main(int argc, char **argv)
403 // check for proper useage of program
404 if(argc < 2) {
405 cout << "usage: " << argv[0] << " <config file>" << endl;
406 exit(1);
407 } // END if
409 CONFIG_TREC trec_config;
410 trec_config.setup_config( string(argv[1]) );
412 if( !trec_config.check_index_config() ) {
413 cout << "ERROR - configure file invalid, pls check" << endl;
414 exit(1);
417 SW_STORE sw_store;
418 string stopsfilename = trec_config.get_stopsfile();
419 Read_SW_File( (char *) stopsfilename.c_str(), &sw_store );
421 // Catch any Xapian::Error exceptions thrown
422 try {
423 // Make the database
424 Xapian::WritableDatabase db(Xapian::Flint::open(trec_config.get_db().c_str(), Xapian::DB_CREATE_OR_OPEN));
426 struct timeval start_time, finish_time, timelapse; /* timing variables */
428 // start the timer
429 gettimeofday( &start_time, 0 );
431 // index the text collection
432 index_directory( trec_config.get_textfile(), trec_config, db, sw_store );
433 db.flush();
435 // start the timer
436 gettimeofday( &finish_time, 0 );
438 // print the total time, and average time per query -
439 diff_time( finish_time, start_time, &timelapse );
440 cout << "Total time for " << totaldocs << " documents is " << time_real( timelapse ) << " secs, text size = " << ttextsize
441 << " mb" << endl;
442 cout << "Total number of documents in the database is now " << db.get_doccount() << " docs" << endl;
444 } catch (const Xapian::Error &e) {
445 cout << "Exception: " << e.get_msg() << endl;
446 return 1;
447 } catch (const string &s) {
448 cout << "Exception: " << s << endl;
449 return 1;
450 } catch (const char *s) {
451 cout << "Exception: " << s << endl;
452 return 1;
453 } catch (...) {
454 cout << "Caught unknown exception" << endl;
455 return 1;
456 } // END catch
458 } // END main