Add README
[xapian-trec.git] / stopword.cc
blob1bab8e9fa8d76f350987275e42c847b301c556c8
1 /* stopword.cc: stop word manipulation routines
3 * ----START-LICENCE----
4 * Copyright 2003 Andy MacFarlane, City University
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 * -----END-LICENCE-----
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <ctype.h>
29 #include "stopword.h"
31 #define LSTRING 80 /* A Long String of characters */
32 #define TRUE 1 /* a condition is true */
33 #define FALSE 0 /* a condition is false */
34 #define IDENTICAL 0 /* two words are equal in strcmp */
35 #define LATER 1 /* 1 word is lexico. later than another */
36 #define EARLIER -1 /* 1 word is lexico. ealier than another */
37 #define NL '\n' /* newline marker */
38 #define KW_SIZE 80 /* maximum size of a keyword */
40 void Init_str( char str[], int size ) {
41 /* initialise a string */
43 int i;
45 for(i = 0; i < size; i++)
46 str[i] = '\0';
48 } /* END Init_str */
50 void Read_Extra_SWord( FILE *f_id, char word[KW_SIZE] ) {
51 /* read a stop word from the extra stop word file*/
53 int i;
54 char c; /* character read in */
56 /*initialize word*/
57 Init_str( word, KW_SIZE );
59 /* get character from file until newline or end of file found */
60 for( i=0; i < KW_SIZE && (c=fgetc(f_id)) != EOF && c != NL; i++ )
61 word[i] = c;
62 /* END for */
64 } /*END Read_Extra_SWord */
67 void Read_SW_File( char sw_file[], SW_STORE * sw_store ) {
68 /* read stop word file into stop word store
69 version which goes directly to the file, with need for specifiying dir */
71 FILE *swf_id; /* file id for stopword file */
72 int i;
73 char sw_fname[LSTRING]; /* full path and name of sw file */
75 sw_store->nstops = 0; /* init just in case! */
77 strcpy( sw_fname, sw_file );
79 swf_id = fopen( sw_fname, "r" ); /*open stop word file*/
80 if( !swf_id )
82 fprintf(stderr,"ERROR - can't open the extra stop word file [%s]\n",sw_fname );
83 exit( 0 );
84 } /* END if */
86 /*read stop words into stop word store*/
87 for( i=0; i < MAX_STOPWORDS && !feof(swf_id); i++) {
88 Read_Extra_SWord( swf_id, sw_store->words[i] );
89 if(!feof(swf_id))
91 sw_store->nstops++;
92 //fprintf(stderr,"DEBUG) sw(%d) %s\n", i,
93 // sw_store->words[i] );
94 } /* END if */
95 } /*END for*/
96 //fprintf(stderr,"STOP_WORDS) read in %d stop words\n", sw_store->nstops );
98 fclose( swf_id ); /*close down stop word file*/
100 } /* END Read_SW_File */
102 int IsStopWord( SW_STORE sw_store, char word[KW_SIZE] ) {
103 /* see if word is a extra stop word / stop integer during iterative search or
104 relevance feedback process */
106 int found = FALSE; /* word found status */
107 int low = 0; /* min position of search */
108 int high; /* max position of search */
109 int middle; /* middle position of current search */
110 int result; /* result of compare between two words */
112 high = sw_store.nstops;
114 if( isdigit(word[0]) )
115 { /* if its a digit don't index it */
116 found = TRUE;
118 else /* its a word, so compare it with the full stop word list - using binary chop */
120 while( low <= high && !found )
122 middle = (high + low) / 2;
124 result = strcmp( word, sw_store.words[middle] );
125 //fprintf(stderr,"COMPARE) [%s] with sw[%s], RESULT=%d\n", word, sw_store.words[middle], result );
126 if (result == IDENTICAL)
127 found = 1;
128 else if (result >= LATER)
129 low = middle + 1;
130 else
131 high = middle - 1;
132 /*END if*/
133 } /*END while*/
134 } /* END if */
135 //fprintf(stderr,"RESULT) %d\n", found );
137 return found; /* result of search */
139 } /* END IsStopWord */