1 /* stopword.cc: stop word manipulation routines
3 * ----START-LICENCE----
4 * Copyright 2003 Andy MacFarlane, City University
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * -----END-LICENCE-----
31 #define LSTRING 80 /* A Long String of characters */
32 #define TRUE 1 /* a condition is true */
33 #define FALSE 0 /* a condition is false */
34 #define IDENTICAL 0 /* two words are equal in strcmp */
35 #define LATER 1 /* 1 word is lexico. later than another */
36 #define EARLIER -1 /* 1 word is lexico. ealier than another */
37 #define NL '\n' /* newline marker */
38 #define KW_SIZE 80 /* maximum size of a keyword */
40 void Init_str( char str
[], int size
) {
41 /* initialise a string */
45 for(i
= 0; i
< size
; i
++)
50 void Read_Extra_SWord( FILE *f_id
, char word
[KW_SIZE
] ) {
51 /* read a stop word from the extra stop word file*/
54 char c
; /* character read in */
57 Init_str( word
, KW_SIZE
);
59 /* get character from file until newline or end of file found */
60 for( i
=0; i
< KW_SIZE
&& (c
=fgetc(f_id
)) != EOF
&& c
!= NL
; i
++ )
64 } /*END Read_Extra_SWord */
67 void Read_SW_File( char sw_file
[], SW_STORE
* sw_store
) {
68 /* read stop word file into stop word store
69 version which goes directly to the file, with need for specifiying dir */
71 FILE *swf_id
; /* file id for stopword file */
73 char sw_fname
[LSTRING
]; /* full path and name of sw file */
75 sw_store
->nstops
= 0; /* init just in case! */
77 strcpy( sw_fname
, sw_file
);
79 swf_id
= fopen( sw_fname
, "r" ); /*open stop word file*/
82 fprintf(stderr
,"ERROR - can't open the extra stop word file [%s]\n",sw_fname
);
86 /*read stop words into stop word store*/
87 for( i
=0; i
< MAX_STOPWORDS
&& !feof(swf_id
); i
++) {
88 Read_Extra_SWord( swf_id
, sw_store
->words
[i
] );
92 //fprintf(stderr,"DEBUG) sw(%d) %s\n", i,
93 // sw_store->words[i] );
96 //fprintf(stderr,"STOP_WORDS) read in %d stop words\n", sw_store->nstops );
98 fclose( swf_id
); /*close down stop word file*/
100 } /* END Read_SW_File */
102 int IsStopWord( SW_STORE sw_store
, char word
[KW_SIZE
] ) {
103 /* see if word is a extra stop word / stop integer during iterative search or
104 relevance feedback process */
106 int found
= FALSE
; /* word found status */
107 int low
= 0; /* min position of search */
108 int high
; /* max position of search */
109 int middle
; /* middle position of current search */
110 int result
; /* result of compare between two words */
112 high
= sw_store
.nstops
;
114 if( isdigit(word
[0]) )
115 { /* if its a digit don't index it */
118 else /* its a word, so compare it with the full stop word list - using binary chop */
120 while( low
<= high
&& !found
)
122 middle
= (high
+ low
) / 2;
124 result
= strcmp( word
, sw_store
.words
[middle
] );
125 //fprintf(stderr,"COMPARE) [%s] with sw[%s], RESULT=%d\n", word, sw_store.words[middle], result );
126 if (result
== IDENTICAL
)
128 else if (result
>= LATER
)
135 //fprintf(stderr,"RESULT) %d\n", found );
137 return found
; /* result of search */
139 } /* END IsStopWord */