1 /* index_utils.cc - utility functions for indexing testcase data
3 * Copyright (C) 2005,2007,2013 Olly Betts
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "index_utils.h"
24 #include "stringutils.h"
33 static string
munge_term(const string
&term
);
35 /// Read a paragraph from stream @a input.
37 get_paragraph(istream
&input
)
42 if (find_if(line
.begin(), line
.end(), C_isnotspace
) == line
.end())
50 FileIndexer::index_to(Xapian::WritableDatabase
& db
)
52 Xapian::Stem
stemmer("english");
54 while (file
!= end
|| (input
.is_open() && !input
.eof())) {
55 if (input
.eof()) next_file();
58 string para
= get_paragraph(input
);
61 // Value 0 contains all possible character values so we can check that
62 // none of them cause problems.
63 string
value0("X\0\0\0 \1\t"
64 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
65 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
66 "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
67 "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
68 "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
69 "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
70 "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
71 "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
72 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
73 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
74 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
75 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
76 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
77 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
78 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
79 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
81 if (para
.size() > 2) value0
[0] = para
[2];
83 doc
.add_value(0, value0
);
85 for (Xapian::valueno i
= min(para
.length(), size_t(10)); i
>= 1; --i
) {
86 doc
.add_value(i
, para
.substr(i
, 1));
88 // Value 11 is useful for tests of sorting
89 doc
.add_value(11, Xapian::sortable_serialise(para
.size()));
91 // Value 12 is useful for tests of collapsing
92 doc
.add_value(12, Xapian::sortable_serialise(para
.size() % 5));
94 // Value 13 contains the first 3 letters of the paragraph
95 doc
.add_value(13, para
.substr(0, 3));
97 Xapian::termcount pos
= 0;
98 string::const_iterator word_end
= para
.begin();
99 // Need a const_iterator version of para.end() for find_if.
100 const string::const_iterator para_end
= para
.end();
101 while (word_end
!= para_end
) {
102 string::const_iterator word_start
;
103 word_start
= find_if(word_end
, para_end
, C_isnotspace
);
104 word_end
= find_if(word_start
, para_end
, C_isspace
);
105 string word
= stemmer(munge_term(string(word_start
, word_end
)));
106 if (!word
.empty()) doc
.add_posting(word
, ++pos
);
109 db
.add_document(doc
);
113 // Strip unwanted characters, force to lower case, and handle \ escapes.
115 munge_term(const string
&term
)
118 for (string::const_iterator i
= term
.begin(); i
!= term
.end(); ++i
) {
121 result
+= C_tolower(ch
);
122 else if (ch
== '\\') {
124 if (i
!= term
.end()) {
126 case '\\': ch
= '\\'; break;
127 case '0': ch
= '\0'; break;
128 case 'n': ch
= '\n'; break;
129 case 'r': ch
= '\r'; break;
130 case 't': ch
= '\t'; break;
132 // Check we can read the next two characters.
133 if (size_t(i
- term
.begin()) >= term
.size() - 2) {
137 string::const_iterator j
= i
;
140 if (!C_isxdigit(b
) || !C_isxdigit(c
)) {
143 ch
= (hex_digit(b
) << 4) | hex_digit(c
);
156 FileIndexer::next_file()
158 if (input
.is_open()) {
160 // MSVC doesn't clear fail() on close() and re-open().
164 // Find the next non-empty filename.
165 while (file
!= end
&& (*file
).empty()) {
168 if (file
== end
) return;
171 if (!datadir
.empty()) {
173 bool need_slash
= true;
174 for (char dir_sep
: DIR_SEPS_LIST
) {
175 if (filename
.back() == dir_sep
) {
180 if (need_slash
) filename
+= '/';
185 input
.open(filename
.c_str());
186 // Need to check is_open() - just using operator! fails with MSVC.
187 if (!input
.is_open()) {
188 string msg
= "Can't read file '";
190 msg
+= "' for indexing (";
191 msg
+= strerror(errno
);