xapian-core/tests/harness/index_utils.cc

   1 /* index_utils.cc - utility functions for indexing testcase data
   2  *
   3  * Copyright (C) 2005,2007,2013 Olly Betts
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18  */
  19
  20 #include <config.h>
  21
  22 #include "index_utils.h"
  23
  24 #include "stringutils.h"
  25
  26 #include <algorithm>
  27 #include <cerrno>
  28 #include <cstring>
  29 #include <fstream>
  30
  31 using namespace std;
  32
  33 static string munge_term(const string &term);
  34
  35 /// Read a paragraph from stream @a input.
  36 static string
  37 get_paragraph(istream &input)
  38 {
  39     string para, line;
  40     while (true) {
  41         getline(input, line);
  42         if (find_if(line.begin(), line.end(), C_isnotspace) == line.end())
  43             return para;
  44         para += line;
  45         para += '\n';
  46     }
  47 }
  48
  49 void
  50 FileIndexer::index_to(Xapian::WritableDatabase & db)
  51 {
  52     Xapian::Stem stemmer("english");
  53
  54     while (file != end || (input.is_open() && !input.eof())) {
  55         if (input.eof()) next_file();
  56
  57         Xapian::Document doc;
  58         string para = get_paragraph(input);
  59         doc.set_data(para);
  60
  61         // Value 0 contains all possible character values so we can check that
  62         // none of them cause problems.
  63         string value0("X\0\0\0 \1\t"
  64             "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
  65             "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
  66             "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
  67             "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
  68             "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
  69             "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
  70             "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
  71             "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
  72             "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
  73             "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
  74             "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
  75             "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
  76             "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
  77             "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
  78             "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
  79             "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
  80             7 + 256);
  81         if (para.size() > 2) value0[0] = para[2];
  82         value0 += para;
  83         doc.add_value(0, value0);
  84
  85         for (Xapian::valueno i = min(para.length(), size_t(10)); i >= 1; --i) {
  86             doc.add_value(i, para.substr(i, 1));
  87         }
  88         // Value 11 is useful for tests of sorting
  89         doc.add_value(11, Xapian::sortable_serialise(para.size()));
  90
  91         // Value 12 is useful for tests of collapsing
  92         doc.add_value(12, Xapian::sortable_serialise(para.size() % 5));
  93
  94         // Value 13 contains the first 3 letters of the paragraph
  95         doc.add_value(13, para.substr(0, 3));
  96
  97         Xapian::termcount pos = 0;
  98         string::const_iterator word_end = para.begin();
  99         // Need a const_iterator version of para.end() for find_if.
 100         const string::const_iterator para_end = para.end();
 101         while (word_end != para_end) {
 102             string::const_iterator word_start;
 103             word_start = find_if(word_end, para_end, C_isnotspace);
 104             word_end = find_if(word_start, para_end, C_isspace);
 105             string word = stemmer(munge_term(string(word_start, word_end)));
 106             if (!word.empty()) doc.add_posting(word, ++pos);
 107         }
 108
 109         db.add_document(doc);
 110     }
 111 }
 112
 113 // Strip unwanted characters, force to lower case, and handle \ escapes.
 114 static string
 115 munge_term(const string &term)
 116 {
 117     string result;
 118     for (string::const_iterator i = term.begin(); i != term.end(); ++i) {
 119         char ch = *i;
 120         if (C_isalnum(ch))
 121             result += C_tolower(ch);
 122         else if (ch == '\\') {
 123             ++i;
 124             if (i != term.end()) {
 125                 switch (*i) {
 126                     case '\\': ch = '\\'; break;
 127                     case '0': ch = '\0'; break;
 128                     case 'n': ch = '\n'; break;
 129                     case 'r': ch = '\r'; break;
 130                     case 't': ch = '\t'; break;
 131                     case 'x': {
 132                         // Check we can read the next two characters.
 133                         if (size_t(i - term.begin()) >= term.size() - 2) {
 134                             --i;
 135                             break;
 136                         }
 137                         string::const_iterator j = i;
 138                         char b = *++i;
 139                         char c = *++i;
 140                         if (!C_isxdigit(b) || !C_isxdigit(c)) {
 141                             i = j - 1;
 142                         } else {
 143                             ch = (hex_digit(b) << 4) | hex_digit(c);
 144                         }
 145                         break;
 146                     }
 147                 }
 148             }
 149             result += ch;
 150         }
 151     }
 152     return result;
 153 }
 154
 155 void
 156 FileIndexer::next_file()
 157 {
 158     if (input.is_open()) {
 159         input.close();
 160         // MSVC doesn't clear fail() on close() and re-open().
 161         input.clear();
 162     }
 163
 164     // Find the next non-empty filename.
 165     while (file != end && (*file).empty()) {
 166         ++file;
 167     }
 168     if (file == end) return;
 169
 170     string filename;
 171     if (!datadir.empty()) {
 172         filename = datadir;
 173         bool need_slash = true;
 174         for (char dir_sep : DIR_SEPS_LIST) {
 175             if (filename.back() == dir_sep) {
 176                 need_slash = false;
 177                 break;
 178             }
 179         }
 180         if (need_slash) filename += '/';
 181     }
 182     filename += *file++;
 183     filename += ".txt";
 184
 185     input.open(filename.c_str());
 186     // Need to check is_open() - just using operator! fails with MSVC.
 187     if (!input.is_open()) {
 188         string msg = "Can't read file '";
 189         msg += filename;
 190         msg += "' for indexing (";
 191         msg += strerror(errno);
 192         msg += ')';
 193         throw msg;
 194     }
 195 }