xapian-applications/omega/scriptindex.cc

   1 /* scriptindex.cc
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 Sam Liddicott
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,2018 Olly Betts
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  21  * USA
  22  */
  23
  24 #include <config.h>
  25
  26 #include <xapian.h>
  27
  28 #include <algorithm>
  29 #include <fstream>
  30 #include <iostream>
  31 #include <list>
  32 #include <map>
  33 #include <string>
  34 #include <vector>
  35 #include <cstring>
  36
  37 #include <cstdlib>
  38 #include "safeerrno.h"
  39 #include <cstdio>
  40 #include <ctime>
  41
  42 #include "commonhelp.h"
  43 #include "hashterm.h"
  44 #include "loadfile.h"
  45 #include "myhtmlparse.h"
  46 #include "str.h"
  47 #include "stringutils.h"
  48 #include "timegm.h"
  49 #include "utf8truncate.h"
  50 #include "utils.h"
  51 #include "values.h"
  52
  53 #include "gnu_getopt.h"
  54
  55 using namespace std;
  56
  57 #define PROG_NAME "scriptindex"
  58 #define PROG_DESC "index arbitrary data as described by an index script"
  59
  60 static bool verbose;
  61 static int addcount;
  62 static int repcount;
  63 static int delcount;
  64
  65 inline bool
  66 prefix_needs_colon(const string & prefix, unsigned ch)
  67 {
  68     if (!C_isupper(ch) && ch != ':') return false;
  69     string::size_type len = prefix.length();
  70     return (len > 1 && prefix[len - 1] != ':');
  71 }
  72
  73 const char * action_names[] = {
  74     "bad", "new",
  75     "boolean", "date", "field", "hash", "hextobin", "index", "indexnopos",
  76     "load", "lower", "parsedate", "spell", "truncate", "unhtml", "unique",
  77     "value", "valuenumeric", "valuepacked", "weight"
  78 };
  79
  80 // For debugging:
  81 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
  82
  83 class Action {
  84 public:
  85     typedef enum {
  86         BAD, NEW,
  87         BOOLEAN, DATE, FIELD, HASH, HEXTOBIN, INDEX, INDEXNOPOS, LOAD, LOWER,
  88         PARSEDATE, SPELL, TRUNCATE, UNHTML, UNIQUE, VALUE,
  89         VALUENUMERIC, VALUEPACKED, WEIGHT
  90     } type;
  91 private:
  92     type action;
  93     int num_arg;
  94     string string_arg;
  95 public:
  96     Action(type action_) : action(action_), num_arg(0) { }
  97     Action(type action_, const string & arg)
  98         : action(action_), string_arg(arg) {
  99         num_arg = atoi(string_arg.c_str());
 100     }
 101     Action(type action_, const string & arg, int num)
 102         : action(action_), num_arg(num), string_arg(arg) { }
 103     type get_action() const { return action; }
 104     int get_num_arg() const { return num_arg; }
 105     void set_num_arg(int num) { num_arg = num; }
 106     const string & get_string_arg() const { return string_arg; }
 107 };
 108
 109 static void
 110 report_useless_action(const string &file, size_t line, size_t pos,
 111                       const string &action)
 112 {
 113     cerr << file << ':' << line;
 114     if (pos != string::npos) cerr << ':' << pos;
 115     cerr << ": Warning: Index action '" << action << "' has no effect" << endl;
 116
 117     static bool given_left_to_right_warning = false;
 118     if (!given_left_to_right_warning) {
 119         given_left_to_right_warning = true;
 120         cerr << file << ':' << line
 121              << ": Warning: Note that actions are executed from left to right"
 122              << endl;
 123     }
 124 }
 125
 126 static map<string, vector<Action> > index_spec;
 127
 128 static void
 129 parse_index_script(const string &filename)
 130 {
 131     ifstream script(filename.c_str());
 132     if (!script.is_open()) {
 133         cerr << filename << ": " << strerror(errno) << endl;
 134         exit(1);
 135     }
 136     string line;
 137     size_t line_no = 0;
 138     bool had_unique = false;
 139     while (getline(script, line)) {
 140         ++line_no;
 141         vector<string> fields;
 142         vector<Action> actions;
 143         string::const_iterator i, j;
 144         const string &s = line;
 145         i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
 146         if (i == s.end() || *i == '#') continue;
 147         while (true) {
 148             if (!C_isalnum(*i)) {
 149                 cerr << filename << ':' << line_no
 150                      << ": field name must start with alphanumeric" << endl;
 151                 exit(1);
 152             }
 153             j = find_if(i, s.end(),
 154                         [](char ch) { return !C_isalnum(ch) && ch != '_'; });
 155             fields.push_back(string(i, j));
 156             i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
 157             if (i == s.end()) break;
 158             if (*i == ':') {
 159                 ++i;
 160                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 161                 break;
 162             }
 163             if (i == j) {
 164                 cerr << filename << ':' << line_no
 165                      << ": bad character '" << *j << "' in fieldname" << endl;
 166                 exit(1);
 167             }
 168         }
 169         Xapian::termcount weight = 1;
 170         size_t useless_weight_pos = string::npos;
 171         map<string, Action::type> boolmap;
 172         j = i;
 173         while (j != s.end()) {
 174             i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
 175             string action(s, j - s.begin(), i - j);
 176             Action::type code = Action::BAD;
 177             unsigned min_args = 0, max_args = 0;
 178             bool takes_integer_argument = false;
 179             if (!action.empty()) {
 180                 switch (action[0]) {
 181                     case 'b':
 182                         if (action == "boolean") {
 183                             code = Action::BOOLEAN;
 184                             max_args = 1;
 185                         }
 186                         break;
 187                     case 'd':
 188                         if (action == "date") {
 189                             code = Action::DATE;
 190                             min_args = max_args = 1;
 191                         }
 192                         break;
 193                     case 'f':
 194                         if (action == "field") {
 195                             code = Action::FIELD;
 196                             max_args = 1;
 197                         }
 198                         break;
 199                     case 'h':
 200                         if (action == "hash") {
 201                             code = Action::HASH;
 202                             max_args = 1;
 203                             takes_integer_argument = true;
 204                         } else if (action == "hextobin") {
 205                             code = Action::HEXTOBIN;
 206                         }
 207                         break;
 208                     case 'i':
 209                         if (action == "index") {
 210                             code = Action::INDEX;
 211                             max_args = 1;
 212                         } else if (action == "indexnopos") {
 213                             code = Action::INDEXNOPOS;
 214                             max_args = 1;
 215                         }
 216                         break;
 217                     case 'l':
 218                         if (action == "lower") {
 219                             code = Action::LOWER;
 220                         } else if (action == "load") {
 221                             code = Action::LOAD;
 222                         }
 223                         break;
 224                     case 'p':
 225                         if (action == "parsedate") {
 226                             code = Action::PARSEDATE;
 227                             min_args = max_args = 1;
 228                         }
 229                         break;
 230                     case 's':
 231                         if (action == "spell") {
 232                             code = Action::SPELL;
 233                         }
 234                         break;
 235                     case 't':
 236                         if (action == "truncate") {
 237                             code = Action::TRUNCATE;
 238                             min_args = max_args = 1;
 239                             takes_integer_argument = true;
 240                         }
 241                         break;
 242                     case 'u':
 243                         if (action == "unhtml") {
 244                             code = Action::UNHTML;
 245                         } else if (action == "unique") {
 246                             code = Action::UNIQUE;
 247                             min_args = max_args = 1;
 248                         }
 249                         break;
 250                     case 'v':
 251                         if (action == "value") {
 252                             code = Action::VALUE;
 253                             min_args = max_args = 1;
 254                             takes_integer_argument = true;
 255                         } else if (action == "valuenumeric") {
 256                             code = Action::VALUENUMERIC;
 257                             min_args = max_args = 1;
 258                             takes_integer_argument = true;
 259                         } else if (action == "valuepacked") {
 260                             code = Action::VALUEPACKED;
 261                             min_args = max_args = 1;
 262                             takes_integer_argument = true;
 263                         }
 264                         break;
 265                     case 'w':
 266                         if (action == "weight") {
 267                             code = Action::WEIGHT;
 268                             min_args = max_args = 1;
 269                             takes_integer_argument = true;
 270                         }
 271                         break;
 272                 }
 273             }
 274             if (code == Action::BAD) {
 275                 cerr << filename << ':' << line_no
 276                      << ": Unknown index action '" << action << "'" << endl;
 277                 exit(1);
 278             }
 279             auto i_after_action = i;
 280             i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 281
 282             if (i != s.end() && *i == '=') {
 283                 if (i != i_after_action) {
 284                     cerr << filename << ':' << line_no
 285                          << ": warning: putting spaces between the action and "
 286                             "'=' is deprecated." << endl;
 287                 }
 288
 289                 if (max_args == 0) {
 290                     cerr << filename << ':' << line_no
 291                          << ": Index action '" << action
 292                          << "' doesn't take an argument" << endl;
 293                     exit(1);
 294                 }
 295
 296                 ++i;
 297                 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 298                 if (i != j) {
 299                     cerr << filename << ':' << line_no
 300                          << ": warning: putting spaces between '=' and the "
 301                             "argument is deprecated." << endl;
 302                 }
 303
 304                 vector<string> vals;
 305                 while (true) {
 306                     if (j != s.end() && *j == '"') {
 307                         // Quoted argument.
 308                         ++j;
 309                         i = find(j, s.end(), '"');
 310                         if (i == s.end()) {
 311                             cerr << filename << ':' << line_no
 312                                  << ": No closing quote" << endl;
 313                             exit(1);
 314                         }
 315                         vals.emplace_back(j, i);
 316                         ++i;
 317                         if (i == s.end() || C_isspace(*i)) break;
 318                         if (*i != ',') {
 319                             cerr << filename << ':' << line_no
 320                                  << ": Unexpected character '" << *i
 321                                  << "' after closing quote" << endl;
 322                             exit(1);
 323                         }
 324                         ++i;
 325                     } else if (max_args > 1) {
 326                         // Unquoted argument, split on comma.
 327                         i = find_if(j, s.end(),
 328                                     [](char ch) {
 329                                         return C_isspace(ch) || ch == ',';
 330                                     });
 331                         vals.emplace_back(j, i);
 332                         if (*i != ',') break;
 333                         ++i;
 334                     } else {
 335                         // Unquoted argument, including any commas.
 336                         i = find_if(j, s.end(),
 337                                     [](char ch) { return C_isspace(ch); });
 338                         vals.emplace_back(j, i);
 339                         break;
 340                     }
 341
 342                     if (vals.size() == max_args) {
 343                         cerr << filename << ':' << line_no
 344                              << ": Index action '" << action
 345                              << "' takes at most " << max_args << " arguments"
 346                              << endl;
 347                         exit(1);
 348                     }
 349                 }
 350
 351                 if (vals.size() < min_args) {
 352                     if (min_args == max_args) {
 353                         cerr << filename << ':' << line_no
 354                              << ": Index action '" << action
 355                              << "' requires " << min_args << " arguments"
 356                              << endl;
 357                         exit(1);
 358                     }
 359                     cerr << filename << ':' << line_no
 360                          << ": Index action '" << action
 361                          << "' requires at least " << min_args << " arguments"
 362                          << endl;
 363                     exit(1);
 364                 }
 365
 366                 string val;
 367                 if (!vals.empty()) {
 368                     val = vals.front();
 369                 }
 370
 371                 if (takes_integer_argument) {
 372                     if (val.find('.') != string::npos) {
 373                         cerr << filename << ':' << line_no
 374                              << ": Warning: Index action '" << action
 375                              << "' takes an integer argument" << endl;
 376                     }
 377                 }
 378                 switch (code) {
 379                     case Action::INDEX:
 380                     case Action::INDEXNOPOS:
 381                         actions.emplace_back(code, val, weight);
 382                         useless_weight_pos = string::npos;
 383                         break;
 384                     case Action::WEIGHT:
 385                         // We don't push an Action for WEIGHT - instead we
 386                         // store it ready to use in the INDEX and INDEXNOPOS
 387                         // Actions.
 388                         weight = atoi(val.c_str());
 389                         if (useless_weight_pos != string::npos) {
 390                             report_useless_action(filename, line_no,
 391                                                   useless_weight_pos, action);
 392                         }
 393                         useless_weight_pos = j - s.begin();
 394                         break;
 395                     case Action::TRUNCATE:
 396                         if (!actions.empty() &&
 397                             actions.back().get_action() == Action::LOAD) {
 398                             /* Turn "load truncate=n" into "load" with
 399                              * num_arg n, so that we don't needlessly
 400                              * allocate memory and read data we're just
 401                              * going to ignore.
 402                              */
 403                             actions.pop_back();
 404                             code = Action::LOAD;
 405                         }
 406                         actions.emplace_back(code, val);
 407                         break;
 408                     case Action::UNIQUE:
 409                         if (had_unique) {
 410                             cerr << filename << ':' << line_no
 411                                 << ": Index action 'unique' used more than "
 412                                    "once" << endl;
 413                             exit(1);
 414                         }
 415                         had_unique = true;
 416                         if (boolmap.find(val) == boolmap.end())
 417                             boolmap[val] = Action::UNIQUE;
 418                         actions.emplace_back(code, val);
 419                         break;
 420                     case Action::HASH: {
 421                         actions.emplace_back(code, val);
 422                         auto& obj = actions.back();
 423                         auto max_length = obj.get_num_arg();
 424                         if (max_length < 6) {
 425                             cerr << filename << ':' << line_no
 426                                  << ": Index action 'hash' takes an integer "
 427                                     "argument which must be at least 6" << endl;
 428                             exit(1);
 429                         }
 430                         break;
 431                     }
 432                     case Action::BOOLEAN:
 433                         boolmap[val] = Action::BOOLEAN;
 434                         /* FALLTHRU */
 435                     default:
 436                         actions.emplace_back(code, val);
 437                 }
 438                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 439             } else {
 440                 if (min_args > 0) {
 441                     if (min_args == max_args) {
 442                         cerr << filename << ':' << line_no
 443                              << ": Index action '" << action
 444                              << "' requires " << min_args << " arguments"
 445                              << endl;
 446                         exit(1);
 447                     }
 448                     cerr << filename << ':' << line_no
 449                          << ": Index action '" << action
 450                          << "' requires at least " << min_args << " arguments"
 451                          << endl;
 452                     exit(1);
 453                 }
 454                 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
 455                     useless_weight_pos = string::npos;
 456                     actions.emplace_back(code, "", weight);
 457                 } else if (code == Action::HASH) {
 458                     actions.emplace_back(code, "", MAX_SAFE_TERM_LENGTH - 1);
 459                 } else {
 460                     actions.emplace_back(code);
 461                 }
 462             }
 463             j = i;
 464         }
 465
 466         if (useless_weight_pos != string::npos) {
 467             report_useless_action(filename, line_no, useless_weight_pos,
 468                                   "weight");
 469         }
 470
 471         while (!actions.empty()) {
 472             bool done = true;
 473             Action::type action = actions.back().get_action();
 474             switch (action) {
 475                 case Action::HASH:
 476                 case Action::HEXTOBIN:
 477                 case Action::LOWER:
 478                 case Action::PARSEDATE:
 479                 case Action::SPELL:
 480                 case Action::TRUNCATE:
 481                 case Action::UNHTML:
 482                     done = false;
 483                     report_useless_action(filename, line_no, string::npos,
 484                                           action_names[action]);
 485                     actions.pop_back();
 486                     break;
 487                 default:
 488                     break;
 489             }
 490             if (done) break;
 491         }
 492
 493         map<string, Action::type>::const_iterator boolpfx;
 494         for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
 495             if (boolpfx->second == Action::UNIQUE) {
 496                 cerr << filename << ':' << line_no
 497                      << ": Warning: Index action 'unique=" << boolpfx->first
 498                      << "' without 'boolean=" << boolpfx->first << "'" << endl;
 499                 static bool given_doesnt_imply_boolean_warning = false;
 500                 if (!given_doesnt_imply_boolean_warning) {
 501                     given_doesnt_imply_boolean_warning = true;
 502                     cerr << filename << ':' << line_no
 503                          << ": Warning: Note 'unique' doesn't implicitly add "
 504                             "a boolean term" << endl;
 505                 }
 506             }
 507         }
 508
 509         vector<string>::const_iterator field;
 510         for (field = fields.begin(); field != fields.end(); ++field) {
 511             vector<Action> &v = index_spec[*field];
 512             if (v.empty()) {
 513                 if (fields.size() == 1) {
 514                     // Optimise common case where there's only one fieldname
 515                     // for a list of actions.
 516                     v = std::move(actions);
 517                 } else {
 518                     v = actions;
 519                 }
 520             } else {
 521                 v.emplace_back(Action::NEW);
 522                 v.insert(v.end(), actions.begin(), actions.end());
 523             }
 524         }
 525     }
 526
 527     if (index_spec.empty()) {
 528         cerr << filename << ": No rules found in index script" << endl;
 529         exit(1);
 530     }
 531 }
 532
 533 static void
 534 index_file(const char *fname, istream &stream,
 535            Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
 536 {
 537     string line;
 538     size_t line_no = 0;
 539     while (!stream.eof() && getline(stream, line)) {
 540         ++line_no;
 541         Xapian::Document doc;
 542         indexer.set_document(doc);
 543         Xapian::docid docid = 0;
 544         map<string, list<string> > fields;
 545         bool seen_content = false;
 546         while (!line.empty()) {
 547             // Cope with files from MS Windows (\r\n end of lines).
 548             // Trim multiple \r characters, since that seems the best way
 549             // to handle that case.
 550             string::size_type last = line.find_last_not_of('\r');
 551             if (last == string::npos) break;
 552             line.resize(last + 1);
 553
 554             string::size_type eq = line.find('=');
 555             if (eq == string::npos && !line.empty()) {
 556                 cerr << fname << ':' << line_no << ": expected = somewhere "
 557                     "in this line" << endl;
 558                 // FIXME: die or what?
 559             }
 560             string field(line, 0, eq);
 561             string value(line, eq + 1, string::npos);
 562             while (getline(stream, line)) {
 563                 ++line_no;
 564                 if (line.empty() || line[0] != '=') break;
 565                 // Cope with files from MS Windows (\r\n end of lines).
 566                 // Trim multiple \r characters, since that seems the best way
 567                 // to handle that case.
 568                 last = line.find_last_not_of('\r');
 569                 // line[0] == '=', so last != string::npos.
 570                 // Replace the '=' with a '\n' so we don't have to use substr.
 571                 line[0] = '\n';
 572                 line.resize(last + 1);
 573                 value += line;
 574             }
 575
 576             // Default to not indexing spellings.
 577             indexer.set_flags(Xapian::TermGenerator::flags(0));
 578
 579             const vector<Action> &v = index_spec[field];
 580             string old_value = value;
 581             vector<Action>::const_iterator i;
 582             bool this_field_is_content = true;
 583             for (i = v.begin(); i != v.end(); ++i) {
 584                 switch (i->get_action()) {
 585                     case Action::BAD:
 586                         abort();
 587                     case Action::NEW:
 588                         value = old_value;
 589                         // We're processing the same field again - give it a
 590                         // reprieve.
 591                         this_field_is_content = true;
 592                         break;
 593                     case Action::FIELD:
 594                         if (!value.empty()) {
 595                             string f = i->get_string_arg();
 596                             if (f.empty()) f = field;
 597                             // replace newlines with spaces
 598                             string s = value;
 599                             string::size_type j = 0;
 600                             while ((j = s.find('\n', j)) != string::npos)
 601                                 s[j] = ' ';
 602                             fields[f].push_back(s);
 603                         }
 604                         break;
 605                     case Action::INDEX:
 606                         indexer.index_text(value,
 607                                            i->get_num_arg(),
 608                                            i->get_string_arg());
 609                         break;
 610                     case Action::INDEXNOPOS:
 611                         // No positional information so phrase searching
 612                         // won't work.  However, the database will use much
 613                         // less diskspace.
 614                         indexer.index_text_without_positions(value,
 615                                                              i->get_num_arg(),
 616                                                              i->get_string_arg());
 617                         break;
 618                     case Action::BOOLEAN: {
 619                         // Do nothing if there's no text.
 620                         if (value.empty()) break;
 621
 622                         string term = i->get_string_arg();
 623                         if (prefix_needs_colon(term, value[0])) term += ':';
 624                         term += value;
 625
 626                         doc.add_boolean_term(term);
 627                         break;
 628                     }
 629                     case Action::HASH: {
 630                         unsigned int max_length = i->get_num_arg();
 631                         if (value.length() > max_length)
 632                             value = hash_long_term(value, max_length);
 633                         break;
 634                     }
 635                     case Action::HEXTOBIN: {
 636                         size_t len = value.length();
 637                         if (len & 1) {
 638                             cerr << "hextobin: input must have even length"
 639                                  << endl;
 640                         } else {
 641                             string output;
 642                             output.reserve(len / 2);
 643                             for (size_t j = 0; j < len; j += 2) {
 644                                 char a = value[j];
 645                                 char b = value[j + 1];
 646                                 if (!C_isxdigit(a) || !C_isxdigit(b)) {
 647                                     cerr << "hextobin: input must be all hex "
 648                                             "digits" << endl;
 649                                     goto badhex;
 650                                 }
 651                                 char r = (hex_digit(a) << 4) | hex_digit(b);
 652                                 output.push_back(r);
 653                             }
 654                             value = std::move(output);
 655                         }
 656 badhex:
 657                         break;
 658                     }
 659                     case Action::LOWER:
 660                         value = Xapian::Unicode::tolower(value);
 661                         break;
 662                     case Action::LOAD: {
 663                         bool truncated = false;
 664                         // FIXME: Use NOATIME if we own the file or are root.
 665                         if (!load_file(value, i->get_num_arg(), NOCACHE,
 666                                        value, truncated)) {
 667                             cerr << "Couldn't load file '" << value << "': "
 668                                  << strerror(errno) << endl;
 669                             value.resize(0);
 670                         }
 671                         if (!truncated) break;
 672                     }
 673                     /* FALLTHRU */
 674                     case Action::TRUNCATE:
 675                         utf8_truncate(value, i->get_num_arg());
 676                         break;
 677                     case Action::SPELL:
 678                         indexer.set_flags(indexer.FLAG_SPELLING);
 679                         break;
 680                     case Action::UNHTML: {
 681                         MyHtmlParser p;
 682                         try {
 683                             // Default HTML character set is latin 1, though
 684                             // not specifying one is deprecated these days.
 685                             p.parse_html(value, "iso-8859-1", false);
 686                         } catch (const string & newcharset) {
 687                             p.reset();
 688                             p.parse_html(value, newcharset, true);
 689                         }
 690                         if (p.indexing_allowed)
 691                             value = p.dump;
 692                         else
 693                             value = "";
 694                         break;
 695                     }
 696                     case Action::UNIQUE: {
 697                         // If there's no text, just issue a warning.
 698                         if (value.empty()) {
 699                             cerr << fname << ':' << line_no
 700                                  << ": Ignoring UNIQUE action on empty text"
 701                                  << endl;
 702                             break;
 703                         }
 704
 705                         // Ensure that the value of this field is unique.
 706                         // If a record already exists with the same value,
 707                         // it will be replaced with the new record.
 708
 709                         // Unique fields aren't considered content - if
 710                         // there are no other fields in the document, the
 711                         // document is to be deleted.
 712                         this_field_is_content = false;
 713
 714                         // Argument is the prefix to add to the field value
 715                         // to get the unique term.
 716                         string t = i->get_string_arg();
 717                         if (prefix_needs_colon(t, value[0])) t += ':';
 718                         t += value;
 719 again:
 720                         try {
 721                             Xapian::PostingIterator p = database.postlist_begin(t);
 722                             if (p != database.postlist_end(t)) {
 723                                 docid = *p;
 724                             }
 725                         } catch (const Xapian::Error &e) {
 726                             // Hmm, what happened?
 727                             cerr << "Caught exception in UNIQUE!" << endl;
 728                             cerr << "E: " << e.get_description() << endl;
 729                             database.commit();
 730                             goto again;
 731                         }
 732                         break;
 733                     }
 734                     case Action::VALUE:
 735                         if (!value.empty())
 736                             doc.add_value(i->get_num_arg(), value);
 737                         break;
 738                     case Action::VALUENUMERIC: {
 739                         if (value.empty()) break;
 740                         char * end;
 741                         double dbl = strtod(value.c_str(), &end);
 742                         if (*end) {
 743                             cerr << fname << ':' << line_no << ": Warning: "
 744                                     "Trailing characters in VALUENUMERIC: '"
 745                                  << value << "'" << endl;
 746                         }
 747                         doc.add_value(i->get_num_arg(),
 748                                       Xapian::sortable_serialise(dbl));
 749                         break;
 750                     }
 751                     case Action::VALUEPACKED: {
 752                         uint32_t word = 0;
 753                         if (value.empty() || !C_isdigit(value[0])) {
 754                             // strtoul() accepts leading whitespace and negated
 755                             // values, neither of which we want to allow.
 756                             errno = EINVAL;
 757                         } else {
 758                             errno = 0;
 759                             char* q;
 760                             word = strtoul(value.c_str(), &q, 10);
 761                             if (!errno && *q != '\0') {
 762                                 // Trailing characters after converted value.
 763                                 errno = EINVAL;
 764                             }
 765                         }
 766                         if (errno) {
 767                             cerr << fname << ':' << line_no << ": Warning: "
 768                                     "valuepacked \"" << value << "\" ";
 769                             if (errno == ERANGE) {
 770                                 cerr << "out of range";
 771                             } else {
 772                                 cerr << "not an unsigned integer";
 773                             }
 774                             cerr << endl;
 775                         }
 776                         int valueslot = i->get_num_arg();
 777                         doc.add_value(valueslot, int_to_binary_string(word));
 778                         break;
 779                     }
 780                     case Action::DATE: {
 781                         const string & type = i->get_string_arg();
 782                         string yyyymmdd;
 783                         if (type == "unix") {
 784                             time_t t = atoi(value.c_str());
 785                             struct tm *tm = localtime(&t);
 786                             int y = tm->tm_year + 1900;
 787                             int m = tm->tm_mon + 1;
 788                             yyyymmdd = date_to_string(y, m, tm->tm_mday);
 789                         } else if (type == "yyyymmdd") {
 790                             if (value.length() == 8) yyyymmdd = value;
 791                         }
 792                         if (yyyymmdd.empty()) break;
 793                         // Date (YYYYMMDD)
 794                         doc.add_boolean_term("D" + yyyymmdd);
 795                         yyyymmdd.resize(6);
 796                         // Month (YYYYMM)
 797                         doc.add_boolean_term("M" + yyyymmdd);
 798                         yyyymmdd.resize(4);
 799                         // Year (YYYY)
 800                         doc.add_boolean_term("Y" + yyyymmdd);
 801                         break;
 802                     }
 803                     case Action::PARSEDATE: {
 804                         string dateformat = i->get_string_arg();
 805                         struct tm tm;
 806                         memset(&tm, 0, sizeof(tm));
 807                         auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
 808                         if (ret == NULL) {
 809                             cerr << fname << ':' << line_no << ": Warning: "
 810                                     "\"" << value << "\" doesn't match format "
 811                                     "\"" << dateformat << '\"' << endl;
 812                             break;
 813                         }
 814
 815                         if (*ret != '\0') {
 816                             cerr << fname << ':' << line_no << ": Warning: "
 817                                     "\"" << value << "\" not fully matched by "
 818                                     "format \"" << dateformat << "\" "
 819                                     "(\"" << ret << "\" left over) but "
 820                                     "indexing anyway" << endl;
 821                         }
 822
 823                         value = str(timegm(&tm));
 824                         break;
 825                     }
 826                     default:
 827                         /* Empty default case to avoid "unhandled enum value"
 828                          * warnings. */
 829                         break;
 830                 }
 831             }
 832             if (this_field_is_content) seen_content = true;
 833             if (stream.eof()) break;
 834         }
 835
 836         // If we haven't seen any fields (other than unique identifiers)
 837         // the document is to be deleted.
 838         if (!seen_content) {
 839             if (docid) {
 840                 database.delete_document(docid);
 841                 if (verbose) cout << "Del: " << docid << endl;
 842                 delcount ++;
 843             }
 844         } else {
 845             string data;
 846             for (auto&& i : fields) {
 847                 for (auto&& field_val : i.second) {
 848                     data += i.first;
 849                     data += '=';
 850                     data += field_val;
 851                     data += '\n';
 852                 }
 853             }
 854
 855             // Put the data in the document
 856             doc.set_data(data);
 857
 858             // Add the document to the database
 859             if (docid) {
 860                 try {
 861                     database.replace_document(docid, doc);
 862                     if (verbose) cout << "Replace: " << docid << endl;
 863                     repcount ++;
 864                 } catch (const Xapian::Error &e) {
 865                     cerr << "E: " << e.get_description() << endl;
 866                     // Possibly the document was deleted by another
 867                     // process in the meantime...?
 868                     docid = database.add_document(doc);
 869                     cerr << "Replace failed, adding as new: " << docid << endl;
 870                 }
 871             } else {
 872                 docid = database.add_document(doc);
 873                 if (verbose) cout << "Add: " << docid << endl;
 874                 addcount ++;
 875             }
 876         }
 877     }
 878
 879     // Commit after each file to make sure all changes from that file make it
 880     // in.
 881     if (verbose) cout << "Committing: " << endl;
 882     database.commit();
 883 }
 884
 885 int
 886 main(int argc, char **argv)
 887 try {
 888     // If the database already exists, default to updating not overwriting.
 889     int database_mode = Xapian::DB_CREATE_OR_OPEN;
 890     verbose = false;
 891     Xapian::Stem stemmer("english");
 892
 893     static const struct option longopts[] = {
 894         { "help",       no_argument,    NULL, 'h' },
 895         { "version",    no_argument,    NULL, 'V' },
 896         { "stemmer",    required_argument,      NULL, 's' },
 897         { "overwrite",  no_argument,    NULL, 'o' },
 898         { "verbose",    no_argument,    NULL, 'v' },
 899         { 0, 0, NULL, 0 }
 900     };
 901
 902     bool more = true, show_help = false;
 903     while (more) {
 904         switch (gnu_getopt_long(argc, argv, "vs:hV", longopts, NULL)) {
 905             case EOF:
 906                 more = false;
 907                 break;
 908             default:
 909             case 'h': // --help
 910                 show_help = true;
 911                 more = false;
 912                 break;
 913             case 'V': // --version
 914                 print_package_info(PROG_NAME);
 915                 return 0;
 916             case 'o': // --overwrite
 917                 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
 918                 break;
 919             case 'v':
 920                 verbose = true;
 921                 break;
 922             case 's':
 923                 try {
 924                     stemmer = Xapian::Stem(optarg);
 925                 } catch (const Xapian::InvalidArgumentError &) {
 926                     cerr << "Unknown stemming language '" << optarg << "'.\n";
 927                     cerr << "Available language names are: "
 928                          << Xapian::Stem::get_available_languages() << endl;
 929                     return 1;
 930                 }
 931                 break;
 932         }
 933     }
 934
 935     argv += optind;
 936     argc -= optind;
 937     if (show_help || argc < 2) {
 938         cout << PROG_NAME " - " PROG_DESC "\n"
 939 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
 940 "\n"
 941 "Creates or updates a Xapian database with the data from the input files listed\n"
 942 "on the command line.  If no files are specified, data is read from stdin.\n"
 943 "\n"
 944 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
 945 "format for INDEXER_SCRIPT.\n"
 946 "\n"
 947 "Options:\n"
 948 "  -v, --verbose       display additional messages to aid debugging\n"
 949 "      --overwrite     create the database anew (the default is to update if\n"
 950 "                      the database already exists)\n";
 951         print_stemmer_help("");
 952         print_help_and_version_help("");
 953         exit(show_help ? 0 : 1);
 954     }
 955
 956     parse_index_script(argv[1]);
 957
 958     // Open the database.  If another process is currently updating the
 959     // database, wait for the lock to become available.
 960     auto flags = database_mode | Xapian::DB_RETRY_LOCK;
 961     Xapian::WritableDatabase database(argv[0], flags);
 962
 963     Xapian::TermGenerator indexer;
 964     indexer.set_stemmer(stemmer);
 965     // Set the database for spellings to be added to by the "spell" action.
 966     indexer.set_database(database);
 967
 968     addcount = 0;
 969     repcount = 0;
 970     delcount = 0;
 971
 972     if (argc == 2) {
 973         // Read from stdin.
 974         index_file("<stdin>", cin, database, indexer);
 975     } else {
 976         // Read file(s) listed on the command line.
 977         for (int i = 2; i < argc; ++i) {
 978             ifstream stream(argv[i]);
 979             if (stream) {
 980                 index_file(argv[i], stream, database, indexer);
 981             } else {
 982                 cerr << "Can't open file " << argv[i] << endl;
 983             }
 984         }
 985     }
 986
 987     cout << "records (added, replaced, deleted) = (" << addcount << ", "
 988          << repcount << ", " << delcount << ")" << endl;
 989 } catch (const Xapian::Error &error) {
 990     cerr << "Exception: " << error.get_description() << endl;
 991     exit(1);
 992 } catch (const std::bad_alloc &) {
 993     cerr << "Exception: std::bad_alloc" << endl;
 994     exit(1);
 995 } catch (...) {
 996     cerr << "Unknown Exception" << endl;
 997     exit(1);
 998 }