xapian-applications/omega/scriptindex.cc

   1 /* scriptindex.cc
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 Sam Liddicott
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017 Olly Betts
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  21  * USA
  22  */
  23
  24 #include <config.h>
  25
  26 #include <xapian.h>
  27
  28 #include <algorithm>
  29 #include <fstream>
  30 #include <iostream>
  31 #include <list>
  32 #include <map>
  33 #include <string>
  34 #include <vector>
  35 #include <cstring>
  36
  37 #include <cstdlib>
  38 #include "safeerrno.h"
  39 #include <cstdio>
  40 #include <ctime>
  41
  42 #include "commonhelp.h"
  43 #include "hashterm.h"
  44 #include "loadfile.h"
  45 #include "myhtmlparse.h"
  46 #include "str.h"
  47 #include "stringutils.h"
  48 #include "timegm.h"
  49 #include "utf8truncate.h"
  50 #include "utils.h"
  51 #include "values.h"
  52
  53 #include "gnu_getopt.h"
  54
  55 using namespace std;
  56
  57 #define PROG_NAME "scriptindex"
  58 #define PROG_DESC "index arbitrary data as described by an index script"
  59
  60 static bool verbose;
  61 static int addcount;
  62 static int repcount;
  63 static int delcount;
  64
  65 inline bool
  66 prefix_needs_colon(const string & prefix, unsigned ch)
  67 {
  68     if (!C_isupper(ch) && ch != ':') return false;
  69     string::size_type len = prefix.length();
  70     return (len > 1 && prefix[len - 1] != ':');
  71 }
  72
  73 const char * action_names[] = {
  74     "bad", "new",
  75     "boolean", "date", "field", "hash", "index", "indexnopos", "load", "lower",
  76     "parsedate", "spell", "truncate", "unhtml", "unique", "value",
  77     "valuenumeric", "valuepacked", "weight"
  78 };
  79
  80 // For debugging:
  81 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
  82
  83 class Action {
  84 public:
  85     typedef enum {
  86         BAD, NEW,
  87         BOOLEAN, DATE, FIELD, HASH, INDEX, INDEXNOPOS, LOAD, LOWER,
  88         PARSEDATE, SPELL, TRUNCATE, UNHTML, UNIQUE, VALUE,
  89         VALUENUMERIC, VALUEPACKED, WEIGHT
  90     } type;
  91 private:
  92     type action;
  93     int num_arg;
  94     string string_arg;
  95 public:
  96     Action(type action_) : action(action_), num_arg(0) { }
  97     Action(type action_, const string & arg)
  98         : action(action_), string_arg(arg) {
  99         num_arg = atoi(string_arg.c_str());
 100     }
 101     Action(type action_, const string & arg, int num)
 102         : action(action_), num_arg(num), string_arg(arg) { }
 103     type get_action() const { return action; }
 104     int get_num_arg() const { return num_arg; }
 105     const string & get_string_arg() const { return string_arg; }
 106 };
 107
 108 static void
 109 report_useless_action(const string &file, size_t line, size_t pos,
 110                       const string &action)
 111 {
 112     cout << file << ':' << line;
 113     if (pos != string::npos) cout << ':' << pos;
 114     cout << ": Warning: Index action '" << action << "' has no effect" << endl;
 115
 116     static bool given_left_to_right_warning = false;
 117     if (!given_left_to_right_warning) {
 118         given_left_to_right_warning = true;
 119         cout << file << ':' << line
 120              << ": Warning: Note that actions are executed from left to right"
 121              << endl;
 122     }
 123 }
 124
 125 static map<string, vector<Action> > index_spec;
 126
 127 static void
 128 parse_index_script(const string &filename)
 129 {
 130     ifstream script(filename.c_str());
 131     if (!script.is_open()) {
 132         cout << filename << ": " << strerror(errno) << endl;
 133         exit(1);
 134     }
 135     string line;
 136     size_t line_no = 0;
 137     bool had_unique = false;
 138     while (getline(script, line)) {
 139         ++line_no;
 140         vector<string> fields;
 141         vector<Action> actions;
 142         string::const_iterator i, j;
 143         const string &s = line;
 144         i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
 145         if (i == s.end() || *i == '#') continue;
 146         while (true) {
 147             if (!C_isalnum(*i)) {
 148                 cout << filename << ':' << line_no
 149                      << ": field name must start with alphanumeric" << endl;
 150                 exit(1);
 151             }
 152             j = find_if(i, s.end(),
 153                         [](char ch) { return !C_isalnum(ch) && ch != '_'; });
 154             fields.push_back(string(i, j));
 155             i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
 156             if (i == s.end()) break;
 157             if (*i == ':') {
 158                 ++i;
 159                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 160                 break;
 161             }
 162             if (i == j) {
 163                 cout << filename << ':' << line_no
 164                      << ": bad character '" << *j << "' in fieldname" << endl;
 165                 exit(1);
 166             }
 167         }
 168         Xapian::termcount weight = 1;
 169         size_t useless_weight_pos = string::npos;
 170         map<string, Action::type> boolmap;
 171         j = i;
 172         while (j != s.end()) {
 173             i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
 174             string action(s, j - s.begin(), i - j);
 175             Action::type code = Action::BAD;
 176             enum {NO, OPT, YES} arg = NO;
 177             bool takes_integer_argument = false;
 178             if (!action.empty()) {
 179                 switch (action[0]) {
 180                     case 'b':
 181                         if (action == "boolean") {
 182                             code = Action::BOOLEAN;
 183                             arg = OPT;
 184                         }
 185                         break;
 186                     case 'd':
 187                         if (action == "date") {
 188                             code = Action::DATE;
 189                             arg = YES;
 190                         }
 191                         break;
 192                     case 'f':
 193                         if (action == "field") {
 194                             code = Action::FIELD;
 195                             arg = OPT;
 196                         }
 197                         break;
 198                     case 'h':
 199                         if (action == "hash") {
 200                             code = Action::HASH;
 201                             arg = OPT;
 202                             takes_integer_argument = true;
 203                         }
 204                         break;
 205                     case 'i':
 206                         if (action == "index") {
 207                             code = Action::INDEX;
 208                             arg = OPT;
 209                         } else if (action == "indexnopos") {
 210                             code = Action::INDEXNOPOS;
 211                             arg = OPT;
 212                         }
 213                         break;
 214                     case 'l':
 215                         if (action == "lower") {
 216                             code = Action::LOWER;
 217                         } else if (action == "load") {
 218                             code = Action::LOAD;
 219                         }
 220                         break;
 221                     case 'p':
 222                         if (action == "parsedate") {
 223                             code = Action::PARSEDATE;
 224                             arg = YES;
 225                         }
 226                         break;
 227                     case 's':
 228                         if (action == "spell") {
 229                             code = Action::SPELL;
 230                         }
 231                         break;
 232                     case 't':
 233                         if (action == "truncate") {
 234                             code = Action::TRUNCATE;
 235                             arg = YES;
 236                             takes_integer_argument = true;
 237                         }
 238                         break;
 239                     case 'u':
 240                         if (action == "unhtml") {
 241                             code = Action::UNHTML;
 242                         } else if (action == "unique") {
 243                             code = Action::UNIQUE;
 244                             arg = YES;
 245                         }
 246                         break;
 247                     case 'v':
 248                         if (action == "value") {
 249                             code = Action::VALUE;
 250                             arg = YES;
 251                             takes_integer_argument = true;
 252                         } else if (action == "valuenumeric") {
 253                             code = Action::VALUENUMERIC;
 254                             arg = YES;
 255                             takes_integer_argument = true;
 256                         } else if (action == "valuepacked") {
 257                             code = Action::VALUEPACKED;
 258                             arg = YES;
 259                             takes_integer_argument = true;
 260                         }
 261                         break;
 262                     case 'w':
 263                         if (action == "weight") {
 264                             code = Action::WEIGHT;
 265                             arg = YES;
 266                             takes_integer_argument = true;
 267                         }
 268                         break;
 269                 }
 270             }
 271             if (code == Action::BAD) {
 272                 cout << filename << ':' << line_no
 273                      << ": Unknown index action '" << action << "'" << endl;
 274                 exit(1);
 275             }
 276             auto i_after_action = i;
 277             i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 278
 279             if (i != s.end() && *i == '=') {
 280                 if (i != i_after_action) {
 281                     cout << filename << ':' << line_no
 282                          << ": warning: putting spaces between the action and "
 283                             "'=' is deprecated." << endl;
 284                 }
 285
 286                 if (arg == NO) {
 287                     cout << filename << ':' << line_no
 288                          << ": Index action '" << action
 289                          << "' doesn't take an argument" << endl;
 290                     exit(1);
 291                 }
 292                 ++i;
 293                 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 294                 if (i != j) {
 295                     cout << filename << ':' << line_no
 296                          << ": warning: putting spaces between '=' and the "
 297                             "argument is deprecated." << endl;
 298                 }
 299                 string val;
 300                 if (j != s.end() && *j == '"') {
 301                     // Quoted argument.
 302                     ++j;
 303                     i = find(j, s.end(), '"');
 304                     if (i == s.end()) {
 305                         cout << filename << ':' << line_no << ": No closing quote" << endl;
 306                         exit(1);
 307                     }
 308                     val.assign(j, i);
 309                     ++i;
 310                 } else {
 311                     // Unquoted argument.
 312                     i = find_if(j, s.end(), [](char ch) { return C_isspace(ch); });
 313                     val.assign(j, i);
 314                 }
 315                 if (takes_integer_argument) {
 316                     if (val.find('.') != string::npos) {
 317                         cout << filename << ':' << line_no
 318                              << ": Warning: Index action '" << action
 319                              << "' takes an integer argument" << endl;
 320                     }
 321                 }
 322                 switch (code) {
 323                     case Action::INDEX:
 324                     case Action::INDEXNOPOS:
 325                         actions.push_back(Action(code, val, weight));
 326                         useless_weight_pos = string::npos;
 327                         break;
 328                     case Action::WEIGHT:
 329                         // We don't push an Action for WEIGHT - instead we
 330                         // store it ready to use in the INDEX and INDEXNOPOS
 331                         // Actions.
 332                         weight = atoi(val.c_str());
 333                         if (useless_weight_pos != string::npos) {
 334                             report_useless_action(filename, line_no,
 335                                                   useless_weight_pos, action);
 336                         }
 337                         useless_weight_pos = j - s.begin();
 338                         break;
 339                     case Action::TRUNCATE:
 340                         if (!actions.empty() &&
 341                             actions.back().get_action() == Action::LOAD) {
 342                             /* Turn "load truncate=n" into "load" with
 343                              * num_arg n, so that we don't needlessly
 344                              * allocate memory and read data we're just
 345                              * going to ignore.
 346                              */
 347                             actions.pop_back();
 348                             code = Action::LOAD;
 349                         }
 350                         actions.push_back(Action(code, val));
 351                         break;
 352                     case Action::UNIQUE:
 353                         if (had_unique) {
 354                             cout << filename << ':' << line_no
 355                                 << ": Index action 'unique' used more than "
 356                                    "once" << endl;
 357                             exit(1);
 358                         }
 359                         had_unique = true;
 360                         if (boolmap.find(val) == boolmap.end())
 361                             boolmap[val] = Action::UNIQUE;
 362                         actions.push_back(Action(code, val));
 363                         break;
 364                     case Action::BOOLEAN:
 365                         boolmap[val] = Action::BOOLEAN;
 366                         /* FALLTHRU */
 367                     default:
 368                         actions.push_back(Action(code, val));
 369                 }
 370                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 371             } else {
 372                 if (arg == YES) {
 373                     cout << filename << ':' << line_no
 374                          << ": Index action '" << action
 375                          << "' must have an argument" << endl;
 376                     exit(1);
 377                 }
 378                 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
 379                     useless_weight_pos = string::npos;
 380                     actions.push_back(Action(code, "", weight));
 381                 } else {
 382                     actions.push_back(Action(code));
 383                 }
 384             }
 385             j = i;
 386         }
 387
 388         if (useless_weight_pos != string::npos) {
 389             report_useless_action(filename, line_no, useless_weight_pos,
 390                                   "weight");
 391         }
 392
 393         while (!actions.empty()) {
 394             bool done = true;
 395             Action::type action = actions.back().get_action();
 396             switch (action) {
 397                 case Action::HASH:
 398                 case Action::LOWER:
 399                 case Action::SPELL:
 400                 case Action::TRUNCATE:
 401                 case Action::UNHTML:
 402                     done = false;
 403                     report_useless_action(filename, line_no, string::npos,
 404                                           action_names[action]);
 405                     actions.pop_back();
 406                     break;
 407                 default:
 408                     break;
 409             }
 410             if (done) break;
 411         }
 412
 413         map<string, Action::type>::const_iterator boolpfx;
 414         for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
 415             if (boolpfx->second == Action::UNIQUE) {
 416                 cout << filename << ':' << line_no
 417                      << ": Warning: Index action 'unique=" << boolpfx->first
 418                      << "' without 'boolean=" << boolpfx->first << "'" << endl;
 419                 static bool given_doesnt_imply_boolean_warning = false;
 420                 if (!given_doesnt_imply_boolean_warning) {
 421                     given_doesnt_imply_boolean_warning = true;
 422                     cout << filename << ':' << line_no
 423                          << ": Warning: Note 'unique' doesn't implicitly add "
 424                             "a boolean term" << endl;
 425                 }
 426             }
 427         }
 428
 429         vector<string>::const_iterator field;
 430         for (field = fields.begin(); field != fields.end(); ++field) {
 431             vector<Action> &v = index_spec[*field];
 432             if (v.empty()) {
 433                 v = actions;
 434             } else {
 435                 v.push_back(Action(Action::NEW));
 436                 v.insert(v.end(), actions.begin(), actions.end());
 437             }
 438         }
 439     }
 440
 441     if (index_spec.empty()) {
 442         cout << filename << ": No rules found in index script" << endl;
 443         exit(1);
 444     }
 445 }
 446
 447 static bool
 448 index_file(const char *fname, istream &stream,
 449            Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
 450 {
 451     string line;
 452     size_t line_no = 0;
 453     while (!stream.eof() && getline(stream, line)) {
 454         ++line_no;
 455         Xapian::Document doc;
 456         indexer.set_document(doc);
 457         Xapian::docid docid = 0;
 458         map<string, list<string> > fields;
 459         bool seen_content = false;
 460         while (!line.empty()) {
 461             // Cope with files from MS Windows (\r\n end of lines).
 462             // Trim multiple \r characters, since that seems the best way
 463             // to handle that case.
 464             string::size_type last = line.find_last_not_of('\r');
 465             if (last == string::npos) break;
 466             line.resize(last + 1);
 467
 468             string::size_type eq = line.find('=');
 469             if (eq == string::npos && !line.empty()) {
 470                 cout << fname << ':' << line_no << ": expected = somewhere "
 471                     "in this line" << endl;
 472                 // FIXME: die or what?
 473             }
 474             string field(line, 0, eq);
 475             string value(line, eq + 1, string::npos);
 476             while (getline(stream, line)) {
 477                 ++line_no;
 478                 if (line.empty() || line[0] != '=') break;
 479                 // Cope with files from MS Windows (\r\n end of lines).
 480                 // Trim multiple \r characters, since that seems the best way
 481                 // to handle that case.
 482                 last = line.find_last_not_of('\r');
 483                 // line[0] == '=', so last != string::npos.
 484                 // Replace the '=' with a '\n' so we don't have to use substr.
 485                 line[0] = '\n';
 486                 line.resize(last + 1);
 487                 value += line;
 488             }
 489
 490             // Default to not indexing spellings.
 491             indexer.set_flags(Xapian::TermGenerator::flags(0));
 492
 493             const vector<Action> &v = index_spec[field];
 494             string old_value = value;
 495             vector<Action>::const_iterator i;
 496             bool this_field_is_content = true;
 497             for (i = v.begin(); i != v.end(); ++i) {
 498                 switch (i->get_action()) {
 499                     case Action::BAD:
 500                         abort();
 501                     case Action::NEW:
 502                         value = old_value;
 503                         // We're processing the same field again - give it a
 504                         // reprieve.
 505                         this_field_is_content = true;
 506                         break;
 507                     case Action::FIELD:
 508                         if (!value.empty()) {
 509                             string f = i->get_string_arg();
 510                             if (f.empty()) f = field;
 511                             // replace newlines with spaces
 512                             string s = value;
 513                             string::size_type j = 0;
 514                             while ((j = s.find('\n', j)) != string::npos)
 515                                 s[j] = ' ';
 516                             fields[f].push_back(s);
 517                         }
 518                         break;
 519                     case Action::INDEX:
 520                         indexer.index_text(value,
 521                                            i->get_num_arg(),
 522                                            i->get_string_arg());
 523                         break;
 524                     case Action::INDEXNOPOS:
 525                         // No positional information so phrase searching
 526                         // won't work.  However, the database will use much
 527                         // less diskspace.
 528                         indexer.index_text_without_positions(value,
 529                                                              i->get_num_arg(),
 530                                                              i->get_string_arg());
 531                         break;
 532                     case Action::BOOLEAN: {
 533                         // Do nothing if there's no text.
 534                         if (value.empty()) break;
 535
 536                         string term = i->get_string_arg();
 537                         if (prefix_needs_colon(term, value[0])) term += ':';
 538                         term += value;
 539
 540                         doc.add_boolean_term(term);
 541                         break;
 542                     }
 543                     case Action::HASH: {
 544                         unsigned int max_length = i->get_num_arg();
 545                         if (max_length == 0)
 546                             max_length = MAX_SAFE_TERM_LENGTH - 1;
 547                         if (value.length() > max_length)
 548                             value = hash_long_term(value, max_length);
 549                         break;
 550                     }
 551                     case Action::LOWER:
 552                         value = Xapian::Unicode::tolower(value);
 553                         break;
 554                     case Action::LOAD: {
 555                         bool truncated = false;
 556                         // FIXME: Use NOATIME if we own the file or are root.
 557                         if (!load_file(value, i->get_num_arg(), NOCACHE,
 558                                        value, truncated)) {
 559                             cerr << "Couldn't load file '" << value << "': "
 560                                  << strerror(errno) << endl;
 561                             value.resize(0);
 562                         }
 563                         if (!truncated) break;
 564                     }
 565                     /* FALLTHRU */
 566                     case Action::TRUNCATE:
 567                         utf8_truncate(value, i->get_num_arg());
 568                         break;
 569                     case Action::SPELL:
 570                         indexer.set_flags(indexer.FLAG_SPELLING);
 571                         break;
 572                     case Action::UNHTML: {
 573                         MyHtmlParser p;
 574                         try {
 575                             // Default HTML character set is latin 1, though
 576                             // not specifying one is deprecated these days.
 577                             p.parse_html(value, "iso-8859-1", false);
 578                         } catch (const string & newcharset) {
 579                             p.reset();
 580                             p.parse_html(value, newcharset, true);
 581                         }
 582                         if (p.indexing_allowed)
 583                             value = p.dump;
 584                         else
 585                             value = "";
 586                         break;
 587                     }
 588                     case Action::UNIQUE: {
 589                         // If there's no text, just issue a warning.
 590                         if (value.empty()) {
 591                             cout << fname << ':' << line_no
 592                                  << ": Ignoring UNIQUE action on empty text"
 593                                  << endl;
 594                             break;
 595                         }
 596
 597                         // Ensure that the value of this field is unique.
 598                         // If a record already exists with the same value,
 599                         // it will be replaced with the new record.
 600
 601                         // Unique fields aren't considered content - if
 602                         // there are no other fields in the document, the
 603                         // document is to be deleted.
 604                         this_field_is_content = false;
 605
 606                         // Argument is the prefix to add to the field value
 607                         // to get the unique term.
 608                         string t = i->get_string_arg();
 609                         if (prefix_needs_colon(t, value[0])) t += ':';
 610                         t += value;
 611 again:
 612                         try {
 613                             Xapian::PostingIterator p = database.postlist_begin(t);
 614                             if (p != database.postlist_end(t)) {
 615                                 docid = *p;
 616                             }
 617                         } catch (const Xapian::Error &e) {
 618                             // Hmm, what happened?
 619                             cout << "Caught exception in UNIQUE!" << endl;
 620                             cout << "E: " << e.get_description() << endl;
 621                             database.commit();
 622                             goto again;
 623                         }
 624                         break;
 625                     }
 626                     case Action::VALUE:
 627                         if (!value.empty())
 628                             doc.add_value(i->get_num_arg(), value);
 629                         break;
 630                     case Action::VALUENUMERIC: {
 631                         if (value.empty()) break;
 632                         char * end;
 633                         double dbl = strtod(value.c_str(), &end);
 634                         if (*end) {
 635                             cout << fname << ':' << line_no << ": Warning: "
 636                                     "Trailing characters in VALUENUMERIC: '"
 637                                  << value << "'" << endl;
 638                         }
 639                         doc.add_value(i->get_num_arg(),
 640                                       Xapian::sortable_serialise(dbl));
 641                         break;
 642                     }
 643                     case Action::VALUEPACKED: {
 644                         uint32_t word = 0;
 645                         if (value.empty() || !C_isdigit(value[0])) {
 646                             // strtoul() accepts leading whitespace and negated
 647                             // values, neither of which we want to allow.
 648                             errno = EINVAL;
 649                         } else {
 650                             errno = 0;
 651                             char* q;
 652                             word = strtoul(value.c_str(), &q, 10);
 653                             if (!errno && *q != '\0') {
 654                                 // Trailing characters after converted value.
 655                                 errno = EINVAL;
 656                             }
 657                         }
 658                         if (errno) {
 659                             cout << fname << ':' << line_no << ": Warning: "
 660                                     "valuepacked \"" << value << "\" ";
 661                             if (errno == ERANGE) {
 662                                 cout << "out of range";
 663                             } else {
 664                                 cout << "not an unsigned integer";
 665                             }
 666                             cout << endl;
 667                         }
 668                         int valueslot = i->get_num_arg();
 669                         doc.add_value(valueslot, int_to_binary_string(word));
 670                         break;
 671                     }
 672                     case Action::DATE: {
 673                         const string & type = i->get_string_arg();
 674                         string yyyymmdd;
 675                         if (type == "unix") {
 676                             time_t t = atoi(value.c_str());
 677                             struct tm *tm = localtime(&t);
 678                             int y = tm->tm_year + 1900;
 679                             int m = tm->tm_mon + 1;
 680                             yyyymmdd = date_to_string(y, m, tm->tm_mday);
 681                         } else if (type == "yyyymmdd") {
 682                             if (value.length() == 8) yyyymmdd = value;
 683                         }
 684                         if (yyyymmdd.empty()) break;
 685                         // Date (YYYYMMDD)
 686                         doc.add_boolean_term("D" + yyyymmdd);
 687                         yyyymmdd.resize(6);
 688                         // Month (YYYYMM)
 689                         doc.add_boolean_term("M" + yyyymmdd);
 690                         yyyymmdd.resize(4);
 691                         // Year (YYYY)
 692                         doc.add_boolean_term("Y" + yyyymmdd);
 693                         break;
 694                     }
 695                     case Action::PARSEDATE: {
 696                         string dateformat = i->get_string_arg();
 697                         struct tm tm;
 698                         memset(&tm, 0, sizeof(tm));
 699                         auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
 700                         if (ret == NULL) {
 701                             cout << fname << ':' << line_no << ": Warning: "
 702                                     "\"" << value << "\" doesn't match format "
 703                                     "\"" << dateformat << '\"' << endl;
 704                             break;
 705                         }
 706
 707                         if (*ret != '\0') {
 708                             cout << fname << ':' << line_no << ": Warning: "
 709                                     "\"" << value << "\" not fully matched by "
 710                                     "format \"" << dateformat << "\" "
 711                                     "(\"" << ret << "\" left over) but "
 712                                     "indexing anyway" << endl;
 713                         }
 714
 715                         value = str(timegm(&tm));
 716                         break;
 717                     }
 718                     default:
 719                         /* Empty default case to avoid "unhandled enum value"
 720                          * warnings. */
 721                         break;
 722                 }
 723             }
 724             if (this_field_is_content) seen_content = true;
 725             if (stream.eof()) break;
 726         }
 727
 728         // If we haven't seen any fields (other than unique identifiers)
 729         // the document is to be deleted.
 730         if (!seen_content) {
 731             if (docid) {
 732                 database.delete_document(docid);
 733                 if (verbose) cout << "Del: " << docid << endl;
 734                 delcount ++;
 735             }
 736         } else {
 737             string data;
 738             for (auto&& i : fields) {
 739                 for (auto&& field_val : i.second) {
 740                     data += i.first;
 741                     data += '=';
 742                     data += field_val;
 743                     data += '\n';
 744                 }
 745             }
 746
 747             // Put the data in the document
 748             doc.set_data(data);
 749
 750             // Add the document to the database
 751             if (docid) {
 752                 try {
 753                     database.replace_document(docid, doc);
 754                     if (verbose) cout << "Replace: " << docid << endl;
 755                     repcount ++;
 756                 } catch (const Xapian::Error &e) {
 757                     cout << "E: " << e.get_description() << endl;
 758                     // Possibly the document was deleted by another
 759                     // process in the meantime...?
 760                     docid = database.add_document(doc);
 761                     cout << "Replace failed, adding as new: " << docid << endl;
 762                 }
 763             } else {
 764                 docid = database.add_document(doc);
 765                 if (verbose) cout << "Add: " << docid << endl;
 766                 addcount ++;
 767             }
 768         }
 769     }
 770
 771     // Commit after each file to make sure all changes from that file make it
 772     // in.
 773     if (verbose) cout << "Committing: " << endl;
 774     database.commit();
 775
 776     return true;
 777 }
 778
 779 int
 780 main(int argc, char **argv)
 781 try {
 782     // If the database already exists, default to updating not overwriting.
 783     int database_mode = Xapian::DB_CREATE_OR_OPEN;
 784     verbose = false;
 785     Xapian::Stem stemmer("english");
 786
 787     static const struct option longopts[] = {
 788         { "help",       no_argument,    NULL, 'h' },
 789         { "version",    no_argument,    NULL, 'V' },
 790         { "stemmer",    required_argument,      NULL, 's' },
 791         { "overwrite",  no_argument,    NULL, 'o' },
 792         { "verbose",    no_argument,    NULL, 'v' },
 793         { 0, 0, NULL, 0 }
 794     };
 795
 796     bool more = true, show_help = false;
 797     while (more) {
 798         switch (gnu_getopt_long(argc, argv, "vs:hV", longopts, NULL)) {
 799             case EOF:
 800                 more = false;
 801                 break;
 802             default:
 803             case 'h': // --help
 804                 show_help = true;
 805                 more = false;
 806                 break;
 807             case 'V': // --version
 808                 print_package_info(PROG_NAME);
 809                 return 0;
 810             case 'o': // --overwrite
 811                 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
 812                 break;
 813             case 'v':
 814                 verbose = true;
 815                 break;
 816             case 's':
 817                 try {
 818                     stemmer = Xapian::Stem(optarg);
 819                 } catch (const Xapian::InvalidArgumentError &) {
 820                     cerr << "Unknown stemming language '" << optarg << "'.\n";
 821                     cerr << "Available language names are: "
 822                          << Xapian::Stem::get_available_languages() << endl;
 823                     return 1;
 824                 }
 825                 break;
 826         }
 827     }
 828
 829     argv += optind;
 830     argc -= optind;
 831     if (show_help || argc < 2) {
 832         cout << PROG_NAME " - " PROG_DESC "\n"
 833 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
 834 "\n"
 835 "Creates or updates a Xapian database with the data from the input files listed\n"
 836 "on the command line.  If no files are specified, data is read from stdin.\n"
 837 "\n"
 838 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
 839 "format for INDEXER_SCRIPT.\n"
 840 "\n"
 841 "Options:\n"
 842 "  -v, --verbose       display additional messages to aid debugging\n"
 843 "      --overwrite     create the database anew (the default is to update if\n"
 844 "                      the database already exists)\n";
 845         print_stemmer_help("");
 846         print_help_and_version_help("");
 847         exit(show_help ? 0 : 1);
 848     }
 849
 850     parse_index_script(argv[1]);
 851
 852     // Open the database.  If another process is currently updating the
 853     // database, wait for the lock to become available.
 854     auto flags = database_mode | Xapian::DB_RETRY_LOCK;
 855     Xapian::WritableDatabase database(argv[0], flags);
 856
 857     Xapian::TermGenerator indexer;
 858     indexer.set_stemmer(stemmer);
 859     // Set the database for spellings to be added to by the "spell" action.
 860     indexer.set_database(database);
 861
 862     addcount = 0;
 863     repcount = 0;
 864     delcount = 0;
 865
 866     if (argc == 2) {
 867         // Read from stdin.
 868         index_file("<stdin>", cin, database, indexer);
 869     } else {
 870         // Read file(s) listed on the command line.
 871         for (int i = 2; i < argc; ++i) {
 872             ifstream stream(argv[i]);
 873             if (stream) {
 874                 index_file(argv[i], stream, database, indexer);
 875             } else {
 876                 cout << "Can't open file " << argv[i] << endl;
 877             }
 878         }
 879     }
 880
 881     cout << "records (added, replaced, deleted) = (" << addcount << ", "
 882          << repcount << ", " << delcount << ")" << endl;
 883 } catch (const Xapian::Error &error) {
 884     cout << "Exception: " << error.get_description() << endl;
 885     exit(1);
 886 } catch (const std::bad_alloc &) {
 887     cout << "Exception: std::bad_alloc" << endl;
 888     exit(1);
 889 } catch (...) {
 890     cout << "Unknown Exception" << endl;
 891     exit(1);
 892 }