xapian-applications/omega/scriptindex.cc

   1 /* scriptindex.cc
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 Sam Liddicott
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,2018 Olly Betts
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  21  * USA
  22  */
  23
  24 #include <config.h>
  25
  26 #include <xapian.h>
  27
  28 #include <algorithm>
  29 #include <fstream>
  30 #include <iostream>
  31 #include <list>
  32 #include <map>
  33 #include <memory>
  34 #include <string>
  35 #include <unordered_set>
  36 #include <vector>
  37 #include <cstring>
  38
  39 #include <cerrno>
  40 #include <cstdio>
  41 #include <cstdlib>
  42 #include <ctime>
  43
  44 #include "commonhelp.h"
  45 #include "hashterm.h"
  46 #include "loadfile.h"
  47 #include "myhtmlparse.h"
  48 #include "str.h"
  49 #include "stringutils.h"
  50 #include "timegm.h"
  51 #include "utf8truncate.h"
  52 #include "utils.h"
  53 #include "values.h"
  54
  55 #include "gnu_getopt.h"
  56
  57 using namespace std;
  58
  59 #define PROG_NAME "scriptindex"
  60 #define PROG_DESC "index arbitrary data as described by an index script"
  61
  62 static bool verbose;
  63 static int addcount;
  64 static int repcount;
  65 static int delcount;
  66
  67 inline bool
  68 prefix_needs_colon(const string & prefix, unsigned ch)
  69 {
  70     if (!C_isupper(ch) && ch != ':') return false;
  71     string::size_type len = prefix.length();
  72     return (len > 1 && prefix[len - 1] != ':');
  73 }
  74
  75 const char * action_names[] = {
  76     "bad", "new",
  77     "boolean", "date", "field", "hash", "hextobin", "index", "indexnopos",
  78     "load", "lower", "parsedate", "spell", "split", "truncate", "unhtml",
  79     "unique", "value", "valuenumeric", "valuepacked", "weight"
  80 };
  81
  82 // For debugging:
  83 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
  84
  85 class Action {
  86 public:
  87     typedef enum {
  88         BAD, NEW,
  89         BOOLEAN, DATE, FIELD, HASH, HEXTOBIN, INDEX, INDEXNOPOS, LOAD, LOWER,
  90         PARSEDATE, SPELL, SPLIT, TRUNCATE, UNHTML, UNIQUE, VALUE,
  91         VALUENUMERIC, VALUEPACKED, WEIGHT
  92     } type;
  93     enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT };
  94 private:
  95     type action;
  96     int num_arg;
  97     string string_arg;
  98     // Offset into indexscript line.
  99     size_t pos;
 100 public:
 101     Action(type action_, size_t pos_)
 102         : action(action_), num_arg(0), pos(pos_) { }
 103     Action(type action_, size_t pos_, const string & arg)
 104         : action(action_), string_arg(arg), pos(pos_) {
 105         num_arg = atoi(string_arg.c_str());
 106     }
 107     Action(type action_, size_t pos_, const string & arg, int num)
 108         : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
 109     type get_action() const { return action; }
 110     int get_num_arg() const { return num_arg; }
 111     void set_num_arg(int num) { num_arg = num; }
 112     const string & get_string_arg() const { return string_arg; }
 113     size_t get_pos() const { return pos; }
 114 };
 115
 116 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
 117
 118 static void
 119 report_location(enum diag_type type,
 120                 const string& filename,
 121                 size_t line = 0,
 122                 size_t pos = string::npos)
 123 {
 124     cerr << filename;
 125     if (line != 0) {
 126         cerr << ':' << line;
 127     }
 128     if (pos != string::npos) {
 129         // The first column is numbered 1.
 130         cerr << ':' << pos + 1;
 131     }
 132     switch (type) {
 133         case DIAG_ERROR:
 134             cerr << ": error: ";
 135             break;
 136         case DIAG_WARN:
 137             cerr << ": warning: ";
 138             break;
 139         case DIAG_NOTE:
 140             cerr << ": note: ";
 141             break;
 142     }
 143 }
 144
 145 static void
 146 report_useless_action(const string &file, size_t line, size_t pos,
 147                       const string &action)
 148 {
 149     report_location(DIAG_WARN, file, line, pos);
 150     cerr << "Index action '" << action << "' has no effect" << endl;
 151
 152     static bool given_left_to_right_warning = false;
 153     if (!given_left_to_right_warning) {
 154         given_left_to_right_warning = true;
 155         report_location(DIAG_NOTE, file, line, pos);
 156         cerr << "Actions are executed from left to right" << endl;
 157     }
 158 }
 159
 160 static map<string, vector<Action>> index_spec;
 161
 162 static void
 163 parse_index_script(const string &filename)
 164 {
 165     ifstream script(filename.c_str());
 166     if (!script.is_open()) {
 167         report_location(DIAG_ERROR, filename);
 168         cerr << strerror(errno) << endl;
 169         exit(1);
 170     }
 171     string line;
 172     size_t line_no = 0;
 173     bool had_unique = false;
 174     while (getline(script, line)) {
 175         ++line_no;
 176         vector<string> fields;
 177         vector<Action> actions;
 178         string::const_iterator i, j;
 179         const string &s = line;
 180         i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
 181         if (i == s.end() || *i == '#') continue;
 182         while (true) {
 183             if (!C_isalnum(*i)) {
 184                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 185                 cerr << "field name must start with alphanumeric" << endl;
 186                 exit(1);
 187             }
 188             j = find_if(i, s.end(),
 189                         [](char ch) { return !C_isalnum(ch) && ch != '_'; });
 190             fields.push_back(string(i, j));
 191             i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
 192             if (i == s.end()) break;
 193             if (*i == ':') {
 194                 ++i;
 195                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 196                 break;
 197             }
 198             if (i == j) {
 199                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 200                 cerr << "bad character '" << *i << "' in fieldname" << endl;
 201                 exit(1);
 202             }
 203         }
 204         Xapian::termcount weight = 1;
 205         size_t useless_weight_pos = string::npos;
 206         map<string, Action::type> boolmap;
 207         j = i;
 208         while (j != s.end()) {
 209             size_t action_pos = j - s.begin();
 210             i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
 211             string action(s, j - s.begin(), i - j);
 212             Action::type code = Action::BAD;
 213             unsigned min_args = 0, max_args = 0;
 214             bool takes_integer_argument = false;
 215             if (!action.empty()) {
 216                 switch (action[0]) {
 217                     case 'b':
 218                         if (action == "boolean") {
 219                             code = Action::BOOLEAN;
 220                             max_args = 1;
 221                         }
 222                         break;
 223                     case 'd':
 224                         if (action == "date") {
 225                             code = Action::DATE;
 226                             min_args = max_args = 1;
 227                         }
 228                         break;
 229                     case 'f':
 230                         if (action == "field") {
 231                             code = Action::FIELD;
 232                             max_args = 1;
 233                         }
 234                         break;
 235                     case 'h':
 236                         if (action == "hash") {
 237                             code = Action::HASH;
 238                             max_args = 1;
 239                             takes_integer_argument = true;
 240                         } else if (action == "hextobin") {
 241                             code = Action::HEXTOBIN;
 242                         }
 243                         break;
 244                     case 'i':
 245                         if (action == "index") {
 246                             code = Action::INDEX;
 247                             max_args = 1;
 248                         } else if (action == "indexnopos") {
 249                             code = Action::INDEXNOPOS;
 250                             max_args = 1;
 251                         }
 252                         break;
 253                     case 'l':
 254                         if (action == "lower") {
 255                             code = Action::LOWER;
 256                         } else if (action == "load") {
 257                             code = Action::LOAD;
 258                         }
 259                         break;
 260                     case 'p':
 261                         if (action == "parsedate") {
 262                             code = Action::PARSEDATE;
 263                             min_args = max_args = 1;
 264                         }
 265                         break;
 266                     case 's':
 267                         if (action == "spell") {
 268                             code = Action::SPELL;
 269                         } else if (action == "split") {
 270                             code = Action::SPLIT;
 271                             min_args = 1;
 272                             max_args = 2;
 273                         }
 274                         break;
 275                     case 't':
 276                         if (action == "truncate") {
 277                             code = Action::TRUNCATE;
 278                             min_args = max_args = 1;
 279                             takes_integer_argument = true;
 280                         }
 281                         break;
 282                     case 'u':
 283                         if (action == "unhtml") {
 284                             code = Action::UNHTML;
 285                         } else if (action == "unique") {
 286                             code = Action::UNIQUE;
 287                             min_args = max_args = 1;
 288                         }
 289                         break;
 290                     case 'v':
 291                         if (action == "value") {
 292                             code = Action::VALUE;
 293                             min_args = max_args = 1;
 294                             takes_integer_argument = true;
 295                         } else if (action == "valuenumeric") {
 296                             code = Action::VALUENUMERIC;
 297                             min_args = max_args = 1;
 298                             takes_integer_argument = true;
 299                         } else if (action == "valuepacked") {
 300                             code = Action::VALUEPACKED;
 301                             min_args = max_args = 1;
 302                             takes_integer_argument = true;
 303                         }
 304                         break;
 305                     case 'w':
 306                         if (action == "weight") {
 307                             code = Action::WEIGHT;
 308                             min_args = max_args = 1;
 309                             takes_integer_argument = true;
 310                         }
 311                         break;
 312                 }
 313             }
 314             if (code == Action::BAD) {
 315                 report_location(DIAG_ERROR, filename, line_no, action_pos);
 316                 cerr << "Unknown index action '" << action << "'" << endl;
 317                 exit(1);
 318             }
 319             auto i_after_action = i;
 320             i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 321
 322             if (i != s.end() && *i == '=') {
 323                 if (i != i_after_action) {
 324                     report_location(DIAG_WARN, filename, line_no,
 325                                     i_after_action - s.begin());
 326                     cerr << "putting spaces between the action and '=' is "
 327                             "deprecated." << endl;
 328                 }
 329
 330                 if (max_args == 0) {
 331                     report_location(DIAG_ERROR, filename, line_no,
 332                                     i - s.begin());
 333                     cerr << "Index action '" << action
 334                          << "' doesn't take an argument" << endl;
 335                     exit(1);
 336                 }
 337
 338                 ++i;
 339                 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 340                 if (i != j) {
 341                     report_location(DIAG_WARN, filename, line_no,
 342                                     i - s.begin());
 343                     cerr << "putting spaces between '=' and the argument is "
 344                             "deprecated." << endl;
 345                 }
 346
 347                 vector<string> vals;
 348                 while (true) {
 349                     if (j != s.end() && *j == '"') {
 350                         // Quoted argument.
 351                         ++j;
 352                         string arg;
 353                         while (true) {
 354                             i = find_if(j, s.end(),
 355                                         [](char ch) {
 356                                             return ch == '"' || ch == '\\';
 357                                         });
 358                             if (i == s.end()) {
 359                                 report_location(DIAG_ERROR, filename, line_no,
 360                                                 s.size());
 361                                 cerr << "No closing quote" << endl;
 362                                 exit(1);
 363                             }
 364                             arg.append(j, i);
 365                             if (*i++ == '"')
 366                                 break;
 367
 368                             // Escape sequence.
 369                             if (i == s.end()) {
 370 bad_escaping:
 371                                 report_location(DIAG_ERROR, filename, line_no,
 372                                                 i - s.begin());
 373                                 cerr << "Bad escaping in quoted action argument"
 374                                      << endl;
 375                                 exit(1);
 376                             }
 377
 378                             char ch = *i;
 379                             switch (ch) {
 380                                 case '\\':
 381                                 case '"':
 382                                     break;
 383                                 case '0':
 384                                     ch = '\0';
 385                                     break;
 386                                 case 'n':
 387                                     ch = '\n';
 388                                     break;
 389                                 case 'r':
 390                                     ch = '\r';
 391                                     break;
 392                                 case 't':
 393                                     ch = '\t';
 394                                     break;
 395                                 case 'x': {
 396                                     if (++i == s.end())
 397                                         goto bad_escaping;
 398                                     char ch1 = *i;
 399                                     if (++i == s.end())
 400                                         goto bad_escaping;
 401                                     char ch2 = *i;
 402                                     if (!C_isxdigit(ch1) ||
 403                                         !C_isxdigit(ch2))
 404                                         goto bad_escaping;
 405                                     ch = hex_digit(ch1) << 4 |
 406                                          hex_digit(ch2);
 407                                     break;
 408                                 }
 409                                 default:
 410                                     goto bad_escaping;
 411                             }
 412                             arg += ch;
 413                             j = i + 1;
 414                         }
 415                         vals.emplace_back(std::move(arg));
 416                         if (i == s.end() || C_isspace(*i)) break;
 417                         if (*i != ',') {
 418                             report_location(DIAG_ERROR, filename, line_no,
 419                                             i - s.begin());
 420                             cerr << "Unexpected character '" << *i
 421                                  << "' after closing quote" << endl;
 422                             exit(1);
 423                         }
 424                         ++i;
 425                     } else if (max_args > 1) {
 426                         // Unquoted argument, split on comma.
 427                         i = find_if(j, s.end(),
 428                                     [](char ch) {
 429                                         return C_isspace(ch) || ch == ',';
 430                                     });
 431                         vals.emplace_back(j, i);
 432                         if (*i != ',') break;
 433                         ++i;
 434                     } else {
 435                         // Unquoted argument, including any commas.
 436                         i = find_if(j, s.end(),
 437                                     [](char ch) { return C_isspace(ch); });
 438                         vals.emplace_back(j, i);
 439                         break;
 440                     }
 441                     j = i;
 442
 443                     if (vals.size() == max_args) {
 444                         report_location(DIAG_ERROR, filename, line_no,
 445                                         i - s.begin());
 446                         cerr << "Index action '" << action
 447                              << "' takes at most " << max_args << " arguments"
 448                              << endl;
 449                         exit(1);
 450                     }
 451                 }
 452
 453                 if (vals.size() < min_args) {
 454                     report_location(DIAG_ERROR, filename, line_no,
 455                                     i - s.begin());
 456                     if (min_args == max_args) {
 457                         cerr << "Index action '" << action
 458                              << "' requires " << min_args << " arguments"
 459                              << endl;
 460                         exit(1);
 461                     }
 462                     cerr << "Index action '" << action
 463                          << "' requires at least " << min_args << " arguments"
 464                          << endl;
 465                     exit(1);
 466                 }
 467
 468                 string val;
 469                 if (!vals.empty()) {
 470                     val = vals.front();
 471                 }
 472
 473                 if (takes_integer_argument) {
 474                     auto dot = val.find('.');
 475                     if (dot != string::npos) {
 476                         report_location(DIAG_WARN, filename, line_no,
 477                                         j - s.begin() + dot);
 478                         cerr << "Index action '" << action
 479                              << "' takes an integer argument" << endl;
 480                     }
 481                 }
 482                 switch (code) {
 483                     case Action::INDEX:
 484                     case Action::INDEXNOPOS:
 485                         actions.emplace_back(code, action_pos, val, weight);
 486                         useless_weight_pos = string::npos;
 487                         break;
 488                     case Action::WEIGHT:
 489                         // We don't push an Action for WEIGHT - instead we
 490                         // store it ready to use in the INDEX and INDEXNOPOS
 491                         // Actions.
 492                         weight = atoi(val.c_str());
 493                         if (useless_weight_pos != string::npos) {
 494                             report_useless_action(filename, line_no,
 495                                                   useless_weight_pos, action);
 496                         }
 497                         useless_weight_pos = action_pos;
 498                         break;
 499                     case Action::SPLIT: {
 500                         if (val.empty()) {
 501                             report_location(DIAG_ERROR, filename, line_no);
 502                             cerr << "Split delimiter can't be empty" << endl;
 503                             exit(1);
 504                         }
 505                         int operation = Action::SPLIT_NONE;
 506                         if (vals.size() >= 2) {
 507                             if (vals[1] == "dedup") {
 508                                 operation = Action::SPLIT_DEDUP;
 509                             } else if (vals[1] == "sort") {
 510                                 operation = Action::SPLIT_SORT;
 511                             } else if (vals[1] == "none") {
 512                                 operation = Action::SPLIT_NONE;
 513                             } else {
 514                                 report_location(DIAG_ERROR, filename, line_no);
 515                                 cerr << "Bad split operation '" << vals[1]
 516                                      << "'" << endl;
 517                                 exit(1);
 518                             }
 519                         }
 520                         actions.emplace_back(code, action_pos, val, operation);
 521                         break;
 522                     }
 523                     case Action::TRUNCATE:
 524                         if (!actions.empty() &&
 525                             actions.back().get_action() == Action::LOAD) {
 526                             /* Turn "load truncate=n" into "load" with
 527                              * num_arg n, so that we don't needlessly
 528                              * allocate memory and read data we're just
 529                              * going to ignore.
 530                              */
 531                             actions.pop_back();
 532                             code = Action::LOAD;
 533                         }
 534                         actions.emplace_back(code, action_pos, val);
 535                         break;
 536                     case Action::UNIQUE:
 537                         if (had_unique) {
 538                             report_location(DIAG_ERROR, filename, line_no,
 539                                             action_pos);
 540                             cerr << "Index action 'unique' used more than once"
 541                                  << endl;
 542                             exit(1);
 543                         }
 544                         had_unique = true;
 545                         if (boolmap.find(val) == boolmap.end())
 546                             boolmap[val] = Action::UNIQUE;
 547                         actions.emplace_back(code, action_pos, val);
 548                         break;
 549                     case Action::HASH: {
 550                         actions.emplace_back(code, action_pos, val);
 551                         auto& obj = actions.back();
 552                         auto max_length = obj.get_num_arg();
 553                         if (max_length < 6) {
 554                             report_location(DIAG_ERROR, filename, line_no,
 555                                             obj.get_pos() + 4 + 1);
 556                             cerr << "Index action 'hash' takes an integer "
 557                                     "argument which must be at least 6" << endl;
 558                             exit(1);
 559                         }
 560                         break;
 561                     }
 562                     case Action::BOOLEAN:
 563                         boolmap[val] = Action::BOOLEAN;
 564                         /* FALLTHRU */
 565                     default:
 566                         actions.emplace_back(code, action_pos, val);
 567                 }
 568                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 569             } else {
 570                 if (min_args > 0) {
 571                     report_location(DIAG_ERROR, filename, line_no,
 572                                     i_after_action - s.begin());
 573                     if (min_args == max_args) {
 574                         cerr << "Index action '" << action << "' requires "
 575                              << min_args << " arguments" << endl;
 576                         exit(1);
 577                     }
 578                     cerr << "Index action '" << action << "' requires at least "
 579                          << min_args << " arguments" << endl;
 580                     exit(1);
 581                 }
 582                 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
 583                     useless_weight_pos = string::npos;
 584                     actions.emplace_back(code, action_pos, "", weight);
 585                 } else if (code == Action::HASH) {
 586                     actions.emplace_back(code, action_pos, "",
 587                                          MAX_SAFE_TERM_LENGTH - 1);
 588                 } else {
 589                     actions.emplace_back(code, action_pos);
 590                 }
 591             }
 592             j = i;
 593         }
 594
 595         if (useless_weight_pos != string::npos) {
 596             report_useless_action(filename, line_no, useless_weight_pos,
 597                                   "weight");
 598         }
 599
 600         while (!actions.empty()) {
 601             bool done = true;
 602             Action::type action = actions.back().get_action();
 603             switch (action) {
 604                 case Action::HASH:
 605                 case Action::HEXTOBIN:
 606                 case Action::LOWER:
 607                 case Action::PARSEDATE:
 608                 case Action::SPELL:
 609                 case Action::TRUNCATE:
 610                 case Action::UNHTML:
 611                     done = false;
 612                     report_useless_action(filename, line_no,
 613                                           actions.back().get_pos(),
 614                                           action_names[action]);
 615                     actions.pop_back();
 616                     break;
 617                 default:
 618                     break;
 619             }
 620             if (done) break;
 621         }
 622
 623         map<string, Action::type>::const_iterator boolpfx;
 624         for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
 625             if (boolpfx->second == Action::UNIQUE) {
 626                 report_location(DIAG_WARN, filename, line_no);
 627                 cerr << "Index action 'unique=" << boolpfx->first
 628                      << "' without 'boolean=" << boolpfx->first << "'" << endl;
 629                 static bool given_doesnt_imply_boolean_warning = false;
 630                 if (!given_doesnt_imply_boolean_warning) {
 631                     given_doesnt_imply_boolean_warning = true;
 632                     report_location(DIAG_NOTE, filename, line_no);
 633                     cerr << "'unique' doesn't implicitly add a boolean term"
 634                          << endl;
 635                 }
 636             }
 637         }
 638
 639         vector<string>::const_iterator field;
 640         for (field = fields.begin(); field != fields.end(); ++field) {
 641             vector<Action> &v = index_spec[*field];
 642             if (v.empty()) {
 643                 if (fields.size() == 1) {
 644                     // Optimise common case where there's only one fieldname
 645                     // for a list of actions.
 646                     v = std::move(actions);
 647                 } else {
 648                     v = actions;
 649                 }
 650             } else {
 651                 v.emplace_back(Action::NEW, string::npos);
 652                 v.insert(v.end(), actions.begin(), actions.end());
 653             }
 654         }
 655     }
 656
 657     if (index_spec.empty()) {
 658         report_location(DIAG_ERROR, filename, line_no);
 659         cerr << "No rules found in index script" << endl;
 660         exit(1);
 661     }
 662 }
 663
 664 static bool
 665 run_actions(vector<Action>::const_iterator action_it,
 666             vector<Action>::const_iterator action_end,
 667             Xapian::WritableDatabase& database,
 668             Xapian::TermGenerator& indexer,
 669             const string& old_value,
 670             bool& this_field_is_content, Xapian::Document& doc,
 671             map<string, list<string>>& fields,
 672             string& field, const char* fname,
 673             size_t line_no, Xapian::docid& docid)
 674 {
 675     string value = old_value;
 676     while (action_it != action_end) {
 677         auto& action = *action_it++;
 678         switch (action.get_action()) {
 679             case Action::BAD:
 680                 abort();
 681             case Action::NEW:
 682                 value = old_value;
 683                 // We're processing the same field again - give it a reprieve.
 684                 this_field_is_content = true;
 685                 break;
 686             case Action::FIELD:
 687                 if (!value.empty()) {
 688                     string f = action.get_string_arg();
 689                     if (f.empty()) f = field;
 690                     // replace newlines with spaces
 691                     string s = value;
 692                     string::size_type j = 0;
 693                     while ((j = s.find('\n', j)) != string::npos)
 694                         s[j] = ' ';
 695                     fields[f].push_back(s);
 696                 }
 697                 break;
 698             case Action::INDEX:
 699                 indexer.index_text(value,
 700                                    action.get_num_arg(),
 701                                    action.get_string_arg());
 702                 break;
 703             case Action::INDEXNOPOS:
 704                 // No positional information so phrase searching won't work.
 705                 // However, the database will use much less diskspace.
 706                 indexer.index_text_without_positions(value,
 707                                                      action.get_num_arg(),
 708                                                      action.get_string_arg());
 709                 break;
 710             case Action::BOOLEAN: {
 711                 // Do nothing if there's no text.
 712                 if (value.empty()) break;
 713
 714                 string term = action.get_string_arg();
 715                 if (prefix_needs_colon(term, value[0])) term += ':';
 716                 term += value;
 717
 718                 doc.add_boolean_term(term);
 719                 break;
 720             }
 721             case Action::HASH: {
 722                 unsigned int max_length = action.get_num_arg();
 723                 if (value.length() > max_length)
 724                     value = hash_long_term(value, max_length);
 725                 break;
 726             }
 727             case Action::HEXTOBIN: {
 728                 size_t len = value.length();
 729                 if (len & 1) {
 730                     report_location(DIAG_ERROR, fname, line_no);
 731                     cerr << "hextobin: input must have even length"
 732                          << endl;
 733                 } else {
 734                     string output;
 735                     output.reserve(len / 2);
 736                     for (size_t j = 0; j < len; j += 2) {
 737                         char a = value[j];
 738                         char b = value[j + 1];
 739                         if (!C_isxdigit(a) || !C_isxdigit(b)) {
 740                             report_location(DIAG_ERROR, fname, line_no);
 741                             cerr << "hextobin: input must be all hex "
 742                                     "digits" << endl;
 743                             goto badhex;
 744                         }
 745                         char r = (hex_digit(a) << 4) | hex_digit(b);
 746                         output.push_back(r);
 747                     }
 748                     value = std::move(output);
 749                 }
 750 badhex:
 751                 break;
 752             }
 753             case Action::LOWER:
 754                 value = Xapian::Unicode::tolower(value);
 755                 break;
 756             case Action::LOAD: {
 757                 bool truncated = false;
 758                 // FIXME: Use NOATIME if we own the file or are root.
 759                 if (!load_file(value, action.get_num_arg(), NOCACHE,
 760                                value, truncated)) {
 761                     report_location(DIAG_ERROR, fname, line_no);
 762                     cerr << "Couldn't load file '" << value << "': "
 763                          << strerror(errno) << endl;
 764                     value.resize(0);
 765                 }
 766                 if (!truncated) break;
 767             }
 768             /* FALLTHRU */
 769             case Action::TRUNCATE:
 770                 utf8_truncate(value, action.get_num_arg());
 771                 break;
 772             case Action::SPELL:
 773                 indexer.set_flags(indexer.FLAG_SPELLING);
 774                 break;
 775             case Action::SPLIT: {
 776                 // Execute actions on the split up to the first NEW, if any.
 777                 vector<Action>::const_iterator split_end = action_it;
 778                 while (split_end != action_end &&
 779                        split_end->get_action() != Action::NEW) {
 780                     ++split_end;
 781                 }
 782
 783                 if (value.empty()) {
 784                     // Nothing to do.
 785                 } else if (action.get_num_arg() != Action::SPLIT_SORT) {
 786                     // Generate split as we consume it.
 787                     const string& delimiter = action.get_string_arg();
 788
 789                     unique_ptr<unordered_set<string>> seen;
 790                     if (action.get_num_arg() == Action::SPLIT_DEDUP) {
 791                         seen.reset(new unordered_set<string>);
 792                     }
 793
 794                     if (delimiter.size() == 1) {
 795                         // Special case for common single character delimiter.
 796                         char ch = delimiter[0];
 797                         string::size_type i = 0;
 798                         while (true) {
 799                             string::size_type j = value.find(ch, i);
 800                             if (i != j) {
 801                                 string val(value, i, j - i);
 802                                 if (!seen.get() || seen->insert(val).second) {
 803                                     run_actions(action_it, split_end,
 804                                                 database, indexer,
 805                                                 val,
 806                                                 this_field_is_content, doc,
 807                                                 fields,
 808                                                 field, fname, line_no,
 809                                                 docid);
 810                                 }
 811                             }
 812                             if (j == string::npos) break;
 813                             i = j + 1;
 814                         }
 815                     } else {
 816                         string::size_type i = 0;
 817                         while (true) {
 818                             string::size_type j = value.find(delimiter, i);
 819                             if (i != j) {
 820                                 string val(value, i, j - i);
 821                                 if (!seen.get() || seen->insert(val).second) {
 822                                     run_actions(action_it, split_end,
 823                                                 database, indexer,
 824                                                 val,
 825                                                 this_field_is_content, doc,
 826                                                 fields,
 827                                                 field, fname, line_no,
 828                                                 docid);
 829                                 }
 830                             }
 831                             if (j == string::npos) break;
 832                             i = j + delimiter.size();
 833                         }
 834                     }
 835                 } else {
 836                     vector<string> split_values;
 837                     const string& delimiter = action.get_string_arg();
 838                     if (delimiter.size() == 1) {
 839                         // Special case for common single character delimiter.
 840                         char ch = delimiter[0];
 841                         string::size_type i = 0;
 842                         while (true) {
 843                             string::size_type j = value.find(ch, i);
 844                             if (i != j) {
 845                                 split_values.emplace_back(value, i, j - i);
 846                             }
 847                             if (j == string::npos) break;
 848                             i = j + 1;
 849                         }
 850                     } else {
 851                         string::size_type i = 0;
 852                         while (true) {
 853                             string::size_type j = value.find(delimiter, i);
 854                             if (i != j) {
 855                                 split_values.emplace_back(value, i, j - i);
 856                             }
 857                             if (j == string::npos) break;
 858                             i = j + delimiter.size();
 859                         }
 860                     }
 861
 862                     sort(split_values.begin(), split_values.end());
 863
 864                     for (auto&& val : split_values) {
 865                         run_actions(action_it, split_end,
 866                                     database, indexer, val,
 867                                     this_field_is_content, doc, fields,
 868                                     field, fname, line_no,
 869                                     docid);
 870                     }
 871                 }
 872
 873                 action_it = split_end;
 874                 break;
 875             }
 876             case Action::UNHTML: {
 877                 MyHtmlParser p;
 878                 try {
 879                     // Default HTML character set is latin 1, though
 880                     // not specifying one is deprecated these days.
 881                     p.parse_html(value, "iso-8859-1", false);
 882                 } catch (const string & newcharset) {
 883                     p.reset();
 884                     p.parse_html(value, newcharset, true);
 885                 }
 886                 if (p.indexing_allowed)
 887                     value = p.dump;
 888                 else
 889                     value = "";
 890                 break;
 891             }
 892             case Action::UNIQUE: {
 893                 // If there's no text, just issue a warning.
 894                 if (value.empty()) {
 895                     report_location(DIAG_WARN, fname, line_no);
 896                     cerr << "Ignoring UNIQUE action on empty text"
 897                          << endl;
 898                     break;
 899                 }
 900
 901                 // Ensure that the value of this field is unique.
 902                 // If a record already exists with the same value,
 903                 // it will be replaced with the new record.
 904
 905                 // Unique fields aren't considered content - if
 906                 // there are no other fields in the document, the
 907                 // document is to be deleted.
 908                 this_field_is_content = false;
 909
 910                 // Argument is the prefix to add to the field value
 911                 // to get the unique term.
 912                 string t = action.get_string_arg();
 913                 if (prefix_needs_colon(t, value[0])) t += ':';
 914                 t += value;
 915                 Xapian::PostingIterator p = database.postlist_begin(t);
 916                 if (p != database.postlist_end(t)) {
 917                     docid = *p;
 918                 }
 919                 break;
 920             }
 921             case Action::VALUE:
 922                 if (!value.empty())
 923                     doc.add_value(action.get_num_arg(), value);
 924                 break;
 925             case Action::VALUENUMERIC: {
 926                 if (value.empty()) break;
 927                 char * end;
 928                 double dbl = strtod(value.c_str(), &end);
 929                 if (*end) {
 930                     report_location(DIAG_WARN, fname, line_no);
 931                     cerr << "Trailing characters in VALUENUMERIC: '"
 932                          << value << "'" << endl;
 933                 }
 934                 doc.add_value(action.get_num_arg(),
 935                               Xapian::sortable_serialise(dbl));
 936                 break;
 937             }
 938             case Action::VALUEPACKED: {
 939                 uint32_t word = 0;
 940                 if (value.empty() || !C_isdigit(value[0])) {
 941                     // strtoul() accepts leading whitespace and negated
 942                     // values, neither of which we want to allow.
 943                     errno = EINVAL;
 944                 } else {
 945                     errno = 0;
 946                     char* q;
 947                     word = strtoul(value.c_str(), &q, 10);
 948                     if (!errno && *q != '\0') {
 949                         // Trailing characters after converted value.
 950                         errno = EINVAL;
 951                     }
 952                 }
 953                 if (errno) {
 954                     report_location(DIAG_WARN, fname, line_no);
 955                     cerr << "valuepacked \"" << value << "\" ";
 956                     if (errno == ERANGE) {
 957                         cerr << "out of range";
 958                     } else {
 959                         cerr << "not an unsigned integer";
 960                     }
 961                     cerr << endl;
 962                 }
 963                 int valueslot = action.get_num_arg();
 964                 doc.add_value(valueslot, int_to_binary_string(word));
 965                 break;
 966             }
 967             case Action::DATE: {
 968                 const string & type = action.get_string_arg();
 969                 string yyyymmdd;
 970                 if (type == "unix") {
 971                     time_t t = atoi(value.c_str());
 972                     struct tm *tm = localtime(&t);
 973                     int y = tm->tm_year + 1900;
 974                     int m = tm->tm_mon + 1;
 975                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
 976                 } else if (type == "yyyymmdd") {
 977                     if (value.length() == 8) yyyymmdd = value;
 978                 }
 979                 if (yyyymmdd.empty()) break;
 980                 // Date (YYYYMMDD)
 981                 doc.add_boolean_term("D" + yyyymmdd);
 982                 yyyymmdd.resize(6);
 983                 // Month (YYYYMM)
 984                 doc.add_boolean_term("M" + yyyymmdd);
 985                 yyyymmdd.resize(4);
 986                 // Year (YYYY)
 987                 doc.add_boolean_term("Y" + yyyymmdd);
 988                 break;
 989             }
 990             case Action::PARSEDATE: {
 991                 string dateformat = action.get_string_arg();
 992                 struct tm tm;
 993                 memset(&tm, 0, sizeof(tm));
 994                 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
 995                 if (ret == NULL) {
 996                     report_location(DIAG_WARN, fname, line_no);
 997                     cerr << "\"" << value << "\" doesn't match format "
 998                             "\"" << dateformat << '\"' << endl;
 999                     break;
1000                 }
1001
1002                 if (*ret != '\0') {
1003                     report_location(DIAG_WARN, fname, line_no);
1004                     cerr << "\"" << value << "\" not fully matched by "
1005                             "format \"" << dateformat << "\" "
1006                             "(\"" << ret << "\" left over) but "
1007                             "indexing anyway" << endl;
1008                 }
1009
1010                 value = str(timegm(&tm));
1011                 break;
1012             }
1013             default:
1014                 /* Empty default case to avoid "unhandled enum value"
1015                  * warnings. */
1016                 break;
1017         }
1018     }
1019     return true;
1020 }
1021
1022 static void
1023 index_file(const char *fname, istream &stream,
1024            Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1025 {
1026     string line;
1027     size_t line_no = 0;
1028     while (!stream.eof() && getline(stream, line)) {
1029         ++line_no;
1030         Xapian::Document doc;
1031         indexer.set_document(doc);
1032         Xapian::docid docid = 0;
1033         map<string, list<string>> fields;
1034         bool seen_content = false;
1035         while (!line.empty()) {
1036             // Cope with files from MS Windows (\r\n end of lines).
1037             // Trim multiple \r characters, since that seems the best way
1038             // to handle that case.
1039             string::size_type last = line.find_last_not_of('\r');
1040             if (last == string::npos) break;
1041             line.resize(last + 1);
1042
1043             string::size_type eq = line.find('=');
1044             if (eq == string::npos && !line.empty()) {
1045                 report_location(DIAG_ERROR, fname, line_no, line.size());
1046                 cerr << "expected = somewhere in this line" << endl;
1047                 // FIXME: die or what?
1048             }
1049             string field(line, 0, eq);
1050             string value(line, eq + 1, string::npos);
1051             while (getline(stream, line)) {
1052                 ++line_no;
1053                 if (line.empty() || line[0] != '=') break;
1054                 // Cope with files from MS Windows (\r\n end of lines).
1055                 // Trim multiple \r characters, since that seems the best way
1056                 // to handle that case.
1057                 last = line.find_last_not_of('\r');
1058                 // line[0] == '=', so last != string::npos.
1059                 // Replace the '=' with a '\n' so we don't have to use substr.
1060                 line[0] = '\n';
1061                 line.resize(last + 1);
1062                 value += line;
1063             }
1064
1065             // Default to not indexing spellings.
1066             indexer.set_flags(Xapian::TermGenerator::flags(0));
1067
1068             bool this_field_is_content = true;
1069             const vector<Action>& v = index_spec[field];
1070             run_actions(v.begin(), v.end(),
1071                         database, indexer, value,
1072                         this_field_is_content, doc, fields,
1073                         field, fname, line_no,
1074                         docid);
1075             if (this_field_is_content) seen_content = true;
1076             if (stream.eof()) break;
1077         }
1078
1079         // If we haven't seen any fields (other than unique identifiers)
1080         // the document is to be deleted.
1081         if (!seen_content) {
1082             if (docid) {
1083                 database.delete_document(docid);
1084                 if (verbose) cout << "Del: " << docid << endl;
1085                 ++delcount;
1086             }
1087         } else {
1088             string data;
1089             for (auto&& i : fields) {
1090                 for (auto&& field_val : i.second) {
1091                     data += i.first;
1092                     data += '=';
1093                     data += field_val;
1094                     data += '\n';
1095                 }
1096             }
1097
1098             // Put the data in the document
1099             doc.set_data(data);
1100
1101             // Add the document to the database
1102             if (docid) {
1103                 database.replace_document(docid, doc);
1104                 if (verbose) cout << "Replace: " << docid << endl;
1105                 ++repcount;
1106             } else {
1107                 docid = database.add_document(doc);
1108                 if (verbose) cout << "Add: " << docid << endl;
1109                 ++addcount;
1110             }
1111         }
1112     }
1113
1114     // Commit after each file to make sure all changes from that file make it
1115     // in.
1116     if (verbose) cout << "Committing: " << endl;
1117     database.commit();
1118 }
1119
1120 int
1121 main(int argc, char **argv)
1122 try {
1123     // If the database already exists, default to updating not overwriting.
1124     int database_mode = Xapian::DB_CREATE_OR_OPEN;
1125     verbose = false;
1126     Xapian::Stem stemmer("english");
1127
1128     constexpr auto NO_ARG = no_argument;
1129     constexpr auto REQ_ARG = required_argument;
1130     static const struct option longopts[] = {
1131         { "help",       NO_ARG,         NULL, 'h' },
1132         { "version",    NO_ARG,         NULL, 'V' },
1133         { "stemmer",    REQ_ARG,        NULL, 's' },
1134         { "overwrite",  NO_ARG,         NULL, 'o' },
1135         { "verbose",    NO_ARG,         NULL, 'v' },
1136         { 0, 0, NULL, 0 }
1137     };
1138
1139     bool more = true, show_help = false;
1140     while (more) {
1141         switch (gnu_getopt_long(argc, argv, "vs:hV", longopts, NULL)) {
1142             case EOF:
1143                 more = false;
1144                 break;
1145             default:
1146             case 'h': // --help
1147                 show_help = true;
1148                 more = false;
1149                 break;
1150             case 'V': // --version
1151                 print_package_info(PROG_NAME);
1152                 return 0;
1153             case 'o': // --overwrite
1154                 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1155                 break;
1156             case 'v':
1157                 verbose = true;
1158                 break;
1159             case 's':
1160                 try {
1161                     stemmer = Xapian::Stem(optarg);
1162                 } catch (const Xapian::InvalidArgumentError &) {
1163                     cerr << "Unknown stemming language '" << optarg << "'.\n";
1164                     cerr << "Available language names are: "
1165                          << Xapian::Stem::get_available_languages() << endl;
1166                     return 1;
1167                 }
1168                 break;
1169         }
1170     }
1171
1172     argv += optind;
1173     argc -= optind;
1174     if (show_help || argc < 2) {
1175         cout << PROG_NAME " - " PROG_DESC "\n"
1176 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1177 "\n"
1178 "Creates or updates a Xapian database with the data from the input files listed\n"
1179 "on the command line.  If no files are specified, data is read from stdin.\n"
1180 "\n"
1181 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1182 "format for INDEXER_SCRIPT.\n"
1183 "\n"
1184 "Options:\n"
1185 "  -v, --verbose       display additional messages to aid debugging\n"
1186 "      --overwrite     create the database anew (the default is to update if\n"
1187 "                      the database already exists)\n";
1188         print_stemmer_help("");
1189         print_help_and_version_help("");
1190         exit(show_help ? 0 : 1);
1191     }
1192
1193     parse_index_script(argv[1]);
1194
1195     // Open the database.  If another process is currently updating the
1196     // database, wait for the lock to become available.
1197     auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1198     Xapian::WritableDatabase database(argv[0], flags);
1199
1200     Xapian::TermGenerator indexer;
1201     indexer.set_stemmer(stemmer);
1202     // Set the database for spellings to be added to by the "spell" action.
1203     indexer.set_database(database);
1204
1205     addcount = 0;
1206     repcount = 0;
1207     delcount = 0;
1208
1209     if (argc == 2) {
1210         // Read from stdin.
1211         index_file("<stdin>", cin, database, indexer);
1212     } else {
1213         // Read file(s) listed on the command line.
1214         for (int i = 2; i < argc; ++i) {
1215             ifstream stream(argv[i]);
1216             if (stream) {
1217                 index_file(argv[i], stream, database, indexer);
1218             } else {
1219                 cerr << "Can't open file " << argv[i] << endl;
1220             }
1221         }
1222     }
1223
1224     cout << "records (added, replaced, deleted) = (" << addcount << ", "
1225          << repcount << ", " << delcount << ")" << endl;
1226 } catch (const Xapian::Error &error) {
1227     cerr << "Exception: " << error.get_description() << endl;
1228     exit(1);
1229 } catch (const std::bad_alloc &) {
1230     cerr << "Exception: std::bad_alloc" << endl;
1231     exit(1);
1232 } catch (...) {
1233     cerr << "Unknown Exception" << endl;
1234     exit(1);
1235 }