[ci] Fix netbsd job to upgrade existing packages
[xapian.git] / xapian-applications / omega / scriptindex.cc
blob68f03a647b8a8ea956459b323539f536c1e76e3d
1 /** @file
2 * @brief index arbitrary data as described by an index script
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Sam Liddicott
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2023 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #include <config.h>
27 #ifdef __CYGWIN__
28 // Needed to get setenv() and strptime() declared.
29 # define _GNU_SOURCE
30 #endif
32 #include <xapian.h>
34 #include <algorithm>
35 #include <fstream>
36 #include <iostream>
37 #include <list>
38 #include <map>
39 #include <memory>
40 #include <string>
41 #include <unordered_set>
42 #include <vector>
43 #include <cstring>
45 #include <cerrno>
46 #include <cstdio>
47 #include <cstdlib>
48 #include <ctime>
50 #include "commonhelp.h"
51 #include "datetime.h"
52 #include "genericxmlparser.h"
53 #include "hashterm.h"
54 #include "htmlparser.h"
55 #include "loadfile.h"
56 #include "parseint.h"
57 #include "setenv.h"
58 #include "str.h"
59 #include "stringutils.h"
60 #include "timegm.h"
61 #include "utf8truncate.h"
62 #include "values.h"
64 #ifndef HAVE_STRPTIME
65 #include "portability/strptime.h"
66 #endif
68 #include "gnu_getopt.h"
70 using namespace std;
72 #define PROG_NAME "scriptindex"
73 #define PROG_DESC "index arbitrary data as described by an index script"
75 static bool verbose;
76 static int addcount;
77 static int repcount;
78 static int delcount;
79 static int skipcount;
81 /** What to do if there's a UNIQUE action but a record doesn't use it.
83 static enum {
84 UNIQUE_ERROR,
85 UNIQUE_WARN_NEW,
86 UNIQUE_NEW,
87 UNIQUE_WARN_SKIP,
88 UNIQUE_SKIP
89 } unique_missing = UNIQUE_ERROR;
91 /// Track if UNIQUE action is unused in the current record.
92 static bool unique_unused;
94 /// Track if the current record is being skipping.
95 static bool skipping_record = false;
97 static inline bool
98 prefix_needs_colon(const string & prefix, unsigned ch)
100 if (!C_isupper(ch) && ch != ':') return false;
101 string::size_type len = prefix.length();
102 return (len > 1 && prefix[len - 1] != ':');
105 const char * action_names[] = {
106 // Actions used internally:
107 "bad",
108 "new",
109 // Actual actions:
110 "boolean",
111 "date",
112 "field",
113 "gap",
114 "hash",
115 "hextobin",
116 "index",
117 "indexnopos",
118 "load",
119 "lower",
120 "ltrim",
121 "parsedate",
122 "rtrim",
123 "spell",
124 "split",
125 "squash",
126 "trim",
127 "truncate",
128 "unhtml",
129 "unique",
130 "unxml",
131 "value",
132 "valuenumeric",
133 "valuepacked",
134 "weight"
137 // For debugging:
138 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")\n"
140 class Action {
141 public:
142 typedef enum {
143 // Actions used internally:
144 BAD,
145 NEW,
146 // Actual actions:
147 BOOLEAN,
148 DATE,
149 FIELD,
150 GAP,
151 HASH,
152 HEXTOBIN,
153 INDEX,
154 INDEXNOPOS,
155 LOAD,
156 LOWER,
157 LTRIM,
158 PARSEDATE,
159 RTRIM,
160 SPELL,
161 SPLIT,
162 SQUASH,
163 TRIM,
164 TRUNCATE,
165 UNHTML,
166 UNIQUE,
167 UNXML,
168 VALUE,
169 VALUENUMERIC,
170 VALUEPACKED,
171 WEIGHT
172 } type;
173 enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT, SPLIT_PREFIXES };
174 private:
175 type action;
176 int num_arg = 0;
177 string string_arg;
178 // Offset into indexscript line.
179 size_t pos;
180 public:
181 Action(type action_, size_t pos_)
182 : action(action_), pos(pos_) { }
183 Action(type action_, size_t pos_, const string & arg)
184 : action(action_), string_arg(arg), pos(pos_) {
185 num_arg = atoi(string_arg.c_str());
187 Action(type action_, size_t pos_, const string & arg, int num)
188 : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
189 type get_action() const { return action; }
190 int get_num_arg() const { return num_arg; }
191 void set_num_arg(int num) { num_arg = num; }
192 const string & get_string_arg() const { return string_arg; }
193 size_t get_pos() const { return pos; }
196 // These allow searching for an Action with a particular Action::type using
197 // std::find().
199 inline bool
200 operator==(const Action& a, Action::type t) { return a.get_action() == t; }
202 inline bool
203 operator==(Action::type t, const Action& a) { return a.get_action() == t; }
205 inline bool
206 operator!=(const Action& a, Action::type t) { return !(a == t); }
208 inline bool
209 operator!=(Action::type t, const Action& a) { return !(t == a); }
211 static void
212 ltrim(string& s, const string& chars)
214 auto i = s.find_first_not_of(chars);
215 if (i) s.erase(0, i);
218 static void
219 rtrim(string& s, const string& chars)
221 s.resize(s.find_last_not_of(chars) + 1);
224 static void
225 squash(string& s, const string& chars)
227 string output;
228 output.reserve(s.size());
229 string::size_type i = 0;
230 while ((i = s.find_first_not_of(chars, i)) != string::npos) {
231 auto j = s.find_first_of(chars, i);
232 if (!output.empty()) output += ' ';
233 output.append(s, i, j - i);
234 i = j;
236 s = std::move(output);
239 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
241 static unsigned error_count = 0;
243 static void
244 report_location(enum diag_type type,
245 const string& filename,
246 size_t line = 0,
247 size_t pos = string::npos)
249 cerr << filename;
250 if (line != 0) {
251 cerr << ':' << line;
252 if (pos != string::npos) {
253 // The first column is numbered 1.
254 cerr << ':' << pos + 1;
257 switch (type) {
258 case DIAG_ERROR:
259 cerr << ": error: ";
260 ++error_count;
261 break;
262 case DIAG_WARN:
263 cerr << ": warning: ";
264 break;
265 case DIAG_NOTE:
266 cerr << ": note: ";
267 break;
271 static void
272 report_useless_action(const string &file, size_t line, size_t pos,
273 const string &action)
275 report_location(DIAG_WARN, file, line, pos);
276 cerr << "Index action '" << action << "' has no effect\n";
278 static bool given_left_to_right_warning = false;
279 if (!given_left_to_right_warning) {
280 given_left_to_right_warning = true;
281 report_location(DIAG_NOTE, file, line, pos);
282 cerr << "Actions are executed from left to right\n";
286 // Return true if we can support %z on the current platform.
287 static inline bool
288 parsedate_supports_z()
290 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
291 // Without tm_gmtoff we aren't going to get the timezone information from
292 // strptime().
293 return false;
294 #else
295 // Perform a simple run-time test to check if %z is suitably supported.
296 static bool cached_result = ([]() {
297 struct tm tm;
298 memset(&tm, 0, sizeof(tm));
299 auto ret = strptime("+1245", "%z", &tm);
300 return ret && *ret == '\0' && tm.tm_gmtoff == (12 * 60 + 45) * 60;
301 })();
302 return cached_result;
303 #endif
306 static bool index_spec_uses_unique = false;
308 static map<string, vector<Action>> index_spec;
310 // Like std::getline() but handle \r\n line endings too.
311 static istream&
312 getline_portable(istream& stream, string& line)
314 istream& result = getline(stream, line);
315 // Trim multiple \r characters, since that seems the best way to handle
316 // that case.
317 line.resize(UNSIGNED_OVERFLOW_OK(line.find_last_not_of('\r') + 1));
318 return result;
321 static void
322 parse_index_script(const string &filename)
324 ifstream script(filename.c_str());
325 if (!script.is_open()) {
326 report_location(DIAG_ERROR, filename);
327 cerr << strerror(errno) << '\n';
328 exit(1);
330 string line;
331 size_t line_no = 0;
332 // Line number where we saw a `unique` action, or 0 if we haven't.
333 int unique_line_no = 0;
334 // Offset into line unique_line_no where the `unique` action was.
335 size_t unique_pos = 0;
336 while (getline(script, line)) {
337 ++line_no;
338 vector<string> fields;
339 vector<Action> actions;
340 string::const_iterator i, j;
341 const string &s = line;
342 i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
343 if (i == s.end() || *i == '#') {
344 // Blank line or comment.
345 continue;
347 while (true) {
348 if (!C_isalnum(*i)) {
349 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
350 cerr << "field name must start with alphanumeric\n";
352 j = find_if(i + 1, s.end(),
353 [](char ch) { return !C_isalnum(ch) && ch != '_'; });
354 fields.push_back(string(i, j));
355 i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
356 if (i == s.end()) break;
357 if (*i == ':') {
358 ++i;
359 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
360 break;
362 if (i == j) {
363 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
364 cerr << "bad character '" << *i << "' in field name\n";
365 ++i;
366 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
367 if (i == s.end()) break;
370 Xapian::termcount weight = 1;
371 size_t useless_weight_pos = string::npos;
372 map<string, Action::type> boolmap;
373 j = i;
374 while (j != s.end()) {
375 size_t action_pos = j - s.begin();
376 i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
377 string action(s, j - s.begin(), i - j);
378 Action::type code = Action::BAD;
379 unsigned min_args = 0, max_args = 0;
380 bool takes_integer_argument = false;
381 if (!action.empty()) {
382 switch (action[0]) {
383 case 'b':
384 if (action == "boolean") {
385 code = Action::BOOLEAN;
386 max_args = 1;
388 break;
389 case 'd':
390 if (action == "date") {
391 code = Action::DATE;
392 min_args = max_args = 1;
394 break;
395 case 'f':
396 if (action == "field") {
397 code = Action::FIELD;
398 max_args = 1;
400 break;
401 case 'g':
402 if (action == "gap") {
403 code = Action::GAP;
404 max_args = 1;
405 takes_integer_argument = true;
407 break;
408 case 'h':
409 if (action == "hash") {
410 code = Action::HASH;
411 max_args = 1;
412 takes_integer_argument = true;
413 } else if (action == "hextobin") {
414 code = Action::HEXTOBIN;
416 break;
417 case 'i':
418 if (action == "index") {
419 code = Action::INDEX;
420 max_args = 1;
421 } else if (action == "indexnopos") {
422 code = Action::INDEXNOPOS;
423 max_args = 1;
425 break;
426 case 'l':
427 if (action == "lower") {
428 code = Action::LOWER;
429 } else if (action == "load") {
430 code = Action::LOAD;
431 } else if (action == "ltrim") {
432 code = Action::LTRIM;
433 max_args = 1;
435 break;
436 case 'p':
437 if (action == "parsedate") {
438 code = Action::PARSEDATE;
439 min_args = max_args = 1;
441 break;
442 case 'r':
443 if (action == "rtrim") {
444 code = Action::RTRIM;
445 max_args = 1;
447 break;
448 case 's':
449 if (action == "spell") {
450 code = Action::SPELL;
451 } else if (action == "split") {
452 code = Action::SPLIT;
453 min_args = 1;
454 max_args = 2;
455 } else if (action == "squash") {
456 code = Action::SQUASH;
457 max_args = 1;
459 break;
460 case 't':
461 if (action == "truncate") {
462 code = Action::TRUNCATE;
463 min_args = max_args = 1;
464 takes_integer_argument = true;
465 } else if (action == "trim") {
466 code = Action::TRIM;
467 max_args = 1;
469 break;
470 case 'u':
471 if (action == "unhtml") {
472 code = Action::UNHTML;
473 } else if (action == "unique") {
474 code = Action::UNIQUE;
475 min_args = 1;
476 max_args = 2;
477 } else if (action == "unxml") {
478 code = Action::UNXML;
480 break;
481 case 'v':
482 if (action == "value") {
483 code = Action::VALUE;
484 min_args = max_args = 1;
485 takes_integer_argument = true;
486 } else if (action == "valuenumeric") {
487 code = Action::VALUENUMERIC;
488 min_args = max_args = 1;
489 takes_integer_argument = true;
490 } else if (action == "valuepacked") {
491 code = Action::VALUEPACKED;
492 min_args = max_args = 1;
493 takes_integer_argument = true;
495 break;
496 case 'w':
497 if (action == "weight") {
498 code = Action::WEIGHT;
499 min_args = max_args = 1;
500 // Don't set takes_integer_argument since we parse
501 // it with parse_unsigned() and issue an error there
502 // - setting takes_integer_argument would give a
503 // double error for arguments with a decimal point.
505 break;
508 if (code == Action::BAD) {
509 report_location(DIAG_ERROR, filename, line_no, action_pos);
510 if (action.empty()) {
511 i = find_if(i, s.end(), C_isspace);
512 cerr << "Expected index action, found '"
513 << string(s, j - s.begin(), i - j) << "'\n";
514 } else {
515 cerr << "Unknown index action '" << action << "'\n";
518 auto i_after_action = i;
519 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
521 if (i != s.end() && *i == '=') {
522 if (i != i_after_action) {
523 report_location(DIAG_WARN, filename, line_no,
524 i_after_action - s.begin());
525 cerr << "putting spaces between the action and '=' is "
526 "deprecated\n";
529 if (max_args == 0) {
530 report_location(DIAG_ERROR, filename, line_no,
531 i - s.begin());
532 cerr << "Index action '" << action
533 << "' doesn't take an argument\n";
536 ++i;
537 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
538 if (i != j) {
539 report_location(DIAG_WARN, filename, line_no,
540 i - s.begin());
541 cerr << "putting spaces between '=' and the argument is "
542 "deprecated\n";
545 vector<string> vals;
546 while (true) {
547 if (j != s.end() && *j == '"') {
548 // Quoted argument.
549 ++j;
550 string arg;
551 while (true) {
552 i = find_if(j, s.end(),
553 [](char ch) {
554 return ch == '"' || ch == '\\';
556 if (i == s.end()) {
557 report_location(DIAG_ERROR, filename, line_no,
558 s.size());
559 cerr << "No closing quote\n";
560 break;
562 arg.append(j, i);
563 if (*i++ == '"')
564 break;
566 // Escape sequence.
567 if (i == s.end()) {
568 bad_escaping:
569 report_location(DIAG_ERROR, filename, line_no,
570 i - s.begin());
571 cerr << "Bad escaping in quoted action "
572 "argument\n";
573 break;
576 char ch = *i;
577 switch (ch) {
578 case '\\':
579 case '"':
580 break;
581 case '0':
582 ch = '\0';
583 break;
584 case 'n':
585 ch = '\n';
586 break;
587 case 'r':
588 ch = '\r';
589 break;
590 case 't':
591 ch = '\t';
592 break;
593 case 'x': {
594 if (++i == s.end())
595 goto bad_escaping;
596 char ch1 = *i;
597 if (!C_isxdigit(ch1)) {
598 bad_hex_digit:
599 report_location(DIAG_ERROR, filename,
600 line_no, i - s.begin());
601 cerr << "Bad hex digit in escaping\n";
602 --i;
603 break;
605 if (++i == s.end())
606 goto bad_escaping;
607 char ch2 = *i;
608 if (!C_isxdigit(ch2)) {
609 goto bad_hex_digit;
611 ch = hex_decode(ch1, ch2);
612 break;
614 default:
615 report_location(DIAG_ERROR, filename,
616 line_no, i - s.begin());
617 cerr << "Bad escape sequence '\\" << ch
618 << "'\n";
619 break;
621 arg += ch;
622 j = i + 1;
624 vals.emplace_back(std::move(arg));
625 if (i == s.end() || C_isspace(*i)) break;
626 if (*i == ',') {
627 ++i;
628 } else {
629 report_location(DIAG_ERROR, filename, line_no,
630 i - s.begin());
631 cerr << "Unexpected character '" << *i
632 << "' after closing quote\n";
633 do {
634 ++i;
635 } while (i != s.end() && *i != ',' && !C_isspace(*i));
636 if (*i != ',') break;
637 ++i;
639 } else if (max_args > 1) {
640 // Unquoted argument, split on comma.
641 i = find_if(j, s.end(),
642 [](char ch) {
643 return C_isspace(ch) || ch == ',';
645 vals.emplace_back(j, i);
646 if (*i != ',') break;
647 ++i;
648 } else {
649 // Unquoted argument, including any commas.
650 i = find_if(j, s.end(),
651 [](char ch) { return C_isspace(ch); });
652 vals.emplace_back(j, i);
653 break;
655 j = i;
657 if (vals.size() == max_args) {
658 report_location(DIAG_ERROR, filename, line_no,
659 i - s.begin());
660 cerr << "Index action '" << action << "' takes at most "
661 << max_args << " arguments\n";
665 if (vals.size() < min_args) {
666 report_location(DIAG_ERROR, filename, line_no,
667 i - s.begin());
668 if (min_args == max_args) {
669 cerr << "Index action '" << action << "' requires "
670 << min_args << " arguments\n";
671 } else {
672 cerr << "Index action '" << action << "' requires "
673 "at least " << min_args << " arguments\n";
675 // Allow action handling code to assume there are min_args
676 // arguments.
677 vals.resize(min_args);
680 string val;
681 if (!vals.empty()) {
682 val = vals.front();
685 if (takes_integer_argument) {
686 auto dot = val.find('.');
687 if (dot != string::npos) {
688 report_location(DIAG_ERROR, filename, line_no,
689 j - s.begin() + dot);
690 cerr << "Index action '" << action
691 << "' takes an integer argument\n";
694 switch (code) {
695 case Action::DATE:
696 if (val != "unix" &&
697 val != "unixutc" &&
698 val != "yyyymmdd") {
699 report_location(DIAG_ERROR, filename, line_no,
700 j - s.begin());
701 cerr << "Invalid parameter '" << val
702 << "' for action 'date'\n";
704 actions.emplace_back(code, action_pos, val);
705 break;
706 case Action::INDEX:
707 case Action::INDEXNOPOS:
708 actions.emplace_back(code, action_pos, val, weight);
709 useless_weight_pos = string::npos;
710 break;
711 case Action::WEIGHT:
712 // We don't push an Action for WEIGHT - instead we
713 // store it ready to use in the INDEX and INDEXNOPOS
714 // Actions.
715 if (!parse_unsigned(val.c_str(), weight)) {
716 report_location(DIAG_ERROR, filename, line_no,
717 j - s.begin());
718 cerr << "Index action 'weight' takes a "
719 "non-negative integer argument\n";
720 weight = 0;
722 if (useless_weight_pos != string::npos) {
723 report_useless_action(filename, line_no,
724 useless_weight_pos, action);
726 useless_weight_pos = action_pos;
727 break;
728 case Action::PARSEDATE: {
729 auto bad_code = val.find("%Z");
730 if (bad_code != val.npos) {
731 report_location(DIAG_ERROR, filename, line_no,
732 j - s.begin() + bad_code);
733 cerr << "Parsing timezone names with %Z is not "
734 "supported\n";
736 bad_code = val.find("%z");
737 if (bad_code != val.npos && !parsedate_supports_z()) {
738 report_location(DIAG_ERROR, filename, line_no,
739 j - s.begin() + bad_code);
740 cerr << "Parsing timezone offsets with %z is not "
741 "supported on this platform\n";
743 actions.emplace_back(code, action_pos, val);
744 break;
746 case Action::SPLIT: {
747 if (val.empty()) {
748 report_location(DIAG_ERROR, filename, line_no,
749 j - s.begin());
750 cerr << "Split delimiter can't be empty\n";
752 int operation = Action::SPLIT_NONE;
753 if (vals.size() >= 2) {
754 if (vals[1] == "dedup") {
755 operation = Action::SPLIT_DEDUP;
756 } else if (vals[1] == "sort") {
757 operation = Action::SPLIT_SORT;
758 } else if (vals[1] == "none") {
759 operation = Action::SPLIT_NONE;
760 } else if (vals[1] == "prefixes") {
761 operation = Action::SPLIT_PREFIXES;
762 } else {
763 // FIXME: Column should be for where the `op`
764 // parameter starts, which this isn't if the
765 // value is quoted, contains escape sequences,
766 // etc.
767 report_location(DIAG_ERROR, filename, line_no,
768 i - s.begin() - vals[1].size());
769 cerr << "Bad split operation '" << vals[1]
770 << "'\n";
773 actions.emplace_back(code, action_pos, val, operation);
774 break;
776 case Action::TRUNCATE:
777 if (!actions.empty() &&
778 actions.back().get_action() == Action::LOAD) {
779 /* Turn "load truncate=n" into "load" with
780 * num_arg n, so that we don't needlessly
781 * allocate memory and read data we're just
782 * going to ignore.
784 actions.pop_back();
785 code = Action::LOAD;
787 actions.emplace_back(code, action_pos, val);
788 break;
789 case Action::UNIQUE:
790 if (unique_line_no) {
791 report_location(DIAG_ERROR, filename, line_no,
792 action_pos);
793 cerr << "Index action 'unique' used more than "
794 "once\n";
795 report_location(DIAG_NOTE, filename,
796 unique_line_no, unique_pos);
797 cerr << "Previously used here\n";
799 unique_line_no = line_no;
800 unique_pos = action_pos;
801 if (boolmap.find(val) == boolmap.end())
802 boolmap[val] = Action::UNIQUE;
803 if (vals.size() >= 2) {
804 if (vals[1] == "missing=error") {
805 unique_missing = UNIQUE_ERROR;
806 } else if (vals[1] == "missing=new") {
807 unique_missing = UNIQUE_NEW;
808 } else if (vals[1] == "missing=warn+new") {
809 unique_missing = UNIQUE_WARN_NEW;
810 } else if (vals[1] == "missing=skip") {
811 unique_missing = UNIQUE_SKIP;
812 } else if (vals[1] == "missing=warn+skip") {
813 unique_missing = UNIQUE_WARN_SKIP;
814 } else {
815 report_location(DIAG_ERROR, filename, line_no);
816 cerr << "Bad unique parameter '" << vals[1]
817 << "'\n";
820 actions.emplace_back(code, action_pos, val);
821 break;
822 case Action::GAP: {
823 actions.emplace_back(code, action_pos, val);
824 auto& obj = actions.back();
825 auto gap_size = obj.get_num_arg();
826 if (gap_size <= 0) {
827 report_location(DIAG_ERROR, filename, line_no,
828 obj.get_pos() + 3 + 1);
829 cerr << "Index action 'gap' takes a strictly "
830 "positive integer argument\n";
832 break;
834 case Action::HASH: {
835 actions.emplace_back(code, action_pos, val);
836 auto& obj = actions.back();
837 auto max_length = obj.get_num_arg();
838 if (max_length < 6) {
839 report_location(DIAG_ERROR, filename, line_no,
840 obj.get_pos() + 4 + 1);
841 cerr << "Index action 'hash' takes an integer "
842 "argument which must be at least 6\n";
844 break;
846 case Action::LTRIM:
847 case Action::RTRIM:
848 case Action::SQUASH:
849 case Action::TRIM:
850 for (unsigned char ch : val) {
851 if (ch >= 0x80) {
852 auto column = actions.back().get_pos() +
853 strlen(action_names[code]) + 1;
854 report_location(DIAG_ERROR, filename, line_no,
855 column);
856 cerr << "Index action '" << action_names[code]
857 << "' only support ASCII characters "
858 "currently\n";
861 actions.emplace_back(code, action_pos, val);
862 break;
863 case Action::BOOLEAN:
864 boolmap[val] = Action::BOOLEAN;
865 /* FALLTHRU */
866 default:
867 actions.emplace_back(code, action_pos, val);
869 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
870 } else {
871 if (min_args > 0) {
872 report_location(DIAG_ERROR, filename, line_no,
873 i_after_action - s.begin());
874 if (min_args == max_args) {
875 cerr << "Index action '" << action << "' requires "
876 << min_args << " arguments\n";
877 } else {
878 cerr << "Index action '" << action << "' requires "
879 "at least " << min_args << " arguments\n";
882 switch (code) {
883 case Action::INDEX:
884 case Action::INDEXNOPOS:
885 useless_weight_pos = string::npos;
886 actions.emplace_back(code, action_pos, "", weight);
887 break;
888 case Action::GAP:
889 actions.emplace_back(code, action_pos, "", 100);
890 break;
891 case Action::HASH:
892 actions.emplace_back(code, action_pos, "",
893 MAX_SAFE_TERM_LENGTH - 1);
894 break;
895 case Action::LTRIM:
896 case Action::RTRIM:
897 case Action::SQUASH:
898 case Action::TRIM:
899 actions.emplace_back(code, action_pos, " \t\f\v\r\n");
900 break;
901 default:
902 actions.emplace_back(code, action_pos);
903 break;
906 j = i;
909 if (useless_weight_pos != string::npos) {
910 report_useless_action(filename, line_no, useless_weight_pos,
911 "weight");
914 while (!actions.empty()) {
915 bool done = true;
916 Action::type action = actions.back().get_action();
917 switch (action) {
918 case Action::HASH:
919 case Action::HEXTOBIN:
920 case Action::LOWER:
921 case Action::LTRIM:
922 case Action::PARSEDATE:
923 case Action::RTRIM:
924 case Action::SPELL:
925 case Action::SQUASH:
926 case Action::TRIM:
927 case Action::TRUNCATE:
928 case Action::UNHTML:
929 case Action::UNXML:
930 done = false;
931 report_useless_action(filename, line_no,
932 actions.back().get_pos(),
933 action_names[action]);
934 actions.pop_back();
935 break;
936 default:
937 break;
939 if (done) break;
942 map<string, Action::type>::const_iterator boolpfx;
943 for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
944 if (boolpfx->second == Action::UNIQUE) {
945 report_location(DIAG_WARN, filename, unique_line_no,
946 unique_pos);
947 cerr << "Index action 'unique=" << boolpfx->first
948 << "' without 'boolean=" << boolpfx->first << "'\n";
949 static bool given_doesnt_imply_boolean_warning = false;
950 if (!given_doesnt_imply_boolean_warning) {
951 given_doesnt_imply_boolean_warning = true;
952 report_location(DIAG_NOTE, filename, unique_line_no,
953 unique_pos);
954 cerr << "'unique' doesn't implicitly add a boolean term\n";
959 vector<string>::const_iterator field;
960 for (field = fields.begin(); field != fields.end(); ++field) {
961 vector<Action> &v = index_spec[*field];
962 if (v.empty()) {
963 if (fields.size() == 1) {
964 // Optimise common case where there's only one fieldname
965 // for a list of actions.
966 v = std::move(actions);
967 } else {
968 v = actions;
970 } else {
971 v.emplace_back(Action::NEW, string::npos);
972 v.insert(v.end(), actions.begin(), actions.end());
977 if (index_spec.empty()) {
978 report_location(DIAG_ERROR, filename, line_no);
979 cerr << "No rules found in index script\n";
982 if (error_count) {
983 exit(1);
986 index_spec_uses_unique = (unique_line_no > 0);
989 static bool
990 run_actions(vector<Action>::const_iterator action_it,
991 vector<Action>::const_iterator action_end,
992 Xapian::WritableDatabase& database,
993 Xapian::TermGenerator& indexer,
994 const string& old_value,
995 bool& this_field_is_content, Xapian::Document& doc,
996 map<string, list<string>>& fields,
997 string& field, const char* fname,
998 size_t line_no, Xapian::docid& docid)
1000 string value = old_value;
1001 while (action_it != action_end) {
1002 auto& action = *action_it++;
1003 switch (action.get_action()) {
1004 case Action::BAD:
1005 abort();
1006 case Action::NEW:
1007 value = old_value;
1008 break;
1009 case Action::FIELD:
1010 if (!value.empty()) {
1011 string f = action.get_string_arg();
1012 if (f.empty()) f = field;
1013 // replace newlines with spaces
1014 string s = value;
1015 string::size_type j = 0;
1016 while ((j = s.find('\n', j)) != string::npos)
1017 s[j] = ' ';
1018 fields[f].push_back(s);
1020 break;
1021 case Action::INDEX:
1022 indexer.index_text(value,
1023 action.get_num_arg(),
1024 action.get_string_arg());
1025 break;
1026 case Action::INDEXNOPOS:
1027 // No positional information so phrase searching won't work.
1028 // However, the database will use much less diskspace.
1029 indexer.index_text_without_positions(value,
1030 action.get_num_arg(),
1031 action.get_string_arg());
1032 break;
1033 case Action::BOOLEAN: {
1034 // Do nothing if there's no text.
1035 if (value.empty()) break;
1037 string term = action.get_string_arg();
1038 if (prefix_needs_colon(term, value[0])) term += ':';
1039 term += value;
1041 doc.add_boolean_term(term);
1042 break;
1044 case Action::GAP:
1045 indexer.increase_termpos(action.get_num_arg());
1046 break;
1047 case Action::HASH: {
1048 unsigned int max_length = action.get_num_arg();
1049 if (value.length() > max_length)
1050 value = hash_long_term(value, max_length);
1051 break;
1053 case Action::HEXTOBIN: {
1054 size_t len = value.length();
1055 if (len & 1) {
1056 report_location(DIAG_ERROR, fname, line_no);
1057 cerr << "hextobin: input must have even length\n";
1058 exit(1);
1061 string output;
1062 output.reserve(len / 2);
1063 for (size_t j = 0; j < len; j += 2) {
1064 char a = value[j];
1065 char b = value[j + 1];
1066 if (!C_isxdigit(a) || !C_isxdigit(b)) {
1067 report_location(DIAG_ERROR, fname, line_no);
1068 cerr << "hextobin: input must be all hex digits\n";
1069 exit(1);
1071 char r = hex_decode(a, b);
1072 output.push_back(r);
1074 value = std::move(output);
1075 break;
1077 case Action::LOWER:
1078 value = Xapian::Unicode::tolower(value);
1079 break;
1080 case Action::LTRIM:
1081 ltrim(value, action.get_string_arg());
1082 break;
1083 case Action::RTRIM:
1084 rtrim(value, action.get_string_arg());
1085 break;
1086 case Action::TRIM:
1087 rtrim(value, action.get_string_arg());
1088 ltrim(value, action.get_string_arg());
1089 break;
1090 case Action::SQUASH:
1091 squash(value, action.get_string_arg());
1092 break;
1093 case Action::LOAD: {
1094 // If there's no input, just issue a warning.
1095 if (value.empty()) {
1096 report_location(DIAG_WARN, fname, line_no);
1097 cerr << "Empty filename in LOAD action\n";
1098 break;
1100 bool truncated = false;
1101 string filename = std::move(value);
1102 // FIXME: Use NOATIME if we own the file or are root.
1103 if (!load_file(filename, action.get_num_arg(), NOCACHE,
1104 value, truncated)) {
1105 report_location(DIAG_ERROR, fname, line_no);
1106 cerr << "Couldn't load file '" << filename << "': "
1107 << strerror(errno) << '\n';
1108 exit(1);
1110 if (!truncated) break;
1112 /* FALLTHRU */
1113 case Action::TRUNCATE:
1114 utf8_truncate(value, action.get_num_arg());
1115 break;
1116 case Action::SPELL:
1117 indexer.set_flags(indexer.FLAG_SPELLING);
1118 break;
1119 case Action::SPLIT: {
1120 // Find the end of the actions which split should execute.
1121 auto split_end = find(action_it, action_end, Action::NEW);
1123 int split_type = action.get_num_arg();
1124 if (value.empty()) {
1125 // Nothing to do.
1126 } else if (split_type != Action::SPLIT_SORT) {
1127 // Generate split as we consume it.
1128 const string& delimiter = action.get_string_arg();
1130 unique_ptr<unordered_set<string>> seen;
1131 if (split_type == Action::SPLIT_DEDUP) {
1132 seen.reset(new unordered_set<string>);
1135 if (delimiter.size() == 1) {
1136 // Special case for common single character delimiter.
1137 char ch = delimiter[0];
1138 string::size_type i = 0;
1139 while (true) {
1140 string::size_type j = value.find(ch, i);
1141 if (split_type == Action::SPLIT_PREFIXES) {
1142 if (j > 0) {
1143 string val(value, 0, j);
1144 run_actions(action_it, split_end,
1145 database, indexer,
1146 val,
1147 this_field_is_content, doc,
1148 fields,
1149 field, fname, line_no,
1150 docid);
1152 } else if (i != j) {
1153 string val(value, i, j - i);
1154 if (!seen.get() || seen->insert(val).second) {
1155 run_actions(action_it, split_end,
1156 database, indexer,
1157 val,
1158 this_field_is_content, doc,
1159 fields,
1160 field, fname, line_no,
1161 docid);
1164 if (j == string::npos) break;
1165 i = j + 1;
1167 } else {
1168 string::size_type i = 0;
1169 while (true) {
1170 string::size_type j = value.find(delimiter, i);
1171 if (split_type == Action::SPLIT_PREFIXES) {
1172 if (j > 0) {
1173 string val(value, 0, j);
1174 run_actions(action_it, split_end,
1175 database, indexer,
1176 val,
1177 this_field_is_content, doc,
1178 fields,
1179 field, fname, line_no,
1180 docid);
1182 } else if (i != j) {
1183 string val(value, i, j - i);
1184 if (!seen.get() || seen->insert(val).second) {
1185 run_actions(action_it, split_end,
1186 database, indexer,
1187 val,
1188 this_field_is_content, doc,
1189 fields,
1190 field, fname, line_no,
1191 docid);
1194 if (j == string::npos) break;
1195 i = j + delimiter.size();
1198 } else {
1199 vector<string> split_values;
1200 const string& delimiter = action.get_string_arg();
1201 if (delimiter.size() == 1) {
1202 // Special case for common single character delimiter.
1203 char ch = delimiter[0];
1204 string::size_type i = 0;
1205 while (true) {
1206 string::size_type j = value.find(ch, i);
1207 if (i != j) {
1208 split_values.emplace_back(value, i, j - i);
1210 if (j == string::npos) break;
1211 i = j + 1;
1213 } else {
1214 string::size_type i = 0;
1215 while (true) {
1216 string::size_type j = value.find(delimiter, i);
1217 if (i != j) {
1218 split_values.emplace_back(value, i, j - i);
1220 if (j == string::npos) break;
1221 i = j + delimiter.size();
1225 sort(split_values.begin(), split_values.end());
1227 for (auto&& val : split_values) {
1228 run_actions(action_it, split_end,
1229 database, indexer, val,
1230 this_field_is_content, doc, fields,
1231 field, fname, line_no,
1232 docid);
1236 action_it = split_end;
1237 break;
1239 case Action::UNHTML: {
1240 HtmlParser p;
1241 try {
1242 // Default HTML character set is latin 1, though
1243 // not specifying one is deprecated these days.
1244 p.parse(value, "iso-8859-1", false);
1245 } catch (const string & newcharset) {
1246 p.reset();
1247 p.parse(value, newcharset, true);
1249 if (p.indexing_allowed)
1250 value = p.dump;
1251 else
1252 value = "";
1253 break;
1255 case Action::UNXML: {
1256 GenericXmlParser p;
1257 p.parse(value);
1258 value = std::move(p.dump);
1259 break;
1261 case Action::UNIQUE: {
1262 unique_unused = false;
1264 if (value.empty()) {
1265 enum diag_type diag = DIAG_WARN;
1266 switch (unique_missing) {
1267 case UNIQUE_ERROR:
1268 diag = DIAG_ERROR;
1269 /* FALLTHRU */
1270 case UNIQUE_WARN_NEW:
1271 case UNIQUE_WARN_SKIP:
1272 report_location(diag, fname, line_no);
1273 cerr << "UNIQUE action on empty text\n";
1274 default:
1275 break;
1277 switch (unique_missing) {
1278 case UNIQUE_ERROR:
1279 exit(1);
1280 case UNIQUE_SKIP:
1281 case UNIQUE_WARN_SKIP:
1282 skipping_record = true;
1283 break;
1284 case UNIQUE_NEW:
1285 case UNIQUE_WARN_NEW:
1286 break;
1288 break;
1291 // Ensure that the value of this field is unique.
1292 // If a record already exists with the same value,
1293 // it will be replaced with the new record.
1295 // Unique fields aren't considered content - if
1296 // there are no other fields in the document, the
1297 // document is to be deleted.
1298 this_field_is_content = false;
1300 // Argument is the prefix to add to the field value
1301 // to get the unique term.
1302 string t = action.get_string_arg();
1303 if (prefix_needs_colon(t, value[0])) t += ':';
1304 t += value;
1305 Xapian::PostingIterator p = database.postlist_begin(t);
1306 if (p != database.postlist_end(t)) {
1307 docid = *p;
1309 break;
1311 case Action::VALUE:
1312 if (!value.empty())
1313 doc.add_value(action.get_num_arg(), value);
1314 break;
1315 case Action::VALUENUMERIC: {
1316 if (value.empty()) break;
1317 char * end;
1318 double dbl = strtod(value.c_str(), &end);
1319 if (*end) {
1320 report_location(DIAG_WARN, fname, line_no);
1321 cerr << "Trailing characters in VALUENUMERIC: '"
1322 << value << "'\n";
1324 doc.add_value(action.get_num_arg(),
1325 Xapian::sortable_serialise(dbl));
1326 break;
1328 case Action::VALUEPACKED: {
1329 uint32_t word = 0;
1330 if (value.empty() || !C_isdigit(value[0])) {
1331 // strtoul() accepts leading whitespace and negated
1332 // values, neither of which we want to allow.
1333 errno = EINVAL;
1334 } else {
1335 errno = 0;
1336 char* q;
1337 word = strtoul(value.c_str(), &q, 10);
1338 if (!errno && *q != '\0') {
1339 // Trailing characters after converted value.
1340 errno = EINVAL;
1343 if (errno) {
1344 report_location(DIAG_WARN, fname, line_no);
1345 cerr << "valuepacked \"" << value << "\" ";
1346 if (errno == ERANGE) {
1347 cerr << "out of range\n";
1348 } else {
1349 cerr << "not an unsigned integer\n";
1352 int valueslot = action.get_num_arg();
1353 doc.add_value(valueslot, int_to_binary_string(word));
1354 break;
1356 case Action::DATE: {
1357 // Do nothing for empty input.
1358 if (value.empty()) break;
1360 const string & type = action.get_string_arg();
1361 string yyyymmdd;
1362 if (type == "unix") {
1363 time_t t;
1364 if (!parse_signed(value.c_str(), t)) {
1365 report_location(DIAG_WARN, fname, line_no);
1366 cerr << "Date value (in secs) for action DATE "
1367 "must be an integer - ignoring\n";
1368 break;
1370 struct tm *tm = localtime(&t);
1371 int y = tm->tm_year + 1900;
1372 int m = tm->tm_mon + 1;
1373 yyyymmdd = date_to_string(y, m, tm->tm_mday);
1374 } else if (type == "unixutc") {
1375 time_t t;
1376 if (!parse_signed(value.c_str(), t)) {
1377 report_location(DIAG_WARN, fname, line_no);
1378 cerr << "Date value (in secs) for action DATE "
1379 "must be an integer - ignoring\n";
1380 break;
1382 struct tm *tm = gmtime(&t);
1383 int y = tm->tm_year + 1900;
1384 int m = tm->tm_mon + 1;
1385 yyyymmdd = date_to_string(y, m, tm->tm_mday);
1386 } else if (type == "yyyymmdd") {
1387 if (value.length() != 8) {
1388 report_location(DIAG_WARN, fname, line_no);
1389 cerr << "date=yyyymmdd expects an 8 character value "
1390 "- ignoring\n";
1391 break;
1393 yyyymmdd = value;
1396 // Date (YYYYMMDD)
1397 doc.add_boolean_term("D" + yyyymmdd);
1398 yyyymmdd.resize(6);
1399 // Month (YYYYMM)
1400 doc.add_boolean_term("M" + yyyymmdd);
1401 yyyymmdd.resize(4);
1402 // Year (YYYY)
1403 doc.add_boolean_term("Y" + yyyymmdd);
1404 break;
1406 case Action::PARSEDATE: {
1407 string dateformat = action.get_string_arg();
1408 struct tm tm;
1409 memset(&tm, 0, sizeof(tm));
1410 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
1411 if (ret == NULL) {
1412 report_location(DIAG_WARN, fname, line_no);
1413 cerr << "\"" << value << "\" doesn't match format "
1414 "\"" << dateformat << '\"' << '\n';
1415 break;
1418 if (*ret != '\0') {
1419 report_location(DIAG_WARN, fname, line_no);
1420 cerr << "\"" << value << "\" not fully matched by "
1421 "format \"" << dateformat << "\" "
1422 "(\"" << ret << "\" left over) but "
1423 "indexing anyway\n";
1425 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1426 auto gmtoff = tm.tm_gmtoff;
1427 #endif
1428 auto secs_since_epoch = timegm(&tm);
1429 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1430 secs_since_epoch -= gmtoff;
1431 #endif
1432 value = str(secs_since_epoch);
1433 break;
1435 default:
1436 /* Empty default case to avoid "unhandled enum value"
1437 * warnings. */
1438 break;
1441 return true;
1444 static void
1445 index_file(const char *fname, istream &stream,
1446 Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1448 string line;
1449 size_t line_no = 0;
1450 while (!stream.eof() && getline_portable(stream, line)) {
1451 ++line_no;
1452 // Allow blank lines before the first record and multiple blank lines
1453 // between records.
1454 if (line.empty()) continue;
1456 Xapian::Document doc;
1457 indexer.set_document(doc);
1458 Xapian::docid docid = 0;
1459 map<string, list<string>> fields;
1460 bool seen_content = false;
1461 skipping_record = false;
1462 unique_unused = index_spec_uses_unique;
1463 while (!line.empty()) {
1464 string::size_type eq = line.find('=');
1465 if (eq == string::npos && !line.empty()) {
1466 report_location(DIAG_ERROR, fname, line_no);
1467 cerr << "Expected = somewhere in this line\n";
1468 exit(1);
1470 string field(line, 0, eq);
1471 string value(line, eq + 1, string::npos);
1472 line.clear();
1473 while (getline_portable(stream, line)) {
1474 ++line_no;
1475 if (line.empty() || line[0] != '=') break;
1476 // Replace the '=' with a '\n'.
1477 line[0] = '\n';
1478 value += line;
1479 line.erase();
1482 if (skipping_record) continue;
1484 // Default to not indexing spellings.
1485 indexer.set_flags(Xapian::TermGenerator::flags(0));
1487 bool this_field_is_content = true;
1488 const vector<Action>& v = index_spec[field];
1489 run_actions(v.begin(), v.end(),
1490 database, indexer, value,
1491 this_field_is_content, doc, fields,
1492 field, fname, line_no,
1493 docid);
1494 if (this_field_is_content) seen_content = true;
1497 if (unique_unused) {
1498 enum diag_type diag = DIAG_WARN;
1499 switch (unique_missing) {
1500 case UNIQUE_ERROR:
1501 diag = DIAG_ERROR;
1502 /* FALLTHRU */
1503 case UNIQUE_WARN_NEW:
1504 case UNIQUE_WARN_SKIP:
1505 report_location(diag, fname, line_no);
1506 cerr << "UNIQUE action unused in this record\n";
1507 default:
1508 break;
1510 switch (unique_missing) {
1511 case UNIQUE_ERROR:
1512 exit(1);
1513 case UNIQUE_SKIP:
1514 case UNIQUE_WARN_SKIP:
1515 skipping_record = true;
1516 break;
1517 case UNIQUE_NEW:
1518 case UNIQUE_WARN_NEW:
1519 break;
1523 if (skipping_record) {
1524 ++skipcount;
1525 } else if (!seen_content) {
1526 // We haven't seen any fields (other than unique identifiers)
1527 // so the document is to be deleted.
1528 if (docid) {
1529 database.delete_document(docid);
1530 if (verbose) cout << "Del: " << docid << '\n';
1531 ++delcount;
1533 } else {
1534 string data;
1535 for (auto&& i : fields) {
1536 for (auto&& field_val : i.second) {
1537 data += i.first;
1538 data += '=';
1539 data += field_val;
1540 data += '\n';
1544 // Put the data in the document
1545 doc.set_data(data);
1547 // Add the document to the database
1548 if (docid) {
1549 database.replace_document(docid, doc);
1550 if (verbose) cout << "Replace: " << docid << '\n';
1551 ++repcount;
1552 } else {
1553 docid = database.add_document(doc);
1554 if (verbose) cout << "Add: " << docid << '\n';
1555 ++addcount;
1560 // Commit after each file to make sure all changes from that file make it
1561 // in.
1562 if (verbose) cout << "Committing\n";
1563 database.commit();
1566 [[noreturn]]
1567 static void
1568 show_help(int exit_code)
1570 cout << PROG_NAME " - " PROG_DESC "\n"
1571 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1572 "\n"
1573 "Creates or updates a Xapian database with the data from the input files listed\n"
1574 "on the command line. If no files are specified, data is read from stdin.\n"
1575 "\n"
1576 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1577 "format for INDEXER_SCRIPT.\n"
1578 "\n"
1579 "Options:\n"
1580 " -v, --verbose display additional messages to aid debugging\n"
1581 " --overwrite create the database anew (the default is to update if\n"
1582 " the database already exists)\n";
1583 print_stemmer_help("");
1584 print_help_and_version_help("");
1585 exit(exit_code);
1589 main(int argc, char **argv)
1590 try {
1591 // If the database already exists, default to updating not overwriting.
1592 int database_mode = Xapian::DB_CREATE_OR_OPEN;
1593 verbose = false;
1594 Xapian::Stem stemmer("english");
1596 // Without this, strptime() seems to treat formats without a timezone as
1597 // being local time, including %s.
1598 setenv("TZ", "UTC", 1);
1600 constexpr auto NO_ARG = no_argument;
1601 constexpr auto REQ_ARG = required_argument;
1602 static const struct option longopts[] = {
1603 { "help", NO_ARG, NULL, 'h' },
1604 { "version", NO_ARG, NULL, 'V' },
1605 { "stemmer", REQ_ARG, NULL, 's' },
1606 { "overwrite", NO_ARG, NULL, 'o' },
1607 { "verbose", NO_ARG, NULL, 'v' },
1608 { 0, 0, NULL, 0 }
1611 int getopt_ret;
1612 while ((getopt_ret = gnu_getopt_long(argc, argv, "vs:hV",
1613 longopts, NULL)) != -1) {
1614 switch (getopt_ret) {
1615 default:
1616 show_help(1);
1617 break;
1618 case 'h': // --help
1619 show_help(0);
1620 break;
1621 case 'V': // --version
1622 print_package_info(PROG_NAME);
1623 return 0;
1624 case 'o': // --overwrite
1625 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1626 break;
1627 case 'v':
1628 verbose = true;
1629 break;
1630 case 's':
1631 try {
1632 stemmer = Xapian::Stem(optarg);
1633 } catch (const Xapian::InvalidArgumentError &) {
1634 cerr << "Unknown stemming language '" << optarg << "'.\n";
1635 cerr << "Available language names are: "
1636 << Xapian::Stem::get_available_languages() << '\n';
1637 return 1;
1639 break;
1643 argv += optind;
1644 argc -= optind;
1645 if (argc < 2) {
1646 show_help(1);
1649 parse_index_script(argv[1]);
1651 // Open the database. If another process is currently updating the
1652 // database, wait for the lock to become available.
1653 auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1654 Xapian::WritableDatabase database(argv[0], flags);
1656 Xapian::TermGenerator indexer;
1657 indexer.set_stemmer(stemmer);
1658 // Set the database for spellings to be added to by the "spell" action.
1659 indexer.set_database(database);
1661 addcount = 0;
1662 repcount = 0;
1663 delcount = 0;
1664 skipcount = 0;
1666 if (argc == 2) {
1667 // Read from stdin.
1668 index_file("<stdin>", cin, database, indexer);
1669 } else {
1670 // Read file(s) listed on the command line.
1671 for (int i = 2; i < argc; ++i) {
1672 ifstream stream(argv[i]);
1673 if (stream) {
1674 index_file(argv[i], stream, database, indexer);
1675 } else {
1676 cerr << "Can't open file " << argv[i] << '\n';
1681 cout << "records (added, replaced, deleted, skipped) = ("
1682 << addcount << ", "
1683 << repcount << ", "
1684 << delcount << ", "
1685 << skipcount << ")\n";
1686 } catch (const Xapian::Error &error) {
1687 cerr << "Exception: " << error.get_description() << '\n';
1688 exit(1);
1689 } catch (const std::bad_alloc &) {
1690 cerr << "Exception: std::bad_alloc\n";
1691 exit(1);
1692 } catch (...) {
1693 cerr << "Unknown Exception\n";
1694 exit(1);