scriptindex: Improve error handling for LOAD action
[xapian.git] / xapian-applications / omega / scriptindex.cc
blob8b93ad34f7ea4658375a5d11d049e6fdaa2ac9cb
1 /* scriptindex.cc
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Sam Liddicott
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,2018 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include <xapian.h>
28 #include <algorithm>
29 #include <fstream>
30 #include <iostream>
31 #include <list>
32 #include <map>
33 #include <memory>
34 #include <string>
35 #include <unordered_set>
36 #include <vector>
37 #include <cstring>
39 #include <cerrno>
40 #include <cstdio>
41 #include <cstdlib>
42 #include <ctime>
44 #include "commonhelp.h"
45 #include "hashterm.h"
46 #include "loadfile.h"
47 #include "myhtmlparse.h"
48 #include "str.h"
49 #include "stringutils.h"
50 #include "timegm.h"
51 #include "utf8truncate.h"
52 #include "utils.h"
53 #include "values.h"
55 #include "gnu_getopt.h"
57 using namespace std;
59 #define PROG_NAME "scriptindex"
60 #define PROG_DESC "index arbitrary data as described by an index script"
62 static bool verbose;
63 static int addcount;
64 static int repcount;
65 static int delcount;
67 static inline bool
68 prefix_needs_colon(const string & prefix, unsigned ch)
70 if (!C_isupper(ch) && ch != ':') return false;
71 string::size_type len = prefix.length();
72 return (len > 1 && prefix[len - 1] != ':');
75 const char * action_names[] = {
76 "bad", "new",
77 "boolean", "date", "field", "hash", "hextobin", "index", "indexnopos",
78 "load", "lower", "parsedate", "spell", "split", "truncate", "unhtml",
79 "unique", "value", "valuenumeric", "valuepacked", "weight"
82 // For debugging:
83 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
85 class Action {
86 public:
87 typedef enum {
88 BAD, NEW,
89 BOOLEAN, DATE, FIELD, HASH, HEXTOBIN, INDEX, INDEXNOPOS, LOAD, LOWER,
90 PARSEDATE, SPELL, SPLIT, TRUNCATE, UNHTML, UNIQUE, VALUE,
91 VALUENUMERIC, VALUEPACKED, WEIGHT
92 } type;
93 enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT };
94 private:
95 type action;
96 int num_arg;
97 string string_arg;
98 // Offset into indexscript line.
99 size_t pos;
100 public:
101 Action(type action_, size_t pos_)
102 : action(action_), num_arg(0), pos(pos_) { }
103 Action(type action_, size_t pos_, const string & arg)
104 : action(action_), string_arg(arg), pos(pos_) {
105 num_arg = atoi(string_arg.c_str());
107 Action(type action_, size_t pos_, const string & arg, int num)
108 : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
109 type get_action() const { return action; }
110 int get_num_arg() const { return num_arg; }
111 void set_num_arg(int num) { num_arg = num; }
112 const string & get_string_arg() const { return string_arg; }
113 size_t get_pos() const { return pos; }
116 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
118 static void
119 report_location(enum diag_type type,
120 const string& filename,
121 size_t line = 0,
122 size_t pos = string::npos)
124 cerr << filename;
125 if (line != 0) {
126 cerr << ':' << line;
128 if (pos != string::npos) {
129 // The first column is numbered 1.
130 cerr << ':' << pos + 1;
132 switch (type) {
133 case DIAG_ERROR:
134 cerr << ": error: ";
135 break;
136 case DIAG_WARN:
137 cerr << ": warning: ";
138 break;
139 case DIAG_NOTE:
140 cerr << ": note: ";
141 break;
145 static void
146 report_useless_action(const string &file, size_t line, size_t pos,
147 const string &action)
149 report_location(DIAG_WARN, file, line, pos);
150 cerr << "Index action '" << action << "' has no effect" << endl;
152 static bool given_left_to_right_warning = false;
153 if (!given_left_to_right_warning) {
154 given_left_to_right_warning = true;
155 report_location(DIAG_NOTE, file, line, pos);
156 cerr << "Actions are executed from left to right" << endl;
160 static map<string, vector<Action>> index_spec;
162 static void
163 parse_index_script(const string &filename)
165 ifstream script(filename.c_str());
166 if (!script.is_open()) {
167 report_location(DIAG_ERROR, filename);
168 cerr << strerror(errno) << endl;
169 exit(1);
171 string line;
172 size_t line_no = 0;
173 bool had_unique = false;
174 while (getline(script, line)) {
175 ++line_no;
176 vector<string> fields;
177 vector<Action> actions;
178 string::const_iterator i, j;
179 const string &s = line;
180 i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
181 if (i == s.end() || *i == '#') continue;
182 while (true) {
183 if (!C_isalnum(*i)) {
184 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
185 cerr << "field name must start with alphanumeric" << endl;
186 exit(1);
188 j = find_if(i, s.end(),
189 [](char ch) { return !C_isalnum(ch) && ch != '_'; });
190 fields.push_back(string(i, j));
191 i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
192 if (i == s.end()) break;
193 if (*i == ':') {
194 ++i;
195 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
196 break;
198 if (i == j) {
199 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
200 cerr << "bad character '" << *i << "' in fieldname" << endl;
201 exit(1);
204 Xapian::termcount weight = 1;
205 size_t useless_weight_pos = string::npos;
206 map<string, Action::type> boolmap;
207 j = i;
208 while (j != s.end()) {
209 size_t action_pos = j - s.begin();
210 i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
211 string action(s, j - s.begin(), i - j);
212 Action::type code = Action::BAD;
213 unsigned min_args = 0, max_args = 0;
214 bool takes_integer_argument = false;
215 if (!action.empty()) {
216 switch (action[0]) {
217 case 'b':
218 if (action == "boolean") {
219 code = Action::BOOLEAN;
220 max_args = 1;
222 break;
223 case 'd':
224 if (action == "date") {
225 code = Action::DATE;
226 min_args = max_args = 1;
228 break;
229 case 'f':
230 if (action == "field") {
231 code = Action::FIELD;
232 max_args = 1;
234 break;
235 case 'h':
236 if (action == "hash") {
237 code = Action::HASH;
238 max_args = 1;
239 takes_integer_argument = true;
240 } else if (action == "hextobin") {
241 code = Action::HEXTOBIN;
243 break;
244 case 'i':
245 if (action == "index") {
246 code = Action::INDEX;
247 max_args = 1;
248 } else if (action == "indexnopos") {
249 code = Action::INDEXNOPOS;
250 max_args = 1;
252 break;
253 case 'l':
254 if (action == "lower") {
255 code = Action::LOWER;
256 } else if (action == "load") {
257 code = Action::LOAD;
259 break;
260 case 'p':
261 if (action == "parsedate") {
262 code = Action::PARSEDATE;
263 min_args = max_args = 1;
265 break;
266 case 's':
267 if (action == "spell") {
268 code = Action::SPELL;
269 } else if (action == "split") {
270 code = Action::SPLIT;
271 min_args = 1;
272 max_args = 2;
274 break;
275 case 't':
276 if (action == "truncate") {
277 code = Action::TRUNCATE;
278 min_args = max_args = 1;
279 takes_integer_argument = true;
281 break;
282 case 'u':
283 if (action == "unhtml") {
284 code = Action::UNHTML;
285 } else if (action == "unique") {
286 code = Action::UNIQUE;
287 min_args = max_args = 1;
289 break;
290 case 'v':
291 if (action == "value") {
292 code = Action::VALUE;
293 min_args = max_args = 1;
294 takes_integer_argument = true;
295 } else if (action == "valuenumeric") {
296 code = Action::VALUENUMERIC;
297 min_args = max_args = 1;
298 takes_integer_argument = true;
299 } else if (action == "valuepacked") {
300 code = Action::VALUEPACKED;
301 min_args = max_args = 1;
302 takes_integer_argument = true;
304 break;
305 case 'w':
306 if (action == "weight") {
307 code = Action::WEIGHT;
308 min_args = max_args = 1;
309 takes_integer_argument = true;
311 break;
314 if (code == Action::BAD) {
315 report_location(DIAG_ERROR, filename, line_no, action_pos);
316 cerr << "Unknown index action '" << action << "'" << endl;
317 exit(1);
319 auto i_after_action = i;
320 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
322 if (i != s.end() && *i == '=') {
323 if (i != i_after_action) {
324 report_location(DIAG_WARN, filename, line_no,
325 i_after_action - s.begin());
326 cerr << "putting spaces between the action and '=' is "
327 "deprecated." << endl;
330 if (max_args == 0) {
331 report_location(DIAG_ERROR, filename, line_no,
332 i - s.begin());
333 cerr << "Index action '" << action
334 << "' doesn't take an argument" << endl;
335 exit(1);
338 ++i;
339 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
340 if (i != j) {
341 report_location(DIAG_WARN, filename, line_no,
342 i - s.begin());
343 cerr << "putting spaces between '=' and the argument is "
344 "deprecated." << endl;
347 vector<string> vals;
348 while (true) {
349 if (j != s.end() && *j == '"') {
350 // Quoted argument.
351 ++j;
352 string arg;
353 while (true) {
354 i = find_if(j, s.end(),
355 [](char ch) {
356 return ch == '"' || ch == '\\';
358 if (i == s.end()) {
359 report_location(DIAG_ERROR, filename, line_no,
360 s.size());
361 cerr << "No closing quote" << endl;
362 exit(1);
364 arg.append(j, i);
365 if (*i++ == '"')
366 break;
368 // Escape sequence.
369 if (i == s.end()) {
370 bad_escaping:
371 report_location(DIAG_ERROR, filename, line_no,
372 i - s.begin());
373 cerr << "Bad escaping in quoted action argument"
374 << endl;
375 exit(1);
378 char ch = *i;
379 switch (ch) {
380 case '\\':
381 case '"':
382 break;
383 case '0':
384 ch = '\0';
385 break;
386 case 'n':
387 ch = '\n';
388 break;
389 case 'r':
390 ch = '\r';
391 break;
392 case 't':
393 ch = '\t';
394 break;
395 case 'x': {
396 if (++i == s.end())
397 goto bad_escaping;
398 char ch1 = *i;
399 if (++i == s.end())
400 goto bad_escaping;
401 char ch2 = *i;
402 if (!C_isxdigit(ch1) ||
403 !C_isxdigit(ch2))
404 goto bad_escaping;
405 ch = hex_digit(ch1) << 4 |
406 hex_digit(ch2);
407 break;
409 default:
410 goto bad_escaping;
412 arg += ch;
413 j = i + 1;
415 vals.emplace_back(std::move(arg));
416 if (i == s.end() || C_isspace(*i)) break;
417 if (*i != ',') {
418 report_location(DIAG_ERROR, filename, line_no,
419 i - s.begin());
420 cerr << "Unexpected character '" << *i
421 << "' after closing quote" << endl;
422 exit(1);
424 ++i;
425 } else if (max_args > 1) {
426 // Unquoted argument, split on comma.
427 i = find_if(j, s.end(),
428 [](char ch) {
429 return C_isspace(ch) || ch == ',';
431 vals.emplace_back(j, i);
432 if (*i != ',') break;
433 ++i;
434 } else {
435 // Unquoted argument, including any commas.
436 i = find_if(j, s.end(),
437 [](char ch) { return C_isspace(ch); });
438 vals.emplace_back(j, i);
439 break;
441 j = i;
443 if (vals.size() == max_args) {
444 report_location(DIAG_ERROR, filename, line_no,
445 i - s.begin());
446 cerr << "Index action '" << action
447 << "' takes at most " << max_args << " arguments"
448 << endl;
449 exit(1);
453 if (vals.size() < min_args) {
454 report_location(DIAG_ERROR, filename, line_no,
455 i - s.begin());
456 if (min_args == max_args) {
457 cerr << "Index action '" << action
458 << "' requires " << min_args << " arguments"
459 << endl;
460 exit(1);
462 cerr << "Index action '" << action
463 << "' requires at least " << min_args << " arguments"
464 << endl;
465 exit(1);
468 string val;
469 if (!vals.empty()) {
470 val = vals.front();
473 if (takes_integer_argument) {
474 auto dot = val.find('.');
475 if (dot != string::npos) {
476 report_location(DIAG_WARN, filename, line_no,
477 j - s.begin() + dot);
478 cerr << "Index action '" << action
479 << "' takes an integer argument" << endl;
482 switch (code) {
483 case Action::INDEX:
484 case Action::INDEXNOPOS:
485 actions.emplace_back(code, action_pos, val, weight);
486 useless_weight_pos = string::npos;
487 break;
488 case Action::WEIGHT:
489 // We don't push an Action for WEIGHT - instead we
490 // store it ready to use in the INDEX and INDEXNOPOS
491 // Actions.
492 weight = atoi(val.c_str());
493 if (useless_weight_pos != string::npos) {
494 report_useless_action(filename, line_no,
495 useless_weight_pos, action);
497 useless_weight_pos = action_pos;
498 break;
499 case Action::SPLIT: {
500 if (val.empty()) {
501 report_location(DIAG_ERROR, filename, line_no);
502 cerr << "Split delimiter can't be empty" << endl;
503 exit(1);
505 int operation = Action::SPLIT_NONE;
506 if (vals.size() >= 2) {
507 if (vals[1] == "dedup") {
508 operation = Action::SPLIT_DEDUP;
509 } else if (vals[1] == "sort") {
510 operation = Action::SPLIT_SORT;
511 } else if (vals[1] == "none") {
512 operation = Action::SPLIT_NONE;
513 } else {
514 report_location(DIAG_ERROR, filename, line_no);
515 cerr << "Bad split operation '" << vals[1]
516 << "'" << endl;
517 exit(1);
520 actions.emplace_back(code, action_pos, val, operation);
521 break;
523 case Action::TRUNCATE:
524 if (!actions.empty() &&
525 actions.back().get_action() == Action::LOAD) {
526 /* Turn "load truncate=n" into "load" with
527 * num_arg n, so that we don't needlessly
528 * allocate memory and read data we're just
529 * going to ignore.
531 actions.pop_back();
532 code = Action::LOAD;
534 actions.emplace_back(code, action_pos, val);
535 break;
536 case Action::UNIQUE:
537 if (had_unique) {
538 report_location(DIAG_ERROR, filename, line_no,
539 action_pos);
540 cerr << "Index action 'unique' used more than once"
541 << endl;
542 exit(1);
544 had_unique = true;
545 if (boolmap.find(val) == boolmap.end())
546 boolmap[val] = Action::UNIQUE;
547 actions.emplace_back(code, action_pos, val);
548 break;
549 case Action::HASH: {
550 actions.emplace_back(code, action_pos, val);
551 auto& obj = actions.back();
552 auto max_length = obj.get_num_arg();
553 if (max_length < 6) {
554 report_location(DIAG_ERROR, filename, line_no,
555 obj.get_pos() + 4 + 1);
556 cerr << "Index action 'hash' takes an integer "
557 "argument which must be at least 6" << endl;
558 exit(1);
560 break;
562 case Action::BOOLEAN:
563 boolmap[val] = Action::BOOLEAN;
564 /* FALLTHRU */
565 default:
566 actions.emplace_back(code, action_pos, val);
568 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
569 } else {
570 if (min_args > 0) {
571 report_location(DIAG_ERROR, filename, line_no,
572 i_after_action - s.begin());
573 if (min_args == max_args) {
574 cerr << "Index action '" << action << "' requires "
575 << min_args << " arguments" << endl;
576 exit(1);
578 cerr << "Index action '" << action << "' requires at least "
579 << min_args << " arguments" << endl;
580 exit(1);
582 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
583 useless_weight_pos = string::npos;
584 actions.emplace_back(code, action_pos, "", weight);
585 } else if (code == Action::HASH) {
586 actions.emplace_back(code, action_pos, "",
587 MAX_SAFE_TERM_LENGTH - 1);
588 } else {
589 actions.emplace_back(code, action_pos);
592 j = i;
595 if (useless_weight_pos != string::npos) {
596 report_useless_action(filename, line_no, useless_weight_pos,
597 "weight");
600 while (!actions.empty()) {
601 bool done = true;
602 Action::type action = actions.back().get_action();
603 switch (action) {
604 case Action::HASH:
605 case Action::HEXTOBIN:
606 case Action::LOWER:
607 case Action::PARSEDATE:
608 case Action::SPELL:
609 case Action::TRUNCATE:
610 case Action::UNHTML:
611 done = false;
612 report_useless_action(filename, line_no,
613 actions.back().get_pos(),
614 action_names[action]);
615 actions.pop_back();
616 break;
617 default:
618 break;
620 if (done) break;
623 map<string, Action::type>::const_iterator boolpfx;
624 for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
625 if (boolpfx->second == Action::UNIQUE) {
626 report_location(DIAG_WARN, filename, line_no);
627 cerr << "Index action 'unique=" << boolpfx->first
628 << "' without 'boolean=" << boolpfx->first << "'" << endl;
629 static bool given_doesnt_imply_boolean_warning = false;
630 if (!given_doesnt_imply_boolean_warning) {
631 given_doesnt_imply_boolean_warning = true;
632 report_location(DIAG_NOTE, filename, line_no);
633 cerr << "'unique' doesn't implicitly add a boolean term"
634 << endl;
639 vector<string>::const_iterator field;
640 for (field = fields.begin(); field != fields.end(); ++field) {
641 vector<Action> &v = index_spec[*field];
642 if (v.empty()) {
643 if (fields.size() == 1) {
644 // Optimise common case where there's only one fieldname
645 // for a list of actions.
646 v = std::move(actions);
647 } else {
648 v = actions;
650 } else {
651 v.emplace_back(Action::NEW, string::npos);
652 v.insert(v.end(), actions.begin(), actions.end());
657 if (index_spec.empty()) {
658 report_location(DIAG_ERROR, filename, line_no);
659 cerr << "No rules found in index script" << endl;
660 exit(1);
664 static bool
665 run_actions(vector<Action>::const_iterator action_it,
666 vector<Action>::const_iterator action_end,
667 Xapian::WritableDatabase& database,
668 Xapian::TermGenerator& indexer,
669 const string& old_value,
670 bool& this_field_is_content, Xapian::Document& doc,
671 map<string, list<string>>& fields,
672 string& field, const char* fname,
673 size_t line_no, Xapian::docid& docid)
675 string value = old_value;
676 while (action_it != action_end) {
677 auto& action = *action_it++;
678 switch (action.get_action()) {
679 case Action::BAD:
680 abort();
681 case Action::NEW:
682 value = old_value;
683 // We're processing the same field again - give it a reprieve.
684 this_field_is_content = true;
685 break;
686 case Action::FIELD:
687 if (!value.empty()) {
688 string f = action.get_string_arg();
689 if (f.empty()) f = field;
690 // replace newlines with spaces
691 string s = value;
692 string::size_type j = 0;
693 while ((j = s.find('\n', j)) != string::npos)
694 s[j] = ' ';
695 fields[f].push_back(s);
697 break;
698 case Action::INDEX:
699 indexer.index_text(value,
700 action.get_num_arg(),
701 action.get_string_arg());
702 break;
703 case Action::INDEXNOPOS:
704 // No positional information so phrase searching won't work.
705 // However, the database will use much less diskspace.
706 indexer.index_text_without_positions(value,
707 action.get_num_arg(),
708 action.get_string_arg());
709 break;
710 case Action::BOOLEAN: {
711 // Do nothing if there's no text.
712 if (value.empty()) break;
714 string term = action.get_string_arg();
715 if (prefix_needs_colon(term, value[0])) term += ':';
716 term += value;
718 doc.add_boolean_term(term);
719 break;
721 case Action::HASH: {
722 unsigned int max_length = action.get_num_arg();
723 if (value.length() > max_length)
724 value = hash_long_term(value, max_length);
725 break;
727 case Action::HEXTOBIN: {
728 size_t len = value.length();
729 if (len & 1) {
730 report_location(DIAG_ERROR, fname, line_no);
731 cerr << "hextobin: input must have even length"
732 << endl;
733 } else {
734 string output;
735 output.reserve(len / 2);
736 for (size_t j = 0; j < len; j += 2) {
737 char a = value[j];
738 char b = value[j + 1];
739 if (!C_isxdigit(a) || !C_isxdigit(b)) {
740 report_location(DIAG_ERROR, fname, line_no);
741 cerr << "hextobin: input must be all hex "
742 "digits" << endl;
743 goto badhex;
745 char r = (hex_digit(a) << 4) | hex_digit(b);
746 output.push_back(r);
748 value = std::move(output);
750 badhex:
751 break;
753 case Action::LOWER:
754 value = Xapian::Unicode::tolower(value);
755 break;
756 case Action::LOAD: {
757 bool truncated = false;
758 string filename = std::move(value);
759 // FIXME: Use NOATIME if we own the file or are root.
760 if (!load_file(filename, action.get_num_arg(), NOCACHE,
761 value, truncated)) {
762 report_location(DIAG_ERROR, fname, line_no);
763 cerr << "Couldn't load file '" << filename << "': "
764 << strerror(errno) << endl;
765 value.resize(0);
766 break;
768 if (!truncated) break;
770 /* FALLTHRU */
771 case Action::TRUNCATE:
772 utf8_truncate(value, action.get_num_arg());
773 break;
774 case Action::SPELL:
775 indexer.set_flags(indexer.FLAG_SPELLING);
776 break;
777 case Action::SPLIT: {
778 // Execute actions on the split up to the first NEW, if any.
779 vector<Action>::const_iterator split_end = action_it;
780 while (split_end != action_end &&
781 split_end->get_action() != Action::NEW) {
782 ++split_end;
785 if (value.empty()) {
786 // Nothing to do.
787 } else if (action.get_num_arg() != Action::SPLIT_SORT) {
788 // Generate split as we consume it.
789 const string& delimiter = action.get_string_arg();
791 unique_ptr<unordered_set<string>> seen;
792 if (action.get_num_arg() == Action::SPLIT_DEDUP) {
793 seen.reset(new unordered_set<string>);
796 if (delimiter.size() == 1) {
797 // Special case for common single character delimiter.
798 char ch = delimiter[0];
799 string::size_type i = 0;
800 while (true) {
801 string::size_type j = value.find(ch, i);
802 if (i != j) {
803 string val(value, i, j - i);
804 if (!seen.get() || seen->insert(val).second) {
805 run_actions(action_it, split_end,
806 database, indexer,
807 val,
808 this_field_is_content, doc,
809 fields,
810 field, fname, line_no,
811 docid);
814 if (j == string::npos) break;
815 i = j + 1;
817 } else {
818 string::size_type i = 0;
819 while (true) {
820 string::size_type j = value.find(delimiter, i);
821 if (i != j) {
822 string val(value, i, j - i);
823 if (!seen.get() || seen->insert(val).second) {
824 run_actions(action_it, split_end,
825 database, indexer,
826 val,
827 this_field_is_content, doc,
828 fields,
829 field, fname, line_no,
830 docid);
833 if (j == string::npos) break;
834 i = j + delimiter.size();
837 } else {
838 vector<string> split_values;
839 const string& delimiter = action.get_string_arg();
840 if (delimiter.size() == 1) {
841 // Special case for common single character delimiter.
842 char ch = delimiter[0];
843 string::size_type i = 0;
844 while (true) {
845 string::size_type j = value.find(ch, i);
846 if (i != j) {
847 split_values.emplace_back(value, i, j - i);
849 if (j == string::npos) break;
850 i = j + 1;
852 } else {
853 string::size_type i = 0;
854 while (true) {
855 string::size_type j = value.find(delimiter, i);
856 if (i != j) {
857 split_values.emplace_back(value, i, j - i);
859 if (j == string::npos) break;
860 i = j + delimiter.size();
864 sort(split_values.begin(), split_values.end());
866 for (auto&& val : split_values) {
867 run_actions(action_it, split_end,
868 database, indexer, val,
869 this_field_is_content, doc, fields,
870 field, fname, line_no,
871 docid);
875 action_it = split_end;
876 break;
878 case Action::UNHTML: {
879 MyHtmlParser p;
880 try {
881 // Default HTML character set is latin 1, though
882 // not specifying one is deprecated these days.
883 p.parse_html(value, "iso-8859-1", false);
884 } catch (const string & newcharset) {
885 p.reset();
886 p.parse_html(value, newcharset, true);
888 if (p.indexing_allowed)
889 value = p.dump;
890 else
891 value = "";
892 break;
894 case Action::UNIQUE: {
895 // If there's no text, just issue a warning.
896 if (value.empty()) {
897 report_location(DIAG_WARN, fname, line_no);
898 cerr << "Ignoring UNIQUE action on empty text"
899 << endl;
900 break;
903 // Ensure that the value of this field is unique.
904 // If a record already exists with the same value,
905 // it will be replaced with the new record.
907 // Unique fields aren't considered content - if
908 // there are no other fields in the document, the
909 // document is to be deleted.
910 this_field_is_content = false;
912 // Argument is the prefix to add to the field value
913 // to get the unique term.
914 string t = action.get_string_arg();
915 if (prefix_needs_colon(t, value[0])) t += ':';
916 t += value;
917 Xapian::PostingIterator p = database.postlist_begin(t);
918 if (p != database.postlist_end(t)) {
919 docid = *p;
921 break;
923 case Action::VALUE:
924 if (!value.empty())
925 doc.add_value(action.get_num_arg(), value);
926 break;
927 case Action::VALUENUMERIC: {
928 if (value.empty()) break;
929 char * end;
930 double dbl = strtod(value.c_str(), &end);
931 if (*end) {
932 report_location(DIAG_WARN, fname, line_no);
933 cerr << "Trailing characters in VALUENUMERIC: '"
934 << value << "'" << endl;
936 doc.add_value(action.get_num_arg(),
937 Xapian::sortable_serialise(dbl));
938 break;
940 case Action::VALUEPACKED: {
941 uint32_t word = 0;
942 if (value.empty() || !C_isdigit(value[0])) {
943 // strtoul() accepts leading whitespace and negated
944 // values, neither of which we want to allow.
945 errno = EINVAL;
946 } else {
947 errno = 0;
948 char* q;
949 word = strtoul(value.c_str(), &q, 10);
950 if (!errno && *q != '\0') {
951 // Trailing characters after converted value.
952 errno = EINVAL;
955 if (errno) {
956 report_location(DIAG_WARN, fname, line_no);
957 cerr << "valuepacked \"" << value << "\" ";
958 if (errno == ERANGE) {
959 cerr << "out of range";
960 } else {
961 cerr << "not an unsigned integer";
963 cerr << endl;
965 int valueslot = action.get_num_arg();
966 doc.add_value(valueslot, int_to_binary_string(word));
967 break;
969 case Action::DATE: {
970 const string & type = action.get_string_arg();
971 string yyyymmdd;
972 if (type == "unix") {
973 time_t t = atoi(value.c_str());
974 struct tm *tm = localtime(&t);
975 int y = tm->tm_year + 1900;
976 int m = tm->tm_mon + 1;
977 yyyymmdd = date_to_string(y, m, tm->tm_mday);
978 } else if (type == "yyyymmdd") {
979 if (value.length() == 8) yyyymmdd = value;
981 if (yyyymmdd.empty()) break;
982 // Date (YYYYMMDD)
983 doc.add_boolean_term("D" + yyyymmdd);
984 yyyymmdd.resize(6);
985 // Month (YYYYMM)
986 doc.add_boolean_term("M" + yyyymmdd);
987 yyyymmdd.resize(4);
988 // Year (YYYY)
989 doc.add_boolean_term("Y" + yyyymmdd);
990 break;
992 case Action::PARSEDATE: {
993 string dateformat = action.get_string_arg();
994 struct tm tm;
995 memset(&tm, 0, sizeof(tm));
996 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
997 if (ret == NULL) {
998 report_location(DIAG_WARN, fname, line_no);
999 cerr << "\"" << value << "\" doesn't match format "
1000 "\"" << dateformat << '\"' << endl;
1001 break;
1004 if (*ret != '\0') {
1005 report_location(DIAG_WARN, fname, line_no);
1006 cerr << "\"" << value << "\" not fully matched by "
1007 "format \"" << dateformat << "\" "
1008 "(\"" << ret << "\" left over) but "
1009 "indexing anyway" << endl;
1012 value = str(timegm(&tm));
1013 break;
1015 default:
1016 /* Empty default case to avoid "unhandled enum value"
1017 * warnings. */
1018 break;
1021 return true;
1024 static void
1025 index_file(const char *fname, istream &stream,
1026 Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1028 string line;
1029 size_t line_no = 0;
1030 while (!stream.eof() && getline(stream, line)) {
1031 ++line_no;
1032 Xapian::Document doc;
1033 indexer.set_document(doc);
1034 Xapian::docid docid = 0;
1035 map<string, list<string>> fields;
1036 bool seen_content = false;
1037 while (!line.empty()) {
1038 // Cope with files from MS Windows (\r\n end of lines).
1039 // Trim multiple \r characters, since that seems the best way
1040 // to handle that case.
1041 string::size_type last = line.find_last_not_of('\r');
1042 if (last == string::npos) break;
1043 line.resize(last + 1);
1045 string::size_type eq = line.find('=');
1046 if (eq == string::npos && !line.empty()) {
1047 report_location(DIAG_ERROR, fname, line_no, line.size());
1048 cerr << "expected = somewhere in this line" << endl;
1049 // FIXME: die or what?
1051 string field(line, 0, eq);
1052 string value(line, eq + 1, string::npos);
1053 while (getline(stream, line)) {
1054 ++line_no;
1055 if (line.empty() || line[0] != '=') break;
1056 // Cope with files from MS Windows (\r\n end of lines).
1057 // Trim multiple \r characters, since that seems the best way
1058 // to handle that case.
1059 last = line.find_last_not_of('\r');
1060 // line[0] == '=', so last != string::npos.
1061 // Replace the '=' with a '\n' so we don't have to use substr.
1062 line[0] = '\n';
1063 line.resize(last + 1);
1064 value += line;
1067 // Default to not indexing spellings.
1068 indexer.set_flags(Xapian::TermGenerator::flags(0));
1070 bool this_field_is_content = true;
1071 const vector<Action>& v = index_spec[field];
1072 run_actions(v.begin(), v.end(),
1073 database, indexer, value,
1074 this_field_is_content, doc, fields,
1075 field, fname, line_no,
1076 docid);
1077 if (this_field_is_content) seen_content = true;
1078 if (stream.eof()) break;
1081 // If we haven't seen any fields (other than unique identifiers)
1082 // the document is to be deleted.
1083 if (!seen_content) {
1084 if (docid) {
1085 database.delete_document(docid);
1086 if (verbose) cout << "Del: " << docid << endl;
1087 ++delcount;
1089 } else {
1090 string data;
1091 for (auto&& i : fields) {
1092 for (auto&& field_val : i.second) {
1093 data += i.first;
1094 data += '=';
1095 data += field_val;
1096 data += '\n';
1100 // Put the data in the document
1101 doc.set_data(data);
1103 // Add the document to the database
1104 if (docid) {
1105 database.replace_document(docid, doc);
1106 if (verbose) cout << "Replace: " << docid << endl;
1107 ++repcount;
1108 } else {
1109 docid = database.add_document(doc);
1110 if (verbose) cout << "Add: " << docid << endl;
1111 ++addcount;
1116 // Commit after each file to make sure all changes from that file make it
1117 // in.
1118 if (verbose) cout << "Committing: " << endl;
1119 database.commit();
1123 main(int argc, char **argv)
1124 try {
1125 // If the database already exists, default to updating not overwriting.
1126 int database_mode = Xapian::DB_CREATE_OR_OPEN;
1127 verbose = false;
1128 Xapian::Stem stemmer("english");
1130 constexpr auto NO_ARG = no_argument;
1131 constexpr auto REQ_ARG = required_argument;
1132 static const struct option longopts[] = {
1133 { "help", NO_ARG, NULL, 'h' },
1134 { "version", NO_ARG, NULL, 'V' },
1135 { "stemmer", REQ_ARG, NULL, 's' },
1136 { "overwrite", NO_ARG, NULL, 'o' },
1137 { "verbose", NO_ARG, NULL, 'v' },
1138 { 0, 0, NULL, 0 }
1141 bool more = true, show_help = false;
1142 while (more) {
1143 switch (gnu_getopt_long(argc, argv, "vs:hV", longopts, NULL)) {
1144 case EOF:
1145 more = false;
1146 break;
1147 default:
1148 case 'h': // --help
1149 show_help = true;
1150 more = false;
1151 break;
1152 case 'V': // --version
1153 print_package_info(PROG_NAME);
1154 return 0;
1155 case 'o': // --overwrite
1156 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1157 break;
1158 case 'v':
1159 verbose = true;
1160 break;
1161 case 's':
1162 try {
1163 stemmer = Xapian::Stem(optarg);
1164 } catch (const Xapian::InvalidArgumentError &) {
1165 cerr << "Unknown stemming language '" << optarg << "'.\n";
1166 cerr << "Available language names are: "
1167 << Xapian::Stem::get_available_languages() << endl;
1168 return 1;
1170 break;
1174 argv += optind;
1175 argc -= optind;
1176 if (show_help || argc < 2) {
1177 cout << PROG_NAME " - " PROG_DESC "\n"
1178 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1179 "\n"
1180 "Creates or updates a Xapian database with the data from the input files listed\n"
1181 "on the command line. If no files are specified, data is read from stdin.\n"
1182 "\n"
1183 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1184 "format for INDEXER_SCRIPT.\n"
1185 "\n"
1186 "Options:\n"
1187 " -v, --verbose display additional messages to aid debugging\n"
1188 " --overwrite create the database anew (the default is to update if\n"
1189 " the database already exists)\n";
1190 print_stemmer_help("");
1191 print_help_and_version_help("");
1192 exit(show_help ? 0 : 1);
1195 parse_index_script(argv[1]);
1197 // Open the database. If another process is currently updating the
1198 // database, wait for the lock to become available.
1199 auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1200 Xapian::WritableDatabase database(argv[0], flags);
1202 Xapian::TermGenerator indexer;
1203 indexer.set_stemmer(stemmer);
1204 // Set the database for spellings to be added to by the "spell" action.
1205 indexer.set_database(database);
1207 addcount = 0;
1208 repcount = 0;
1209 delcount = 0;
1211 if (argc == 2) {
1212 // Read from stdin.
1213 index_file("<stdin>", cin, database, indexer);
1214 } else {
1215 // Read file(s) listed on the command line.
1216 for (int i = 2; i < argc; ++i) {
1217 ifstream stream(argv[i]);
1218 if (stream) {
1219 index_file(argv[i], stream, database, indexer);
1220 } else {
1221 cerr << "Can't open file " << argv[i] << endl;
1226 cout << "records (added, replaced, deleted) = (" << addcount << ", "
1227 << repcount << ", " << delcount << ")" << endl;
1228 } catch (const Xapian::Error &error) {
1229 cerr << "Exception: " << error.get_description() << endl;
1230 exit(1);
1231 } catch (const std::bad_alloc &) {
1232 cerr << "Exception: std::bad_alloc" << endl;
1233 exit(1);
1234 } catch (...) {
1235 cerr << "Unknown Exception" << endl;
1236 exit(1);