scriptindex: Support multiple index action arguments
[xapian.git] / xapian-applications / omega / scriptindex.cc
blob5f4327527c70eeb31eb401afe63818098890bd09
1 /* scriptindex.cc
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Sam Liddicott
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,2018 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include <xapian.h>
28 #include <algorithm>
29 #include <fstream>
30 #include <iostream>
31 #include <list>
32 #include <map>
33 #include <string>
34 #include <vector>
35 #include <cstring>
37 #include <cstdlib>
38 #include "safeerrno.h"
39 #include <cstdio>
40 #include <ctime>
42 #include "commonhelp.h"
43 #include "hashterm.h"
44 #include "loadfile.h"
45 #include "myhtmlparse.h"
46 #include "str.h"
47 #include "stringutils.h"
48 #include "timegm.h"
49 #include "utf8truncate.h"
50 #include "utils.h"
51 #include "values.h"
53 #include "gnu_getopt.h"
55 using namespace std;
57 #define PROG_NAME "scriptindex"
58 #define PROG_DESC "index arbitrary data as described by an index script"
60 static bool verbose;
61 static int addcount;
62 static int repcount;
63 static int delcount;
65 inline bool
66 prefix_needs_colon(const string & prefix, unsigned ch)
68 if (!C_isupper(ch) && ch != ':') return false;
69 string::size_type len = prefix.length();
70 return (len > 1 && prefix[len - 1] != ':');
73 const char * action_names[] = {
74 "bad", "new",
75 "boolean", "date", "field", "hash", "hextobin", "index", "indexnopos",
76 "load", "lower", "parsedate", "spell", "truncate", "unhtml", "unique",
77 "value", "valuenumeric", "valuepacked", "weight"
80 // For debugging:
81 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
83 class Action {
84 public:
85 typedef enum {
86 BAD, NEW,
87 BOOLEAN, DATE, FIELD, HASH, HEXTOBIN, INDEX, INDEXNOPOS, LOAD, LOWER,
88 PARSEDATE, SPELL, TRUNCATE, UNHTML, UNIQUE, VALUE,
89 VALUENUMERIC, VALUEPACKED, WEIGHT
90 } type;
91 private:
92 type action;
93 int num_arg;
94 string string_arg;
95 public:
96 Action(type action_) : action(action_), num_arg(0) { }
97 Action(type action_, const string & arg)
98 : action(action_), string_arg(arg) {
99 num_arg = atoi(string_arg.c_str());
101 Action(type action_, const string & arg, int num)
102 : action(action_), num_arg(num), string_arg(arg) { }
103 type get_action() const { return action; }
104 int get_num_arg() const { return num_arg; }
105 void set_num_arg(int num) { num_arg = num; }
106 const string & get_string_arg() const { return string_arg; }
109 static void
110 report_useless_action(const string &file, size_t line, size_t pos,
111 const string &action)
113 cerr << file << ':' << line;
114 if (pos != string::npos) cerr << ':' << pos;
115 cerr << ": Warning: Index action '" << action << "' has no effect" << endl;
117 static bool given_left_to_right_warning = false;
118 if (!given_left_to_right_warning) {
119 given_left_to_right_warning = true;
120 cerr << file << ':' << line
121 << ": Warning: Note that actions are executed from left to right"
122 << endl;
126 static map<string, vector<Action> > index_spec;
128 static void
129 parse_index_script(const string &filename)
131 ifstream script(filename.c_str());
132 if (!script.is_open()) {
133 cerr << filename << ": " << strerror(errno) << endl;
134 exit(1);
136 string line;
137 size_t line_no = 0;
138 bool had_unique = false;
139 while (getline(script, line)) {
140 ++line_no;
141 vector<string> fields;
142 vector<Action> actions;
143 string::const_iterator i, j;
144 const string &s = line;
145 i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
146 if (i == s.end() || *i == '#') continue;
147 while (true) {
148 if (!C_isalnum(*i)) {
149 cerr << filename << ':' << line_no
150 << ": field name must start with alphanumeric" << endl;
151 exit(1);
153 j = find_if(i, s.end(),
154 [](char ch) { return !C_isalnum(ch) && ch != '_'; });
155 fields.push_back(string(i, j));
156 i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
157 if (i == s.end()) break;
158 if (*i == ':') {
159 ++i;
160 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
161 break;
163 if (i == j) {
164 cerr << filename << ':' << line_no
165 << ": bad character '" << *j << "' in fieldname" << endl;
166 exit(1);
169 Xapian::termcount weight = 1;
170 size_t useless_weight_pos = string::npos;
171 map<string, Action::type> boolmap;
172 j = i;
173 while (j != s.end()) {
174 i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
175 string action(s, j - s.begin(), i - j);
176 Action::type code = Action::BAD;
177 unsigned min_args = 0, max_args = 0;
178 bool takes_integer_argument = false;
179 if (!action.empty()) {
180 switch (action[0]) {
181 case 'b':
182 if (action == "boolean") {
183 code = Action::BOOLEAN;
184 max_args = 1;
186 break;
187 case 'd':
188 if (action == "date") {
189 code = Action::DATE;
190 min_args = max_args = 1;
192 break;
193 case 'f':
194 if (action == "field") {
195 code = Action::FIELD;
196 max_args = 1;
198 break;
199 case 'h':
200 if (action == "hash") {
201 code = Action::HASH;
202 max_args = 1;
203 takes_integer_argument = true;
204 } else if (action == "hextobin") {
205 code = Action::HEXTOBIN;
207 break;
208 case 'i':
209 if (action == "index") {
210 code = Action::INDEX;
211 max_args = 1;
212 } else if (action == "indexnopos") {
213 code = Action::INDEXNOPOS;
214 max_args = 1;
216 break;
217 case 'l':
218 if (action == "lower") {
219 code = Action::LOWER;
220 } else if (action == "load") {
221 code = Action::LOAD;
223 break;
224 case 'p':
225 if (action == "parsedate") {
226 code = Action::PARSEDATE;
227 min_args = max_args = 1;
229 break;
230 case 's':
231 if (action == "spell") {
232 code = Action::SPELL;
234 break;
235 case 't':
236 if (action == "truncate") {
237 code = Action::TRUNCATE;
238 min_args = max_args = 1;
239 takes_integer_argument = true;
241 break;
242 case 'u':
243 if (action == "unhtml") {
244 code = Action::UNHTML;
245 } else if (action == "unique") {
246 code = Action::UNIQUE;
247 min_args = max_args = 1;
249 break;
250 case 'v':
251 if (action == "value") {
252 code = Action::VALUE;
253 min_args = max_args = 1;
254 takes_integer_argument = true;
255 } else if (action == "valuenumeric") {
256 code = Action::VALUENUMERIC;
257 min_args = max_args = 1;
258 takes_integer_argument = true;
259 } else if (action == "valuepacked") {
260 code = Action::VALUEPACKED;
261 min_args = max_args = 1;
262 takes_integer_argument = true;
264 break;
265 case 'w':
266 if (action == "weight") {
267 code = Action::WEIGHT;
268 min_args = max_args = 1;
269 takes_integer_argument = true;
271 break;
274 if (code == Action::BAD) {
275 cerr << filename << ':' << line_no
276 << ": Unknown index action '" << action << "'" << endl;
277 exit(1);
279 auto i_after_action = i;
280 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
282 if (i != s.end() && *i == '=') {
283 if (i != i_after_action) {
284 cerr << filename << ':' << line_no
285 << ": warning: putting spaces between the action and "
286 "'=' is deprecated." << endl;
289 if (max_args == 0) {
290 cerr << filename << ':' << line_no
291 << ": Index action '" << action
292 << "' doesn't take an argument" << endl;
293 exit(1);
296 ++i;
297 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
298 if (i != j) {
299 cerr << filename << ':' << line_no
300 << ": warning: putting spaces between '=' and the "
301 "argument is deprecated." << endl;
304 vector<string> vals;
305 while (true) {
306 if (j != s.end() && *j == '"') {
307 // Quoted argument.
308 ++j;
309 i = find(j, s.end(), '"');
310 if (i == s.end()) {
311 cerr << filename << ':' << line_no
312 << ": No closing quote" << endl;
313 exit(1);
315 vals.emplace_back(j, i);
316 ++i;
317 if (i == s.end() || C_isspace(*i)) break;
318 if (*i != ',') {
319 cerr << filename << ':' << line_no
320 << ": Unexpected character '" << *i
321 << "' after closing quote" << endl;
322 exit(1);
324 ++i;
325 } else if (max_args > 1) {
326 // Unquoted argument, split on comma.
327 i = find_if(j, s.end(),
328 [](char ch) {
329 return C_isspace(ch) || ch == ',';
331 vals.emplace_back(j, i);
332 if (*i != ',') break;
333 ++i;
334 } else {
335 // Unquoted argument, including any commas.
336 i = find_if(j, s.end(),
337 [](char ch) { return C_isspace(ch); });
338 vals.emplace_back(j, i);
339 break;
342 if (vals.size() == max_args) {
343 cerr << filename << ':' << line_no
344 << ": Index action '" << action
345 << "' takes at most " << max_args << " arguments"
346 << endl;
347 exit(1);
351 if (vals.size() < min_args) {
352 if (min_args == max_args) {
353 cerr << filename << ':' << line_no
354 << ": Index action '" << action
355 << "' requires " << min_args << " arguments"
356 << endl;
357 exit(1);
359 cerr << filename << ':' << line_no
360 << ": Index action '" << action
361 << "' requires at least " << min_args << " arguments"
362 << endl;
363 exit(1);
366 string val;
367 if (!vals.empty()) {
368 val = vals.front();
371 if (takes_integer_argument) {
372 if (val.find('.') != string::npos) {
373 cerr << filename << ':' << line_no
374 << ": Warning: Index action '" << action
375 << "' takes an integer argument" << endl;
378 switch (code) {
379 case Action::INDEX:
380 case Action::INDEXNOPOS:
381 actions.emplace_back(code, val, weight);
382 useless_weight_pos = string::npos;
383 break;
384 case Action::WEIGHT:
385 // We don't push an Action for WEIGHT - instead we
386 // store it ready to use in the INDEX and INDEXNOPOS
387 // Actions.
388 weight = atoi(val.c_str());
389 if (useless_weight_pos != string::npos) {
390 report_useless_action(filename, line_no,
391 useless_weight_pos, action);
393 useless_weight_pos = j - s.begin();
394 break;
395 case Action::TRUNCATE:
396 if (!actions.empty() &&
397 actions.back().get_action() == Action::LOAD) {
398 /* Turn "load truncate=n" into "load" with
399 * num_arg n, so that we don't needlessly
400 * allocate memory and read data we're just
401 * going to ignore.
403 actions.pop_back();
404 code = Action::LOAD;
406 actions.emplace_back(code, val);
407 break;
408 case Action::UNIQUE:
409 if (had_unique) {
410 cerr << filename << ':' << line_no
411 << ": Index action 'unique' used more than "
412 "once" << endl;
413 exit(1);
415 had_unique = true;
416 if (boolmap.find(val) == boolmap.end())
417 boolmap[val] = Action::UNIQUE;
418 actions.emplace_back(code, val);
419 break;
420 case Action::HASH: {
421 actions.emplace_back(code, val);
422 auto& obj = actions.back();
423 auto max_length = obj.get_num_arg();
424 if (max_length < 6) {
425 cerr << filename << ':' << line_no
426 << ": Index action 'hash' takes an integer "
427 "argument which must be at least 6" << endl;
428 exit(1);
430 break;
432 case Action::BOOLEAN:
433 boolmap[val] = Action::BOOLEAN;
434 /* FALLTHRU */
435 default:
436 actions.emplace_back(code, val);
438 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
439 } else {
440 if (min_args > 0) {
441 if (min_args == max_args) {
442 cerr << filename << ':' << line_no
443 << ": Index action '" << action
444 << "' requires " << min_args << " arguments"
445 << endl;
446 exit(1);
448 cerr << filename << ':' << line_no
449 << ": Index action '" << action
450 << "' requires at least " << min_args << " arguments"
451 << endl;
452 exit(1);
454 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
455 useless_weight_pos = string::npos;
456 actions.emplace_back(code, "", weight);
457 } else if (code == Action::HASH) {
458 actions.emplace_back(code, "", MAX_SAFE_TERM_LENGTH - 1);
459 } else {
460 actions.emplace_back(code);
463 j = i;
466 if (useless_weight_pos != string::npos) {
467 report_useless_action(filename, line_no, useless_weight_pos,
468 "weight");
471 while (!actions.empty()) {
472 bool done = true;
473 Action::type action = actions.back().get_action();
474 switch (action) {
475 case Action::HASH:
476 case Action::HEXTOBIN:
477 case Action::LOWER:
478 case Action::PARSEDATE:
479 case Action::SPELL:
480 case Action::TRUNCATE:
481 case Action::UNHTML:
482 done = false;
483 report_useless_action(filename, line_no, string::npos,
484 action_names[action]);
485 actions.pop_back();
486 break;
487 default:
488 break;
490 if (done) break;
493 map<string, Action::type>::const_iterator boolpfx;
494 for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
495 if (boolpfx->second == Action::UNIQUE) {
496 cerr << filename << ':' << line_no
497 << ": Warning: Index action 'unique=" << boolpfx->first
498 << "' without 'boolean=" << boolpfx->first << "'" << endl;
499 static bool given_doesnt_imply_boolean_warning = false;
500 if (!given_doesnt_imply_boolean_warning) {
501 given_doesnt_imply_boolean_warning = true;
502 cerr << filename << ':' << line_no
503 << ": Warning: Note 'unique' doesn't implicitly add "
504 "a boolean term" << endl;
509 vector<string>::const_iterator field;
510 for (field = fields.begin(); field != fields.end(); ++field) {
511 vector<Action> &v = index_spec[*field];
512 if (v.empty()) {
513 if (fields.size() == 1) {
514 // Optimise common case where there's only one fieldname
515 // for a list of actions.
516 v = std::move(actions);
517 } else {
518 v = actions;
520 } else {
521 v.emplace_back(Action::NEW);
522 v.insert(v.end(), actions.begin(), actions.end());
527 if (index_spec.empty()) {
528 cerr << filename << ": No rules found in index script" << endl;
529 exit(1);
533 static void
534 index_file(const char *fname, istream &stream,
535 Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
537 string line;
538 size_t line_no = 0;
539 while (!stream.eof() && getline(stream, line)) {
540 ++line_no;
541 Xapian::Document doc;
542 indexer.set_document(doc);
543 Xapian::docid docid = 0;
544 map<string, list<string> > fields;
545 bool seen_content = false;
546 while (!line.empty()) {
547 // Cope with files from MS Windows (\r\n end of lines).
548 // Trim multiple \r characters, since that seems the best way
549 // to handle that case.
550 string::size_type last = line.find_last_not_of('\r');
551 if (last == string::npos) break;
552 line.resize(last + 1);
554 string::size_type eq = line.find('=');
555 if (eq == string::npos && !line.empty()) {
556 cerr << fname << ':' << line_no << ": expected = somewhere "
557 "in this line" << endl;
558 // FIXME: die or what?
560 string field(line, 0, eq);
561 string value(line, eq + 1, string::npos);
562 while (getline(stream, line)) {
563 ++line_no;
564 if (line.empty() || line[0] != '=') break;
565 // Cope with files from MS Windows (\r\n end of lines).
566 // Trim multiple \r characters, since that seems the best way
567 // to handle that case.
568 last = line.find_last_not_of('\r');
569 // line[0] == '=', so last != string::npos.
570 // Replace the '=' with a '\n' so we don't have to use substr.
571 line[0] = '\n';
572 line.resize(last + 1);
573 value += line;
576 // Default to not indexing spellings.
577 indexer.set_flags(Xapian::TermGenerator::flags(0));
579 const vector<Action> &v = index_spec[field];
580 string old_value = value;
581 vector<Action>::const_iterator i;
582 bool this_field_is_content = true;
583 for (i = v.begin(); i != v.end(); ++i) {
584 switch (i->get_action()) {
585 case Action::BAD:
586 abort();
587 case Action::NEW:
588 value = old_value;
589 // We're processing the same field again - give it a
590 // reprieve.
591 this_field_is_content = true;
592 break;
593 case Action::FIELD:
594 if (!value.empty()) {
595 string f = i->get_string_arg();
596 if (f.empty()) f = field;
597 // replace newlines with spaces
598 string s = value;
599 string::size_type j = 0;
600 while ((j = s.find('\n', j)) != string::npos)
601 s[j] = ' ';
602 fields[f].push_back(s);
604 break;
605 case Action::INDEX:
606 indexer.index_text(value,
607 i->get_num_arg(),
608 i->get_string_arg());
609 break;
610 case Action::INDEXNOPOS:
611 // No positional information so phrase searching
612 // won't work. However, the database will use much
613 // less diskspace.
614 indexer.index_text_without_positions(value,
615 i->get_num_arg(),
616 i->get_string_arg());
617 break;
618 case Action::BOOLEAN: {
619 // Do nothing if there's no text.
620 if (value.empty()) break;
622 string term = i->get_string_arg();
623 if (prefix_needs_colon(term, value[0])) term += ':';
624 term += value;
626 doc.add_boolean_term(term);
627 break;
629 case Action::HASH: {
630 unsigned int max_length = i->get_num_arg();
631 if (value.length() > max_length)
632 value = hash_long_term(value, max_length);
633 break;
635 case Action::HEXTOBIN: {
636 size_t len = value.length();
637 if (len & 1) {
638 cerr << "hextobin: input must have even length"
639 << endl;
640 } else {
641 string output;
642 output.reserve(len / 2);
643 for (size_t j = 0; j < len; j += 2) {
644 char a = value[j];
645 char b = value[j + 1];
646 if (!C_isxdigit(a) || !C_isxdigit(b)) {
647 cerr << "hextobin: input must be all hex "
648 "digits" << endl;
649 goto badhex;
651 char r = (hex_digit(a) << 4) | hex_digit(b);
652 output.push_back(r);
654 value = std::move(output);
656 badhex:
657 break;
659 case Action::LOWER:
660 value = Xapian::Unicode::tolower(value);
661 break;
662 case Action::LOAD: {
663 bool truncated = false;
664 // FIXME: Use NOATIME if we own the file or are root.
665 if (!load_file(value, i->get_num_arg(), NOCACHE,
666 value, truncated)) {
667 cerr << "Couldn't load file '" << value << "': "
668 << strerror(errno) << endl;
669 value.resize(0);
671 if (!truncated) break;
673 /* FALLTHRU */
674 case Action::TRUNCATE:
675 utf8_truncate(value, i->get_num_arg());
676 break;
677 case Action::SPELL:
678 indexer.set_flags(indexer.FLAG_SPELLING);
679 break;
680 case Action::UNHTML: {
681 MyHtmlParser p;
682 try {
683 // Default HTML character set is latin 1, though
684 // not specifying one is deprecated these days.
685 p.parse_html(value, "iso-8859-1", false);
686 } catch (const string & newcharset) {
687 p.reset();
688 p.parse_html(value, newcharset, true);
690 if (p.indexing_allowed)
691 value = p.dump;
692 else
693 value = "";
694 break;
696 case Action::UNIQUE: {
697 // If there's no text, just issue a warning.
698 if (value.empty()) {
699 cerr << fname << ':' << line_no
700 << ": Ignoring UNIQUE action on empty text"
701 << endl;
702 break;
705 // Ensure that the value of this field is unique.
706 // If a record already exists with the same value,
707 // it will be replaced with the new record.
709 // Unique fields aren't considered content - if
710 // there are no other fields in the document, the
711 // document is to be deleted.
712 this_field_is_content = false;
714 // Argument is the prefix to add to the field value
715 // to get the unique term.
716 string t = i->get_string_arg();
717 if (prefix_needs_colon(t, value[0])) t += ':';
718 t += value;
719 again:
720 try {
721 Xapian::PostingIterator p = database.postlist_begin(t);
722 if (p != database.postlist_end(t)) {
723 docid = *p;
725 } catch (const Xapian::Error &e) {
726 // Hmm, what happened?
727 cerr << "Caught exception in UNIQUE!" << endl;
728 cerr << "E: " << e.get_description() << endl;
729 database.commit();
730 goto again;
732 break;
734 case Action::VALUE:
735 if (!value.empty())
736 doc.add_value(i->get_num_arg(), value);
737 break;
738 case Action::VALUENUMERIC: {
739 if (value.empty()) break;
740 char * end;
741 double dbl = strtod(value.c_str(), &end);
742 if (*end) {
743 cerr << fname << ':' << line_no << ": Warning: "
744 "Trailing characters in VALUENUMERIC: '"
745 << value << "'" << endl;
747 doc.add_value(i->get_num_arg(),
748 Xapian::sortable_serialise(dbl));
749 break;
751 case Action::VALUEPACKED: {
752 uint32_t word = 0;
753 if (value.empty() || !C_isdigit(value[0])) {
754 // strtoul() accepts leading whitespace and negated
755 // values, neither of which we want to allow.
756 errno = EINVAL;
757 } else {
758 errno = 0;
759 char* q;
760 word = strtoul(value.c_str(), &q, 10);
761 if (!errno && *q != '\0') {
762 // Trailing characters after converted value.
763 errno = EINVAL;
766 if (errno) {
767 cerr << fname << ':' << line_no << ": Warning: "
768 "valuepacked \"" << value << "\" ";
769 if (errno == ERANGE) {
770 cerr << "out of range";
771 } else {
772 cerr << "not an unsigned integer";
774 cerr << endl;
776 int valueslot = i->get_num_arg();
777 doc.add_value(valueslot, int_to_binary_string(word));
778 break;
780 case Action::DATE: {
781 const string & type = i->get_string_arg();
782 string yyyymmdd;
783 if (type == "unix") {
784 time_t t = atoi(value.c_str());
785 struct tm *tm = localtime(&t);
786 int y = tm->tm_year + 1900;
787 int m = tm->tm_mon + 1;
788 yyyymmdd = date_to_string(y, m, tm->tm_mday);
789 } else if (type == "yyyymmdd") {
790 if (value.length() == 8) yyyymmdd = value;
792 if (yyyymmdd.empty()) break;
793 // Date (YYYYMMDD)
794 doc.add_boolean_term("D" + yyyymmdd);
795 yyyymmdd.resize(6);
796 // Month (YYYYMM)
797 doc.add_boolean_term("M" + yyyymmdd);
798 yyyymmdd.resize(4);
799 // Year (YYYY)
800 doc.add_boolean_term("Y" + yyyymmdd);
801 break;
803 case Action::PARSEDATE: {
804 string dateformat = i->get_string_arg();
805 struct tm tm;
806 memset(&tm, 0, sizeof(tm));
807 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
808 if (ret == NULL) {
809 cerr << fname << ':' << line_no << ": Warning: "
810 "\"" << value << "\" doesn't match format "
811 "\"" << dateformat << '\"' << endl;
812 break;
815 if (*ret != '\0') {
816 cerr << fname << ':' << line_no << ": Warning: "
817 "\"" << value << "\" not fully matched by "
818 "format \"" << dateformat << "\" "
819 "(\"" << ret << "\" left over) but "
820 "indexing anyway" << endl;
823 value = str(timegm(&tm));
824 break;
826 default:
827 /* Empty default case to avoid "unhandled enum value"
828 * warnings. */
829 break;
832 if (this_field_is_content) seen_content = true;
833 if (stream.eof()) break;
836 // If we haven't seen any fields (other than unique identifiers)
837 // the document is to be deleted.
838 if (!seen_content) {
839 if (docid) {
840 database.delete_document(docid);
841 if (verbose) cout << "Del: " << docid << endl;
842 delcount ++;
844 } else {
845 string data;
846 for (auto&& i : fields) {
847 for (auto&& field_val : i.second) {
848 data += i.first;
849 data += '=';
850 data += field_val;
851 data += '\n';
855 // Put the data in the document
856 doc.set_data(data);
858 // Add the document to the database
859 if (docid) {
860 try {
861 database.replace_document(docid, doc);
862 if (verbose) cout << "Replace: " << docid << endl;
863 repcount ++;
864 } catch (const Xapian::Error &e) {
865 cerr << "E: " << e.get_description() << endl;
866 // Possibly the document was deleted by another
867 // process in the meantime...?
868 docid = database.add_document(doc);
869 cerr << "Replace failed, adding as new: " << docid << endl;
871 } else {
872 docid = database.add_document(doc);
873 if (verbose) cout << "Add: " << docid << endl;
874 addcount ++;
879 // Commit after each file to make sure all changes from that file make it
880 // in.
881 if (verbose) cout << "Committing: " << endl;
882 database.commit();
886 main(int argc, char **argv)
887 try {
888 // If the database already exists, default to updating not overwriting.
889 int database_mode = Xapian::DB_CREATE_OR_OPEN;
890 verbose = false;
891 Xapian::Stem stemmer("english");
893 static const struct option longopts[] = {
894 { "help", no_argument, NULL, 'h' },
895 { "version", no_argument, NULL, 'V' },
896 { "stemmer", required_argument, NULL, 's' },
897 { "overwrite", no_argument, NULL, 'o' },
898 { "verbose", no_argument, NULL, 'v' },
899 { 0, 0, NULL, 0 }
902 bool more = true, show_help = false;
903 while (more) {
904 switch (gnu_getopt_long(argc, argv, "vs:hV", longopts, NULL)) {
905 case EOF:
906 more = false;
907 break;
908 default:
909 case 'h': // --help
910 show_help = true;
911 more = false;
912 break;
913 case 'V': // --version
914 print_package_info(PROG_NAME);
915 return 0;
916 case 'o': // --overwrite
917 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
918 break;
919 case 'v':
920 verbose = true;
921 break;
922 case 's':
923 try {
924 stemmer = Xapian::Stem(optarg);
925 } catch (const Xapian::InvalidArgumentError &) {
926 cerr << "Unknown stemming language '" << optarg << "'.\n";
927 cerr << "Available language names are: "
928 << Xapian::Stem::get_available_languages() << endl;
929 return 1;
931 break;
935 argv += optind;
936 argc -= optind;
937 if (show_help || argc < 2) {
938 cout << PROG_NAME " - " PROG_DESC "\n"
939 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
940 "\n"
941 "Creates or updates a Xapian database with the data from the input files listed\n"
942 "on the command line. If no files are specified, data is read from stdin.\n"
943 "\n"
944 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
945 "format for INDEXER_SCRIPT.\n"
946 "\n"
947 "Options:\n"
948 " -v, --verbose display additional messages to aid debugging\n"
949 " --overwrite create the database anew (the default is to update if\n"
950 " the database already exists)\n";
951 print_stemmer_help("");
952 print_help_and_version_help("");
953 exit(show_help ? 0 : 1);
956 parse_index_script(argv[1]);
958 // Open the database. If another process is currently updating the
959 // database, wait for the lock to become available.
960 auto flags = database_mode | Xapian::DB_RETRY_LOCK;
961 Xapian::WritableDatabase database(argv[0], flags);
963 Xapian::TermGenerator indexer;
964 indexer.set_stemmer(stemmer);
965 // Set the database for spellings to be added to by the "spell" action.
966 indexer.set_database(database);
968 addcount = 0;
969 repcount = 0;
970 delcount = 0;
972 if (argc == 2) {
973 // Read from stdin.
974 index_file("<stdin>", cin, database, indexer);
975 } else {
976 // Read file(s) listed on the command line.
977 for (int i = 2; i < argc; ++i) {
978 ifstream stream(argv[i]);
979 if (stream) {
980 index_file(argv[i], stream, database, indexer);
981 } else {
982 cerr << "Can't open file " << argv[i] << endl;
987 cout << "records (added, replaced, deleted) = (" << addcount << ", "
988 << repcount << ", " << delcount << ")" << endl;
989 } catch (const Xapian::Error &error) {
990 cerr << "Exception: " << error.get_description() << endl;
991 exit(1);
992 } catch (const std::bad_alloc &) {
993 cerr << "Exception: std::bad_alloc" << endl;
994 exit(1);
995 } catch (...) {
996 cerr << "Unknown Exception" << endl;
997 exit(1);