scriptindex: Add parsedate and valuepacked actions
[xapian.git] / xapian-applications / omega / scriptindex.cc
blob0278f63920a6f3fdea2d91acc5de62a388e92225
1 /* scriptindex.cc
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Sam Liddicott
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include <xapian.h>
28 #include <algorithm>
29 #include <fstream>
30 #include <iostream>
31 #include <list>
32 #include <map>
33 #include <string>
34 #include <vector>
35 #include <cstring>
37 #include <cstdlib>
38 #include "safeerrno.h"
39 #include <cstdio>
40 #include <ctime>
42 #include "commonhelp.h"
43 #include "hashterm.h"
44 #include "loadfile.h"
45 #include "myhtmlparse.h"
46 #include "str.h"
47 #include "stringutils.h"
48 #include "timegm.h"
49 #include "utf8truncate.h"
50 #include "utils.h"
51 #include "values.h"
53 #include "gnu_getopt.h"
55 using namespace std;
57 #define PROG_NAME "scriptindex"
58 #define PROG_DESC "index arbitrary data as described by an index script"
60 static bool verbose;
61 static int addcount;
62 static int repcount;
63 static int delcount;
65 inline bool
66 prefix_needs_colon(const string & prefix, unsigned ch)
68 if (!C_isupper(ch) && ch != ':') return false;
69 string::size_type len = prefix.length();
70 return (len > 1 && prefix[len - 1] != ':');
73 const char * action_names[] = {
74 "bad", "new",
75 "boolean", "date", "field", "hash", "index", "indexnopos", "load", "lower",
76 "parsedate", "spell", "truncate", "unhtml", "unique", "value",
77 "valuenumeric", "valuepacked", "weight"
80 // For debugging:
81 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
83 class Action {
84 public:
85 typedef enum {
86 BAD, NEW,
87 BOOLEAN, DATE, FIELD, HASH, INDEX, INDEXNOPOS, LOAD, LOWER,
88 PARSEDATE, SPELL, TRUNCATE, UNHTML, UNIQUE, VALUE,
89 VALUENUMERIC, VALUEPACKED, WEIGHT
90 } type;
91 private:
92 type action;
93 int num_arg;
94 string string_arg;
95 public:
96 Action(type action_) : action(action_), num_arg(0) { }
97 Action(type action_, const string & arg)
98 : action(action_), string_arg(arg) {
99 num_arg = atoi(string_arg.c_str());
101 Action(type action_, const string & arg, int num)
102 : action(action_), num_arg(num), string_arg(arg) { }
103 type get_action() const { return action; }
104 int get_num_arg() const { return num_arg; }
105 const string & get_string_arg() const { return string_arg; }
108 static void
109 report_useless_action(const string &file, size_t line, size_t pos,
110 const string &action)
112 cout << file << ':' << line;
113 if (pos != string::npos) cout << ':' << pos;
114 cout << ": Warning: Index action '" << action << "' has no effect" << endl;
116 static bool given_left_to_right_warning = false;
117 if (!given_left_to_right_warning) {
118 given_left_to_right_warning = true;
119 cout << file << ':' << line
120 << ": Warning: Note that actions are executed from left to right"
121 << endl;
125 static map<string, vector<Action> > index_spec;
127 static void
128 parse_index_script(const string &filename)
130 ifstream script(filename.c_str());
131 if (!script.is_open()) {
132 cout << filename << ": " << strerror(errno) << endl;
133 exit(1);
135 string line;
136 size_t line_no = 0;
137 bool had_unique = false;
138 while (getline(script, line)) {
139 ++line_no;
140 vector<string> fields;
141 vector<Action> actions;
142 string::const_iterator i, j;
143 const string &s = line;
144 i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
145 if (i == s.end() || *i == '#') continue;
146 while (true) {
147 if (!C_isalnum(*i)) {
148 cout << filename << ':' << line_no
149 << ": field name must start with alphanumeric" << endl;
150 exit(1);
152 j = find_if(i, s.end(),
153 [](char ch) { return !C_isalnum(ch) && ch != '_'; });
154 fields.push_back(string(i, j));
155 i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
156 if (i == s.end()) break;
157 if (*i == ':') {
158 ++i;
159 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
160 break;
162 if (i == j) {
163 cout << filename << ':' << line_no
164 << ": bad character '" << *j << "' in fieldname" << endl;
165 exit(1);
168 Xapian::termcount weight = 1;
169 size_t useless_weight_pos = string::npos;
170 map<string, Action::type> boolmap;
171 j = i;
172 while (j != s.end()) {
173 i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
174 string action(s, j - s.begin(), i - j);
175 Action::type code = Action::BAD;
176 enum {NO, OPT, YES} arg = NO;
177 bool takes_integer_argument = false;
178 if (!action.empty()) {
179 switch (action[0]) {
180 case 'b':
181 if (action == "boolean") {
182 code = Action::BOOLEAN;
183 arg = OPT;
185 break;
186 case 'd':
187 if (action == "date") {
188 code = Action::DATE;
189 arg = YES;
191 break;
192 case 'f':
193 if (action == "field") {
194 code = Action::FIELD;
195 arg = OPT;
197 break;
198 case 'h':
199 if (action == "hash") {
200 code = Action::HASH;
201 arg = OPT;
202 takes_integer_argument = true;
204 break;
205 case 'i':
206 if (action == "index") {
207 code = Action::INDEX;
208 arg = OPT;
209 } else if (action == "indexnopos") {
210 code = Action::INDEXNOPOS;
211 arg = OPT;
213 break;
214 case 'l':
215 if (action == "lower") {
216 code = Action::LOWER;
217 } else if (action == "load") {
218 code = Action::LOAD;
220 break;
221 case 'p':
222 if (action == "parsedate") {
223 code = Action::PARSEDATE;
224 arg = YES;
226 break;
227 case 's':
228 if (action == "spell") {
229 code = Action::SPELL;
231 break;
232 case 't':
233 if (action == "truncate") {
234 code = Action::TRUNCATE;
235 arg = YES;
236 takes_integer_argument = true;
238 break;
239 case 'u':
240 if (action == "unhtml") {
241 code = Action::UNHTML;
242 } else if (action == "unique") {
243 code = Action::UNIQUE;
244 arg = YES;
246 break;
247 case 'v':
248 if (action == "value") {
249 code = Action::VALUE;
250 arg = YES;
251 takes_integer_argument = true;
252 } else if (action == "valuenumeric") {
253 code = Action::VALUENUMERIC;
254 arg = YES;
255 takes_integer_argument = true;
256 } else if (action == "valuepacked") {
257 code = Action::VALUEPACKED;
258 arg = YES;
259 takes_integer_argument = true;
261 break;
262 case 'w':
263 if (action == "weight") {
264 code = Action::WEIGHT;
265 arg = YES;
266 takes_integer_argument = true;
268 break;
271 if (code == Action::BAD) {
272 cout << filename << ':' << line_no
273 << ": Unknown index action '" << action << "'" << endl;
274 exit(1);
276 auto i_after_action = i;
277 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
279 if (i != s.end() && *i == '=') {
280 if (i != i_after_action) {
281 cout << filename << ':' << line_no
282 << ": warning: putting spaces between the action and "
283 "'=' is deprecated." << endl;
286 if (arg == NO) {
287 cout << filename << ':' << line_no
288 << ": Index action '" << action
289 << "' doesn't take an argument" << endl;
290 exit(1);
292 ++i;
293 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
294 if (i != j) {
295 cout << filename << ':' << line_no
296 << ": warning: putting spaces between '=' and the "
297 "argument is deprecated." << endl;
299 string val;
300 if (j != s.end() && *j == '"') {
301 // Quoted argument.
302 ++j;
303 i = find(j, s.end(), '"');
304 if (i == s.end()) {
305 cout << filename << ':' << line_no << ": No closing quote" << endl;
306 exit(1);
308 val.assign(j, i);
309 ++i;
310 } else {
311 // Unquoted argument.
312 i = find_if(j, s.end(), [](char ch) { return C_isspace(ch); });
313 val.assign(j, i);
315 if (takes_integer_argument) {
316 if (val.find('.') != string::npos) {
317 cout << filename << ':' << line_no
318 << ": Warning: Index action '" << action
319 << "' takes an integer argument" << endl;
322 switch (code) {
323 case Action::INDEX:
324 case Action::INDEXNOPOS:
325 actions.push_back(Action(code, val, weight));
326 useless_weight_pos = string::npos;
327 break;
328 case Action::WEIGHT:
329 // We don't push an Action for WEIGHT - instead we
330 // store it ready to use in the INDEX and INDEXNOPOS
331 // Actions.
332 weight = atoi(val.c_str());
333 if (useless_weight_pos != string::npos) {
334 report_useless_action(filename, line_no,
335 useless_weight_pos, action);
337 useless_weight_pos = j - s.begin();
338 break;
339 case Action::TRUNCATE:
340 if (!actions.empty() &&
341 actions.back().get_action() == Action::LOAD) {
342 /* Turn "load truncate=n" into "load" with
343 * num_arg n, so that we don't needlessly
344 * allocate memory and read data we're just
345 * going to ignore.
347 actions.pop_back();
348 code = Action::LOAD;
350 actions.push_back(Action(code, val));
351 break;
352 case Action::UNIQUE:
353 if (had_unique) {
354 cout << filename << ':' << line_no
355 << ": Index action 'unique' used more than "
356 "once" << endl;
357 exit(1);
359 had_unique = true;
360 if (boolmap.find(val) == boolmap.end())
361 boolmap[val] = Action::UNIQUE;
362 actions.push_back(Action(code, val));
363 break;
364 case Action::BOOLEAN:
365 boolmap[val] = Action::BOOLEAN;
366 /* FALLTHRU */
367 default:
368 actions.push_back(Action(code, val));
370 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
371 } else {
372 if (arg == YES) {
373 cout << filename << ':' << line_no
374 << ": Index action '" << action
375 << "' must have an argument" << endl;
376 exit(1);
378 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
379 useless_weight_pos = string::npos;
380 actions.push_back(Action(code, "", weight));
381 } else {
382 actions.push_back(Action(code));
385 j = i;
388 if (useless_weight_pos != string::npos) {
389 report_useless_action(filename, line_no, useless_weight_pos,
390 "weight");
393 while (!actions.empty()) {
394 bool done = true;
395 Action::type action = actions.back().get_action();
396 switch (action) {
397 case Action::HASH:
398 case Action::LOWER:
399 case Action::SPELL:
400 case Action::TRUNCATE:
401 case Action::UNHTML:
402 done = false;
403 report_useless_action(filename, line_no, string::npos,
404 action_names[action]);
405 actions.pop_back();
406 break;
407 default:
408 break;
410 if (done) break;
413 map<string, Action::type>::const_iterator boolpfx;
414 for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
415 if (boolpfx->second == Action::UNIQUE) {
416 cout << filename << ':' << line_no
417 << ": Warning: Index action 'unique=" << boolpfx->first
418 << "' without 'boolean=" << boolpfx->first << "'" << endl;
419 static bool given_doesnt_imply_boolean_warning = false;
420 if (!given_doesnt_imply_boolean_warning) {
421 given_doesnt_imply_boolean_warning = true;
422 cout << filename << ':' << line_no
423 << ": Warning: Note 'unique' doesn't implicitly add "
424 "a boolean term" << endl;
429 vector<string>::const_iterator field;
430 for (field = fields.begin(); field != fields.end(); ++field) {
431 vector<Action> &v = index_spec[*field];
432 if (v.empty()) {
433 v = actions;
434 } else {
435 v.push_back(Action(Action::NEW));
436 v.insert(v.end(), actions.begin(), actions.end());
441 if (index_spec.empty()) {
442 cout << filename << ": No rules found in index script" << endl;
443 exit(1);
447 static bool
448 index_file(const char *fname, istream &stream,
449 Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
451 string line;
452 size_t line_no = 0;
453 while (!stream.eof() && getline(stream, line)) {
454 ++line_no;
455 Xapian::Document doc;
456 indexer.set_document(doc);
457 Xapian::docid docid = 0;
458 map<string, list<string> > fields;
459 bool seen_content = false;
460 while (!line.empty()) {
461 // Cope with files from MS Windows (\r\n end of lines).
462 // Trim multiple \r characters, since that seems the best way
463 // to handle that case.
464 string::size_type last = line.find_last_not_of('\r');
465 if (last == string::npos) break;
466 line.resize(last + 1);
468 string::size_type eq = line.find('=');
469 if (eq == string::npos && !line.empty()) {
470 cout << fname << ':' << line_no << ": expected = somewhere "
471 "in this line" << endl;
472 // FIXME: die or what?
474 string field(line, 0, eq);
475 string value(line, eq + 1, string::npos);
476 while (getline(stream, line)) {
477 ++line_no;
478 if (line.empty() || line[0] != '=') break;
479 // Cope with files from MS Windows (\r\n end of lines).
480 // Trim multiple \r characters, since that seems the best way
481 // to handle that case.
482 last = line.find_last_not_of('\r');
483 // line[0] == '=', so last != string::npos.
484 // Replace the '=' with a '\n' so we don't have to use substr.
485 line[0] = '\n';
486 line.resize(last + 1);
487 value += line;
490 // Default to not indexing spellings.
491 indexer.set_flags(Xapian::TermGenerator::flags(0));
493 const vector<Action> &v = index_spec[field];
494 string old_value = value;
495 vector<Action>::const_iterator i;
496 bool this_field_is_content = true;
497 for (i = v.begin(); i != v.end(); ++i) {
498 switch (i->get_action()) {
499 case Action::BAD:
500 abort();
501 case Action::NEW:
502 value = old_value;
503 // We're processing the same field again - give it a
504 // reprieve.
505 this_field_is_content = true;
506 break;
507 case Action::FIELD:
508 if (!value.empty()) {
509 string f = i->get_string_arg();
510 if (f.empty()) f = field;
511 // replace newlines with spaces
512 string s = value;
513 string::size_type j = 0;
514 while ((j = s.find('\n', j)) != string::npos)
515 s[j] = ' ';
516 fields[f].push_back(s);
518 break;
519 case Action::INDEX:
520 indexer.index_text(value,
521 i->get_num_arg(),
522 i->get_string_arg());
523 break;
524 case Action::INDEXNOPOS:
525 // No positional information so phrase searching
526 // won't work. However, the database will use much
527 // less diskspace.
528 indexer.index_text_without_positions(value,
529 i->get_num_arg(),
530 i->get_string_arg());
531 break;
532 case Action::BOOLEAN: {
533 // Do nothing if there's no text.
534 if (value.empty()) break;
536 string term = i->get_string_arg();
537 if (prefix_needs_colon(term, value[0])) term += ':';
538 term += value;
540 doc.add_boolean_term(term);
541 break;
543 case Action::HASH: {
544 unsigned int max_length = i->get_num_arg();
545 if (max_length == 0)
546 max_length = MAX_SAFE_TERM_LENGTH - 1;
547 if (value.length() > max_length)
548 value = hash_long_term(value, max_length);
549 break;
551 case Action::LOWER:
552 value = Xapian::Unicode::tolower(value);
553 break;
554 case Action::LOAD: {
555 bool truncated = false;
556 // FIXME: Use NOATIME if we own the file or are root.
557 if (!load_file(value, i->get_num_arg(), NOCACHE,
558 value, truncated)) {
559 cerr << "Couldn't load file '" << value << "': "
560 << strerror(errno) << endl;
561 value.resize(0);
563 if (!truncated) break;
565 /* FALLTHRU */
566 case Action::TRUNCATE:
567 utf8_truncate(value, i->get_num_arg());
568 break;
569 case Action::SPELL:
570 indexer.set_flags(indexer.FLAG_SPELLING);
571 break;
572 case Action::UNHTML: {
573 MyHtmlParser p;
574 try {
575 // Default HTML character set is latin 1, though
576 // not specifying one is deprecated these days.
577 p.parse_html(value, "iso-8859-1", false);
578 } catch (const string & newcharset) {
579 p.reset();
580 p.parse_html(value, newcharset, true);
582 if (p.indexing_allowed)
583 value = p.dump;
584 else
585 value = "";
586 break;
588 case Action::UNIQUE: {
589 // If there's no text, just issue a warning.
590 if (value.empty()) {
591 cout << fname << ':' << line_no
592 << ": Ignoring UNIQUE action on empty text"
593 << endl;
594 break;
597 // Ensure that the value of this field is unique.
598 // If a record already exists with the same value,
599 // it will be replaced with the new record.
601 // Unique fields aren't considered content - if
602 // there are no other fields in the document, the
603 // document is to be deleted.
604 this_field_is_content = false;
606 // Argument is the prefix to add to the field value
607 // to get the unique term.
608 string t = i->get_string_arg();
609 if (prefix_needs_colon(t, value[0])) t += ':';
610 t += value;
611 again:
612 try {
613 Xapian::PostingIterator p = database.postlist_begin(t);
614 if (p != database.postlist_end(t)) {
615 docid = *p;
617 } catch (const Xapian::Error &e) {
618 // Hmm, what happened?
619 cout << "Caught exception in UNIQUE!" << endl;
620 cout << "E: " << e.get_description() << endl;
621 database.commit();
622 goto again;
624 break;
626 case Action::VALUE:
627 if (!value.empty())
628 doc.add_value(i->get_num_arg(), value);
629 break;
630 case Action::VALUENUMERIC: {
631 if (value.empty()) break;
632 char * end;
633 double dbl = strtod(value.c_str(), &end);
634 if (*end) {
635 cout << fname << ':' << line_no << ": Warning: "
636 "Trailing characters in VALUENUMERIC: '"
637 << value << "'" << endl;
639 doc.add_value(i->get_num_arg(),
640 Xapian::sortable_serialise(dbl));
641 break;
643 case Action::VALUEPACKED: {
644 uint32_t word = 0;
645 if (value.empty() || !C_isdigit(value[0])) {
646 // strtoul() accepts leading whitespace and negated
647 // values, neither of which we want to allow.
648 errno = EINVAL;
649 } else {
650 errno = 0;
651 char* q;
652 word = strtoul(value.c_str(), &q, 10);
653 if (!errno && *q != '\0') {
654 // Trailing characters after converted value.
655 errno = EINVAL;
658 if (errno) {
659 cout << fname << ':' << line_no << ": Warning: "
660 "valuepacked \"" << value << "\" ";
661 if (errno == ERANGE) {
662 cout << "out of range";
663 } else {
664 cout << "not an unsigned integer";
666 cout << endl;
668 int valueslot = i->get_num_arg();
669 doc.add_value(valueslot, int_to_binary_string(word));
670 break;
672 case Action::DATE: {
673 const string & type = i->get_string_arg();
674 string yyyymmdd;
675 if (type == "unix") {
676 time_t t = atoi(value.c_str());
677 struct tm *tm = localtime(&t);
678 int y = tm->tm_year + 1900;
679 int m = tm->tm_mon + 1;
680 yyyymmdd = date_to_string(y, m, tm->tm_mday);
681 } else if (type == "yyyymmdd") {
682 if (value.length() == 8) yyyymmdd = value;
684 if (yyyymmdd.empty()) break;
685 // Date (YYYYMMDD)
686 doc.add_boolean_term("D" + yyyymmdd);
687 yyyymmdd.resize(6);
688 // Month (YYYYMM)
689 doc.add_boolean_term("M" + yyyymmdd);
690 yyyymmdd.resize(4);
691 // Year (YYYY)
692 doc.add_boolean_term("Y" + yyyymmdd);
693 break;
695 case Action::PARSEDATE: {
696 string dateformat = i->get_string_arg();
697 struct tm tm;
698 memset(&tm, 0, sizeof(tm));
699 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
700 if (ret == NULL) {
701 cout << fname << ':' << line_no << ": Warning: "
702 "\"" << value << "\" doesn't match format "
703 "\"" << dateformat << '\"' << endl;
704 break;
707 if (*ret != '\0') {
708 cout << fname << ':' << line_no << ": Warning: "
709 "\"" << value << "\" not fully matched by "
710 "format \"" << dateformat << "\" "
711 "(\"" << ret << "\" left over) but "
712 "indexing anyway" << endl;
715 value = str(timegm(&tm));
716 break;
718 default:
719 /* Empty default case to avoid "unhandled enum value"
720 * warnings. */
721 break;
724 if (this_field_is_content) seen_content = true;
725 if (stream.eof()) break;
728 // If we haven't seen any fields (other than unique identifiers)
729 // the document is to be deleted.
730 if (!seen_content) {
731 if (docid) {
732 database.delete_document(docid);
733 if (verbose) cout << "Del: " << docid << endl;
734 delcount ++;
736 } else {
737 string data;
738 for (auto&& i : fields) {
739 for (auto&& field_val : i.second) {
740 data += i.first;
741 data += '=';
742 data += field_val;
743 data += '\n';
747 // Put the data in the document
748 doc.set_data(data);
750 // Add the document to the database
751 if (docid) {
752 try {
753 database.replace_document(docid, doc);
754 if (verbose) cout << "Replace: " << docid << endl;
755 repcount ++;
756 } catch (const Xapian::Error &e) {
757 cout << "E: " << e.get_description() << endl;
758 // Possibly the document was deleted by another
759 // process in the meantime...?
760 docid = database.add_document(doc);
761 cout << "Replace failed, adding as new: " << docid << endl;
763 } else {
764 docid = database.add_document(doc);
765 if (verbose) cout << "Add: " << docid << endl;
766 addcount ++;
771 // Commit after each file to make sure all changes from that file make it
772 // in.
773 if (verbose) cout << "Committing: " << endl;
774 database.commit();
776 return true;
780 main(int argc, char **argv)
781 try {
782 // If the database already exists, default to updating not overwriting.
783 int database_mode = Xapian::DB_CREATE_OR_OPEN;
784 verbose = false;
785 Xapian::Stem stemmer("english");
787 static const struct option longopts[] = {
788 { "help", no_argument, NULL, 'h' },
789 { "version", no_argument, NULL, 'V' },
790 { "stemmer", required_argument, NULL, 's' },
791 { "overwrite", no_argument, NULL, 'o' },
792 { "verbose", no_argument, NULL, 'v' },
793 { 0, 0, NULL, 0 }
796 bool more = true, show_help = false;
797 while (more) {
798 switch (gnu_getopt_long(argc, argv, "vs:hV", longopts, NULL)) {
799 case EOF:
800 more = false;
801 break;
802 default:
803 case 'h': // --help
804 show_help = true;
805 more = false;
806 break;
807 case 'V': // --version
808 print_package_info(PROG_NAME);
809 return 0;
810 case 'o': // --overwrite
811 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
812 break;
813 case 'v':
814 verbose = true;
815 break;
816 case 's':
817 try {
818 stemmer = Xapian::Stem(optarg);
819 } catch (const Xapian::InvalidArgumentError &) {
820 cerr << "Unknown stemming language '" << optarg << "'.\n";
821 cerr << "Available language names are: "
822 << Xapian::Stem::get_available_languages() << endl;
823 return 1;
825 break;
829 argv += optind;
830 argc -= optind;
831 if (show_help || argc < 2) {
832 cout << PROG_NAME " - " PROG_DESC "\n"
833 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
834 "\n"
835 "Creates or updates a Xapian database with the data from the input files listed\n"
836 "on the command line. If no files are specified, data is read from stdin.\n"
837 "\n"
838 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
839 "format for INDEXER_SCRIPT.\n"
840 "\n"
841 "Options:\n"
842 " -v, --verbose display additional messages to aid debugging\n"
843 " --overwrite create the database anew (the default is to update if\n"
844 " the database already exists)\n";
845 print_stemmer_help("");
846 print_help_and_version_help("");
847 exit(show_help ? 0 : 1);
850 parse_index_script(argv[1]);
852 // Open the database. If another process is currently updating the
853 // database, wait for the lock to become available.
854 auto flags = database_mode | Xapian::DB_RETRY_LOCK;
855 Xapian::WritableDatabase database(argv[0], flags);
857 Xapian::TermGenerator indexer;
858 indexer.set_stemmer(stemmer);
859 // Set the database for spellings to be added to by the "spell" action.
860 indexer.set_database(database);
862 addcount = 0;
863 repcount = 0;
864 delcount = 0;
866 if (argc == 2) {
867 // Read from stdin.
868 index_file("<stdin>", cin, database, indexer);
869 } else {
870 // Read file(s) listed on the command line.
871 for (int i = 2; i < argc; ++i) {
872 ifstream stream(argv[i]);
873 if (stream) {
874 index_file(argv[i], stream, database, indexer);
875 } else {
876 cout << "Can't open file " << argv[i] << endl;
881 cout << "records (added, replaced, deleted) = (" << addcount << ", "
882 << repcount << ", " << delcount << ")" << endl;
883 } catch (const Xapian::Error &error) {
884 cout << "Exception: " << error.get_description() << endl;
885 exit(1);
886 } catch (const std::bad_alloc &) {
887 cout << "Exception: std::bad_alloc" << endl;
888 exit(1);
889 } catch (...) {
890 cout << "Unknown Exception" << endl;
891 exit(1);