3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Sam Liddicott
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
38 #include "safeerrno.h"
42 #include "commonhelp.h"
45 #include "myhtmlparse.h"
47 #include "stringutils.h"
49 #include "utf8truncate.h"
53 #include "gnu_getopt.h"
57 #define PROG_NAME "scriptindex"
58 #define PROG_DESC "index arbitrary data as described by an index script"
66 prefix_needs_colon(const string
& prefix
, unsigned ch
)
68 if (!C_isupper(ch
) && ch
!= ':') return false;
69 string::size_type len
= prefix
.length();
70 return (len
> 1 && prefix
[len
- 1] != ':');
73 const char * action_names
[] = {
75 "boolean", "date", "field", "hash", "index", "indexnopos", "load", "lower",
76 "parsedate", "spell", "truncate", "unhtml", "unique", "value",
77 "valuenumeric", "valuepacked", "weight"
81 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
87 BOOLEAN
, DATE
, FIELD
, HASH
, INDEX
, INDEXNOPOS
, LOAD
, LOWER
,
88 PARSEDATE
, SPELL
, TRUNCATE
, UNHTML
, UNIQUE
, VALUE
,
89 VALUENUMERIC
, VALUEPACKED
, WEIGHT
96 Action(type action_
) : action(action_
), num_arg(0) { }
97 Action(type action_
, const string
& arg
)
98 : action(action_
), string_arg(arg
) {
99 num_arg
= atoi(string_arg
.c_str());
101 Action(type action_
, const string
& arg
, int num
)
102 : action(action_
), num_arg(num
), string_arg(arg
) { }
103 type
get_action() const { return action
; }
104 int get_num_arg() const { return num_arg
; }
105 const string
& get_string_arg() const { return string_arg
; }
109 report_useless_action(const string
&file
, size_t line
, size_t pos
,
110 const string
&action
)
112 cout
<< file
<< ':' << line
;
113 if (pos
!= string::npos
) cout
<< ':' << pos
;
114 cout
<< ": Warning: Index action '" << action
<< "' has no effect" << endl
;
116 static bool given_left_to_right_warning
= false;
117 if (!given_left_to_right_warning
) {
118 given_left_to_right_warning
= true;
119 cout
<< file
<< ':' << line
120 << ": Warning: Note that actions are executed from left to right"
125 static map
<string
, vector
<Action
> > index_spec
;
128 parse_index_script(const string
&filename
)
130 ifstream
script(filename
.c_str());
131 if (!script
.is_open()) {
132 cout
<< filename
<< ": " << strerror(errno
) << endl
;
137 bool had_unique
= false;
138 while (getline(script
, line
)) {
140 vector
<string
> fields
;
141 vector
<Action
> actions
;
142 string::const_iterator i
, j
;
143 const string
&s
= line
;
144 i
= find_if(s
.begin(), s
.end(), [](char ch
) { return !C_isspace(ch
); });
145 if (i
== s
.end() || *i
== '#') continue;
147 if (!C_isalnum(*i
)) {
148 cout
<< filename
<< ':' << line_no
149 << ": field name must start with alphanumeric" << endl
;
152 j
= find_if(i
, s
.end(),
153 [](char ch
) { return !C_isalnum(ch
) && ch
!= '_'; });
154 fields
.push_back(string(i
, j
));
155 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
156 if (i
== s
.end()) break;
159 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
163 cout
<< filename
<< ':' << line_no
164 << ": bad character '" << *j
<< "' in fieldname" << endl
;
168 Xapian::termcount weight
= 1;
169 size_t useless_weight_pos
= string::npos
;
170 map
<string
, Action::type
> boolmap
;
172 while (j
!= s
.end()) {
173 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isalnum(ch
); });
174 string
action(s
, j
- s
.begin(), i
- j
);
175 Action::type code
= Action::BAD
;
176 enum {NO
, OPT
, YES
} arg
= NO
;
177 bool takes_integer_argument
= false;
178 if (!action
.empty()) {
181 if (action
== "boolean") {
182 code
= Action::BOOLEAN
;
187 if (action
== "date") {
193 if (action
== "field") {
194 code
= Action::FIELD
;
199 if (action
== "hash") {
202 takes_integer_argument
= true;
206 if (action
== "index") {
207 code
= Action::INDEX
;
209 } else if (action
== "indexnopos") {
210 code
= Action::INDEXNOPOS
;
215 if (action
== "lower") {
216 code
= Action::LOWER
;
217 } else if (action
== "load") {
222 if (action
== "parsedate") {
223 code
= Action::PARSEDATE
;
228 if (action
== "spell") {
229 code
= Action::SPELL
;
233 if (action
== "truncate") {
234 code
= Action::TRUNCATE
;
236 takes_integer_argument
= true;
240 if (action
== "unhtml") {
241 code
= Action::UNHTML
;
242 } else if (action
== "unique") {
243 code
= Action::UNIQUE
;
248 if (action
== "value") {
249 code
= Action::VALUE
;
251 takes_integer_argument
= true;
252 } else if (action
== "valuenumeric") {
253 code
= Action::VALUENUMERIC
;
255 takes_integer_argument
= true;
256 } else if (action
== "valuepacked") {
257 code
= Action::VALUEPACKED
;
259 takes_integer_argument
= true;
263 if (action
== "weight") {
264 code
= Action::WEIGHT
;
266 takes_integer_argument
= true;
271 if (code
== Action::BAD
) {
272 cout
<< filename
<< ':' << line_no
273 << ": Unknown index action '" << action
<< "'" << endl
;
276 auto i_after_action
= i
;
277 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
279 if (i
!= s
.end() && *i
== '=') {
280 if (i
!= i_after_action
) {
281 cout
<< filename
<< ':' << line_no
282 << ": warning: putting spaces between the action and "
283 "'=' is deprecated." << endl
;
287 cout
<< filename
<< ':' << line_no
288 << ": Index action '" << action
289 << "' doesn't take an argument" << endl
;
293 j
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
295 cout
<< filename
<< ':' << line_no
296 << ": warning: putting spaces between '=' and the "
297 "argument is deprecated." << endl
;
300 if (j
!= s
.end() && *j
== '"') {
303 i
= find(j
, s
.end(), '"');
305 cout
<< filename
<< ':' << line_no
<< ": No closing quote" << endl
;
311 // Unquoted argument.
312 i
= find_if(j
, s
.end(), [](char ch
) { return C_isspace(ch
); });
315 if (takes_integer_argument
) {
316 if (val
.find('.') != string::npos
) {
317 cout
<< filename
<< ':' << line_no
318 << ": Warning: Index action '" << action
319 << "' takes an integer argument" << endl
;
324 case Action::INDEXNOPOS
:
325 actions
.push_back(Action(code
, val
, weight
));
326 useless_weight_pos
= string::npos
;
329 // We don't push an Action for WEIGHT - instead we
330 // store it ready to use in the INDEX and INDEXNOPOS
332 weight
= atoi(val
.c_str());
333 if (useless_weight_pos
!= string::npos
) {
334 report_useless_action(filename
, line_no
,
335 useless_weight_pos
, action
);
337 useless_weight_pos
= j
- s
.begin();
339 case Action::TRUNCATE
:
340 if (!actions
.empty() &&
341 actions
.back().get_action() == Action::LOAD
) {
342 /* Turn "load truncate=n" into "load" with
343 * num_arg n, so that we don't needlessly
344 * allocate memory and read data we're just
350 actions
.push_back(Action(code
, val
));
354 cout
<< filename
<< ':' << line_no
355 << ": Index action 'unique' used more than "
360 if (boolmap
.find(val
) == boolmap
.end())
361 boolmap
[val
] = Action::UNIQUE
;
362 actions
.push_back(Action(code
, val
));
364 case Action::BOOLEAN
:
365 boolmap
[val
] = Action::BOOLEAN
;
368 actions
.push_back(Action(code
, val
));
370 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
373 cout
<< filename
<< ':' << line_no
374 << ": Index action '" << action
375 << "' must have an argument" << endl
;
378 if (code
== Action::INDEX
|| code
== Action::INDEXNOPOS
) {
379 useless_weight_pos
= string::npos
;
380 actions
.push_back(Action(code
, "", weight
));
382 actions
.push_back(Action(code
));
388 if (useless_weight_pos
!= string::npos
) {
389 report_useless_action(filename
, line_no
, useless_weight_pos
,
393 while (!actions
.empty()) {
395 Action::type action
= actions
.back().get_action();
400 case Action::TRUNCATE
:
403 report_useless_action(filename
, line_no
, string::npos
,
404 action_names
[action
]);
413 map
<string
, Action::type
>::const_iterator boolpfx
;
414 for (boolpfx
= boolmap
.begin(); boolpfx
!= boolmap
.end(); ++boolpfx
) {
415 if (boolpfx
->second
== Action::UNIQUE
) {
416 cout
<< filename
<< ':' << line_no
417 << ": Warning: Index action 'unique=" << boolpfx
->first
418 << "' without 'boolean=" << boolpfx
->first
<< "'" << endl
;
419 static bool given_doesnt_imply_boolean_warning
= false;
420 if (!given_doesnt_imply_boolean_warning
) {
421 given_doesnt_imply_boolean_warning
= true;
422 cout
<< filename
<< ':' << line_no
423 << ": Warning: Note 'unique' doesn't implicitly add "
424 "a boolean term" << endl
;
429 vector
<string
>::const_iterator field
;
430 for (field
= fields
.begin(); field
!= fields
.end(); ++field
) {
431 vector
<Action
> &v
= index_spec
[*field
];
435 v
.push_back(Action(Action::NEW
));
436 v
.insert(v
.end(), actions
.begin(), actions
.end());
441 if (index_spec
.empty()) {
442 cout
<< filename
<< ": No rules found in index script" << endl
;
448 index_file(const char *fname
, istream
&stream
,
449 Xapian::WritableDatabase
&database
, Xapian::TermGenerator
&indexer
)
453 while (!stream
.eof() && getline(stream
, line
)) {
455 Xapian::Document doc
;
456 indexer
.set_document(doc
);
457 Xapian::docid docid
= 0;
458 map
<string
, list
<string
> > fields
;
459 bool seen_content
= false;
460 while (!line
.empty()) {
461 // Cope with files from MS Windows (\r\n end of lines).
462 // Trim multiple \r characters, since that seems the best way
463 // to handle that case.
464 string::size_type last
= line
.find_last_not_of('\r');
465 if (last
== string::npos
) break;
466 line
.resize(last
+ 1);
468 string::size_type eq
= line
.find('=');
469 if (eq
== string::npos
&& !line
.empty()) {
470 cout
<< fname
<< ':' << line_no
<< ": expected = somewhere "
471 "in this line" << endl
;
472 // FIXME: die or what?
474 string
field(line
, 0, eq
);
475 string
value(line
, eq
+ 1, string::npos
);
476 while (getline(stream
, line
)) {
478 if (line
.empty() || line
[0] != '=') break;
479 // Cope with files from MS Windows (\r\n end of lines).
480 // Trim multiple \r characters, since that seems the best way
481 // to handle that case.
482 last
= line
.find_last_not_of('\r');
483 // line[0] == '=', so last != string::npos.
484 // Replace the '=' with a '\n' so we don't have to use substr.
486 line
.resize(last
+ 1);
490 // Default to not indexing spellings.
491 indexer
.set_flags(Xapian::TermGenerator::flags(0));
493 const vector
<Action
> &v
= index_spec
[field
];
494 string old_value
= value
;
495 vector
<Action
>::const_iterator i
;
496 bool this_field_is_content
= true;
497 for (i
= v
.begin(); i
!= v
.end(); ++i
) {
498 switch (i
->get_action()) {
503 // We're processing the same field again - give it a
505 this_field_is_content
= true;
508 if (!value
.empty()) {
509 string f
= i
->get_string_arg();
510 if (f
.empty()) f
= field
;
511 // replace newlines with spaces
513 string::size_type j
= 0;
514 while ((j
= s
.find('\n', j
)) != string::npos
)
516 fields
[f
].push_back(s
);
520 indexer
.index_text(value
,
522 i
->get_string_arg());
524 case Action::INDEXNOPOS
:
525 // No positional information so phrase searching
526 // won't work. However, the database will use much
528 indexer
.index_text_without_positions(value
,
530 i
->get_string_arg());
532 case Action::BOOLEAN
: {
533 // Do nothing if there's no text.
534 if (value
.empty()) break;
536 string term
= i
->get_string_arg();
537 if (prefix_needs_colon(term
, value
[0])) term
+= ':';
540 doc
.add_boolean_term(term
);
544 unsigned int max_length
= i
->get_num_arg();
546 max_length
= MAX_SAFE_TERM_LENGTH
- 1;
547 if (value
.length() > max_length
)
548 value
= hash_long_term(value
, max_length
);
552 value
= Xapian::Unicode::tolower(value
);
555 bool truncated
= false;
556 // FIXME: Use NOATIME if we own the file or are root.
557 if (!load_file(value
, i
->get_num_arg(), NOCACHE
,
559 cerr
<< "Couldn't load file '" << value
<< "': "
560 << strerror(errno
) << endl
;
563 if (!truncated
) break;
566 case Action::TRUNCATE
:
567 utf8_truncate(value
, i
->get_num_arg());
570 indexer
.set_flags(indexer
.FLAG_SPELLING
);
572 case Action::UNHTML
: {
575 // Default HTML character set is latin 1, though
576 // not specifying one is deprecated these days.
577 p
.parse_html(value
, "iso-8859-1", false);
578 } catch (const string
& newcharset
) {
580 p
.parse_html(value
, newcharset
, true);
582 if (p
.indexing_allowed
)
588 case Action::UNIQUE
: {
589 // If there's no text, just issue a warning.
591 cout
<< fname
<< ':' << line_no
592 << ": Ignoring UNIQUE action on empty text"
597 // Ensure that the value of this field is unique.
598 // If a record already exists with the same value,
599 // it will be replaced with the new record.
601 // Unique fields aren't considered content - if
602 // there are no other fields in the document, the
603 // document is to be deleted.
604 this_field_is_content
= false;
606 // Argument is the prefix to add to the field value
607 // to get the unique term.
608 string t
= i
->get_string_arg();
609 if (prefix_needs_colon(t
, value
[0])) t
+= ':';
613 Xapian::PostingIterator p
= database
.postlist_begin(t
);
614 if (p
!= database
.postlist_end(t
)) {
617 } catch (const Xapian::Error
&e
) {
618 // Hmm, what happened?
619 cout
<< "Caught exception in UNIQUE!" << endl
;
620 cout
<< "E: " << e
.get_description() << endl
;
628 doc
.add_value(i
->get_num_arg(), value
);
630 case Action::VALUENUMERIC
: {
631 if (value
.empty()) break;
633 double dbl
= strtod(value
.c_str(), &end
);
635 cout
<< fname
<< ':' << line_no
<< ": Warning: "
636 "Trailing characters in VALUENUMERIC: '"
637 << value
<< "'" << endl
;
639 doc
.add_value(i
->get_num_arg(),
640 Xapian::sortable_serialise(dbl
));
643 case Action::VALUEPACKED
: {
645 if (value
.empty() || !C_isdigit(value
[0])) {
646 // strtoul() accepts leading whitespace and negated
647 // values, neither of which we want to allow.
652 word
= strtoul(value
.c_str(), &q
, 10);
653 if (!errno
&& *q
!= '\0') {
654 // Trailing characters after converted value.
659 cout
<< fname
<< ':' << line_no
<< ": Warning: "
660 "valuepacked \"" << value
<< "\" ";
661 if (errno
== ERANGE
) {
662 cout
<< "out of range";
664 cout
<< "not an unsigned integer";
668 int valueslot
= i
->get_num_arg();
669 doc
.add_value(valueslot
, int_to_binary_string(word
));
673 const string
& type
= i
->get_string_arg();
675 if (type
== "unix") {
676 time_t t
= atoi(value
.c_str());
677 struct tm
*tm
= localtime(&t
);
678 int y
= tm
->tm_year
+ 1900;
679 int m
= tm
->tm_mon
+ 1;
680 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
681 } else if (type
== "yyyymmdd") {
682 if (value
.length() == 8) yyyymmdd
= value
;
684 if (yyyymmdd
.empty()) break;
686 doc
.add_boolean_term("D" + yyyymmdd
);
689 doc
.add_boolean_term("M" + yyyymmdd
);
692 doc
.add_boolean_term("Y" + yyyymmdd
);
695 case Action::PARSEDATE
: {
696 string dateformat
= i
->get_string_arg();
698 memset(&tm
, 0, sizeof(tm
));
699 auto ret
= strptime(value
.c_str(), dateformat
.c_str(), &tm
);
701 cout
<< fname
<< ':' << line_no
<< ": Warning: "
702 "\"" << value
<< "\" doesn't match format "
703 "\"" << dateformat
<< '\"' << endl
;
708 cout
<< fname
<< ':' << line_no
<< ": Warning: "
709 "\"" << value
<< "\" not fully matched by "
710 "format \"" << dateformat
<< "\" "
711 "(\"" << ret
<< "\" left over) but "
712 "indexing anyway" << endl
;
715 value
= str(timegm(&tm
));
719 /* Empty default case to avoid "unhandled enum value"
724 if (this_field_is_content
) seen_content
= true;
725 if (stream
.eof()) break;
728 // If we haven't seen any fields (other than unique identifiers)
729 // the document is to be deleted.
732 database
.delete_document(docid
);
733 if (verbose
) cout
<< "Del: " << docid
<< endl
;
738 for (auto&& i
: fields
) {
739 for (auto&& field_val
: i
.second
) {
747 // Put the data in the document
750 // Add the document to the database
753 database
.replace_document(docid
, doc
);
754 if (verbose
) cout
<< "Replace: " << docid
<< endl
;
756 } catch (const Xapian::Error
&e
) {
757 cout
<< "E: " << e
.get_description() << endl
;
758 // Possibly the document was deleted by another
759 // process in the meantime...?
760 docid
= database
.add_document(doc
);
761 cout
<< "Replace failed, adding as new: " << docid
<< endl
;
764 docid
= database
.add_document(doc
);
765 if (verbose
) cout
<< "Add: " << docid
<< endl
;
771 // Commit after each file to make sure all changes from that file make it
773 if (verbose
) cout
<< "Committing: " << endl
;
780 main(int argc
, char **argv
)
782 // If the database already exists, default to updating not overwriting.
783 int database_mode
= Xapian::DB_CREATE_OR_OPEN
;
785 Xapian::Stem
stemmer("english");
787 static const struct option longopts
[] = {
788 { "help", no_argument
, NULL
, 'h' },
789 { "version", no_argument
, NULL
, 'V' },
790 { "stemmer", required_argument
, NULL
, 's' },
791 { "overwrite", no_argument
, NULL
, 'o' },
792 { "verbose", no_argument
, NULL
, 'v' },
796 bool more
= true, show_help
= false;
798 switch (gnu_getopt_long(argc
, argv
, "vs:hV", longopts
, NULL
)) {
807 case 'V': // --version
808 print_package_info(PROG_NAME
);
810 case 'o': // --overwrite
811 database_mode
= Xapian::DB_CREATE_OR_OVERWRITE
;
818 stemmer
= Xapian::Stem(optarg
);
819 } catch (const Xapian::InvalidArgumentError
&) {
820 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n";
821 cerr
<< "Available language names are: "
822 << Xapian::Stem::get_available_languages() << endl
;
831 if (show_help
|| argc
< 2) {
832 cout
<< PROG_NAME
" - " PROG_DESC
"\n"
833 "Usage: " PROG_NAME
" [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
835 "Creates or updates a Xapian database with the data from the input files listed\n"
836 "on the command line. If no files are specified, data is read from stdin.\n"
838 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
839 "format for INDEXER_SCRIPT.\n"
842 " -v, --verbose display additional messages to aid debugging\n"
843 " --overwrite create the database anew (the default is to update if\n"
844 " the database already exists)\n";
845 print_stemmer_help("");
846 print_help_and_version_help("");
847 exit(show_help
? 0 : 1);
850 parse_index_script(argv
[1]);
852 // Open the database. If another process is currently updating the
853 // database, wait for the lock to become available.
854 auto flags
= database_mode
| Xapian::DB_RETRY_LOCK
;
855 Xapian::WritableDatabase
database(argv
[0], flags
);
857 Xapian::TermGenerator indexer
;
858 indexer
.set_stemmer(stemmer
);
859 // Set the database for spellings to be added to by the "spell" action.
860 indexer
.set_database(database
);
868 index_file("<stdin>", cin
, database
, indexer
);
870 // Read file(s) listed on the command line.
871 for (int i
= 2; i
< argc
; ++i
) {
872 ifstream
stream(argv
[i
]);
874 index_file(argv
[i
], stream
, database
, indexer
);
876 cout
<< "Can't open file " << argv
[i
] << endl
;
881 cout
<< "records (added, replaced, deleted) = (" << addcount
<< ", "
882 << repcount
<< ", " << delcount
<< ")" << endl
;
883 } catch (const Xapian::Error
&error
) {
884 cout
<< "Exception: " << error
.get_description() << endl
;
886 } catch (const std::bad_alloc
&) {
887 cout
<< "Exception: std::bad_alloc" << endl
;
890 cout
<< "Unknown Exception" << endl
;