3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Sam Liddicott
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,2018 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
35 #include <unordered_set>
44 #include "commonhelp.h"
47 #include "myhtmlparse.h"
49 #include "stringutils.h"
51 #include "utf8truncate.h"
55 #include "gnu_getopt.h"
59 #define PROG_NAME "scriptindex"
60 #define PROG_DESC "index arbitrary data as described by an index script"
68 prefix_needs_colon(const string
& prefix
, unsigned ch
)
70 if (!C_isupper(ch
) && ch
!= ':') return false;
71 string::size_type len
= prefix
.length();
72 return (len
> 1 && prefix
[len
- 1] != ':');
75 const char * action_names
[] = {
77 "boolean", "date", "field", "hash", "hextobin", "index", "indexnopos",
78 "load", "lower", "parsedate", "spell", "split", "truncate", "unhtml",
79 "unique", "value", "valuenumeric", "valuepacked", "weight"
83 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
89 BOOLEAN
, DATE
, FIELD
, HASH
, HEXTOBIN
, INDEX
, INDEXNOPOS
, LOAD
, LOWER
,
90 PARSEDATE
, SPELL
, SPLIT
, TRUNCATE
, UNHTML
, UNIQUE
, VALUE
,
91 VALUENUMERIC
, VALUEPACKED
, WEIGHT
93 enum { SPLIT_NONE
, SPLIT_DEDUP
, SPLIT_SORT
};
98 // Offset into indexscript line.
101 Action(type action_
, size_t pos_
)
102 : action(action_
), num_arg(0), pos(pos_
) { }
103 Action(type action_
, size_t pos_
, const string
& arg
)
104 : action(action_
), string_arg(arg
), pos(pos_
) {
105 num_arg
= atoi(string_arg
.c_str());
107 Action(type action_
, size_t pos_
, const string
& arg
, int num
)
108 : action(action_
), num_arg(num
), string_arg(arg
), pos(pos_
) { }
109 type
get_action() const { return action
; }
110 int get_num_arg() const { return num_arg
; }
111 void set_num_arg(int num
) { num_arg
= num
; }
112 const string
& get_string_arg() const { return string_arg
; }
113 size_t get_pos() const { return pos
; }
116 enum diag_type
{ DIAG_ERROR
, DIAG_WARN
, DIAG_NOTE
};
119 report_location(enum diag_type type
,
120 const string
& filename
,
122 size_t pos
= string::npos
)
128 if (pos
!= string::npos
) {
129 // The first column is numbered 1.
130 cerr
<< ':' << pos
+ 1;
137 cerr
<< ": warning: ";
146 report_useless_action(const string
&file
, size_t line
, size_t pos
,
147 const string
&action
)
149 report_location(DIAG_WARN
, file
, line
, pos
);
150 cerr
<< "Index action '" << action
<< "' has no effect" << endl
;
152 static bool given_left_to_right_warning
= false;
153 if (!given_left_to_right_warning
) {
154 given_left_to_right_warning
= true;
155 report_location(DIAG_NOTE
, file
, line
, pos
);
156 cerr
<< "Actions are executed from left to right" << endl
;
160 static map
<string
, vector
<Action
>> index_spec
;
163 parse_index_script(const string
&filename
)
165 ifstream
script(filename
.c_str());
166 if (!script
.is_open()) {
167 report_location(DIAG_ERROR
, filename
);
168 cerr
<< strerror(errno
) << endl
;
173 bool had_unique
= false;
174 while (getline(script
, line
)) {
176 vector
<string
> fields
;
177 vector
<Action
> actions
;
178 string::const_iterator i
, j
;
179 const string
&s
= line
;
180 i
= find_if(s
.begin(), s
.end(), [](char ch
) { return !C_isspace(ch
); });
181 if (i
== s
.end() || *i
== '#') continue;
183 if (!C_isalnum(*i
)) {
184 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
185 cerr
<< "field name must start with alphanumeric" << endl
;
188 j
= find_if(i
, s
.end(),
189 [](char ch
) { return !C_isalnum(ch
) && ch
!= '_'; });
190 fields
.push_back(string(i
, j
));
191 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
192 if (i
== s
.end()) break;
195 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
199 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
200 cerr
<< "bad character '" << *i
<< "' in fieldname" << endl
;
204 Xapian::termcount weight
= 1;
205 size_t useless_weight_pos
= string::npos
;
206 map
<string
, Action::type
> boolmap
;
208 while (j
!= s
.end()) {
209 size_t action_pos
= j
- s
.begin();
210 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isalnum(ch
); });
211 string
action(s
, j
- s
.begin(), i
- j
);
212 Action::type code
= Action::BAD
;
213 unsigned min_args
= 0, max_args
= 0;
214 bool takes_integer_argument
= false;
215 if (!action
.empty()) {
218 if (action
== "boolean") {
219 code
= Action::BOOLEAN
;
224 if (action
== "date") {
226 min_args
= max_args
= 1;
230 if (action
== "field") {
231 code
= Action::FIELD
;
236 if (action
== "hash") {
239 takes_integer_argument
= true;
240 } else if (action
== "hextobin") {
241 code
= Action::HEXTOBIN
;
245 if (action
== "index") {
246 code
= Action::INDEX
;
248 } else if (action
== "indexnopos") {
249 code
= Action::INDEXNOPOS
;
254 if (action
== "lower") {
255 code
= Action::LOWER
;
256 } else if (action
== "load") {
261 if (action
== "parsedate") {
262 code
= Action::PARSEDATE
;
263 min_args
= max_args
= 1;
267 if (action
== "spell") {
268 code
= Action::SPELL
;
269 } else if (action
== "split") {
270 code
= Action::SPLIT
;
276 if (action
== "truncate") {
277 code
= Action::TRUNCATE
;
278 min_args
= max_args
= 1;
279 takes_integer_argument
= true;
283 if (action
== "unhtml") {
284 code
= Action::UNHTML
;
285 } else if (action
== "unique") {
286 code
= Action::UNIQUE
;
287 min_args
= max_args
= 1;
291 if (action
== "value") {
292 code
= Action::VALUE
;
293 min_args
= max_args
= 1;
294 takes_integer_argument
= true;
295 } else if (action
== "valuenumeric") {
296 code
= Action::VALUENUMERIC
;
297 min_args
= max_args
= 1;
298 takes_integer_argument
= true;
299 } else if (action
== "valuepacked") {
300 code
= Action::VALUEPACKED
;
301 min_args
= max_args
= 1;
302 takes_integer_argument
= true;
306 if (action
== "weight") {
307 code
= Action::WEIGHT
;
308 min_args
= max_args
= 1;
309 takes_integer_argument
= true;
314 if (code
== Action::BAD
) {
315 report_location(DIAG_ERROR
, filename
, line_no
, action_pos
);
316 cerr
<< "Unknown index action '" << action
<< "'" << endl
;
319 auto i_after_action
= i
;
320 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
322 if (i
!= s
.end() && *i
== '=') {
323 if (i
!= i_after_action
) {
324 report_location(DIAG_WARN
, filename
, line_no
,
325 i_after_action
- s
.begin());
326 cerr
<< "putting spaces between the action and '=' is "
327 "deprecated." << endl
;
331 report_location(DIAG_ERROR
, filename
, line_no
,
333 cerr
<< "Index action '" << action
334 << "' doesn't take an argument" << endl
;
339 j
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
341 report_location(DIAG_WARN
, filename
, line_no
,
343 cerr
<< "putting spaces between '=' and the argument is "
344 "deprecated." << endl
;
349 if (j
!= s
.end() && *j
== '"') {
354 i
= find_if(j
, s
.end(),
356 return ch
== '"' || ch
== '\\';
359 report_location(DIAG_ERROR
, filename
, line_no
,
361 cerr
<< "No closing quote" << endl
;
371 report_location(DIAG_ERROR
, filename
, line_no
,
373 cerr
<< "Bad escaping in quoted action argument"
402 if (!C_isxdigit(ch1
) ||
405 ch
= hex_digit(ch1
) << 4 |
415 vals
.emplace_back(std::move(arg
));
416 if (i
== s
.end() || C_isspace(*i
)) break;
418 report_location(DIAG_ERROR
, filename
, line_no
,
420 cerr
<< "Unexpected character '" << *i
421 << "' after closing quote" << endl
;
425 } else if (max_args
> 1) {
426 // Unquoted argument, split on comma.
427 i
= find_if(j
, s
.end(),
429 return C_isspace(ch
) || ch
== ',';
431 vals
.emplace_back(j
, i
);
432 if (*i
!= ',') break;
435 // Unquoted argument, including any commas.
436 i
= find_if(j
, s
.end(),
437 [](char ch
) { return C_isspace(ch
); });
438 vals
.emplace_back(j
, i
);
443 if (vals
.size() == max_args
) {
444 report_location(DIAG_ERROR
, filename
, line_no
,
446 cerr
<< "Index action '" << action
447 << "' takes at most " << max_args
<< " arguments"
453 if (vals
.size() < min_args
) {
454 report_location(DIAG_ERROR
, filename
, line_no
,
456 if (min_args
== max_args
) {
457 cerr
<< "Index action '" << action
458 << "' requires " << min_args
<< " arguments"
462 cerr
<< "Index action '" << action
463 << "' requires at least " << min_args
<< " arguments"
473 if (takes_integer_argument
) {
474 auto dot
= val
.find('.');
475 if (dot
!= string::npos
) {
476 report_location(DIAG_WARN
, filename
, line_no
,
477 j
- s
.begin() + dot
);
478 cerr
<< "Index action '" << action
479 << "' takes an integer argument" << endl
;
484 case Action::INDEXNOPOS
:
485 actions
.emplace_back(code
, action_pos
, val
, weight
);
486 useless_weight_pos
= string::npos
;
489 // We don't push an Action for WEIGHT - instead we
490 // store it ready to use in the INDEX and INDEXNOPOS
492 weight
= atoi(val
.c_str());
493 if (useless_weight_pos
!= string::npos
) {
494 report_useless_action(filename
, line_no
,
495 useless_weight_pos
, action
);
497 useless_weight_pos
= action_pos
;
499 case Action::SPLIT
: {
501 report_location(DIAG_ERROR
, filename
, line_no
);
502 cerr
<< "Split delimiter can't be empty" << endl
;
505 int operation
= Action::SPLIT_NONE
;
506 if (vals
.size() >= 2) {
507 if (vals
[1] == "dedup") {
508 operation
= Action::SPLIT_DEDUP
;
509 } else if (vals
[1] == "sort") {
510 operation
= Action::SPLIT_SORT
;
511 } else if (vals
[1] == "none") {
512 operation
= Action::SPLIT_NONE
;
514 report_location(DIAG_ERROR
, filename
, line_no
);
515 cerr
<< "Bad split operation '" << vals
[1]
520 actions
.emplace_back(code
, action_pos
, val
, operation
);
523 case Action::TRUNCATE
:
524 if (!actions
.empty() &&
525 actions
.back().get_action() == Action::LOAD
) {
526 /* Turn "load truncate=n" into "load" with
527 * num_arg n, so that we don't needlessly
528 * allocate memory and read data we're just
534 actions
.emplace_back(code
, action_pos
, val
);
538 report_location(DIAG_ERROR
, filename
, line_no
,
540 cerr
<< "Index action 'unique' used more than once"
545 if (boolmap
.find(val
) == boolmap
.end())
546 boolmap
[val
] = Action::UNIQUE
;
547 actions
.emplace_back(code
, action_pos
, val
);
550 actions
.emplace_back(code
, action_pos
, val
);
551 auto& obj
= actions
.back();
552 auto max_length
= obj
.get_num_arg();
553 if (max_length
< 6) {
554 report_location(DIAG_ERROR
, filename
, line_no
,
555 obj
.get_pos() + 4 + 1);
556 cerr
<< "Index action 'hash' takes an integer "
557 "argument which must be at least 6" << endl
;
562 case Action::BOOLEAN
:
563 boolmap
[val
] = Action::BOOLEAN
;
566 actions
.emplace_back(code
, action_pos
, val
);
568 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
571 report_location(DIAG_ERROR
, filename
, line_no
,
572 i_after_action
- s
.begin());
573 if (min_args
== max_args
) {
574 cerr
<< "Index action '" << action
<< "' requires "
575 << min_args
<< " arguments" << endl
;
578 cerr
<< "Index action '" << action
<< "' requires at least "
579 << min_args
<< " arguments" << endl
;
582 if (code
== Action::INDEX
|| code
== Action::INDEXNOPOS
) {
583 useless_weight_pos
= string::npos
;
584 actions
.emplace_back(code
, action_pos
, "", weight
);
585 } else if (code
== Action::HASH
) {
586 actions
.emplace_back(code
, action_pos
, "",
587 MAX_SAFE_TERM_LENGTH
- 1);
589 actions
.emplace_back(code
, action_pos
);
595 if (useless_weight_pos
!= string::npos
) {
596 report_useless_action(filename
, line_no
, useless_weight_pos
,
600 while (!actions
.empty()) {
602 Action::type action
= actions
.back().get_action();
605 case Action::HEXTOBIN
:
607 case Action::PARSEDATE
:
609 case Action::TRUNCATE
:
612 report_useless_action(filename
, line_no
,
613 actions
.back().get_pos(),
614 action_names
[action
]);
623 map
<string
, Action::type
>::const_iterator boolpfx
;
624 for (boolpfx
= boolmap
.begin(); boolpfx
!= boolmap
.end(); ++boolpfx
) {
625 if (boolpfx
->second
== Action::UNIQUE
) {
626 report_location(DIAG_WARN
, filename
, line_no
);
627 cerr
<< "Index action 'unique=" << boolpfx
->first
628 << "' without 'boolean=" << boolpfx
->first
<< "'" << endl
;
629 static bool given_doesnt_imply_boolean_warning
= false;
630 if (!given_doesnt_imply_boolean_warning
) {
631 given_doesnt_imply_boolean_warning
= true;
632 report_location(DIAG_NOTE
, filename
, line_no
);
633 cerr
<< "'unique' doesn't implicitly add a boolean term"
639 vector
<string
>::const_iterator field
;
640 for (field
= fields
.begin(); field
!= fields
.end(); ++field
) {
641 vector
<Action
> &v
= index_spec
[*field
];
643 if (fields
.size() == 1) {
644 // Optimise common case where there's only one fieldname
645 // for a list of actions.
646 v
= std::move(actions
);
651 v
.emplace_back(Action::NEW
, string::npos
);
652 v
.insert(v
.end(), actions
.begin(), actions
.end());
657 if (index_spec
.empty()) {
658 report_location(DIAG_ERROR
, filename
, line_no
);
659 cerr
<< "No rules found in index script" << endl
;
665 run_actions(vector
<Action
>::const_iterator action_it
,
666 vector
<Action
>::const_iterator action_end
,
667 Xapian::WritableDatabase
& database
,
668 Xapian::TermGenerator
& indexer
,
669 const string
& old_value
,
670 bool& this_field_is_content
, Xapian::Document
& doc
,
671 map
<string
, list
<string
>>& fields
,
672 string
& field
, const char* fname
,
673 size_t line_no
, Xapian::docid
& docid
)
675 string value
= old_value
;
676 while (action_it
!= action_end
) {
677 auto& action
= *action_it
++;
678 switch (action
.get_action()) {
683 // We're processing the same field again - give it a reprieve.
684 this_field_is_content
= true;
687 if (!value
.empty()) {
688 string f
= action
.get_string_arg();
689 if (f
.empty()) f
= field
;
690 // replace newlines with spaces
692 string::size_type j
= 0;
693 while ((j
= s
.find('\n', j
)) != string::npos
)
695 fields
[f
].push_back(s
);
699 indexer
.index_text(value
,
700 action
.get_num_arg(),
701 action
.get_string_arg());
703 case Action::INDEXNOPOS
:
704 // No positional information so phrase searching won't work.
705 // However, the database will use much less diskspace.
706 indexer
.index_text_without_positions(value
,
707 action
.get_num_arg(),
708 action
.get_string_arg());
710 case Action::BOOLEAN
: {
711 // Do nothing if there's no text.
712 if (value
.empty()) break;
714 string term
= action
.get_string_arg();
715 if (prefix_needs_colon(term
, value
[0])) term
+= ':';
718 doc
.add_boolean_term(term
);
722 unsigned int max_length
= action
.get_num_arg();
723 if (value
.length() > max_length
)
724 value
= hash_long_term(value
, max_length
);
727 case Action::HEXTOBIN
: {
728 size_t len
= value
.length();
730 report_location(DIAG_ERROR
, fname
, line_no
);
731 cerr
<< "hextobin: input must have even length"
735 output
.reserve(len
/ 2);
736 for (size_t j
= 0; j
< len
; j
+= 2) {
738 char b
= value
[j
+ 1];
739 if (!C_isxdigit(a
) || !C_isxdigit(b
)) {
740 report_location(DIAG_ERROR
, fname
, line_no
);
741 cerr
<< "hextobin: input must be all hex "
745 char r
= (hex_digit(a
) << 4) | hex_digit(b
);
748 value
= std::move(output
);
754 value
= Xapian::Unicode::tolower(value
);
757 bool truncated
= false;
758 // FIXME: Use NOATIME if we own the file or are root.
759 if (!load_file(value
, action
.get_num_arg(), NOCACHE
,
761 report_location(DIAG_ERROR
, fname
, line_no
);
762 cerr
<< "Couldn't load file '" << value
<< "': "
763 << strerror(errno
) << endl
;
766 if (!truncated
) break;
769 case Action::TRUNCATE
:
770 utf8_truncate(value
, action
.get_num_arg());
773 indexer
.set_flags(indexer
.FLAG_SPELLING
);
775 case Action::SPLIT
: {
776 // Execute actions on the split up to the first NEW, if any.
777 vector
<Action
>::const_iterator split_end
= action_it
;
778 while (split_end
!= action_end
&&
779 split_end
->get_action() != Action::NEW
) {
785 } else if (action
.get_num_arg() != Action::SPLIT_SORT
) {
786 // Generate split as we consume it.
787 const string
& delimiter
= action
.get_string_arg();
789 unique_ptr
<unordered_set
<string
>> seen
;
790 if (action
.get_num_arg() == Action::SPLIT_DEDUP
) {
791 seen
.reset(new unordered_set
<string
>);
794 if (delimiter
.size() == 1) {
795 // Special case for common single character delimiter.
796 char ch
= delimiter
[0];
797 string::size_type i
= 0;
799 string::size_type j
= value
.find(ch
, i
);
801 string
val(value
, i
, j
- i
);
802 if (!seen
.get() || seen
->insert(val
).second
) {
803 run_actions(action_it
, split_end
,
806 this_field_is_content
, doc
,
808 field
, fname
, line_no
,
812 if (j
== string::npos
) break;
816 string::size_type i
= 0;
818 string::size_type j
= value
.find(delimiter
, i
);
820 string
val(value
, i
, j
- i
);
821 if (!seen
.get() || seen
->insert(val
).second
) {
822 run_actions(action_it
, split_end
,
825 this_field_is_content
, doc
,
827 field
, fname
, line_no
,
831 if (j
== string::npos
) break;
832 i
= j
+ delimiter
.size();
836 vector
<string
> split_values
;
837 const string
& delimiter
= action
.get_string_arg();
838 if (delimiter
.size() == 1) {
839 // Special case for common single character delimiter.
840 char ch
= delimiter
[0];
841 string::size_type i
= 0;
843 string::size_type j
= value
.find(ch
, i
);
845 split_values
.emplace_back(value
, i
, j
- i
);
847 if (j
== string::npos
) break;
851 string::size_type i
= 0;
853 string::size_type j
= value
.find(delimiter
, i
);
855 split_values
.emplace_back(value
, i
, j
- i
);
857 if (j
== string::npos
) break;
858 i
= j
+ delimiter
.size();
862 sort(split_values
.begin(), split_values
.end());
864 for (auto&& val
: split_values
) {
865 run_actions(action_it
, split_end
,
866 database
, indexer
, val
,
867 this_field_is_content
, doc
, fields
,
868 field
, fname
, line_no
,
873 action_it
= split_end
;
876 case Action::UNHTML
: {
879 // Default HTML character set is latin 1, though
880 // not specifying one is deprecated these days.
881 p
.parse_html(value
, "iso-8859-1", false);
882 } catch (const string
& newcharset
) {
884 p
.parse_html(value
, newcharset
, true);
886 if (p
.indexing_allowed
)
892 case Action::UNIQUE
: {
893 // If there's no text, just issue a warning.
895 report_location(DIAG_WARN
, fname
, line_no
);
896 cerr
<< "Ignoring UNIQUE action on empty text"
901 // Ensure that the value of this field is unique.
902 // If a record already exists with the same value,
903 // it will be replaced with the new record.
905 // Unique fields aren't considered content - if
906 // there are no other fields in the document, the
907 // document is to be deleted.
908 this_field_is_content
= false;
910 // Argument is the prefix to add to the field value
911 // to get the unique term.
912 string t
= action
.get_string_arg();
913 if (prefix_needs_colon(t
, value
[0])) t
+= ':';
915 Xapian::PostingIterator p
= database
.postlist_begin(t
);
916 if (p
!= database
.postlist_end(t
)) {
923 doc
.add_value(action
.get_num_arg(), value
);
925 case Action::VALUENUMERIC
: {
926 if (value
.empty()) break;
928 double dbl
= strtod(value
.c_str(), &end
);
930 report_location(DIAG_WARN
, fname
, line_no
);
931 cerr
<< "Trailing characters in VALUENUMERIC: '"
932 << value
<< "'" << endl
;
934 doc
.add_value(action
.get_num_arg(),
935 Xapian::sortable_serialise(dbl
));
938 case Action::VALUEPACKED
: {
940 if (value
.empty() || !C_isdigit(value
[0])) {
941 // strtoul() accepts leading whitespace and negated
942 // values, neither of which we want to allow.
947 word
= strtoul(value
.c_str(), &q
, 10);
948 if (!errno
&& *q
!= '\0') {
949 // Trailing characters after converted value.
954 report_location(DIAG_WARN
, fname
, line_no
);
955 cerr
<< "valuepacked \"" << value
<< "\" ";
956 if (errno
== ERANGE
) {
957 cerr
<< "out of range";
959 cerr
<< "not an unsigned integer";
963 int valueslot
= action
.get_num_arg();
964 doc
.add_value(valueslot
, int_to_binary_string(word
));
968 const string
& type
= action
.get_string_arg();
970 if (type
== "unix") {
971 time_t t
= atoi(value
.c_str());
972 struct tm
*tm
= localtime(&t
);
973 int y
= tm
->tm_year
+ 1900;
974 int m
= tm
->tm_mon
+ 1;
975 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
976 } else if (type
== "yyyymmdd") {
977 if (value
.length() == 8) yyyymmdd
= value
;
979 if (yyyymmdd
.empty()) break;
981 doc
.add_boolean_term("D" + yyyymmdd
);
984 doc
.add_boolean_term("M" + yyyymmdd
);
987 doc
.add_boolean_term("Y" + yyyymmdd
);
990 case Action::PARSEDATE
: {
991 string dateformat
= action
.get_string_arg();
993 memset(&tm
, 0, sizeof(tm
));
994 auto ret
= strptime(value
.c_str(), dateformat
.c_str(), &tm
);
996 report_location(DIAG_WARN
, fname
, line_no
);
997 cerr
<< "\"" << value
<< "\" doesn't match format "
998 "\"" << dateformat
<< '\"' << endl
;
1003 report_location(DIAG_WARN
, fname
, line_no
);
1004 cerr
<< "\"" << value
<< "\" not fully matched by "
1005 "format \"" << dateformat
<< "\" "
1006 "(\"" << ret
<< "\" left over) but "
1007 "indexing anyway" << endl
;
1010 value
= str(timegm(&tm
));
1014 /* Empty default case to avoid "unhandled enum value"
1023 index_file(const char *fname
, istream
&stream
,
1024 Xapian::WritableDatabase
&database
, Xapian::TermGenerator
&indexer
)
1028 while (!stream
.eof() && getline(stream
, line
)) {
1030 Xapian::Document doc
;
1031 indexer
.set_document(doc
);
1032 Xapian::docid docid
= 0;
1033 map
<string
, list
<string
>> fields
;
1034 bool seen_content
= false;
1035 while (!line
.empty()) {
1036 // Cope with files from MS Windows (\r\n end of lines).
1037 // Trim multiple \r characters, since that seems the best way
1038 // to handle that case.
1039 string::size_type last
= line
.find_last_not_of('\r');
1040 if (last
== string::npos
) break;
1041 line
.resize(last
+ 1);
1043 string::size_type eq
= line
.find('=');
1044 if (eq
== string::npos
&& !line
.empty()) {
1045 report_location(DIAG_ERROR
, fname
, line_no
, line
.size());
1046 cerr
<< "expected = somewhere in this line" << endl
;
1047 // FIXME: die or what?
1049 string
field(line
, 0, eq
);
1050 string
value(line
, eq
+ 1, string::npos
);
1051 while (getline(stream
, line
)) {
1053 if (line
.empty() || line
[0] != '=') break;
1054 // Cope with files from MS Windows (\r\n end of lines).
1055 // Trim multiple \r characters, since that seems the best way
1056 // to handle that case.
1057 last
= line
.find_last_not_of('\r');
1058 // line[0] == '=', so last != string::npos.
1059 // Replace the '=' with a '\n' so we don't have to use substr.
1061 line
.resize(last
+ 1);
1065 // Default to not indexing spellings.
1066 indexer
.set_flags(Xapian::TermGenerator::flags(0));
1068 bool this_field_is_content
= true;
1069 const vector
<Action
>& v
= index_spec
[field
];
1070 run_actions(v
.begin(), v
.end(),
1071 database
, indexer
, value
,
1072 this_field_is_content
, doc
, fields
,
1073 field
, fname
, line_no
,
1075 if (this_field_is_content
) seen_content
= true;
1076 if (stream
.eof()) break;
1079 // If we haven't seen any fields (other than unique identifiers)
1080 // the document is to be deleted.
1081 if (!seen_content
) {
1083 database
.delete_document(docid
);
1084 if (verbose
) cout
<< "Del: " << docid
<< endl
;
1089 for (auto&& i
: fields
) {
1090 for (auto&& field_val
: i
.second
) {
1098 // Put the data in the document
1101 // Add the document to the database
1103 database
.replace_document(docid
, doc
);
1104 if (verbose
) cout
<< "Replace: " << docid
<< endl
;
1107 docid
= database
.add_document(doc
);
1108 if (verbose
) cout
<< "Add: " << docid
<< endl
;
1114 // Commit after each file to make sure all changes from that file make it
1116 if (verbose
) cout
<< "Committing: " << endl
;
1121 main(int argc
, char **argv
)
1123 // If the database already exists, default to updating not overwriting.
1124 int database_mode
= Xapian::DB_CREATE_OR_OPEN
;
1126 Xapian::Stem
stemmer("english");
1128 constexpr auto NO_ARG
= no_argument
;
1129 constexpr auto REQ_ARG
= required_argument
;
1130 static const struct option longopts
[] = {
1131 { "help", NO_ARG
, NULL
, 'h' },
1132 { "version", NO_ARG
, NULL
, 'V' },
1133 { "stemmer", REQ_ARG
, NULL
, 's' },
1134 { "overwrite", NO_ARG
, NULL
, 'o' },
1135 { "verbose", NO_ARG
, NULL
, 'v' },
1139 bool more
= true, show_help
= false;
1141 switch (gnu_getopt_long(argc
, argv
, "vs:hV", longopts
, NULL
)) {
1150 case 'V': // --version
1151 print_package_info(PROG_NAME
);
1153 case 'o': // --overwrite
1154 database_mode
= Xapian::DB_CREATE_OR_OVERWRITE
;
1161 stemmer
= Xapian::Stem(optarg
);
1162 } catch (const Xapian::InvalidArgumentError
&) {
1163 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n";
1164 cerr
<< "Available language names are: "
1165 << Xapian::Stem::get_available_languages() << endl
;
1174 if (show_help
|| argc
< 2) {
1175 cout
<< PROG_NAME
" - " PROG_DESC
"\n"
1176 "Usage: " PROG_NAME
" [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1178 "Creates or updates a Xapian database with the data from the input files listed\n"
1179 "on the command line. If no files are specified, data is read from stdin.\n"
1181 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1182 "format for INDEXER_SCRIPT.\n"
1185 " -v, --verbose display additional messages to aid debugging\n"
1186 " --overwrite create the database anew (the default is to update if\n"
1187 " the database already exists)\n";
1188 print_stemmer_help("");
1189 print_help_and_version_help("");
1190 exit(show_help
? 0 : 1);
1193 parse_index_script(argv
[1]);
1195 // Open the database. If another process is currently updating the
1196 // database, wait for the lock to become available.
1197 auto flags
= database_mode
| Xapian::DB_RETRY_LOCK
;
1198 Xapian::WritableDatabase
database(argv
[0], flags
);
1200 Xapian::TermGenerator indexer
;
1201 indexer
.set_stemmer(stemmer
);
1202 // Set the database for spellings to be added to by the "spell" action.
1203 indexer
.set_database(database
);
1211 index_file("<stdin>", cin
, database
, indexer
);
1213 // Read file(s) listed on the command line.
1214 for (int i
= 2; i
< argc
; ++i
) {
1215 ifstream
stream(argv
[i
]);
1217 index_file(argv
[i
], stream
, database
, indexer
);
1219 cerr
<< "Can't open file " << argv
[i
] << endl
;
1224 cout
<< "records (added, replaced, deleted) = (" << addcount
<< ", "
1225 << repcount
<< ", " << delcount
<< ")" << endl
;
1226 } catch (const Xapian::Error
&error
) {
1227 cerr
<< "Exception: " << error
.get_description() << endl
;
1229 } catch (const std::bad_alloc
&) {
1230 cerr
<< "Exception: std::bad_alloc" << endl
;
1233 cerr
<< "Unknown Exception" << endl
;