1 /* omindex.cc: index static documents into the omega db
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2005 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017 Olly Betts
7 * Copyright 2009 Frank J Bruzzaniti
8 * Copyright 2012 Mihai Bivol
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
33 #include <sys/types.h>
34 #include "safeunistd.h"
38 #include "safefcntl.h"
46 #include "commonhelp.h"
49 #include "index_file.h"
53 #include "stringutils.h"
54 #include "urlencode.h"
56 #include "gnu_getopt.h"
60 #define PROG_NAME "omindex"
61 #define PROG_DESC "Index static website data via the filesystem"
63 #define TITLE_SIZE 128
64 #define SAMPLE_SIZE 512
66 static bool follow_symlinks
= false;
67 static off_t max_size
= 0;
68 static std::string pretty_max_size
;
69 static bool verbose
= false;
70 static double sleep_before_opendir
= 0;
73 static string url_start_path
;
76 static vector
<pair
<const char*, const char*>> mime_patterns
;
80 p_notalnum(unsigned int c
)
86 index_file(const string
&file
, const string
&url
, DirectoryIterator
& d
,
87 map
<string
, string
>& mime_map
)
92 if (urlterm
.length() > MAX_SAFE_TERM_LENGTH
)
93 urlterm
= hash_long_term(urlterm
, MAX_SAFE_TERM_LENGTH
);
95 const char* leafname
= d
.leafname();
99 for (auto&& i
: mime_patterns
) {
100 if (fnmatch(i
.first
, leafname
, 0) == 0) {
101 if (strcmp(i
.second
, "ignore") == 0)
103 if (strcmp(i
.second
, "skip") == 0) {
104 string m
= "Leafname '";
106 m
+= "' matches pattern: ";
108 skip(urlterm
, file
.substr(root
.size()), m
,
109 d
.get_size(), d
.get_mtime());
119 const char * dot_ptr
= strrchr(leafname
, '.');
121 ext
.assign(dot_ptr
+ 1);
122 if (ext
.size() > max_ext_len
)
126 if (mimetype
.empty()) {
127 mimetype
= mimetype_from_ext(mime_map
, ext
);
128 if (mimetype
== "ignore") {
130 } else if (mimetype
== "skip") {
131 // Ignore mimetype, skipped mimetype should not be quietly ignored.
132 string m
= "skipping extension '";
135 skip(urlterm
, file
.substr(root
.size()), m
,
136 d
.get_size(), d
.get_mtime());
141 // Check the file size.
142 off_t size
= d
.get_size();
144 skip(urlterm
, file
.substr(root
.size()), "Zero-sized file",
145 size
, d
.get_mtime(), SKIP_VERBOSE_ONLY
);
149 if (max_size
> 0 && size
> max_size
) {
150 skip(urlterm
, file
.substr(root
.size()),
151 "Larger than size limit of " + pretty_max_size
,
157 // If we didn't get the mime type from the extension, call libmagic to get
159 if (mimetype
.empty()) {
160 mimetype
= d
.get_magic_mimetype();
161 if (mimetype
.empty()) {
162 skip(urlterm
, file
.substr(root
.size()), "Unknown extension and unrecognised format",
163 d
.get_size(), d
.get_mtime(), SKIP_SHOW_FILENAME
);
169 cout
<< "Indexing \"" << file
.substr(root
.size()) << "\" as "
170 << mimetype
<< " ... ";
172 Xapian::Document new_doc
;
174 // Use `file` as the basis, as we don't want URL encoding in these terms,
175 // but need to switch over the initial part so we get `/~olly/foo/bar` not
176 // `/home/olly/public_html/foo/bar`.
177 string
path_term("P");
178 path_term
+= url_start_path
;
179 path_term
.append(file
, root
.size(), string::npos
);
182 while ((i
= path_term
.rfind('/')) > 1 && i
!= string::npos
) {
184 if (path_term
.length() > MAX_SAFE_TERM_LENGTH
) {
185 new_doc
.add_boolean_term(hash_long_term(path_term
, MAX_SAFE_TERM_LENGTH
));
187 new_doc
.add_boolean_term(path_term
);
191 index_mimetype(file
, urlterm
, url
, ext
, mimetype
, d
, new_doc
, string());
195 index_directory(const string
&path
, const string
&url_
, size_t depth_limit
,
196 map
<string
, string
>& mime_map
)
199 cout
<< "[Entering directory \"" << path
.substr(root
.size()) << "\"]"
202 DirectoryIterator
d(follow_symlinks
);
204 // Crude workaround for MS-DFS share misbehaviour.
205 if (sleep_before_opendir
> 0.0)
206 RealTime::sleep(RealTime::now() + sleep_before_opendir
);
212 url_encode(url
, d
.leafname());
214 file
+= d
.leafname();
217 switch (d
.get_type()) {
218 case DirectoryIterator::DIRECTORY
: {
219 size_t new_limit
= depth_limit
;
221 if (--new_limit
== 0) continue;
225 index_directory(file
, url
, new_limit
, mime_map
);
228 case DirectoryIterator::REGULAR_FILE
:
229 index_file(file
, url
, d
, mime_map
);
232 skip("U" + url
, file
.substr(root
.size()), "Not a regular file",
233 d
.get_size(), d
.get_mtime(),
234 SKIP_VERBOSE_ONLY
| SKIP_SHOW_FILENAME
);
236 } catch (const FileNotFound
& e
) {
237 skip("U" + url
, file
.substr(root
.size()), "File removed during indexing",
238 d
.get_size(), d
.get_mtime(),
239 /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME
);
240 } catch (const std::string
& error
) {
241 skip("U" + url
, file
.substr(root
.size()), error
,
242 d
.get_size(), d
.get_mtime(), SKIP_SHOW_FILENAME
);
245 } catch (FileNotFound
) {
247 cout
<< "Directory \"" << path
.substr(root
.size()) << "\" "
248 "deleted during indexing" << endl
;
249 } catch (const std::string
& error
) {
250 cout
<< error
<< " - skipping directory "
251 "\"" << path
.substr(root
.size()) << "\"" << endl
;
258 // Don't want negative numbers, infinity, NaN, or hex numbers.
259 if (C_isdigit(p
[0]) && (p
[1] | 32) != 'x') {
260 double arg
= strtod(p
, &p
);
269 arg
*= (1024 * 1024);
273 arg
*= (1024 * 1024 * 1024);
285 main(int argc
, char **argv
)
287 // If overwrite is true, the database will be created anew even if it
289 bool overwrite
= false;
290 // If delete_removed_documents is true, delete any documents we don't see.
291 bool delete_removed_documents
= true;
292 // Retry files which we failed to index on a previous run?
293 bool retry_failed
= false;
294 bool use_ctime
= false;
295 bool spelling
= false;
296 bool skip_duplicates
= false;
297 bool ignore_exclusions
= false;
298 bool description_as_sample
= false;
300 size_t depth_limit
= 0;
301 size_t title_size
= TITLE_SIZE
;
302 size_t sample_size
= SAMPLE_SIZE
;
303 empty_body_type empty_body
= EMPTY_BODY_WARN
;
304 string site_term
, host_term
;
305 Xapian::Stem
stemmer("english");
307 enum { OPT_OPENDIR_SLEEP
= 256, OPT_SAMPLE
};
308 static const struct option longopts
[] = {
309 { "help", no_argument
, NULL
, 'h' },
310 { "version", no_argument
, NULL
, 'V' },
311 { "overwrite", no_argument
, NULL
, 'o' },
312 { "duplicates", required_argument
, NULL
, 'd' },
313 { "no-delete", no_argument
, NULL
, 'p' },
314 { "preserve-nonduplicates", no_argument
, NULL
, 'p' },
315 { "db", required_argument
, NULL
, 'D' },
316 { "url", required_argument
, NULL
, 'U' },
317 { "mime-type", required_argument
, NULL
, 'M' },
318 { "mime-type-match", required_argument
, NULL
, 'G' },
319 { "filter", required_argument
, NULL
, 'F' },
320 { "depth-limit",required_argument
, NULL
, 'l' },
321 { "follow", no_argument
, NULL
, 'f' },
322 { "ignore-exclusions", no_argument
, NULL
, 'i' },
323 { "stemmer", required_argument
, NULL
, 's' },
324 { "spelling", no_argument
, NULL
, 'S' },
325 { "verbose", no_argument
, NULL
, 'v' },
326 { "empty-docs", required_argument
, NULL
, 'e' },
327 { "max-size", required_argument
, NULL
, 'm' },
328 { "sample", required_argument
, NULL
, OPT_SAMPLE
},
329 { "sample-size",required_argument
, NULL
, 'E' },
330 { "title-size", required_argument
, NULL
, 'T' },
331 { "retry-failed", no_argument
, NULL
, 'R' },
332 { "opendir-sleep", required_argument
, NULL
, OPT_OPENDIR_SLEEP
},
333 { "track-ctime",no_argument
, NULL
, 'C' },
337 map
<string
, string
> mime_map
;
339 index_add_default_filters();
341 if (argc
== 2 && strcmp(argv
[1], "-v") == 0) {
342 // -v was the short option for --version in 1.2.3 and earlier, but
343 // now it is short for --verbose (for consistency with scriptindex)
344 // so if "-v" is the only option, translate it to "--version" for
345 // backwards compatibility.
346 argv
[1] = const_cast<char *>("--version");
351 while ((getopt_ret
= gnu_getopt_long(argc
, argv
, "hvd:D:U:M:F:l:s:pfRSVe:im:E:T:",
352 longopts
, NULL
)) != -1) {
353 switch (getopt_ret
) {
355 cout
<< PROG_NAME
" - " PROG_DESC
"\n\n"
356 "Usage: " PROG_NAME
" [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
358 "DIRECTORY is the directory to start indexing from.\n"
360 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
363 " -d, --duplicates set duplicate handling ('ignore' or 'replace')\n"
364 " -p, --no-delete skip the deletion of documents corresponding to\n"
365 " deleted files (--preserve-nonduplicates is a\n"
366 " deprecated alias for --no-delete)\n"
367 " -e, --empty-docs=ARG how to handle documents we extract no text from:\n"
368 " ARG can be index, warn (issue a diagnostic and\n"
369 " index), or skip. (default: warn)\n"
370 " -D, --db=DATABASE path to database to use\n"
371 " -U, --url=URL base url BASEDIR corresponds to (default: /)\n"
372 " -M, --mime-type=EXT:TYPE assume any file with extension EXT has MIME\n"
373 " Content-Type TYPE, instead of using libmagic\n"
374 " (empty TYPE removes any existing mapping for EXT;\n"
375 " other special TYPE values: 'ignore' and 'skip')\n"
376 " -G, --mime-type-match=GLOB:TYPE\n"
377 " assume any file with leaf name matching shell\n"
378 " wildcard pattern GLOB has MIME Content-Type TYPE\n"
379 " (special TYPE values: 'ignore' and 'skip')\n"
380 " -F, --filter=M[,[T][,C]]:CMD\n"
381 " process files with MIME Content-Type M using\n"
382 " command CMD, which produces output (on stdout or\n"
383 " in a temporary file) with format T (Content-Type\n"
384 " or file extension; currently txt (default), html\n"
385 " or svg) in character encoding C (default: UTF-8).\n"
386 " E.g. -Fapplication/octet-stream:'strings -n8'\n"
387 " or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
388 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n"
389 " -f, --follow follow symbolic links\n"
390 " -i, --ignore-exclusions ignore meta robots tags and similar exclusions\n"
391 " -S, --spelling index data for spelling correction\n"
392 " -m, --max-size maximum size of file to index (in bytes or with a\n"
393 " suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
394 " (default: unlimited)\n"
395 " --sample=SOURCE what to use for the stored sample of text for\n"
396 " HTML documents - SOURCE can be 'body' or\n"
397 " 'description' (default: 'body')\n"
398 " -E, --sample-size=SIZE maximum size for the document text sample\n"
399 " (supports the same formats as --max-size).\n"
400 " (default: " STRINGIZE(SAMPLE_SIZE
) ")\n"
401 " -T, --title-size=SIZE maximum size for the document title\n"
402 " (supports the same formats as --max-size).\n"
403 " (default: " STRINGIZE(TITLE_SIZE
) ")\n"
404 " -R, --retry-failed retry files which omindex failed to extract text\n"
405 " from on a previous run\n"
406 " --opendir-sleep=SECS sleep for SECS seconds before opening each\n"
407 " directory - sleeping for 2 seconds seems to\n"
408 " reliably work around problems with indexing files\n"
409 " on Microsoft DFS shares.\n"
410 " -C, --track-ctime track each file's ctime so we can detect changes\n"
411 " to ownership or permissions.\n"
412 " -v, --verbose show more information about what is happening\n"
413 " --overwrite create the database anew (the default is to update\n"
414 " if the database already exists)" << endl
;
415 print_stemmer_help(" ");
416 print_help_and_version_help(" ");
420 print_package_info(PROG_NAME
);
422 case 'd': // how shall we handle duplicate documents?
425 skip_duplicates
= true;
428 skip_duplicates
= false;
433 if (strcmp(optarg
, "index") == 0) {
434 empty_body
= EMPTY_BODY_INDEX
;
435 } else if (strcmp(optarg
, "warn") == 0) {
436 empty_body
= EMPTY_BODY_WARN
;
437 } else if (strcmp(optarg
, "skip") == 0) {
438 empty_body
= EMPTY_BODY_SKIP
;
440 cerr
<< "Invalid --empty-docs value '" << optarg
<< "'\n"
441 "Valid values are index, warn, and skip." << endl
;
445 case 'p': // Keep documents even if the files have been removed.
446 delete_removed_documents
= false;
448 case 'l': { // Set recursion limit
449 int arg
= atoi(optarg
);
450 if (arg
< 0) arg
= 0;
451 depth_limit
= size_t(arg
);
454 case 'f': // Turn on following of symlinks
455 follow_symlinks
= true;
458 const char * s
= strrchr(optarg
, ':');
460 cerr
<< "Invalid MIME mapping '" << optarg
<< "'\n"
461 "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
462 "(or txt: to delete a default mapping)" << endl
;
466 // -Mtxt: results in an empty string, which effectively removes the
467 // default mapping for .txt files.
468 mime_map
[string(optarg
, s
- optarg
)] = string(s
+ 1);
469 max_ext_len
= max(max_ext_len
, strlen(s
+ 1));
473 const char * s
= strchr(optarg
, ':');
474 if (s
!= NULL
&& s
[1]) {
476 static_cast<const char *>(memchr(optarg
, ',', s
- optarg
));
477 string output_type
, output_charset
;
479 // Filter produces a specified content-type.
482 static_cast<const char *>(memchr(c
, ',', s
- c
));
484 output_type
.assign(c
, c2
- c
);
486 output_charset
.assign(c2
, s
- c2
);
488 output_type
.assign(c
, s
- c
);
491 if (output_type
.find('/') == string::npos
) {
492 map
<string
, string
>::const_iterator m
;
493 m
= mime_map
.find(output_type
);
494 if (m
!= mime_map
.end()) {
495 output_type
= m
->second
;
497 const char * r
= built_in_mime_map(output_type
);
498 if (r
) output_type
= r
;
501 if (output_type
!= "text/html" &&
502 output_type
!= "text/plain" &&
503 output_type
!= "image/svg+xml") {
504 cerr
<< "Currently only output types 'image/svg+xml', "
505 "'text/html' and 'text/plain' are supported."
513 const char * cmd
= s
+ 1;
514 // Analyse the command string to decide if it needs a shell.
515 bool use_shell
= command_needs_shell(cmd
);
516 index_command(string(optarg
, c
- optarg
),
517 Filter(string(cmd
), output_type
,
518 output_charset
, use_shell
));
520 cerr
<< "Invalid filter mapping '" << optarg
<< "'\n"
521 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or TYPE,EXT:COMMAND\n"
522 "e.g. 'application/octet-stream:strings -n8'"
534 case 'o': // --overwrite
538 ignore_exclusions
= true;
540 case 'R': // --retry-failed
545 stemmer
= Xapian::Stem(optarg
);
546 } catch (const Xapian::InvalidArgumentError
&) {
547 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n"
548 "Available language names are: "
549 << Xapian::Stem::get_available_languages() << endl
;
560 off_t arg
= parse_size(optarg
);
562 sample_size
= size_t(arg
);
565 cerr
<< PROG_NAME
": bad sample size '" << optarg
<< "'" << endl
;
569 off_t arg
= parse_size(optarg
);
571 title_size
= size_t(arg
);
574 cerr
<< PROG_NAME
": bad title size '" << optarg
<< "'" << endl
;
578 off_t size
= parse_size(optarg
);
582 // Set lsb to the lowest set bit in max_size.
583 off_t lsb
= max_size
& -max_size
;
584 if (lsb
>= off_t(1L << 30)) {
587 } else if (lsb
>= off_t(1L << 20)) {
590 } else if (lsb
>= off_t(1L << 10)) {
596 pretty_max_size
= str(size
);
597 pretty_max_size
+= suffix
;
600 cerr
<< PROG_NAME
": bad max size '" << optarg
<< "'" << endl
;
603 case OPT_OPENDIR_SLEEP
: {
604 // Don't want negative numbers, infinity, NaN, or hex numbers.
606 if (C_isdigit(p
[0]) && (p
[1] | 32) != 'x') {
607 sleep_before_opendir
= strtod(p
, &p
);
611 cerr
<< PROG_NAME
": bad --opendir-sleep argument: "
612 "'" << optarg
<< "'" << endl
;
616 if (strcmp(optarg
, "description") == 0) {
617 description_as_sample
= true;
618 } else if (strcmp(optarg
, "body") == 0) {
619 description_as_sample
= false;
621 cerr
<< "Invalid --sample value '" << optarg
<< "'\n"
622 "Valid values are body and description." << endl
;
630 char * s
= strrchr(optarg
, ':');
632 cerr
<< "Invalid MIME mapping '" << optarg
<< "'\n"
633 "Should be of the form GLOB:TYPE, e.g. *~:ignore"
638 cerr
<< "--mime-type-match isn't supported in this build because "
639 "the fnmatch() function wasn't found at configure time."
644 cerr
<< "--mime-type-match with an empty pattern can never "
648 if (memchr(optarg
, '/', s
- optarg
)) {
649 cerr
<< "--mime-type-match only matches against the leaf "
650 "filename so a pattern containing '/' can never match."
654 const char* type
= s
+ 1;
656 cerr
<< "--mime-type-match doesn't support an empty MIME type"
661 mime_patterns
.emplace_back(optarg
, type
);
665 case ':': // missing param
667 case '?': // unknown option: FIXME -> char
672 if (dbpath
.empty()) {
673 cerr
<< PROG_NAME
": you must specify a database with --db." << endl
;
676 if (baseurl
.empty()) {
677 cerr
<< PROG_NAME
": --url not specified, assuming '/'." << endl
;
679 // baseurl must end in a '/'.
680 if (!endswith(baseurl
, '/')) {
684 // Site term (omits the trailing slash):
686 site_term
.append(baseurl
, 0, baseurl
.size() - 1);
687 if (site_term
.size() > MAX_SAFE_TERM_LENGTH
)
688 site_term
= hash_long_term(site_term
, MAX_SAFE_TERM_LENGTH
);
690 // Host term, if the URL contains a hostname (omits any port number):
692 j
= find_if(baseurl
.begin(), baseurl
.end(), p_notalnum
) - baseurl
.begin();
693 if (j
> 0 && baseurl
.substr(j
, 3) == "://" && j
+ 3 < baseurl
.size()) {
695 // We must find a '/' - we ensured baseurl ended with a '/' above.
696 string::size_type k
= baseurl
.find('/', j
);
697 url_start_path
.assign(baseurl
, k
, string::npos
);
698 string::const_iterator l
;
699 l
= find(baseurl
.begin() + j
, baseurl
.begin() + k
, ':');
700 string::size_type host_len
= l
- baseurl
.begin() - j
;
702 host_term
.append(baseurl
, j
, host_len
);
703 // DNS hostname limit is 253.
704 if (host_term
.size() > MAX_SAFE_TERM_LENGTH
)
705 host_term
= hash_long_term(host_term
, MAX_SAFE_TERM_LENGTH
);
707 url_start_path
= baseurl
;
710 if (optind
>= argc
|| optind
+ 2 < argc
) {
711 cerr
<< PROG_NAME
": you must specify a directory to index.\n"
712 "Do this either as a single directory (corresponding to the base URL)\n"
713 "or two directories - the first corresponding to the base URL and the second\n"
714 "a subdirectory of that to index." << endl
;
719 if (!endswith(root
, '/')) {
722 if (optind
+ 2 == argc
) {
723 string start_url
= argv
[optind
+ 1];
724 if (startswith(start_url
, '/')) {
725 // Make relative to root.
726 if (!startswith(start_url
, root
)) {
727 cerr
<< PROG_NAME
": '" << argv
[optind
+ 1] << "' "
728 "is not a subdirectory of '" << argv
[optind
] << "'."
732 start_url
.erase(0, root
.size());
734 if (!endswith(start_url
, '/')) {
738 url_encode_path(baseurl
, start_url
);
743 index_init(dbpath
, stemmer
, root
, site_term
, host_term
, empty_body
,
744 (skip_duplicates
? DUP_SKIP
: DUP_CHECK_LAZILY
),
745 sample_size
, title_size
, max_ext_len
,
746 overwrite
, retry_failed
, delete_removed_documents
, verbose
,
747 use_ctime
, spelling
, ignore_exclusions
,
748 description_as_sample
);
749 index_directory(root
, baseurl
, depth_limit
, mime_map
);
750 index_handle_deletion();
753 } catch (const CommitAndExit
&e
) {
754 cout
<< "Exception: " << e
.what() << endl
;
755 cout
<< "Committing pending changes..." << endl
;
757 } catch (const Xapian::Error
&e
) {
758 cout
<< "Exception: " << e
.get_description() << endl
;
759 } catch (const exception
&e
) {
760 cout
<< "Exception: " << e
.what() << endl
;
761 } catch (const string
&s
) {
762 cout
<< "Exception: " << s
<< endl
;
763 } catch (const char *s
) {
764 cout
<< "Exception: " << s
<< endl
;
766 cout
<< "Caught unknown exception" << endl
;