Index Visio files using vsd2xhtml
[xapian.git] / xapian-applications / omega / omindex.cc
blob5dc42ebcfea583e15d640ceeb34806a640085dbc
1 /* omindex.cc: index static documents into the omega db
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2005 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017 Olly Betts
7 * Copyright 2009 Frank J Bruzzaniti
8 * Copyright 2012 Mihai Bivol
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include <algorithm>
29 #include <iostream>
30 #include <string>
31 #include <map>
33 #include <sys/types.h>
34 #include "safeunistd.h"
35 #include <cstdio>
36 #include <cstdlib>
37 #include <cstring>
38 #include "safefcntl.h"
40 #ifdef HAVE_FNMATCH
41 # include <fnmatch.h>
42 #endif
44 #include <xapian.h>
46 #include "commonhelp.h"
47 #include "diritor.h"
48 #include "hashterm.h"
49 #include "index_file.h"
50 #include "mime.h"
51 #include "realtime.h"
52 #include "str.h"
53 #include "stringutils.h"
54 #include "urlencode.h"
56 #include "gnu_getopt.h"
58 using namespace std;
60 #define PROG_NAME "omindex"
61 #define PROG_DESC "Index static website data via the filesystem"
63 #define TITLE_SIZE 128
64 #define SAMPLE_SIZE 512
66 static bool follow_symlinks = false;
67 static off_t max_size = 0;
68 static std::string pretty_max_size;
69 static bool verbose = false;
70 static double sleep_before_opendir = 0;
72 static string root;
73 static string url_start_path;
75 #ifdef HAVE_FNMATCH
76 static vector<pair<const char*, const char*>> mime_patterns;
77 #endif
79 inline static bool
80 p_notalnum(unsigned int c)
82 return !C_isalnum(c);
85 static void
86 index_file(const string &file, const string &url, DirectoryIterator & d,
87 map<string, string>& mime_map)
89 string urlterm("U");
90 urlterm += url;
92 if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
93 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
95 const char* leafname = d.leafname();
97 string mimetype;
98 #ifdef HAVE_FNMATCH
99 for (auto&& i : mime_patterns) {
100 if (fnmatch(i.first, leafname, 0) == 0) {
101 if (strcmp(i.second, "ignore") == 0)
102 return;
103 if (strcmp(i.second, "skip") == 0) {
104 string m = "Leafname '";
105 m += leafname;
106 m += "' matches pattern: ";
107 m += i.first;
108 skip(urlterm, file.substr(root.size()), m,
109 d.get_size(), d.get_mtime());
110 return;
112 mimetype = i.second;
113 break;
116 #endif
118 string ext;
119 const char * dot_ptr = strrchr(leafname, '.');
120 if (dot_ptr) {
121 ext.assign(dot_ptr + 1);
122 if (ext.size() > max_ext_len)
123 ext.resize(0);
126 if (mimetype.empty()) {
127 mimetype = mimetype_from_ext(mime_map, ext);
128 if (mimetype == "ignore") {
129 return;
130 } else if (mimetype == "skip") {
131 // Ignore mimetype, skipped mimetype should not be quietly ignored.
132 string m = "skipping extension '";
133 m += ext;
134 m += "'";
135 skip(urlterm, file.substr(root.size()), m,
136 d.get_size(), d.get_mtime());
137 return;
141 // Check the file size.
142 off_t size = d.get_size();
143 if (size == 0) {
144 skip(urlterm, file.substr(root.size()), "Zero-sized file",
145 size, d.get_mtime(), SKIP_VERBOSE_ONLY);
146 return;
149 if (max_size > 0 && size > max_size) {
150 skip(urlterm, file.substr(root.size()),
151 "Larger than size limit of " + pretty_max_size,
152 size, d.get_mtime(),
153 SKIP_VERBOSE_ONLY);
154 return;
157 // If we didn't get the mime type from the extension, call libmagic to get
158 // it.
159 if (mimetype.empty()) {
160 mimetype = d.get_magic_mimetype();
161 if (mimetype.empty()) {
162 skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format",
163 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
164 return;
168 if (verbose)
169 cout << "Indexing \"" << file.substr(root.size()) << "\" as "
170 << mimetype << " ... ";
172 Xapian::Document new_doc;
174 // Use `file` as the basis, as we don't want URL encoding in these terms,
175 // but need to switch over the initial part so we get `/~olly/foo/bar` not
176 // `/home/olly/public_html/foo/bar`.
177 string path_term("P");
178 path_term += url_start_path;
179 path_term.append(file, root.size(), string::npos);
181 size_t i;
182 while ((i = path_term.rfind('/')) > 1 && i != string::npos) {
183 path_term.resize(i);
184 if (path_term.length() > MAX_SAFE_TERM_LENGTH) {
185 new_doc.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH));
186 } else {
187 new_doc.add_boolean_term(path_term);
191 index_mimetype(file, urlterm, url, ext, mimetype, d, new_doc, string());
194 static void
195 index_directory(const string &path, const string &url_, size_t depth_limit,
196 map<string, string>& mime_map)
198 if (verbose)
199 cout << "[Entering directory \"" << path.substr(root.size()) << "\"]"
200 << endl;
202 DirectoryIterator d(follow_symlinks);
203 try {
204 // Crude workaround for MS-DFS share misbehaviour.
205 if (sleep_before_opendir > 0.0)
206 RealTime::sleep(RealTime::now() + sleep_before_opendir);
208 d.start(path);
210 while (d.next()) {
211 string url = url_;
212 url_encode(url, d.leafname());
213 string file = path;
214 file += d.leafname();
216 try {
217 switch (d.get_type()) {
218 case DirectoryIterator::DIRECTORY: {
219 size_t new_limit = depth_limit;
220 if (new_limit) {
221 if (--new_limit == 0) continue;
223 url += '/';
224 file += '/';
225 index_directory(file, url, new_limit, mime_map);
226 break;
228 case DirectoryIterator::REGULAR_FILE:
229 index_file(file, url, d, mime_map);
230 break;
231 default:
232 skip("U" + url, file.substr(root.size()), "Not a regular file",
233 d.get_size(), d.get_mtime(),
234 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
236 } catch (const FileNotFound & e) {
237 skip("U" + url, file.substr(root.size()), "File removed during indexing",
238 d.get_size(), d.get_mtime(),
239 /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME);
240 } catch (const std::string & error) {
241 skip("U" + url, file.substr(root.size()), error,
242 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
245 } catch (FileNotFound) {
246 if (verbose)
247 cout << "Directory \"" << path.substr(root.size()) << "\" "
248 "deleted during indexing" << endl;
249 } catch (const std::string & error) {
250 cout << error << " - skipping directory "
251 "\"" << path.substr(root.size()) << "\"" << endl;
255 static off_t
256 parse_size(char* p)
258 // Don't want negative numbers, infinity, NaN, or hex numbers.
259 if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
260 double arg = strtod(p, &p);
261 switch (*p) {
262 case '\0':
263 break;
264 case 'k': case 'K':
265 arg *= 1024;
266 ++p;
267 break;
268 case 'm': case 'M':
269 arg *= (1024 * 1024);
270 ++p;
271 break;
272 case 'g': case 'G':
273 arg *= (1024 * 1024 * 1024);
274 ++p;
275 break;
277 if (*p == '\0') {
278 return off_t(arg);
281 return -1;
285 main(int argc, char **argv)
287 // If overwrite is true, the database will be created anew even if it
288 // already exists.
289 bool overwrite = false;
290 // If delete_removed_documents is true, delete any documents we don't see.
291 bool delete_removed_documents = true;
292 // Retry files which we failed to index on a previous run?
293 bool retry_failed = false;
294 bool use_ctime = false;
295 bool spelling = false;
296 bool skip_duplicates = false;
297 bool ignore_exclusions = false;
298 bool description_as_sample = false;
299 string baseurl;
300 size_t depth_limit = 0;
301 size_t title_size = TITLE_SIZE;
302 size_t sample_size = SAMPLE_SIZE;
303 empty_body_type empty_body = EMPTY_BODY_WARN;
304 string site_term, host_term;
305 Xapian::Stem stemmer("english");
307 enum { OPT_OPENDIR_SLEEP = 256, OPT_SAMPLE };
308 static const struct option longopts[] = {
309 { "help", no_argument, NULL, 'h' },
310 { "version", no_argument, NULL, 'V' },
311 { "overwrite", no_argument, NULL, 'o' },
312 { "duplicates", required_argument, NULL, 'd' },
313 { "no-delete", no_argument, NULL, 'p' },
314 { "preserve-nonduplicates", no_argument, NULL, 'p' },
315 { "db", required_argument, NULL, 'D' },
316 { "url", required_argument, NULL, 'U' },
317 { "mime-type", required_argument, NULL, 'M' },
318 { "mime-type-match", required_argument, NULL, 'G' },
319 { "filter", required_argument, NULL, 'F' },
320 { "depth-limit",required_argument, NULL, 'l' },
321 { "follow", no_argument, NULL, 'f' },
322 { "ignore-exclusions", no_argument, NULL, 'i' },
323 { "stemmer", required_argument, NULL, 's' },
324 { "spelling", no_argument, NULL, 'S' },
325 { "verbose", no_argument, NULL, 'v' },
326 { "empty-docs", required_argument, NULL, 'e' },
327 { "max-size", required_argument, NULL, 'm' },
328 { "sample", required_argument, NULL, OPT_SAMPLE },
329 { "sample-size",required_argument, NULL, 'E' },
330 { "title-size", required_argument, NULL, 'T' },
331 { "retry-failed", no_argument, NULL, 'R' },
332 { "opendir-sleep", required_argument, NULL, OPT_OPENDIR_SLEEP },
333 { "track-ctime",no_argument, NULL, 'C' },
334 { 0, 0, NULL, 0 }
337 map<string, string> mime_map;
339 index_add_default_filters();
341 if (argc == 2 && strcmp(argv[1], "-v") == 0) {
342 // -v was the short option for --version in 1.2.3 and earlier, but
343 // now it is short for --verbose (for consistency with scriptindex)
344 // so if "-v" is the only option, translate it to "--version" for
345 // backwards compatibility.
346 argv[1] = const_cast<char *>("--version");
349 string dbpath;
350 int getopt_ret;
351 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfRSVe:im:E:T:",
352 longopts, NULL)) != -1) {
353 switch (getopt_ret) {
354 case 'h': {
355 cout << PROG_NAME " - " PROG_DESC "\n\n"
356 "Usage: " PROG_NAME " [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
357 "\n"
358 "DIRECTORY is the directory to start indexing from.\n"
359 "\n"
360 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
361 "\n"
362 "Options:\n"
363 " -d, --duplicates set duplicate handling ('ignore' or 'replace')\n"
364 " -p, --no-delete skip the deletion of documents corresponding to\n"
365 " deleted files (--preserve-nonduplicates is a\n"
366 " deprecated alias for --no-delete)\n"
367 " -e, --empty-docs=ARG how to handle documents we extract no text from:\n"
368 " ARG can be index, warn (issue a diagnostic and\n"
369 " index), or skip. (default: warn)\n"
370 " -D, --db=DATABASE path to database to use\n"
371 " -U, --url=URL base url BASEDIR corresponds to (default: /)\n"
372 " -M, --mime-type=EXT:TYPE assume any file with extension EXT has MIME\n"
373 " Content-Type TYPE, instead of using libmagic\n"
374 " (empty TYPE removes any existing mapping for EXT;\n"
375 " other special TYPE values: 'ignore' and 'skip')\n"
376 " -G, --mime-type-match=GLOB:TYPE\n"
377 " assume any file with leaf name matching shell\n"
378 " wildcard pattern GLOB has MIME Content-Type TYPE\n"
379 " (special TYPE values: 'ignore' and 'skip')\n"
380 " -F, --filter=M[,[T][,C]]:CMD\n"
381 " process files with MIME Content-Type M using\n"
382 " command CMD, which produces output (on stdout or\n"
383 " in a temporary file) with format T (Content-Type\n"
384 " or file extension; currently txt (default), html\n"
385 " or svg) in character encoding C (default: UTF-8).\n"
386 " E.g. -Fapplication/octet-stream:'strings -n8'\n"
387 " or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
388 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n"
389 " -f, --follow follow symbolic links\n"
390 " -i, --ignore-exclusions ignore meta robots tags and similar exclusions\n"
391 " -S, --spelling index data for spelling correction\n"
392 " -m, --max-size maximum size of file to index (in bytes or with a\n"
393 " suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
394 " (default: unlimited)\n"
395 " --sample=SOURCE what to use for the stored sample of text for\n"
396 " HTML documents - SOURCE can be 'body' or\n"
397 " 'description' (default: 'body')\n"
398 " -E, --sample-size=SIZE maximum size for the document text sample\n"
399 " (supports the same formats as --max-size).\n"
400 " (default: " STRINGIZE(SAMPLE_SIZE) ")\n"
401 " -T, --title-size=SIZE maximum size for the document title\n"
402 " (supports the same formats as --max-size).\n"
403 " (default: " STRINGIZE(TITLE_SIZE) ")\n"
404 " -R, --retry-failed retry files which omindex failed to extract text\n"
405 " from on a previous run\n"
406 " --opendir-sleep=SECS sleep for SECS seconds before opening each\n"
407 " directory - sleeping for 2 seconds seems to\n"
408 " reliably work around problems with indexing files\n"
409 " on Microsoft DFS shares.\n"
410 " -C, --track-ctime track each file's ctime so we can detect changes\n"
411 " to ownership or permissions.\n"
412 " -v, --verbose show more information about what is happening\n"
413 " --overwrite create the database anew (the default is to update\n"
414 " if the database already exists)" << endl;
415 print_stemmer_help(" ");
416 print_help_and_version_help(" ");
417 return 0;
419 case 'V':
420 print_package_info(PROG_NAME);
421 return 0;
422 case 'd': // how shall we handle duplicate documents?
423 switch (optarg[0]) {
424 case 'i':
425 skip_duplicates = true;
426 break;
427 case 'r':
428 skip_duplicates = false;
429 break;
431 break;
432 case 'e':
433 if (strcmp(optarg, "index") == 0) {
434 empty_body = EMPTY_BODY_INDEX;
435 } else if (strcmp(optarg, "warn") == 0) {
436 empty_body = EMPTY_BODY_WARN;
437 } else if (strcmp(optarg, "skip") == 0) {
438 empty_body = EMPTY_BODY_SKIP;
439 } else {
440 cerr << "Invalid --empty-docs value '" << optarg << "'\n"
441 "Valid values are index, warn, and skip." << endl;
442 return 1;
444 break;
445 case 'p': // Keep documents even if the files have been removed.
446 delete_removed_documents = false;
447 break;
448 case 'l': { // Set recursion limit
449 int arg = atoi(optarg);
450 if (arg < 0) arg = 0;
451 depth_limit = size_t(arg);
452 break;
454 case 'f': // Turn on following of symlinks
455 follow_symlinks = true;
456 break;
457 case 'M': {
458 const char * s = strrchr(optarg, ':');
459 if (s == NULL) {
460 cerr << "Invalid MIME mapping '" << optarg << "'\n"
461 "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
462 "(or txt: to delete a default mapping)" << endl;
463 return 1;
466 // -Mtxt: results in an empty string, which effectively removes the
467 // default mapping for .txt files.
468 mime_map[string(optarg, s - optarg)] = string(s + 1);
469 max_ext_len = max(max_ext_len, strlen(s + 1));
470 break;
472 case 'F': {
473 const char * s = strchr(optarg, ':');
474 if (s != NULL && s[1]) {
475 const char * c =
476 static_cast<const char *>(memchr(optarg, ',', s - optarg));
477 string output_type, output_charset;
478 if (c) {
479 // Filter produces a specified content-type.
480 ++c;
481 const char * c2 =
482 static_cast<const char *>(memchr(c, ',', s - c));
483 if (c2) {
484 output_type.assign(c, c2 - c);
485 ++c2;
486 output_charset.assign(c2, s - c2);
487 } else {
488 output_type.assign(c, s - c);
490 --c;
491 if (output_type.find('/') == string::npos) {
492 map<string, string>::const_iterator m;
493 m = mime_map.find(output_type);
494 if (m != mime_map.end()) {
495 output_type = m->second;
496 } else {
497 const char * r = built_in_mime_map(output_type);
498 if (r) output_type = r;
501 if (output_type != "text/html" &&
502 output_type != "text/plain" &&
503 output_type != "image/svg+xml") {
504 cerr << "Currently only output types 'image/svg+xml', "
505 "'text/html' and 'text/plain' are supported."
506 << endl;
507 return 1;
509 } else {
510 c = s;
513 const char * cmd = s + 1;
514 // Analyse the command string to decide if it needs a shell.
515 bool use_shell = command_needs_shell(cmd);
516 index_command(string(optarg, c - optarg),
517 Filter(string(cmd), output_type,
518 output_charset, use_shell));
519 } else {
520 cerr << "Invalid filter mapping '" << optarg << "'\n"
521 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or TYPE,EXT:COMMAND\n"
522 "e.g. 'application/octet-stream:strings -n8'"
523 << endl;
524 return 1;
526 break;
528 case 'D':
529 dbpath = optarg;
530 break;
531 case 'U':
532 baseurl = optarg;
533 break;
534 case 'o': // --overwrite
535 overwrite = true;
536 break;
537 case 'i':
538 ignore_exclusions = true;
539 break;
540 case 'R': // --retry-failed
541 retry_failed = true;
542 break;
543 case 's':
544 try {
545 stemmer = Xapian::Stem(optarg);
546 } catch (const Xapian::InvalidArgumentError &) {
547 cerr << "Unknown stemming language '" << optarg << "'.\n"
548 "Available language names are: "
549 << Xapian::Stem::get_available_languages() << endl;
550 return 1;
552 break;
553 case 'S':
554 spelling = true;
555 break;
556 case 'v':
557 verbose = true;
558 break;
559 case 'E': {
560 off_t arg = parse_size(optarg);
561 if (arg >= 0) {
562 sample_size = size_t(arg);
563 break;
565 cerr << PROG_NAME": bad sample size '" << optarg << "'" << endl;
566 return 1;
568 case 'T': {
569 off_t arg = parse_size(optarg);
570 if (arg >= 0) {
571 title_size = size_t(arg);
572 break;
574 cerr << PROG_NAME": bad title size '" << optarg << "'" << endl;
575 return 1;
577 case 'm': {
578 off_t size = parse_size(optarg);
579 if (size >= 0) {
580 max_size = size;
581 const char * suffix;
582 // Set lsb to the lowest set bit in max_size.
583 off_t lsb = max_size & -max_size;
584 if (lsb >= off_t(1L << 30)) {
585 size >>= 30;
586 suffix = "GB";
587 } else if (lsb >= off_t(1L << 20)) {
588 size >>= 20;
589 suffix = "MB";
590 } else if (lsb >= off_t(1L << 10)) {
591 size >>= 10;
592 suffix = "KB";
593 } else {
594 suffix = "B";
596 pretty_max_size = str(size);
597 pretty_max_size += suffix;
598 break;
600 cerr << PROG_NAME": bad max size '" << optarg << "'" << endl;
601 return 1;
603 case OPT_OPENDIR_SLEEP: {
604 // Don't want negative numbers, infinity, NaN, or hex numbers.
605 char * p = optarg;
606 if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
607 sleep_before_opendir = strtod(p, &p);
608 if (*p == '\0')
609 break;
611 cerr << PROG_NAME": bad --opendir-sleep argument: "
612 "'" << optarg << "'" << endl;
613 return 1;
615 case OPT_SAMPLE:
616 if (strcmp(optarg, "description") == 0) {
617 description_as_sample = true;
618 } else if (strcmp(optarg, "body") == 0) {
619 description_as_sample = false;
620 } else {
621 cerr << "Invalid --sample value '" << optarg << "'\n"
622 "Valid values are body and description." << endl;
623 return 1;
625 break;
626 case 'C':
627 use_ctime = true;
628 break;
629 case 'G': {
630 char * s = strrchr(optarg, ':');
631 if (s == NULL) {
632 cerr << "Invalid MIME mapping '" << optarg << "'\n"
633 "Should be of the form GLOB:TYPE, e.g. *~:ignore"
634 << endl;
635 return 1;
637 #ifndef HAVE_FNMATCH
638 cerr << "--mime-type-match isn't supported in this build because "
639 "the fnmatch() function wasn't found at configure time."
640 << endl;
641 return 1;
642 #else
643 if (s == optarg) {
644 cerr << "--mime-type-match with an empty pattern can never "
645 "match." << endl;
646 return 1;
648 if (memchr(optarg, '/', s - optarg)) {
649 cerr << "--mime-type-match only matches against the leaf "
650 "filename so a pattern containing '/' can never match."
651 << endl;
652 return 1;
654 const char* type = s + 1;
655 if (*type == '\0') {
656 cerr << "--mime-type-match doesn't support an empty MIME type"
657 << endl;
658 return 1;
660 *s = '\0';
661 mime_patterns.emplace_back(optarg, type);
662 break;
663 #endif
665 case ':': // missing param
666 return 1;
667 case '?': // unknown option: FIXME -> char
668 return 1;
672 if (dbpath.empty()) {
673 cerr << PROG_NAME": you must specify a database with --db." << endl;
674 return 1;
676 if (baseurl.empty()) {
677 cerr << PROG_NAME": --url not specified, assuming '/'." << endl;
679 // baseurl must end in a '/'.
680 if (!endswith(baseurl, '/')) {
681 baseurl += '/';
684 // Site term (omits the trailing slash):
685 site_term = "J";
686 site_term.append(baseurl, 0, baseurl.size() - 1);
687 if (site_term.size() > MAX_SAFE_TERM_LENGTH)
688 site_term = hash_long_term(site_term, MAX_SAFE_TERM_LENGTH);
690 // Host term, if the URL contains a hostname (omits any port number):
691 string::size_type j;
692 j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
693 if (j > 0 && baseurl.substr(j, 3) == "://" && j + 3 < baseurl.size()) {
694 j += 3;
695 // We must find a '/' - we ensured baseurl ended with a '/' above.
696 string::size_type k = baseurl.find('/', j);
697 url_start_path.assign(baseurl, k, string::npos);
698 string::const_iterator l;
699 l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
700 string::size_type host_len = l - baseurl.begin() - j;
701 host_term = "H";
702 host_term.append(baseurl, j, host_len);
703 // DNS hostname limit is 253.
704 if (host_term.size() > MAX_SAFE_TERM_LENGTH)
705 host_term = hash_long_term(host_term, MAX_SAFE_TERM_LENGTH);
706 } else {
707 url_start_path = baseurl;
710 if (optind >= argc || optind + 2 < argc) {
711 cerr << PROG_NAME": you must specify a directory to index.\n"
712 "Do this either as a single directory (corresponding to the base URL)\n"
713 "or two directories - the first corresponding to the base URL and the second\n"
714 "a subdirectory of that to index." << endl;
715 return 1;
718 root = argv[optind];
719 if (!endswith(root, '/')) {
720 root += '/';
722 if (optind + 2 == argc) {
723 string start_url = argv[optind + 1];
724 if (startswith(start_url, '/')) {
725 // Make relative to root.
726 if (!startswith(start_url, root)) {
727 cerr << PROG_NAME": '" << argv[optind + 1] << "' "
728 "is not a subdirectory of '" << argv[optind] << "'."
729 << endl;
730 return 1;
732 start_url.erase(0, root.size());
734 if (!endswith(start_url, '/')) {
735 start_url += '/';
737 root += start_url;
738 url_encode_path(baseurl, start_url);
741 int exitcode = 1;
742 try {
743 index_init(dbpath, stemmer, root, site_term, host_term, empty_body,
744 (skip_duplicates ? DUP_SKIP : DUP_CHECK_LAZILY),
745 sample_size, title_size, max_ext_len,
746 overwrite, retry_failed, delete_removed_documents, verbose,
747 use_ctime, spelling, ignore_exclusions,
748 description_as_sample);
749 index_directory(root, baseurl, depth_limit, mime_map);
750 index_handle_deletion();
751 index_commit();
752 exitcode = 0;
753 } catch (const CommitAndExit &e) {
754 cout << "Exception: " << e.what() << endl;
755 cout << "Committing pending changes..." << endl;
756 index_commit();
757 } catch (const Xapian::Error &e) {
758 cout << "Exception: " << e.get_description() << endl;
759 } catch (const exception &e) {
760 cout << "Exception: " << e.what() << endl;
761 } catch (const string &s) {
762 cout << "Exception: " << s << endl;
763 } catch (const char *s) {
764 cout << "Exception: " << s << endl;
765 } catch (...) {
766 cout << "Caught unknown exception" << endl;
769 index_done();
771 return exitcode;