xapian-applications/omega/omindex.cc

   1 /* omindex.cc: index static documents into the omega db
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001,2005 James Aylett
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017 Olly Betts
   7  * Copyright 2009 Frank J Bruzzaniti
   8  * Copyright 2012 Mihai Bivol
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 #include <algorithm>
  29 #include <iostream>
  30 #include <string>
  31 #include <map>
  32
  33 #include <sys/types.h>
  34 #include "safeunistd.h"
  35 #include <cstdio>
  36 #include <cstdlib>
  37 #include <cstring>
  38 #include "safefcntl.h"
  39
  40 #ifdef HAVE_FNMATCH
  41 # include <fnmatch.h>
  42 #endif
  43
  44 #include <xapian.h>
  45
  46 #include "commonhelp.h"
  47 #include "diritor.h"
  48 #include "hashterm.h"
  49 #include "index_file.h"
  50 #include "mime.h"
  51 #include "realtime.h"
  52 #include "str.h"
  53 #include "stringutils.h"
  54 #include "urlencode.h"
  55
  56 #include "gnu_getopt.h"
  57
  58 using namespace std;
  59
  60 #define PROG_NAME "omindex"
  61 #define PROG_DESC "Index static website data via the filesystem"
  62
  63 #define TITLE_SIZE 128
  64 #define SAMPLE_SIZE 512
  65
  66 static bool follow_symlinks = false;
  67 static off_t max_size = 0;
  68 static std::string pretty_max_size;
  69 static bool verbose = false;
  70 static double sleep_before_opendir = 0;
  71
  72 static string root;
  73 static string url_start_path;
  74
  75 #ifdef HAVE_FNMATCH
  76 static vector<pair<const char*, const char*>> mime_patterns;
  77 #endif
  78
  79 inline static bool
  80 p_notalnum(unsigned int c)
  81 {
  82     return !C_isalnum(c);
  83 }
  84
  85 static void
  86 index_file(const string &file, const string &url, DirectoryIterator & d,
  87            map<string, string>& mime_map)
  88 {
  89     string urlterm("U");
  90     urlterm += url;
  91
  92     if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
  93         urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
  94
  95     const char* leafname = d.leafname();
  96
  97     string mimetype;
  98 #ifdef HAVE_FNMATCH
  99     for (auto&& i : mime_patterns) {
 100         if (fnmatch(i.first, leafname, 0) == 0) {
 101             if (strcmp(i.second, "ignore") == 0)
 102                 return;
 103             if (strcmp(i.second, "skip") == 0) {
 104                 string m = "Leafname '";
 105                 m += leafname;
 106                 m += "' matches pattern: ";
 107                 m += i.first;
 108                 skip(urlterm, file.substr(root.size()), m,
 109                      d.get_size(), d.get_mtime());
 110                 return;
 111             }
 112             mimetype = i.second;
 113             break;
 114         }
 115     }
 116 #endif
 117
 118     string ext;
 119     const char * dot_ptr = strrchr(leafname, '.');
 120     if (dot_ptr) {
 121         ext.assign(dot_ptr + 1);
 122         if (ext.size() > max_ext_len)
 123             ext.resize(0);
 124     }
 125
 126     if (mimetype.empty()) {
 127         mimetype = mimetype_from_ext(mime_map, ext);
 128         if (mimetype == "ignore") {
 129             return;
 130         } else if (mimetype == "skip") {
 131             // Ignore mimetype, skipped mimetype should not be quietly ignored.
 132             string m = "skipping extension '";
 133             m += ext;
 134             m += "'";
 135             skip(urlterm, file.substr(root.size()), m,
 136                  d.get_size(), d.get_mtime());
 137             return;
 138         }
 139     }
 140
 141     // Check the file size.
 142     off_t size = d.get_size();
 143     if (size == 0) {
 144         skip(urlterm, file.substr(root.size()), "Zero-sized file",
 145              size, d.get_mtime(), SKIP_VERBOSE_ONLY);
 146         return;
 147     }
 148
 149     if (max_size > 0 && size > max_size) {
 150         skip(urlterm, file.substr(root.size()),
 151              "Larger than size limit of " + pretty_max_size,
 152              size, d.get_mtime(),
 153              SKIP_VERBOSE_ONLY);
 154         return;
 155     }
 156
 157     // If we didn't get the mime type from the extension, call libmagic to get
 158     // it.
 159     if (mimetype.empty()) {
 160         mimetype = d.get_magic_mimetype();
 161         if (mimetype.empty()) {
 162             skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format",
 163                  d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 164             return;
 165         }
 166     }
 167
 168     if (verbose)
 169         cout << "Indexing \"" << file.substr(root.size()) << "\" as "
 170              << mimetype << " ... ";
 171
 172     Xapian::Document new_doc;
 173
 174     // Use `file` as the basis, as we don't want URL encoding in these terms,
 175     // but need to switch over the initial part so we get `/~olly/foo/bar` not
 176     // `/home/olly/public_html/foo/bar`.
 177     string path_term("P");
 178     path_term += url_start_path;
 179     path_term.append(file, root.size(), string::npos);
 180
 181     size_t i;
 182     while ((i = path_term.rfind('/')) > 1 && i != string::npos) {
 183         path_term.resize(i);
 184         if (path_term.length() > MAX_SAFE_TERM_LENGTH) {
 185             new_doc.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH));
 186         } else {
 187             new_doc.add_boolean_term(path_term);
 188         }
 189     }
 190
 191     index_mimetype(file, urlterm, url, ext, mimetype, d, new_doc, string());
 192 }
 193
 194 static void
 195 index_directory(const string &path, const string &url_, size_t depth_limit,
 196                 map<string, string>& mime_map)
 197 {
 198     if (verbose)
 199         cout << "[Entering directory \"" << path.substr(root.size()) << "\"]"
 200              << endl;
 201
 202     DirectoryIterator d(follow_symlinks);
 203     try {
 204         // Crude workaround for MS-DFS share misbehaviour.
 205         if (sleep_before_opendir > 0.0)
 206             RealTime::sleep(RealTime::now() + sleep_before_opendir);
 207
 208         d.start(path);
 209
 210         while (d.next()) {
 211             string url = url_;
 212             url_encode(url, d.leafname());
 213             string file = path;
 214             file += d.leafname();
 215
 216             try {
 217                 switch (d.get_type()) {
 218                     case DirectoryIterator::DIRECTORY: {
 219                         size_t new_limit = depth_limit;
 220                         if (new_limit) {
 221                             if (--new_limit == 0) continue;
 222                         }
 223                         url += '/';
 224                         file += '/';
 225                         index_directory(file, url, new_limit, mime_map);
 226                         break;
 227                     }
 228                     case DirectoryIterator::REGULAR_FILE:
 229                         index_file(file, url, d, mime_map);
 230                         break;
 231                     default:
 232                         skip("U" + url, file.substr(root.size()), "Not a regular file",
 233                              d.get_size(), d.get_mtime(),
 234                              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
 235                 }
 236             } catch (const FileNotFound & e) {
 237                 skip("U" + url, file.substr(root.size()), "File removed during indexing",
 238                      d.get_size(), d.get_mtime(),
 239                      /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME);
 240             } catch (const std::string & error) {
 241                 skip("U" + url, file.substr(root.size()), error,
 242                      d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 243             }
 244         }
 245     } catch (FileNotFound) {
 246         if (verbose)
 247             cout << "Directory \"" << path.substr(root.size()) << "\" "
 248                     "deleted during indexing" << endl;
 249     } catch (const std::string & error) {
 250         cout << error << " - skipping directory "
 251                 "\"" << path.substr(root.size()) << "\"" << endl;
 252     }
 253 }
 254
 255 static off_t
 256 parse_size(char* p)
 257 {
 258     // Don't want negative numbers, infinity, NaN, or hex numbers.
 259     if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
 260         double arg = strtod(p, &p);
 261         switch (*p) {
 262             case '\0':
 263                 break;
 264             case 'k': case 'K':
 265                 arg *= 1024;
 266                 ++p;
 267                 break;
 268             case 'm': case 'M':
 269                 arg *= (1024 * 1024);
 270                 ++p;
 271                 break;
 272             case 'g': case 'G':
 273                 arg *= (1024 * 1024 * 1024);
 274                 ++p;
 275                 break;
 276         }
 277         if (*p == '\0') {
 278             return off_t(arg);
 279         }
 280     }
 281     return -1;
 282 }
 283
 284 int
 285 main(int argc, char **argv)
 286 {
 287     // If overwrite is true, the database will be created anew even if it
 288     // already exists.
 289     bool overwrite = false;
 290     // If delete_removed_documents is true, delete any documents we don't see.
 291     bool delete_removed_documents = true;
 292     // Retry files which we failed to index on a previous run?
 293     bool retry_failed = false;
 294     bool use_ctime = false;
 295     bool spelling = false;
 296     bool skip_duplicates = false;
 297     bool ignore_exclusions = false;
 298     bool description_as_sample = false;
 299     string baseurl;
 300     size_t depth_limit = 0;
 301     size_t title_size = TITLE_SIZE;
 302     size_t sample_size = SAMPLE_SIZE;
 303     empty_body_type empty_body = EMPTY_BODY_WARN;
 304     string site_term, host_term;
 305     Xapian::Stem stemmer("english");
 306
 307     enum { OPT_OPENDIR_SLEEP = 256, OPT_SAMPLE };
 308     static const struct option longopts[] = {
 309         { "help",       no_argument,            NULL, 'h' },
 310         { "version",    no_argument,            NULL, 'V' },
 311         { "overwrite",  no_argument,            NULL, 'o' },
 312         { "duplicates", required_argument,      NULL, 'd' },
 313         { "no-delete",  no_argument,            NULL, 'p' },
 314         { "preserve-nonduplicates",     no_argument,    NULL, 'p' },
 315         { "db",         required_argument,      NULL, 'D' },
 316         { "url",        required_argument,      NULL, 'U' },
 317         { "mime-type",  required_argument,      NULL, 'M' },
 318         { "mime-type-match", required_argument, NULL, 'G' },
 319         { "filter",     required_argument,      NULL, 'F' },
 320         { "depth-limit",required_argument,      NULL, 'l' },
 321         { "follow",     no_argument,            NULL, 'f' },
 322         { "ignore-exclusions",  no_argument,    NULL, 'i' },
 323         { "stemmer",    required_argument,      NULL, 's' },
 324         { "spelling",   no_argument,            NULL, 'S' },
 325         { "verbose",    no_argument,            NULL, 'v' },
 326         { "empty-docs", required_argument,      NULL, 'e' },
 327         { "max-size",   required_argument,      NULL, 'm' },
 328         { "sample",     required_argument,      NULL, OPT_SAMPLE },
 329         { "sample-size",required_argument,      NULL, 'E' },
 330         { "title-size", required_argument,      NULL, 'T' },
 331         { "retry-failed",       no_argument,    NULL, 'R' },
 332         { "opendir-sleep",      required_argument,      NULL, OPT_OPENDIR_SLEEP },
 333         { "track-ctime",no_argument,            NULL, 'C' },
 334         { 0, 0, NULL, 0 }
 335     };
 336
 337     map<string, string> mime_map;
 338
 339     index_add_default_filters();
 340
 341     if (argc == 2 && strcmp(argv[1], "-v") == 0) {
 342         // -v was the short option for --version in 1.2.3 and earlier, but
 343         // now it is short for --verbose (for consistency with scriptindex)
 344         // so if "-v" is the only option, translate it to "--version" for
 345         // backwards compatibility.
 346         argv[1] = const_cast<char *>("--version");
 347     }
 348
 349     string dbpath;
 350     int getopt_ret;
 351     while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfRSVe:im:E:T:",
 352                                          longopts, NULL)) != -1) {
 353         switch (getopt_ret) {
 354         case 'h': {
 355             cout << PROG_NAME " - " PROG_DESC "\n\n"
 356 "Usage: " PROG_NAME " [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
 357 "\n"
 358 "DIRECTORY is the directory to start indexing from.\n"
 359 "\n"
 360 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
 361 "\n"
 362 "Options:\n"
 363 "  -d, --duplicates          set duplicate handling ('ignore' or 'replace')\n"
 364 "  -p, --no-delete           skip the deletion of documents corresponding to\n"
 365 "                            deleted files (--preserve-nonduplicates is a\n"
 366 "                            deprecated alias for --no-delete)\n"
 367 "  -e, --empty-docs=ARG      how to handle documents we extract no text from:\n"
 368 "                            ARG can be index, warn (issue a diagnostic and\n"
 369 "                            index), or skip.  (default: warn)\n"
 370 "  -D, --db=DATABASE         path to database to use\n"
 371 "  -U, --url=URL             base url BASEDIR corresponds to (default: /)\n"
 372 "  -M, --mime-type=EXT:TYPE  assume any file with extension EXT has MIME\n"
 373 "                            Content-Type TYPE, instead of using libmagic\n"
 374 "                            (empty TYPE removes any existing mapping for EXT;\n"
 375 "                            other special TYPE values: 'ignore' and 'skip')\n"
 376 "  -G, --mime-type-match=GLOB:TYPE\n"
 377 "                            assume any file with leaf name matching shell\n"
 378 "                            wildcard pattern GLOB has MIME Content-Type TYPE\n"
 379 "                            (special TYPE values: 'ignore' and 'skip')\n"
 380 "  -F, --filter=M[,[T][,C]]:CMD\n"
 381 "                            process files with MIME Content-Type M using\n"
 382 "                            command CMD, which produces output (on stdout or\n"
 383 "                            in a temporary file) with format T (Content-Type\n"
 384 "                            or file extension; currently txt (default), html\n"
 385 "                            or svg) in character encoding C (default: UTF-8).\n"
 386 "                            E.g. -Fapplication/octet-stream:'strings -n8'\n"
 387 "                            or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
 388 "  -l, --depth-limit=LIMIT   set recursion limit (0 = unlimited)\n"
 389 "  -f, --follow              follow symbolic links\n"
 390 "  -i, --ignore-exclusions   ignore meta robots tags and similar exclusions\n"
 391 "  -S, --spelling            index data for spelling correction\n"
 392 "  -m, --max-size            maximum size of file to index (in bytes or with a\n"
 393 "                            suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
 394 "                            (default: unlimited)\n"
 395 "      --sample=SOURCE       what to use for the stored sample of text for\n"
 396 "                            HTML documents - SOURCE can be 'body' or\n"
 397 "                            'description' (default: 'body')\n"
 398 "  -E, --sample-size=SIZE    maximum size for the document text sample\n"
 399 "                            (supports the same formats as --max-size).\n"
 400 "                            (default: " STRINGIZE(SAMPLE_SIZE) ")\n"
 401 "  -T, --title-size=SIZE     maximum size for the document title\n"
 402 "                            (supports the same formats as --max-size).\n"
 403 "                            (default: " STRINGIZE(TITLE_SIZE) ")\n"
 404 "  -R, --retry-failed        retry files which omindex failed to extract text\n"
 405 "                            from on a previous run\n"
 406 "      --opendir-sleep=SECS  sleep for SECS seconds before opening each\n"
 407 "                            directory - sleeping for 2 seconds seems to\n"
 408 "                            reliably work around problems with indexing files\n"
 409 "                            on Microsoft DFS shares.\n"
 410 "  -C, --track-ctime         track each file's ctime so we can detect changes\n"
 411 "                            to ownership or permissions.\n"
 412 "  -v, --verbose             show more information about what is happening\n"
 413 "      --overwrite           create the database anew (the default is to update\n"
 414 "                            if the database already exists)" << endl;
 415             print_stemmer_help("      ");
 416             print_help_and_version_help("      ");
 417             return 0;
 418         }
 419         case 'V':
 420             print_package_info(PROG_NAME);
 421             return 0;
 422         case 'd': // how shall we handle duplicate documents?
 423             switch (optarg[0]) {
 424             case 'i':
 425                 skip_duplicates = true;
 426                 break;
 427             case 'r':
 428                 skip_duplicates = false;
 429                 break;
 430             }
 431             break;
 432         case 'e':
 433             if (strcmp(optarg, "index") == 0) {
 434                 empty_body = EMPTY_BODY_INDEX;
 435             } else if (strcmp(optarg, "warn") == 0) {
 436                 empty_body = EMPTY_BODY_WARN;
 437             } else if (strcmp(optarg, "skip") == 0) {
 438                 empty_body = EMPTY_BODY_SKIP;
 439             } else {
 440                 cerr << "Invalid --empty-docs value '" << optarg << "'\n"
 441                         "Valid values are index, warn, and skip." << endl;
 442                 return 1;
 443             }
 444             break;
 445         case 'p': // Keep documents even if the files have been removed.
 446             delete_removed_documents = false;
 447             break;
 448         case 'l': { // Set recursion limit
 449             int arg = atoi(optarg);
 450             if (arg < 0) arg = 0;
 451             depth_limit = size_t(arg);
 452             break;
 453         }
 454         case 'f': // Turn on following of symlinks
 455             follow_symlinks = true;
 456             break;
 457         case 'M': {
 458             const char * s = strrchr(optarg, ':');
 459             if (s == NULL) {
 460                 cerr << "Invalid MIME mapping '" << optarg << "'\n"
 461                         "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
 462                         "(or txt: to delete a default mapping)" << endl;
 463                 return 1;
 464             }
 465
 466             // -Mtxt: results in an empty string, which effectively removes the
 467             // default mapping for .txt files.
 468             mime_map[string(optarg, s - optarg)] = string(s + 1);
 469             max_ext_len = max(max_ext_len, strlen(s + 1));
 470             break;
 471         }
 472         case 'F': {
 473             const char * s = strchr(optarg, ':');
 474             if (s != NULL && s[1]) {
 475                 const char * c =
 476                     static_cast<const char *>(memchr(optarg, ',', s - optarg));
 477                 string output_type, output_charset;
 478                 if (c) {
 479                     // Filter produces a specified content-type.
 480                     ++c;
 481                     const char * c2 =
 482                         static_cast<const char *>(memchr(c, ',', s - c));
 483                     if (c2) {
 484                         output_type.assign(c, c2 - c);
 485                         ++c2;
 486                         output_charset.assign(c2, s - c2);
 487                     } else {
 488                         output_type.assign(c, s - c);
 489                     }
 490                     --c;
 491                     if (output_type.find('/') == string::npos) {
 492                         map<string, string>::const_iterator m;
 493                         m = mime_map.find(output_type);
 494                         if (m != mime_map.end()) {
 495                             output_type = m->second;
 496                         } else {
 497                             const char * r = built_in_mime_map(output_type);
 498                             if (r) output_type = r;
 499                         }
 500                     }
 501                     if (output_type != "text/html" &&
 502                         output_type != "text/plain" &&
 503                         output_type != "image/svg+xml") {
 504                         cerr << "Currently only output types 'image/svg+xml', "
 505                                 "'text/html' and 'text/plain' are supported."
 506                              << endl;
 507                         return 1;
 508                     }
 509                 } else {
 510                     c = s;
 511                 }
 512
 513                 const char * cmd = s + 1;
 514                 // Analyse the command string to decide if it needs a shell.
 515                 bool use_shell = command_needs_shell(cmd);
 516                 index_command(string(optarg, c - optarg),
 517                               Filter(string(cmd), output_type,
 518                                      output_charset, use_shell));
 519             } else {
 520                 cerr << "Invalid filter mapping '" << optarg << "'\n"
 521                         "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or TYPE,EXT:COMMAND\n"
 522                         "e.g. 'application/octet-stream:strings -n8'"
 523                      << endl;
 524                 return 1;
 525             }
 526             break;
 527         }
 528         case 'D':
 529             dbpath = optarg;
 530             break;
 531         case 'U':
 532             baseurl = optarg;
 533             break;
 534         case 'o': // --overwrite
 535             overwrite = true;
 536             break;
 537         case 'i':
 538             ignore_exclusions = true;
 539             break;
 540         case 'R': // --retry-failed
 541             retry_failed = true;
 542             break;
 543         case 's':
 544             try {
 545                 stemmer = Xapian::Stem(optarg);
 546             } catch (const Xapian::InvalidArgumentError &) {
 547                 cerr << "Unknown stemming language '" << optarg << "'.\n"
 548                         "Available language names are: "
 549                      << Xapian::Stem::get_available_languages() << endl;
 550                 return 1;
 551             }
 552             break;
 553         case 'S':
 554             spelling = true;
 555             break;
 556         case 'v':
 557             verbose = true;
 558             break;
 559         case 'E': {
 560             off_t arg = parse_size(optarg);
 561             if (arg >= 0) {
 562                 sample_size = size_t(arg);
 563                 break;
 564             }
 565             cerr << PROG_NAME": bad sample size '" << optarg << "'" << endl;
 566             return 1;
 567         }
 568         case 'T': {
 569             off_t arg = parse_size(optarg);
 570             if (arg >= 0) {
 571                 title_size = size_t(arg);
 572                 break;
 573             }
 574             cerr << PROG_NAME": bad title size '" << optarg << "'" << endl;
 575             return 1;
 576         }
 577         case 'm': {
 578             off_t size = parse_size(optarg);
 579             if (size >= 0) {
 580                 max_size = size;
 581                 const char * suffix;
 582                 // Set lsb to the lowest set bit in max_size.
 583                 off_t lsb = max_size & -max_size;
 584                 if (lsb >= off_t(1L << 30)) {
 585                     size >>= 30;
 586                     suffix = "GB";
 587                 } else if (lsb >= off_t(1L << 20)) {
 588                     size >>= 20;
 589                     suffix = "MB";
 590                 } else if (lsb >= off_t(1L << 10)) {
 591                     size >>= 10;
 592                     suffix = "KB";
 593                 } else {
 594                     suffix = "B";
 595                 }
 596                 pretty_max_size = str(size);
 597                 pretty_max_size += suffix;
 598                 break;
 599             }
 600             cerr << PROG_NAME": bad max size '" << optarg << "'" << endl;
 601             return 1;
 602         }
 603         case OPT_OPENDIR_SLEEP: {
 604             // Don't want negative numbers, infinity, NaN, or hex numbers.
 605             char * p = optarg;
 606             if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
 607                 sleep_before_opendir = strtod(p, &p);
 608                 if (*p == '\0')
 609                     break;
 610             }
 611             cerr << PROG_NAME": bad --opendir-sleep argument: "
 612                  "'" << optarg << "'" << endl;
 613             return 1;
 614         }
 615         case OPT_SAMPLE:
 616             if (strcmp(optarg, "description") == 0) {
 617                 description_as_sample = true;
 618             } else if (strcmp(optarg, "body") == 0) {
 619                 description_as_sample = false;
 620             } else {
 621                 cerr << "Invalid --sample value '" << optarg << "'\n"
 622                         "Valid values are body and description." << endl;
 623                 return 1;
 624             }
 625             break;
 626         case 'C':
 627             use_ctime = true;
 628             break;
 629         case 'G': {
 630             char * s = strrchr(optarg, ':');
 631             if (s == NULL) {
 632                 cerr << "Invalid MIME mapping '" << optarg << "'\n"
 633                         "Should be of the form GLOB:TYPE, e.g. *~:ignore"
 634                      << endl;
 635                 return 1;
 636             }
 637 #ifndef HAVE_FNMATCH
 638             cerr << "--mime-type-match isn't supported in this build because "
 639                     "the fnmatch() function wasn't found at configure time."
 640                  << endl;
 641             return 1;
 642 #else
 643             if (s == optarg) {
 644                 cerr << "--mime-type-match with an empty pattern can never "
 645                         "match." << endl;
 646                 return 1;
 647             }
 648             if (memchr(optarg, '/', s - optarg)) {
 649                 cerr << "--mime-type-match only matches against the leaf "
 650                         "filename so a pattern containing '/' can never match."
 651                      << endl;
 652                 return 1;
 653             }
 654             const char* type = s + 1;
 655             if (*type == '\0') {
 656                 cerr << "--mime-type-match doesn't support an empty MIME type"
 657                      << endl;
 658                 return 1;
 659             }
 660             *s = '\0';
 661             mime_patterns.emplace_back(optarg, type);
 662             break;
 663 #endif
 664         }
 665         case ':': // missing param
 666             return 1;
 667         case '?': // unknown option: FIXME -> char
 668             return 1;
 669         }
 670     }
 671
 672     if (dbpath.empty()) {
 673         cerr << PROG_NAME": you must specify a database with --db." << endl;
 674         return 1;
 675     }
 676     if (baseurl.empty()) {
 677         cerr << PROG_NAME": --url not specified, assuming '/'." << endl;
 678     }
 679     // baseurl must end in a '/'.
 680     if (!endswith(baseurl, '/')) {
 681         baseurl += '/';
 682     }
 683
 684     // Site term (omits the trailing slash):
 685     site_term = "J";
 686     site_term.append(baseurl, 0, baseurl.size() - 1);
 687     if (site_term.size() > MAX_SAFE_TERM_LENGTH)
 688         site_term = hash_long_term(site_term, MAX_SAFE_TERM_LENGTH);
 689
 690     // Host term, if the URL contains a hostname (omits any port number):
 691     string::size_type j;
 692     j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
 693     if (j > 0 && baseurl.substr(j, 3) == "://" && j + 3 < baseurl.size()) {
 694         j += 3;
 695         // We must find a '/' - we ensured baseurl ended with a '/' above.
 696         string::size_type k = baseurl.find('/', j);
 697         url_start_path.assign(baseurl, k, string::npos);
 698         string::const_iterator l;
 699         l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
 700         string::size_type host_len = l - baseurl.begin() - j;
 701         host_term = "H";
 702         host_term.append(baseurl, j, host_len);
 703         // DNS hostname limit is 253.
 704         if (host_term.size() > MAX_SAFE_TERM_LENGTH)
 705             host_term = hash_long_term(host_term, MAX_SAFE_TERM_LENGTH);
 706     } else {
 707         url_start_path = baseurl;
 708     }
 709
 710     if (optind >= argc || optind + 2 < argc) {
 711         cerr << PROG_NAME": you must specify a directory to index.\n"
 712 "Do this either as a single directory (corresponding to the base URL)\n"
 713 "or two directories - the first corresponding to the base URL and the second\n"
 714 "a subdirectory of that to index." << endl;
 715         return 1;
 716     }
 717
 718     root = argv[optind];
 719     if (!endswith(root, '/')) {
 720         root += '/';
 721     }
 722     if (optind + 2 == argc) {
 723         string start_url = argv[optind + 1];
 724         if (startswith(start_url, '/')) {
 725             // Make relative to root.
 726             if (!startswith(start_url, root)) {
 727                 cerr << PROG_NAME": '" << argv[optind + 1] << "' "
 728                     "is not a subdirectory of '" << argv[optind] << "'."
 729                      << endl;
 730                 return 1;
 731             }
 732             start_url.erase(0, root.size());
 733         }
 734         if (!endswith(start_url, '/')) {
 735             start_url += '/';
 736         }
 737         root += start_url;
 738         url_encode_path(baseurl, start_url);
 739     }
 740
 741     int exitcode = 1;
 742     try {
 743         index_init(dbpath, stemmer, root, site_term, host_term, empty_body,
 744                    (skip_duplicates ? DUP_SKIP : DUP_CHECK_LAZILY),
 745                    sample_size, title_size, max_ext_len,
 746                    overwrite, retry_failed, delete_removed_documents, verbose,
 747                    use_ctime, spelling, ignore_exclusions,
 748                    description_as_sample);
 749         index_directory(root, baseurl, depth_limit, mime_map);
 750         index_handle_deletion();
 751         index_commit();
 752         exitcode = 0;
 753     } catch (const CommitAndExit &e) {
 754         cout << "Exception: " << e.what() << endl;
 755         cout << "Committing pending changes..." << endl;
 756         index_commit();
 757     } catch (const Xapian::Error &e) {
 758         cout << "Exception: " << e.get_description() << endl;
 759     } catch (const exception &e) {
 760         cout << "Exception: " << e.what() << endl;
 761     } catch (const string &s) {
 762         cout << "Exception: " << s << endl;
 763     } catch (const char *s) {
 764         cout << "Exception: " << s << endl;
 765     } catch (...) {
 766         cout << "Caught unknown exception" << endl;
 767     }
 768
 769     index_done();
 770
 771     return exitcode;
 772 }