xapian-applications/omega/index_file.cc

   1 /** @file
   2  * @brief Handle indexing a document from a file
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002-2023 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  * Copyright 2019 Bruno Baruffaldi
  11  * Copyright 2020 Parth Kapadia
  12  *
  13  * This program is free software; you can redistribute it and/or
  14  * modify it under the terms of the GNU General Public License as
  15  * published by the Free Software Foundation; either version 2 of the
  16  * License, or (at your option) any later version.
  17  *
  18  * This program is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU General Public License for more details.
  22  *
  23  * You should have received a copy of the GNU General Public License
  24  * along with this program; if not, write to the Free Software
  25  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  26  * USA
  27  */
  28
  29 #include <config.h>
  30
  31 #include "index_file.h"
  32
  33 #include <algorithm>
  34 #include <iostream>
  35 #include <limits>
  36 #include <string>
  37 #include <map>
  38 #include <vector>
  39
  40 #include <sys/types.h>
  41 #include "safeunistd.h"
  42 #include <cassert>
  43 #include <cerrno>
  44 #include <cstdio>
  45 #include <cstdlib>
  46 #include <cstring>
  47 #include "safefcntl.h"
  48 #include <ctime>
  49
  50 #include <xapian.h>
  51
  52 #include "abiwordparser.h"
  53 #include "append_filename_arg.h"
  54 #include "atomparser.h"
  55 #include "datetime.h"
  56 #include "diritor.h"
  57 #include "failed.h"
  58 #include "hashterm.h"
  59 #include "htmlparser.h"
  60 #include "md5wrap.h"
  61 #include "mimemap.h"
  62 #include "msxmlparser.h"
  63 #include "opendocmetaparser.h"
  64 #include "opendocparser.h"
  65 #include "pkglibbindir.h"
  66 #include "runfilter.h"
  67 #include "sample.h"
  68 #include "str.h"
  69 #include "stringutils.h"
  70 #include "svgparser.h"
  71 #include "tmpdir.h"
  72 #include "utf8convert.h"
  73 #include "values.h"
  74 #include "worker.h"
  75 #include "xlsxparser.h"
  76 #include "xpsparser.h"
  77
  78 using namespace std;
  79
  80 static Xapian::WritableDatabase db;
  81 static Xapian::TermGenerator indexer;
  82
  83 static Xapian::doccount old_docs_not_seen;
  84 static Xapian::docid old_lastdocid;
  85 static vector<bool> updated;
  86
  87 static bool verbose;
  88 static bool retry_failed;
  89 static bool use_ctime;
  90 static dup_action_type dup_action;
  91 static bool ignore_exclusions;
  92 static bool description_as_sample;
  93 static bool date_terms;
  94
  95 static time_t last_altered_max;
  96 static size_t sample_size;
  97 static size_t title_size;
  98 static size_t max_ext_len;
  99
 100 static empty_body_type empty_body;
 101
 102 static string root;
 103 static string site_term, host_term;
 104
 105 static Failed failed;
 106
 107 map<string, Filter> commands;
 108
 109 static void
 110 mark_as_seen(Xapian::docid did)
 111 {
 112     if (usual(did < updated.size() && !updated[did])) {
 113         updated[did] = true;
 114         --old_docs_not_seen;
 115     }
 116 }
 117
 118 void
 119 skip(const string& urlterm, const string& context, const string& msg,
 120      off_t size, time_t last_mod, unsigned flags)
 121 {
 122     failed.add(urlterm, last_mod, size);
 123
 124     if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
 125         if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
 126         cout << context << ": ";
 127     }
 128
 129     cout << "Skipping - " << msg << endl;
 130 }
 131
 132 static void
 133 skip_cmd_failed(const string& urlterm, const string& context, const string& cmd,
 134                 off_t size, time_t last_mod)
 135 {
 136     skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
 137 }
 138
 139 static void
 140 skip_meta_tag(const string& urlterm, const string& context,
 141               off_t size, time_t last_mod)
 142 {
 143     skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
 144 }
 145
 146 static void
 147 skip_unknown_mimetype(const string& urlterm, const string& context,
 148                       const string& mimetype, off_t size, time_t last_mod)
 149 {
 150     skip(urlterm, context, "unknown MIME type '" + mimetype + "'",
 151          size, last_mod);
 152 }
 153
 154 void
 155 index_add_default_libraries()
 156 {
 157 #if defined HAVE_POPPLER
 158     Worker* omindex_poppler = new Worker("omindex_poppler");
 159     index_library("application/pdf", omindex_poppler);
 160 #endif
 161 #if defined HAVE_LIBEBOOK
 162     Worker* omindex_libebook = new Worker("omindex_libebook");
 163     index_library("application/vnd.palm", omindex_libebook);
 164     index_library("application/x-fictionbook+xml", omindex_libebook);
 165     index_library("application/x-zip-compressed-fb2", omindex_libebook);
 166     index_library("application/x-sony-bbeb", omindex_libebook);
 167     index_library("application/x-tcr-ebook", omindex_libebook);
 168     index_library("application/x-qioo-ebook", omindex_libebook);
 169 #endif
 170 #if defined HAVE_LIBETONYEK
 171     Worker* omindex_libetonyek = new Worker("omindex_libetonyek");
 172     index_library("application/vnd.apple.keynote", omindex_libetonyek);
 173     index_library("application/vnd.apple.pages", omindex_libetonyek);
 174     index_library("application/vnd.apple.numbers", omindex_libetonyek);
 175 #endif
 176 #if defined HAVE_LIBGEPUB
 177     Worker* omindex_libgepub = new Worker("omindex_libgepub");
 178     index_library("application/epub+zip", omindex_libgepub);
 179 #endif
 180 #if defined HAVE_TESSERACT
 181     Worker* omindex_tesseract = new Worker("omindex_tesseract");
 182     index_library("image/gif", omindex_tesseract);
 183     index_library("image/jpeg", omindex_tesseract);
 184     index_library("image/png", omindex_tesseract);
 185     index_library("image/webp", omindex_tesseract);
 186     index_library("image/tiff", omindex_tesseract);
 187     index_library("image/x-portable-bitmap", omindex_tesseract);
 188     index_library("image/x-portable-graymap", omindex_tesseract);
 189     index_library("image/x-portable-anymap", omindex_tesseract);
 190     index_library("image/x-portable-pixmap", omindex_tesseract);
 191 #endif
 192 #if defined HAVE_GMIME
 193     Worker* omindex_gmime = new Worker("omindex_gmime");
 194     index_library("message/rfc822", omindex_gmime);
 195     index_library("message/news", omindex_gmime);
 196 #endif
 197 #if defined HAVE_LIBARCHIVE
 198     Worker* omindex_libarchive = new Worker("omindex_libarchive");
 199     index_library("application/oxps", omindex_libarchive);
 200     index_library("application/vnd.ms-xpsdocument", omindex_libarchive);
 201     index_library("application/vnd.oasis.opendocument.text",
 202                   omindex_libarchive);
 203     index_library("application/vnd.oasis.opendocument.spreadsheet",
 204                   omindex_libarchive);
 205     index_library("application/vnd.oasis.opendocument.presentation",
 206                   omindex_libarchive);
 207     index_library("application/vnd.oasis.opendocument.graphics",
 208                   omindex_libarchive);
 209     index_library("application/vnd.oasis.opendocument.chart",
 210                   omindex_libarchive);
 211     index_library("application/vnd.oasis.opendocument.formula",
 212                   omindex_libarchive);
 213     index_library("application/vnd.oasis.opendocument.database",
 214                   omindex_libarchive);
 215     index_library("application/vnd.oasis.opendocument.image",
 216                   omindex_libarchive);
 217     index_library("application/vnd.oasis.opendocument.text-master",
 218                   omindex_libarchive);
 219     index_library("application/vnd.oasis.opendocument.text-template",
 220                   omindex_libarchive);
 221     index_library("application/vnd.oasis.opendocument.spreadsheet-template",
 222                   omindex_libarchive);
 223     index_library("application/vnd.oasis.opendocument.presentation-template",
 224                   omindex_libarchive);
 225     index_library("application/vnd.oasis.opendocument.graphics-template",
 226                   omindex_libarchive);
 227     index_library("application/vnd.oasis.opendocument.chart-template",
 228                   omindex_libarchive);
 229     index_library("application/vnd.oasis.opendocument.formula-template",
 230                   omindex_libarchive);
 231     index_library("application/vnd.oasis.opendocument.image-template",
 232                   omindex_libarchive);
 233     index_library("application/vnd.oasis.opendocument.text-web",
 234                   omindex_libarchive);
 235     index_library("application/vnd.sun.xml.calc",
 236                   omindex_libarchive);
 237     index_library("application/vnd.sun.xml.calc.template",
 238                   omindex_libarchive);
 239     index_library("application/vnd.sun.xml.draw",
 240                   omindex_libarchive);
 241     index_library("application/vnd.sun.xml.draw.template",
 242                   omindex_libarchive);
 243     index_library("application/vnd.sun.xml.impress",
 244                   omindex_libarchive);
 245     index_library("application/vnd.sun.xml.impress.template",
 246                   omindex_libarchive);
 247     index_library("application/vnd.sun.xml.math",
 248                   omindex_libarchive);
 249     index_library("application/vnd.sun.xml.writer",
 250                   omindex_libarchive);
 251     index_library("application/vnd.sun.xml.writer.global",
 252                   omindex_libarchive);
 253     index_library("application/vnd.sun.xml.writer.template",
 254                   omindex_libarchive);
 255     index_library("application/vnd.openxmlformats-officedocument."
 256                   "wordprocessingml.document", omindex_libarchive);
 257     index_library("application/vnd.openxmlformats-officedocument."
 258                   "wordprocessingml.template", omindex_libarchive);
 259     index_library("application/vnd.openxmlformats-officedocument."
 260                   "spreadsheetml.sheet", omindex_libarchive);
 261     index_library("application/vnd.openxmlformats-officedocument."
 262                   "spreadsheetml.template", omindex_libarchive);
 263     index_library("application/vnd.openxmlformats-officedocument."
 264                   "presentationml.presentation", omindex_libarchive);
 265     index_library("application/vnd.openxmlformats-officedocument."
 266                   "presentationml.slideshow", omindex_libarchive);
 267     index_library("application/vnd.openxmlformats-officedocument."
 268                   "presentationml.template", omindex_libarchive);
 269 #endif
 270 #if defined HAVE_LIBABW
 271     Worker* omindex_libabw = new Worker("omindex_libabw");
 272     index_library("application/x-abiword", omindex_libabw);
 273     index_library("application/x-abiword-compressed", omindex_libabw);
 274 #endif
 275 #if defined HAVE_LIBCDR
 276     Worker* omindex_libcdr = new Worker("omindex_libcdr");
 277     index_library("image/x-coreldraw", omindex_libcdr);
 278 #endif
 279 #if defined HAVE_LIBEXTRACTOR
 280     Worker* omindex_libextractor = new Worker("omindex_libextractor");
 281     index_library("video/mpeg", omindex_libextractor);
 282     index_library("video/x-flv", omindex_libextractor);
 283     index_library("video/x-msvideo", omindex_libextractor);
 284     index_library("video/x-ms-asf", omindex_libextractor);
 285     index_library("video/quicktime", omindex_libextractor);
 286     index_library("video/ogg", omindex_libextractor);
 287     index_library("audio/flac", omindex_libextractor);
 288     index_library("audio/mpeg", omindex_libextractor);
 289     index_library("audio/ogg", omindex_libextractor);
 290     index_library("audio/x-wav", omindex_libextractor);
 291     index_library("audio/x-mod", omindex_libextractor);
 292     index_library("audio/x-s3m", omindex_libextractor);
 293 #endif
 294 #if defined HAVE_LIBMWAW
 295     Worker* omindex_libmwaw = new Worker("omindex_libmwaw");
 296     index_library("application/clarisworks", omindex_libmwaw);
 297     index_library("image/x-pict", omindex_libmwaw);
 298 #endif
 299 }
 300
 301 void
 302 index_add_default_filters()
 303 {
 304     // Command needs to be run using /bin/sh.
 305     auto USE_SHELL = Filter::USE_SHELL;
 306     // Currently none of these commands needs USE_SHELL.
 307     (void)USE_SHELL;
 308     // Input should be piped to stdin.
 309     auto PIPE_IN = Filter::PIPE_IN;
 310     // Filename can be /dev/stdin (which must be seekable).
 311     auto SEEK_DEV_STDIN = Filter::SEEK_DEV_STDIN;
 312     // Filename can be /dev/stdin (which can be a pipe).
 313     auto PIPE_DEV_STDIN = Filter::PIPE_DEV_STDIN;
 314     index_command("application/msword",
 315                   Filter("antiword -mUTF-8.txt -", PIPE_IN));
 316     index_command("application/vnd.ms-excel",
 317                   Filter("xls2csv -c' ' -q0 -dutf-8", PIPE_DEV_STDIN));
 318     index_command("application/vnd.ms-powerpoint",
 319                   Filter("catppt -dutf-8", PIPE_DEV_STDIN));
 320     // Looking at the source of wpd2html and wpd2text I think both output
 321     // UTF-8, but it's hard to be sure without sample Unicode .wpd files
 322     // as they don't seem to be at all well documented.
 323     index_command("application/vnd.wordperfect",
 324                   Filter("wpd2text", SEEK_DEV_STDIN));
 325     // wps2text produces UTF-8 output from the sample files I've tested.
 326     index_command("application/vnd.ms-works",
 327                   Filter("wps2text", SEEK_DEV_STDIN));
 328     // Output is UTF-8 according to "man djvutxt".  Generally this seems to
 329     // be true, though some examples from djvu.org generate isolated byte
 330     // 0x95 in a context which suggests it might be intended to be a bullet
 331     // (as it is in CP1252).
 332     index_command("image/vnd.djvu", Filter("djvutxt -", PIPE_IN));
 333     index_command("text/markdown",
 334                   Filter("markdown", "text/html", PIPE_IN));
 335     // The --text option unhelpfully converts all non-ASCII characters to "?"
 336     // so we use --html instead, which produces HTML entities.  The --nopict
 337     // option suppresses exporting picture files as pictNNNN.wmf in the current
 338     // directory.  Note that this option was ignored in some older versions,
 339     // but it was fixed in unrtf 0.20.4.
 340     index_command("application/rtf",
 341                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 342                          PIPE_IN));
 343     index_command("text/rtf",
 344                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 345                          PIPE_IN));
 346     index_command("text/x-rst",
 347                   Filter("rst2html", "text/html", PIPE_IN));
 348     index_command("application/x-mspublisher",
 349                   Filter("pub2xhtml", "text/html", SEEK_DEV_STDIN));
 350     index_command("application/vnd.ms-outlook",
 351                   Filter(get_pkglibbindir() + "/outlookmsg2html",
 352                          "text/html", SEEK_DEV_STDIN));
 353     index_command("application/vnd.ms-visio.drawing",
 354                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 355     index_command("application/vnd.ms-visio.stencil",
 356                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 357     index_command("application/vnd.ms-visio.template",
 358                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 359     index_command("application/vnd.visio",
 360                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 361     // pod2text's output character set doesn't seem to be documented, but from
 362     // inspecting the source it looks like it's probably iso-8859-1.  We need
 363     // to pass "--errors=stderr" or else minor POD formatting errors cause a
 364     // file not to be indexed.
 365     index_command("text/x-perl",
 366                   Filter("pod2text --errors=stderr",
 367                          "text/plain", "iso-8859-1", PIPE_IN));
 368     // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
 369     // appearing as single ligatures.  For European languages, it's actually
 370     // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
 371     // now until we handle Unicode "compatibility decompositions".
 372     index_command("application/x-dvi",
 373                   Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", PIPE_IN));
 374     // Simplistic - ought to look in index.rdf files for filename and character
 375     // set.
 376     index_command("application/x-maff",
 377                   Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
 378                          SEEK_DEV_STDIN));
 379     index_command("application/x-mimearchive",
 380                   Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
 381                          PIPE_DEV_STDIN));
 382     index_command("message/news",
 383                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 384                          PIPE_DEV_STDIN));
 385     index_command("message/rfc822",
 386                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 387                          PIPE_DEV_STDIN));
 388     index_command("text/vcard",
 389                   Filter(get_pkglibbindir() + "/vcard2text", PIPE_DEV_STDIN));
 390     index_command("application/vnd.apple.keynote",
 391                   Filter("key2text", SEEK_DEV_STDIN));
 392     index_command("application/vnd.apple.numbers",
 393                   Filter("numbers2text", SEEK_DEV_STDIN));
 394     index_command("application/vnd.apple.pages",
 395                   Filter("pages2text", SEEK_DEV_STDIN));
 396 }
 397
 398 void
 399 index_init(const string& dbpath, const Xapian::Stem& stemmer,
 400            const string& root_, const string& site_term_,
 401            const string& host_term_,
 402            empty_body_type empty_body_, dup_action_type dup_action_,
 403            size_t sample_size_, size_t title_size_, size_t max_ext_len_,
 404            bool overwrite, bool retry_failed_,
 405            bool delete_removed_documents, bool verbose_, bool use_ctime_,
 406            bool spelling, bool ignore_exclusions_, bool description_as_sample_,
 407            bool date_terms_)
 408 {
 409     root = root_;
 410     site_term = site_term_;
 411     host_term = host_term_;
 412     empty_body = empty_body_;
 413     dup_action = dup_action_;
 414     sample_size = sample_size_;
 415     title_size = title_size_;
 416     max_ext_len = max_ext_len_;
 417     verbose = verbose_;
 418     use_ctime = use_ctime_;
 419     ignore_exclusions = ignore_exclusions_;
 420     description_as_sample = description_as_sample_;
 421     date_terms = date_terms_;
 422
 423     if (!overwrite) {
 424         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
 425         old_docs_not_seen = db.get_doccount();
 426         // Handle an initially empty database exactly the same way as when
 427         // overwrite is true.
 428         if (old_docs_not_seen != 0) {
 429             old_lastdocid = db.get_lastdocid();
 430             if (delete_removed_documents) {
 431                 // + 1 so that old_lastdocid is a valid subscript.
 432                 updated.resize(old_lastdocid + 1);
 433             }
 434             Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 435             string ubound = db.get_value_upper_bound(slot);
 436             if (!ubound.empty())
 437                 last_altered_max = binary_string_to_int(ubound);
 438         }
 439     } else {
 440         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
 441     }
 442
 443     if (spelling) {
 444         indexer.set_database(db);
 445         indexer.set_flags(indexer.FLAG_SPELLING);
 446     }
 447     indexer.set_stemmer(stemmer);
 448
 449     runfilter_init();
 450
 451     failed.init(db);
 452
 453     if (overwrite) {
 454         // There are no failures to retry, so setting this flag doesn't
 455         // change the outcome, but does mean we avoid the overhead of
 456         // checking for a previous failure.
 457         retry_failed = true;
 458     } else if (retry_failed_) {
 459         failed.clear();
 460         retry_failed = true;
 461     } else {
 462         // If there are no existing failures, setting this flag doesn't
 463         // change the outcome, but does mean we avoid the overhead of
 464         // checking for a previous failure.
 465         retry_failed = failed.empty();
 466     }
 467 }
 468
 469 static void
 470 parse_pdfinfo_field(const char* p, const char* end, string& out,
 471                     const char* field, size_t len)
 472 {
 473     if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
 474         p += len;
 475         while (p != end && *p == ' ')
 476             ++p;
 477         if (p != end && (end[-1] != '\r' || --end != p))
 478             out.assign(p, end - p);
 479     }
 480 }
 481
 482 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
 483     parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
 484
 485 static void
 486 parse_pdf_metainfo(const string& pdfinfo, string& author, string& title,
 487                    string& keywords, string& topic, int& pages)
 488 {
 489     const char* p = pdfinfo.data();
 490     const char* end = p + pdfinfo.size();
 491     while (p != end) {
 492         const char* start = p;
 493         p = static_cast<const char*>(memchr(p, '\n', end - p));
 494         const char* eol;
 495         if (p) {
 496             eol = p;
 497             ++p;
 498         } else {
 499             p = eol = end;
 500         }
 501         switch (*start) {
 502             case 'A':
 503                 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
 504                 break;
 505             case 'K':
 506                 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
 507                 break;
 508             case 'P': {
 509                 string s;
 510                 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
 511                 if (!s.empty())
 512                     pages = atoi(s.c_str());
 513                 break;
 514             }
 515             case 'S':
 516                 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
 517                 break;
 518             case 'T':
 519                 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
 520                 break;
 521         }
 522     }
 523 }
 524
 525 static void
 526 get_pdf_metainfo(int fd, string& author, string& title,
 527                  string& keywords, string& topic, int& pages)
 528 {
 529     try {
 530         string pdfinfo;
 531         run_filter(fd, "pdfinfo -enc UTF-8 -", false, &pdfinfo);
 532         parse_pdf_metainfo(pdfinfo, author, title, keywords, topic, pages);
 533     } catch (const ReadError&) {
 534         // It's probably best to index the document even if pdfinfo fails.
 535     }
 536 }
 537
 538 static void
 539 get_pdf_metainfo(const string& file, string& author, string& title,
 540                  string& keywords, string& topic, int& pages)
 541 {
 542     try {
 543         string cmd = "pdfinfo -enc UTF-8";
 544         append_filename_argument(cmd, file);
 545         parse_pdf_metainfo(stdout_to_string(cmd, false),
 546                            author, title, keywords, topic, pages);
 547     } catch (const ReadError&) {
 548         // It's probably best to index the document even if pdfinfo fails.
 549     }
 550 }
 551
 552 static void
 553 generate_sample_from_csv(const string& csv_data, string& sample)
 554 {
 555     // Add 3 to allow for a 4 byte utf-8 sequence being appended when
 556     // output is sample_size - 1 bytes long.  Use csv_data.size() if smaller
 557     // since the user might reasonably set sample_size really high.
 558     sample.reserve(min(sample_size + 3, csv_data.size()));
 559     size_t last_word_end = 0;
 560     bool in_space = true;
 561     bool in_quotes = false;
 562     for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
 563         unsigned ch = *i;
 564
 565         if (!in_quotes) {
 566             // If not already in double quotes, '"' starts quoting and
 567             // ',' starts a new field.
 568             if (ch == '"') {
 569                 in_quotes = true;
 570                 continue;
 571             }
 572             if (ch == ',')
 573                 ch = ' ';
 574         } else if (ch == '"') {
 575             // In double quotes, '"' either ends double quotes, or
 576             // if followed by another '"', means a literal '"'.
 577             if (++i == Xapian::Utf8Iterator())
 578                 break;
 579             ch = *i;
 580             if (ch != '"') {
 581                 in_quotes = false;
 582                 if (ch == ',')
 583                     ch = ' ';
 584             }
 585         }
 586
 587         if (ch <= ' ' || ch == 0xa0) {
 588             // FIXME: if all the whitespace characters between two
 589             // words are 0xa0 (non-breaking space) then perhaps we
 590             // should output 0xa0.
 591             if (in_space)
 592                 continue;
 593             last_word_end = sample.size();
 594             sample += ' ';
 595             in_space = true;
 596         } else {
 597             Xapian::Unicode::append_utf8(sample, ch);
 598             in_space = false;
 599         }
 600
 601         if (sample.size() >= sample_size) {
 602             // Need to truncate sample.
 603             if (last_word_end <= sample_size / 2) {
 604                 // Monster word!  We'll have to just split it.
 605                 sample.replace(sample_size - 3, string::npos, "...", 3);
 606             } else {
 607                 sample.replace(last_word_end, string::npos, " ...", 4);
 608             }
 609             break;
 610         }
 611     }
 612 }
 613
 614 static bool
 615 index_check_existing(const string& urlterm, time_t last_altered,
 616                      Xapian::docid& did)
 617 {
 618     switch (dup_action) {
 619         case DUP_SKIP: {
 620             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 621             if (p != db.postlist_end(urlterm)) {
 622                 if (verbose)
 623                     cout << "already indexed, not updating" << endl;
 624                 did = *p;
 625                 mark_as_seen(did);
 626                 return true;
 627             }
 628             break;
 629         }
 630         case DUP_CHECK_LAZILY: {
 631             // If last_altered > last_altered_max, we know for sure that the
 632             // file is new or updated.
 633             if (last_altered > last_altered_max) {
 634                 return false;
 635             }
 636
 637             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 638             if (p != db.postlist_end(urlterm)) {
 639                 did = *p;
 640                 Xapian::Document doc = db.get_document(did);
 641                 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 642                 string value = doc.get_value(slot);
 643                 time_t old_last_altered = binary_string_to_int(value);
 644                 if (last_altered <= old_last_altered) {
 645                     if (verbose)
 646                         cout << "already indexed" << endl;
 647                     // The docid should be in updated - the only valid
 648                     // exception is if the URL was long and hashed to the
 649                     // same URL as an existing document indexed in the same
 650                     // batch.
 651                     mark_as_seen(did);
 652                     return true;
 653                 }
 654             }
 655             break;
 656         }
 657     }
 658     return false;
 659 }
 660
 661 void
 662 index_remove_failed_entry(const string& urlterm)
 663 {
 664     failed.del(urlterm);
 665 }
 666
 667 void
 668 index_add_document(const string& urlterm, time_t last_altered,
 669                    Xapian::docid did, const Xapian::Document& doc)
 670 {
 671     if (dup_action != DUP_SKIP) {
 672         // If this document has already been indexed, update the existing
 673         // entry.
 674         if (did) {
 675             // We already found out the document id above.
 676             db.replace_document(did, doc);
 677         } else if (last_altered <= last_altered_max) {
 678             // We checked for the UID term and didn't find it.
 679             did = db.add_document(doc);
 680         } else {
 681             did = db.replace_document(urlterm, doc);
 682         }
 683         mark_as_seen(did);
 684         if (verbose) {
 685             if (did <= old_lastdocid) {
 686                 cout << "updated" << endl;
 687             } else {
 688                 cout << "added" << endl;
 689             }
 690         }
 691     } else {
 692         // If this were a duplicate, we'd have skipped it above.
 693         db.add_document(doc);
 694         if (verbose)
 695             cout << "added" << endl;
 696     }
 697 }
 698
 699 void
 700 index_mimetype(const string& file, const string& urlterm, const string& url,
 701                const string& ext,
 702                string mimetype,
 703                DirectoryIterator& d,
 704                string pathterm,
 705                string record)
 706 {
 707     string context(file, root.size(), string::npos);
 708
 709     // FIXME: We could be cleverer here and check mtime too when use_ctime is
 710     // set - if the ctime has changed but the mtime is unchanged, we can just
 711     // update the existing Document and avoid having to re-extract text, etc.
 712     time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
 713
 714     Xapian::docid did = 0;
 715     if (index_check_existing(urlterm, last_altered, did))
 716         return;
 717
 718     if (!retry_failed) {
 719         // We only store and check the mtime (last modified) - a change to the
 720         // metadata won't generally cause a previous failure to now work
 721         // (FIXME: except permissions).
 722         time_t failed_last_mod;
 723         off_t failed_size;
 724         if (failed.contains(urlterm, failed_last_mod, failed_size)) {
 725             if (d.get_mtime() <= failed_last_mod &&
 726                 d.get_size() == failed_size) {
 727                 if (verbose)
 728                     cout << "failed to extract text on earlier run" << endl;
 729                 return;
 730             }
 731             // The file has changed, so remove the entry for it.  If it fails
 732             // again on this attempt, we'll add a new one.
 733             failed.del(urlterm);
 734         }
 735     }
 736
 737     // If we didn't get the mime type from the extension, call libmagic to get
 738     // it.
 739     if (mimetype.empty()) {
 740         mimetype = d.get_magic_mimetype();
 741         if (mimetype.empty()) {
 742             skip(urlterm, file.substr(root.size()),
 743                  "Unknown extension and unrecognised format",
 744                  d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 745             return;
 746         }
 747     }
 748
 749     if (verbose)
 750         cout << "Indexing \"" << file.substr(root.size()) << "\" as "
 751              << mimetype << " ... " << flush;
 752
 753     // Use `file` as the basis, as we don't want URL encoding in these terms,
 754     // but need to switch over the initial part so we get `/~olly/foo/bar` not
 755     // `/home/olly/public_html/foo/bar`.
 756     Xapian::Document newdocument;
 757     size_t j;
 758     while ((j = pathterm.rfind('/')) > 1 && j != string::npos) {
 759         pathterm.resize(j);
 760         if (pathterm.length() > MAX_SAFE_TERM_LENGTH) {
 761             string term_hash = hash_long_term(pathterm, MAX_SAFE_TERM_LENGTH);
 762             newdocument.add_boolean_term(term_hash);
 763         } else {
 764             newdocument.add_boolean_term(pathterm);
 765         }
 766     }
 767
 768     string author, title, sample, keywords, topic, dump;
 769     string to, cc, bcc, message_id;
 770     string md5;
 771     time_t created = time_t(-1);
 772     int pages = -1;
 773
 774     map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
 775     if (cmd_it == commands.end()) {
 776         size_t slash = mimetype.find('/');
 777         if (slash != string::npos) {
 778             string wildtype(mimetype, 0, slash + 2);
 779             wildtype[slash + 1] = '*';
 780             cmd_it = commands.find(wildtype);
 781             if (cmd_it == commands.end()) {
 782                 cmd_it = commands.find("*/*");
 783             }
 784         }
 785         if (cmd_it == commands.end()) {
 786             cmd_it = commands.find("*");
 787         }
 788     }
 789     try {
 790         if (cmd_it != commands.end() && cmd_it->second.worker) {
 791             // Use a worker process to extract the content.
 792             Worker* wrk = cmd_it->second.worker;
 793             int r = wrk->extract(file, mimetype, dump, title, keywords, author,
 794                                  to, cc, bcc, message_id, pages, created);
 795             if (r != 0) {
 796                 string msg = wrk->get_error();
 797                 assert(!msg.empty());
 798                 skip(urlterm, context, msg, d.get_size(), d.get_mtime());
 799                 if (r < 0) {
 800                     // Hard failure - don't try this filter again for this run.
 801                     string filter_entry;
 802                     if (cmd_it != commands.end()) {
 803                         filter_entry = cmd_it->first;
 804                     } else {
 805                         filter_entry = mimetype;
 806                     }
 807                     commands[filter_entry] = Filter();
 808                 }
 809                 return;
 810             }
 811         } else if (cmd_it != commands.end()) {
 812             // Easy "run a command and read text or HTML from stdout or a
 813             // temporary file" cases.
 814             auto& filter = cmd_it->second;
 815             string cmd = filter.cmd;
 816             if (cmd.empty()) {
 817                 skip(urlterm, context, "required filter not installed",
 818                      d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
 819                 return;
 820             }
 821             if (cmd == "false") {
 822                 // Allow setting 'false' as a filter to mean that a MIME type
 823                 // should be quietly ignored.
 824                 string m = "ignoring MIME type '";
 825                 m += cmd_it->first;
 826                 m += "'";
 827                 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
 828                      SKIP_VERBOSE_ONLY);
 829                 return;
 830             }
 831             bool use_shell = filter.use_shell();
 832             bool input_on_stdin = filter.input_on_stdin();
 833             bool substituted = false;
 834             string tmpout;
 835             size_t pcent = 0;
 836             while (true) {
 837                 pcent = cmd.find('%', pcent);
 838                 if (pcent >= cmd.size() - 1)
 839                     break;
 840                 switch (cmd[pcent + 1]) {
 841                     case '%': // %% -> %.
 842                         cmd.erase(++pcent, 1);
 843                         break;
 844                     case 'f': { // %f -> escaped filename.
 845                         substituted = true;
 846                         if (filter.dev_stdin()) {
 847                             cmd.replace(pcent, 2, "/dev/stdin",
 848                                         CONST_STRLEN("/dev/stdin"));
 849                             break;
 850                         }
 851                         string tail(cmd, pcent + 2);
 852                         cmd.resize(pcent);
 853                         // Suppress the space append_filename_argument()
 854                         // usually adds before the argument - the command
 855                         // string either includes one, or won't expect one
 856                         // (e.g. --input=%f).
 857                         append_filename_argument(cmd, file, false);
 858                         pcent = cmd.size();
 859                         cmd += tail;
 860                         break;
 861                     }
 862                     case 't': { // %t -> temporary output file.
 863                         if (tmpout.empty()) {
 864                             // Use a temporary file with a suitable extension
 865                             // in case the command cares, and for more helpful
 866                             // error messages from the command.
 867                             if (filter.output_type == "text/html") {
 868                                 tmpout = get_tmpfile("tmp.html");
 869                             } else if (filter.output_type == "image/svg+xml") {
 870                                 tmpout = get_tmpfile("tmp.svg");
 871                             } else {
 872                                 tmpout = get_tmpfile("tmp.txt");
 873                             }
 874                         }
 875                         substituted = true;
 876                         string tail(cmd, pcent + 2);
 877                         cmd.resize(pcent);
 878                         // Suppress the space append_filename_argument()
 879                         // usually adds before the argument - the command
 880                         // string either includes one, or won't expect one
 881                         // (e.g. --output=%t).
 882                         append_filename_argument(cmd, tmpout, false);
 883                         pcent = cmd.size();
 884                         cmd += tail;
 885                         break;
 886                     }
 887                     default:
 888                         // Leave anything else alone for now.
 889                         pcent += 2;
 890                         break;
 891                 }
 892             }
 893             if (!substituted && cmd != "true") {
 894                 if (input_on_stdin) {
 895                     if (filter.dev_stdin()) {
 896                         cmd += " /dev/stdin";
 897                     }
 898                 } else {
 899                     // If no %f, append the filename to the command.
 900                     append_filename_argument(cmd, file);
 901                 }
 902             }
 903             try {
 904                 if (!tmpout.empty()) {
 905                     // Output in temporary file.
 906                     if (input_on_stdin) {
 907                         run_filter(d.get_fd(), cmd, use_shell);
 908                     } else {
 909                         run_filter(cmd, use_shell);
 910                     }
 911                     if (!load_file(tmpout, dump, NOCACHE)) {
 912                         throw ReadError("Couldn't read output file");
 913                     }
 914                     unlink(tmpout.c_str());
 915                 } else if (cmd == "true") {
 916                     // Ignore the file's contents, just index metadata from the
 917                     // filing system.
 918                 } else {
 919                     // Output on stdout.
 920                     if (input_on_stdin) {
 921                         run_filter(d.get_fd(), cmd, use_shell, &dump);
 922                     } else {
 923                         run_filter(cmd, use_shell, &dump);
 924                     }
 925                 }
 926                 const string& charset = filter.output_charset;
 927                 if (filter.output_type == "text/html") {
 928                     HtmlParser p;
 929                     p.ignore_metarobots();
 930                     p.description_as_sample = description_as_sample;
 931                     try {
 932                         p.parse(dump, charset, false);
 933                     } catch (const string& newcharset) {
 934                         p.reset();
 935                         p.ignore_metarobots();
 936                         p.description_as_sample = description_as_sample;
 937                         p.parse(dump, newcharset, true);
 938                     } catch (const ReadError&) {
 939                         skip_cmd_failed(urlterm, context, cmd,
 940                                         d.get_size(), d.get_mtime());
 941                         return;
 942                     }
 943                     dump = p.dump;
 944                     title = p.title;
 945                     keywords = p.keywords;
 946                     topic = p.topic;
 947                     sample = p.sample;
 948                     author = p.author;
 949                     created = p.created;
 950                 } else if (filter.output_type == "image/svg+xml") {
 951                     SvgParser svgparser;
 952                     svgparser.parse(dump);
 953                     dump = svgparser.dump;
 954                     title = svgparser.title;
 955                     keywords = svgparser.keywords;
 956                     // FIXME: topic = svgparser.topic;
 957                     author = svgparser.author;
 958                 } else if (!charset.empty()) {
 959                     convert_to_utf8(dump, charset);
 960                 }
 961             } catch (const ReadError&) {
 962                 skip_cmd_failed(urlterm, context, cmd,
 963                                 d.get_size(), d.get_mtime());
 964                 return;
 965             }
 966         } else if (mimetype == "text/html" || mimetype == "text/x-php") {
 967             const string& text = d.file_to_string();
 968             HtmlParser p;
 969             if (ignore_exclusions) p.ignore_metarobots();
 970             p.description_as_sample = description_as_sample;
 971             try {
 972                 // Default HTML character set is latin 1, though not specifying
 973                 // one is deprecated these days.
 974                 p.parse(text, "iso-8859-1", false);
 975             } catch (const string& newcharset) {
 976                 p.reset();
 977                 if (ignore_exclusions) p.ignore_metarobots();
 978                 p.description_as_sample = description_as_sample;
 979                 p.parse(text, newcharset, true);
 980             }
 981             if (!p.indexing_allowed) {
 982                 skip_meta_tag(urlterm, context,
 983                               d.get_size(), d.get_mtime());
 984                 return;
 985             }
 986             dump = p.dump;
 987             title = p.title;
 988             keywords = p.keywords;
 989             topic = p.topic;
 990             sample = p.sample;
 991             author = p.author;
 992             created = p.created;
 993             md5_string(text, md5);
 994         } else if (mimetype == "text/plain") {
 995             // Currently we assume that text files are UTF-8 unless they have a
 996             // byte-order mark.
 997             dump = d.file_to_string();
 998             md5_string(dump, md5);
 999
1000             // Look for Byte-Order Mark (BOM).
1001             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1002                 // UTF-16 in big-endian/little-endian order - we just convert
1003                 // it as "UTF-16" and let the conversion handle the BOM as that
1004                 // way we avoid the copying overhead of erasing 2 bytes from
1005                 // the start of dump.
1006                 convert_to_utf8(dump, "UTF-16");
1007             } else if (startswith(dump, "\xef\xbb\xbf")) {
1008                 // UTF-8 with stupid Windows not-the-byte-order mark.
1009                 dump.erase(0, 3);
1010             } else {
1011                 // FIXME: What charset is the file?  Look at contents?
1012             }
1013         } else if (mimetype == "application/pdf") {
1014             const char* cmd = "pdftotext -enc UTF-8 - -";
1015             try {
1016                 run_filter(d.get_fd(), cmd, false, &dump);
1017             } catch (const ReadError&) {
1018                 skip_cmd_failed(urlterm, context, cmd,
1019                                 d.get_size(), d.get_mtime());
1020                 return;
1021             }
1022             get_pdf_metainfo(d.get_fd(), author, title, keywords, topic, pages);
1023         } else if (mimetype == "application/postscript") {
1024             // There simply doesn't seem to be a Unicode capable PostScript to
1025             // text converter (e.g. pstotext always outputs ISO-8859-1).  The
1026             // only solution seems to be to convert via PDF using ps2pdf and
1027             // then pdftotext.  This gives plausible looking UTF-8 output for
1028             // some Chinese PostScript files I found using Google.  It also has
1029             // the benefit of allowing us to extract meta information from
1030             // PostScript files.
1031             string tmpfile = get_tmpfile("tmp.pdf");
1032             if (tmpfile.empty()) {
1033                 // FIXME: should this be fatal?  Or disable indexing postscript?
1034                 string msg = "Couldn't create temporary directory (";
1035                 msg += strerror(errno);
1036                 msg += ")";
1037                 skip(urlterm, context, msg,
1038                      d.get_size(), d.get_mtime());
1039                 return;
1040             }
1041             string cmd = "ps2pdf -";
1042             append_filename_argument(cmd, tmpfile);
1043             try {
1044                 run_filter(d.get_fd(), cmd, false);
1045                 cmd = "pdftotext -enc UTF-8";
1046                 append_filename_argument(cmd, tmpfile);
1047                 cmd += " -";
1048                 run_filter(cmd, false, &dump);
1049             } catch (const ReadError&) {
1050                 skip_cmd_failed(urlterm, context, cmd,
1051                                 d.get_size(), d.get_mtime());
1052                 unlink(tmpfile.c_str());
1053                 return;
1054             } catch (...) {
1055                 unlink(tmpfile.c_str());
1056                 throw;
1057             }
1058             try {
1059                 get_pdf_metainfo(tmpfile, author, title, keywords, topic,
1060                                  pages);
1061             } catch (...) {
1062                 unlink(tmpfile.c_str());
1063                 throw;
1064             }
1065             unlink(tmpfile.c_str());
1066         } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
1067                    startswith(mimetype, "application/vnd.oasis.opendocument."))
1068         {
1069             // Inspired by http://mjr.towers.org.uk/comp/sxw2text
1070             string cmd = "unzip -p";
1071             append_filename_argument(cmd, file);
1072             cmd += " content.xml ; unzip -p";
1073             append_filename_argument(cmd, file);
1074             cmd += " styles.xml";
1075             try {
1076                 OpenDocParser parser;
1077                 parser.parse(stdout_to_string(cmd, true));
1078                 dump = parser.dump;
1079             } catch (const ReadError&) {
1080                 skip_cmd_failed(urlterm, context, cmd,
1081                                 d.get_size(), d.get_mtime());
1082                 return;
1083             }
1084
1085             cmd = "unzip -p";
1086             append_filename_argument(cmd, file);
1087             cmd += " meta.xml";
1088             try {
1089                 OpenDocMetaParser metaparser;
1090                 metaparser.parse(stdout_to_string(cmd, false));
1091                 title = metaparser.title;
1092                 keywords = metaparser.keywords;
1093                 // FIXME: topic = metaparser.topic;
1094                 sample = metaparser.sample;
1095                 author = metaparser.author;
1096                 pages = metaparser.pages;
1097             } catch (const ReadError&) {
1098                 // It's probably best to index the document even if this fails.
1099             }
1100         } else if (startswith(mimetype,
1101                               "application/vnd.openxmlformats-officedocument."))
1102         {
1103             const char* args = NULL;
1104             string tail(mimetype, 46);
1105             if (startswith(tail, "wordprocessingml.")) {
1106                 // unzip returns exit code 11 if a file to extract wasn't found
1107                 // which we want to ignore, because there may be no headers or
1108                 // no footers.
1109                 args = " word/document.xml"
1110                        " 'word/header*.xml'"
1111                        " 'word/footer*.xml'"
1112                        " 2>/dev/null";
1113             } else if (startswith(tail, "spreadsheetml.")) {
1114                 // Extract the shared string table first, so our parser can
1115                 // grab those ready for parsing the sheets which will reference
1116                 // the shared strings.
1117                 string cmd = "unzip -p";
1118                 append_filename_argument(cmd, file);
1119                 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; "
1120                        "unzip -p";
1121                 append_filename_argument(cmd, file);
1122                 cmd += " xl/worksheets/sheet\\*.xml";
1123                 try {
1124                     XlsxParser parser;
1125                     parser.parse(stdout_to_string(cmd, true));
1126                     dump = parser.dump;
1127                 } catch (const ReadError&) {
1128                     skip_cmd_failed(urlterm, context, cmd,
1129                                     d.get_size(), d.get_mtime());
1130                     return;
1131                 }
1132             } else if (startswith(tail, "presentationml.")) {
1133                 // unzip returns exit code 11 if a file to extract wasn't found
1134                 // which we want to ignore, because there may be no notesSlides
1135                 // or comments.
1136                 args = " 'ppt/slides/slide*.xml'"
1137                        " 'ppt/notesSlides/notesSlide*.xml'"
1138                        " 'ppt/comments/comment*.xml'"
1139                        " 2>/dev/null";
1140             } else {
1141                 // Don't know how to index this type.
1142                 skip_unknown_mimetype(urlterm, context, mimetype,
1143                                       d.get_size(), d.get_mtime());
1144                 return;
1145             }
1146
1147             if (args) {
1148                 string cmd = "unzip -p";
1149                 append_filename_argument(cmd, file);
1150                 cmd += args;
1151                 try {
1152                     MSXmlParser xmlparser;
1153                     // Treat exit status 11 from unzip as success - this is
1154                     // what we get if one of the listed filenames to extract
1155                     // doesn't match anything in the zip file.
1156                     xmlparser.parse(stdout_to_string(cmd, false, 11));
1157                     dump = xmlparser.dump;
1158                 } catch (const ReadError&) {
1159                     skip_cmd_failed(urlterm, context, cmd,
1160                                     d.get_size(), d.get_mtime());
1161                     return;
1162                 }
1163             }
1164
1165             string cmd = "unzip -p";
1166             append_filename_argument(cmd, file);
1167             cmd += " docProps/core.xml";
1168             try {
1169                 OpenDocMetaParser metaparser;
1170                 metaparser.parse(stdout_to_string(cmd, false));
1171                 title = metaparser.title;
1172                 keywords = metaparser.keywords;
1173                 // FIXME: topic = metaparser.topic;
1174                 sample = metaparser.sample;
1175                 author = metaparser.author;
1176             } catch (const ReadError&) {
1177                 // It's probably best to index the document even if this fails.
1178             }
1179         } else if (mimetype == "application/x-abiword") {
1180             AbiwordParser abiwordparser;
1181             const string& text = d.file_to_string();
1182             abiwordparser.parse(text);
1183             dump = abiwordparser.dump;
1184             md5_string(text, md5);
1185         } else if (mimetype == "application/x-abiword-compressed") {
1186             AbiwordParser abiwordparser;
1187             abiwordparser.parse(d.gzfile_to_string());
1188             dump = abiwordparser.dump;
1189         } else if (mimetype == "application/oxps" ||
1190                    mimetype == "application/vnd.ms-xpsdocument") {
1191             string cmd = "unzip -p";
1192             append_filename_argument(cmd, file);
1193             cmd += " 'Documents/*/Pages/*.fpage'";
1194             try {
1195                 XpsParser xpsparser;
1196                 run_filter(cmd, false, &dump);
1197                 xpsparser.parse(dump);
1198                 dump = xpsparser.dump;
1199             } catch (const ReadError&) {
1200                 skip_cmd_failed(urlterm, context, cmd,
1201                                 d.get_size(), d.get_mtime());
1202                 return;
1203             }
1204
1205             cmd = "unzip -p";
1206             append_filename_argument(cmd, file);
1207             cmd += " docProps/core.xml";
1208             try {
1209                 OpenDocMetaParser metaparser;
1210                 metaparser.parse(stdout_to_string(cmd, false));
1211                 title = metaparser.title;
1212                 keywords = metaparser.keywords;
1213                 // FIXME: topic = metaparser.topic;
1214                 sample = metaparser.sample;
1215                 author = metaparser.author;
1216             } catch (const ReadError&) {
1217                 // Ignore errors as not all XPS files contain this file.
1218             }
1219         } else if (mimetype == "text/csv") {
1220             // Currently we assume that text files are UTF-8 unless they have a
1221             // byte-order mark.
1222             dump = d.file_to_string();
1223             md5_string(dump, md5);
1224
1225             // Look for Byte-Order Mark (BOM).
1226             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1227                 // UTF-16 in big-endian/little-endian order - we just convert
1228                 // it as "UTF-16" and let the conversion handle the BOM as that
1229                 // way we avoid the copying overhead of erasing 2 bytes from
1230                 // the start of dump.
1231                 convert_to_utf8(dump, "UTF-16");
1232             } else if (startswith(dump, "\xef\xbb\xbf")) {
1233                 // UTF-8 with stupid Windows not-the-byte-order mark.
1234                 dump.erase(0, 3);
1235             } else {
1236                 // FIXME: What charset is the file?  Look at contents?
1237             }
1238
1239             generate_sample_from_csv(dump, sample);
1240         } else if (mimetype == "image/svg+xml") {
1241             SvgParser svgparser;
1242             const string& text = d.file_to_string();
1243             md5_string(text, md5);
1244             svgparser.parse(text);
1245             dump = svgparser.dump;
1246             title = svgparser.title;
1247             keywords = svgparser.keywords;
1248             // FIXME: topic = svgparser.topic;
1249             author = svgparser.author;
1250         } else if (mimetype == "image/svg+xml-compressed") {
1251             SvgParser svgparser;
1252             const string& text = d.gzfile_to_string();
1253             svgparser.parse(text);
1254             dump = svgparser.dump;
1255             title = svgparser.title;
1256             keywords = svgparser.keywords;
1257             // FIXME: topic = svgparser.topic;
1258             author = svgparser.author;
1259         } else if (mimetype == "application/vnd.debian.binary-package" ||
1260                    mimetype == "application/x-debian-package") {
1261             const char* cmd = "dpkg-deb -f - Description";
1262             string desc;
1263             run_filter(d.get_fd(), cmd, false, &desc);
1264             // First line is short description, which we use as the title.
1265             string::size_type idx = desc.find('\n');
1266             title.assign(desc, 0, idx);
1267             if (idx != string::npos) {
1268                 dump.assign(desc, idx + 1, string::npos);
1269             }
1270         } else if (mimetype == "application/x-redhat-package-manager" ||
1271                    mimetype == "application/x-rpm") {
1272             string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1273             append_filename_argument(cmd, file);
1274             string desc;
1275             run_filter(cmd, false, &desc);
1276             // First line is summary, which we use as the title.
1277             string::size_type idx = desc.find('\n');
1278             title.assign(desc, 0, idx);
1279             if (idx != string::npos) {
1280                 dump.assign(desc, idx + 1, string::npos);
1281             }
1282         } else if (mimetype == "application/atom+xml") {
1283             AtomParser atomparser;
1284             const string& text = d.file_to_string();
1285             md5_string(text, md5);
1286             atomparser.parse(text);
1287             dump = atomparser.dump;
1288             title = atomparser.title;
1289             keywords = atomparser.keywords;
1290             // FIXME: topic = atomparser.topic;
1291             author = atomparser.author;
1292         } else {
1293             // Don't know how to index this type.
1294             skip_unknown_mimetype(urlterm, context, mimetype,
1295                                   d.get_size(), d.get_mtime());
1296             return;
1297         }
1298
1299         // Compute the MD5 of the file if we haven't already.
1300         if (md5.empty() && !d.md5(md5)) {
1301             if (errno == ENOENT || errno == ENOTDIR) {
1302                 skip(urlterm, context, "File removed during indexing",
1303                      d.get_size(), d.get_mtime(),
1304                      SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1305             } else {
1306                 skip(urlterm, context,
1307                      "failed to read file to calculate MD5 checksum",
1308                      d.get_size(), d.get_mtime());
1309             }
1310             return;
1311         }
1312
1313         // Remove any trailing formfeeds, so we don't consider them when
1314         // considering if we extracted any text (e.g. pdftotext outputs a
1315         // formfeed between each page, even for blank pages).
1316         //
1317         // If dump contain only formfeeds, then trim_end will be string::npos
1318         // and ++trim_end will be 0, which is the correct new size.
1319         string::size_type trim_end = dump.find_last_not_of('\f');
1320         if (UNSIGNED_OVERFLOW_OK(++trim_end) != dump.size())
1321             dump.resize(trim_end);
1322
1323         if (dump.empty()) {
1324             switch (empty_body) {
1325                 case EMPTY_BODY_INDEX:
1326                     break;
1327                 case EMPTY_BODY_WARN:
1328                     cout << "no text extracted from document body, "
1329                             "but indexing metadata anyway" << endl;
1330                     break;
1331                 case EMPTY_BODY_SKIP:
1332                     skip(urlterm, context,
1333                          "no text extracted from document body",
1334                          d.get_size(), d.get_mtime());
1335                     return;
1336             }
1337         }
1338
1339         // Produce a sample
1340         if (sample.empty()) {
1341             sample = generate_sample(dump, sample_size, "...", " ...");
1342         } else {
1343             sample = generate_sample(sample, sample_size, "...", " ...");
1344         }
1345
1346         // Put the data in the document
1347         if (record.empty()) {
1348             record = "url=";
1349         } else {
1350             record += "\nurl=";
1351         }
1352         record += url;
1353         record += "\nsample=";
1354         record += sample;
1355         if (!title.empty()) {
1356             record += "\ncaption=";
1357             record += generate_sample(title, title_size, "...", " ...");
1358         }
1359         if (!author.empty()) {
1360             record += "\nauthor=";
1361             record += author;
1362         }
1363         if (!to.empty()) {
1364             record += "\nto=";
1365             record += to;
1366         }
1367         if (!cc.empty()) {
1368             record += "\ncc=";
1369             record += cc;
1370         }
1371         if (!bcc.empty()) {
1372             record += "\nbcc=";
1373             record += bcc;
1374         }
1375         if (!message_id.empty()) {
1376             record += "\nmsgid=";
1377             record += message_id;
1378         }
1379         record += "\ntype=";
1380         record += mimetype;
1381         time_t mtime = d.get_mtime();
1382         if (mtime != static_cast<time_t>(-1)) {
1383             record += "\nmodtime=";
1384             record += str(mtime);
1385         }
1386         if (created != static_cast<time_t>(-1)) {
1387             record += "\ncreated=";
1388             record += str(created);
1389         }
1390         if (pages >= 0) {
1391             record += "\npages=";
1392             record += str(pages);
1393         }
1394         off_t size = d.get_size();
1395         record += "\nsize=";
1396         record += str(size);
1397         newdocument.set_data(record);
1398
1399         // Index the title, document text, keywords and topic.
1400         indexer.set_document(newdocument);
1401         if (!title.empty()) {
1402             indexer.index_text(title, 5, "S");
1403             indexer.increase_termpos(100);
1404         }
1405         if (!dump.empty()) {
1406             indexer.index_text(dump);
1407         }
1408         if (!keywords.empty()) {
1409             indexer.increase_termpos(100);
1410             indexer.index_text(keywords);
1411         }
1412         if (!topic.empty()) {
1413             indexer.increase_termpos(100);
1414             indexer.index_text(topic, 1, "B");
1415         }
1416         // Index the leafname of the file.
1417         {
1418             indexer.increase_termpos(100);
1419             string leaf = d.leafname();
1420             string::size_type dot = leaf.find_last_of('.');
1421             if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1422                 leaf.resize(dot);
1423             indexer.index_text(leaf, 1, "F");
1424
1425             // Also index with underscores and ampersands replaced by spaces.
1426             bool modified = false;
1427             string::size_type rep = 0;
1428             while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1429                 leaf[rep++] = ' ';
1430                 modified = true;
1431             }
1432             if (modified) {
1433                 indexer.increase_termpos(100);
1434                 indexer.index_text(leaf, 1, "F");
1435             }
1436         }
1437
1438         if (!author.empty()) {
1439             indexer.increase_termpos(100);
1440             indexer.index_text(author, 1, "A");
1441         }
1442
1443         if (!to.empty()) {
1444             indexer.increase_termpos(100);
1445             indexer.index_text(to, 1, "XTO");
1446         }
1447
1448         if (!cc.empty()) {
1449             indexer.increase_termpos(100);
1450             indexer.index_text(cc, 1, "XCC");
1451         }
1452
1453         if (!bcc.empty()) {
1454             indexer.increase_termpos(100);
1455             indexer.index_text(bcc, 1, "XBCC");
1456         }
1457
1458         if (!message_id.empty()) {
1459             newdocument.add_boolean_term("XMID:" + message_id);
1460         }
1461
1462         // mimeType:
1463         newdocument.add_boolean_term("T" + mimetype);
1464
1465         newdocument.add_boolean_term(site_term);
1466
1467         if (!host_term.empty())
1468             newdocument.add_boolean_term(host_term);
1469
1470         if (date_terms) {
1471             struct tm* tm = localtime(&mtime);
1472             string date_term = "D";
1473             date_term += date_to_string(tm->tm_year + 1900,
1474                                         tm->tm_mon + 1,
1475                                         tm->tm_mday);
1476             newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1477             date_term.resize(7);
1478             date_term[0] = 'M';
1479             newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1480             date_term.resize(5);
1481             date_term[0] = 'Y';
1482             newdocument.add_boolean_term(date_term); // Year (YYYY)
1483         }
1484
1485         newdocument.add_boolean_term(urlterm); // Url
1486
1487         // Add mtime as a value to allow "sort by date".
1488         newdocument.add_value(VALUE_LASTMOD,
1489                               int_to_binary_string(uint32_t(mtime)));
1490         if (use_ctime) {
1491             // Add ctime as a value to track modifications.
1492             time_t ctime = d.get_ctime();
1493             newdocument.add_value(VALUE_CTIME,
1494                                   int_to_binary_string(uint32_t(ctime)));
1495         }
1496
1497         // Add MD5 as a value to allow duplicate documents to be collapsed
1498         // together.
1499         newdocument.add_value(VALUE_MD5, md5);
1500
1501         // Add the file size as a value to allow "sort by size" and size ranges.
1502         newdocument.add_value(VALUE_SIZE,
1503                               Xapian::sortable_serialise(size));
1504
1505         if (created != static_cast<time_t>(-1)) {
1506             // Add created time as a value to allow "sort by created date".
1507             newdocument.add_value(VALUE_CREATED,
1508                                   int_to_binary_string(uint32_t(created)));
1509         }
1510
1511         bool inc_tag_added = false;
1512         if (d.is_other_readable()) {
1513             inc_tag_added = true;
1514             newdocument.add_boolean_term("I*");
1515         } else if (d.is_group_readable()) {
1516             const char* group = d.get_group();
1517             if (group) {
1518                 newdocument.add_boolean_term(string("I#") + group);
1519             }
1520         }
1521         const char* owner = d.get_owner();
1522         if (owner) {
1523             newdocument.add_boolean_term(string("O") + owner);
1524             if (!inc_tag_added && d.is_owner_readable())
1525                 newdocument.add_boolean_term(string("I@") + owner);
1526         }
1527
1528         string ext_term("E");
1529         for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1530             char ch = *i;
1531             if (ch >= 'A' && ch <= 'Z')
1532                 ch |= 32;
1533             ext_term += ch;
1534         }
1535         newdocument.add_boolean_term(ext_term);
1536
1537         index_add_document(urlterm, last_altered, did, newdocument);
1538     } catch (const ReadError&) {
1539         skip(urlterm, context, string("can't read file: ") + strerror(errno),
1540              d.get_size(), d.get_mtime());
1541     } catch (const NoSuchFilter&) {
1542         string filter_entry;
1543         if (cmd_it != commands.end()) {
1544             filter_entry = cmd_it->first;
1545         } else {
1546             filter_entry = mimetype;
1547         }
1548         string m = "Filter for \"";
1549         m += filter_entry;
1550         m += "\" not installed";
1551         skip(urlterm, context, m, d.get_size(), d.get_mtime());
1552         commands[filter_entry] = Filter();
1553     } catch (const FileNotFound&) {
1554         skip(urlterm, context, "File removed during indexing",
1555              d.get_size(), d.get_mtime(),
1556              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1557     } catch (const std::string& error) {
1558         skip(urlterm, context, error, d.get_size(), d.get_mtime());
1559     } catch (const std::bad_alloc&) {
1560         // Attempt to flag the file as failed and commit changes, though that
1561         // might fail too if we're low on memory rather than being asked to
1562         // allocate a ludicrous amount.
1563         skip(urlterm, context, "Out of memory trying to extract text from file",
1564              d.get_size(), d.get_mtime(),
1565              SKIP_SHOW_FILENAME);
1566         throw CommitAndExit("Caught std::bad_alloc", "");
1567     }
1568 }
1569
1570 void
1571 index_handle_deletion()
1572 {
1573     if (updated.empty() || old_docs_not_seen == 0) return;
1574
1575     if (verbose) {
1576         cout << "Deleting " << old_docs_not_seen
1577              << " old documents which weren't found" << endl;
1578     }
1579     Xapian::PostingIterator alldocs = db.postlist_begin(string());
1580     Xapian::docid did = *alldocs;
1581     while (did < updated.size()) {
1582         if (!updated[did]) {
1583             alldocs.skip_to(did);
1584             if (alldocs == db.postlist_end(string()))
1585                 break;
1586             if (*alldocs != did) {
1587                 // Document #did didn't exist before we started.
1588                 did = *alldocs;
1589                 continue;
1590             }
1591             db.delete_document(did);
1592             if (--old_docs_not_seen == 0)
1593                 break;
1594         }
1595         ++did;
1596     }
1597 }
1598
1599 void
1600 index_commit()
1601 {
1602     db.commit();
1603 }
1604
1605 void
1606 index_done()
1607 {
1608     // If we created a temporary directory then delete it.
1609     remove_tmpdir();
1610 }