xapian-applications/omega/index_file.cc

   1 /** @file index_file.cc
   2  * @brief Handle indexing a document from a file
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include "index_file.h"
  30
  31 #include <algorithm>
  32 #include <iostream>
  33 #include <limits>
  34 #include <string>
  35 #include <map>
  36 #include <vector>
  37
  38 #include <sys/types.h>
  39 #include "safeunistd.h"
  40 #include <cstdio>
  41 #include <cstdlib>
  42 #include <cstring>
  43 #include "safefcntl.h"
  44 #include "safeerrno.h"
  45 #include <ctime>
  46
  47 #include <xapian.h>
  48
  49 #include "append_filename_arg.h"
  50 #include "atomparse.h"
  51 #include "diritor.h"
  52 #include "failed.h"
  53 #include "md5wrap.h"
  54 #include "metaxmlparse.h"
  55 #include "mimemap.h"
  56 #include "msxmlparse.h"
  57 #include "myhtmlparse.h"
  58 #include "opendocparse.h"
  59 #include "pkglibbindir.h"
  60 #include "runfilter.h"
  61 #include "sample.h"
  62 #include "str.h"
  63 #include "stringutils.h"
  64 #include "svgparse.h"
  65 #include "tmpdir.h"
  66 #include "utf8convert.h"
  67 #include "utils.h"
  68 #include "values.h"
  69 #include "xmlparse.h"
  70 #include "xlsxparse.h"
  71 #include "xpsxmlparse.h"
  72
  73 using namespace std;
  74
  75 static Xapian::WritableDatabase db;
  76 static Xapian::TermGenerator indexer;
  77
  78 static Xapian::doccount old_docs_not_seen;
  79 static Xapian::docid old_lastdocid;
  80 static vector<bool> updated;
  81
  82 static bool verbose;
  83 static bool retry_failed;
  84 static bool use_ctime;
  85 static dup_action_type dup_action;
  86 static bool ignore_exclusions;
  87 static bool description_as_sample;
  88
  89 static time_t last_altered_max;
  90 static size_t sample_size;
  91 static size_t title_size;
  92 static size_t max_ext_len;
  93
  94 static empty_body_type empty_body;
  95
  96 static string root;
  97 static string site_term, host_term;
  98
  99 static Failed failed;
 100
 101 map<string, Filter> commands;
 102
 103 static void
 104 mark_as_seen(Xapian::docid did)
 105 {
 106     if (usual(did < updated.size() && !updated[did])) {
 107         updated[did] = true;
 108         --old_docs_not_seen;
 109     }
 110 }
 111
 112 void
 113 skip(const string & urlterm, const string & context, const string & msg,
 114      off_t size, time_t last_mod, unsigned flags)
 115 {
 116     failed.add(urlterm, last_mod, size);
 117
 118     if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
 119         if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
 120         cout << context << ": ";
 121     }
 122
 123     cout << "Skipping - " << msg << endl;
 124 }
 125
 126 static void
 127 skip_cmd_failed(const string & urlterm, const string & context, const string & cmd,
 128                 off_t size, time_t last_mod)
 129 {
 130     skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
 131 }
 132
 133 static void
 134 skip_meta_tag(const string & urlterm, const string & context,
 135               off_t size, time_t last_mod)
 136 {
 137     skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
 138 }
 139
 140 static void
 141 skip_unknown_mimetype(const string & urlterm, const string & context,
 142                       const string & mimetype, off_t size, time_t last_mod)
 143 {
 144     skip(urlterm, context, "unknown MIME type '" + mimetype + "'", size, last_mod);
 145 }
 146
 147 void
 148 index_add_default_filters()
 149 {
 150     index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
 151     index_command("application/vnd.ms-excel",
 152                   Filter("xls2csv -c' ' -q0 -dutf-8", false));
 153     index_command("application/vnd.ms-powerpoint",
 154                   Filter("catppt -dutf-8", false));
 155     // Looking at the source of wpd2html and wpd2text I think both output
 156     // UTF-8, but it's hard to be sure without sample Unicode .wpd files
 157     // as they don't seem to be at all well documented.
 158     index_command("application/vnd.wordperfect", Filter("wpd2text", false));
 159     // wps2text produces UTF-8 output from the sample files I've tested.
 160     index_command("application/vnd.ms-works", Filter("wps2text", false));
 161     // Output is UTF-8 according to "man djvutxt".  Generally this seems to
 162     // be true, though some examples from djvu.org generate isolated byte
 163     // 0x95 in a context which suggests it might be intended to be a bullet
 164     // (as it is in CP1250).
 165     index_command("image/vnd.djvu", Filter("djvutxt", false));
 166     index_command("text/markdown", Filter("markdown", "text/html", false));
 167     // The --text option unhelpfully converts all non-ASCII characters to "?"
 168     // so we use --html instead, which produces HTML entities.  The --nopict
 169     // option suppresses exporting picture files as pictNNNN.wmf in the current
 170     // directory.  Note that this option was ignored in some older versions,
 171     // but it was fixed in unrtf 0.20.4.
 172     index_command("text/rtf",
 173                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 174                          false));
 175     index_command("text/x-rst", Filter("rst2html", "text/html", false));
 176     index_command("application/x-mspublisher",
 177                   Filter("pub2xhtml", "text/html", false));
 178     index_command("application/vnd.ms-outlook",
 179                   Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
 180                          false));
 181     // pod2text's output character set doesn't seem to be documented, but from
 182     // inspecting the source it looks like it's probably iso-8859-1.
 183     index_command("text/x-perl",
 184                   Filter("pod2text", "text/plain", "iso-8859-1", false));
 185     // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
 186     // appearing as single ligatures.  For European languages, it's actually
 187     // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
 188     // now until we handle Unicode "compatibility decompositions".
 189     index_command("application/x-dvi",
 190                   Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
 191     // Simplistic - ought to look in index.rdf files for filename and character
 192     // set.
 193     index_command("application/x-maff",
 194                   Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
 195                          false));
 196     index_command("application/x-mimearchive",
 197                   Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
 198                          false));
 199     index_command("message/news",
 200                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 201                          false));
 202     index_command("message/rfc822",
 203                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 204                          false));
 205     index_command("text/vcard",
 206                   Filter(get_pkglibbindir() + "/vcard2text", false));
 207 }
 208
 209 void
 210 index_init(const string & dbpath, const Xapian::Stem & stemmer,
 211            const string & root_, const string & site_term_,
 212            const string & host_term_,
 213            empty_body_type empty_body_, dup_action_type dup_action_,
 214            size_t sample_size_, size_t title_size_, size_t max_ext_len_,
 215            bool overwrite, bool retry_failed_,
 216            bool delete_removed_documents, bool verbose_, bool use_ctime_,
 217            bool spelling, bool ignore_exclusions_, bool description_as_sample_)
 218 {
 219     root = root_;
 220     site_term = site_term_;
 221     host_term = host_term_;
 222     empty_body = empty_body_;
 223     dup_action = dup_action_;
 224     sample_size = sample_size_;
 225     title_size = title_size_;
 226     max_ext_len = max_ext_len_;
 227     verbose = verbose_;
 228     use_ctime = use_ctime_;
 229     ignore_exclusions = ignore_exclusions_;
 230     description_as_sample = description_as_sample_;
 231
 232     if (!overwrite) {
 233         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
 234         old_docs_not_seen = db.get_doccount();
 235         old_lastdocid = db.get_lastdocid();
 236         if (delete_removed_documents) {
 237             // + 1 so that old_lastdocid is a valid subscript.
 238             updated.resize(old_lastdocid + 1);
 239         }
 240         try {
 241             Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 242             string ubound = db.get_value_upper_bound(slot);
 243             if (!ubound.empty())
 244                 last_altered_max = binary_string_to_int(ubound);
 245         } catch (const Xapian::UnimplementedError &) {
 246             numeric_limits<time_t> n;
 247             last_altered_max = n.max();
 248         }
 249     } else {
 250         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
 251     }
 252
 253     if (spelling) {
 254         indexer.set_database(db);
 255         indexer.set_flags(indexer.FLAG_SPELLING);
 256     }
 257     indexer.set_stemmer(stemmer);
 258
 259     runfilter_init();
 260
 261     failed.init(db);
 262
 263     if (overwrite) {
 264         // There are no failures to retry, so setting this flag doesn't
 265         // change the outcome, but does mean we avoid the overhead of
 266         // checking for a previous failure.
 267         retry_failed = true;
 268     } else if (retry_failed_) {
 269         failed.clear();
 270         retry_failed = true;
 271     } else {
 272         // If there are no existing failures, setting this flag doesn't
 273         // change the outcome, but does mean we avoid the overhead of
 274         // checking for a previous failure.
 275         retry_failed = failed.empty();
 276     }
 277 }
 278
 279 static void
 280 parse_pdfinfo_field(const char * p, const char * end, string & out, const char * field, size_t len)
 281 {
 282     if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
 283         p += len;
 284         while (p != end && *p == ' ')
 285             ++p;
 286         if (p != end && (end[-1] != '\r' || --end != p))
 287             out.assign(p, end - p);
 288     }
 289 }
 290
 291 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
 292     parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
 293
 294 static void
 295 get_pdf_metainfo(const string & file, string &author, string &title,
 296                  string &keywords, string &topic)
 297 {
 298     try {
 299         string cmd = "pdfinfo -enc UTF-8";
 300         append_filename_argument(cmd, file);
 301         string pdfinfo = stdout_to_string(cmd, false);
 302
 303         const char * p = pdfinfo.data();
 304         const char * end = p + pdfinfo.size();
 305         while (p != end) {
 306             const char * start = p;
 307             p = static_cast<const char *>(memchr(p, '\n', end - p));
 308             const char * eol;
 309             if (p) {
 310                 eol = p;
 311                 ++p;
 312             } else {
 313                 p = eol = end;
 314             }
 315             switch (*start) {
 316                 case 'A':
 317                     PARSE_PDFINFO_FIELD(start, eol, author, "Author");
 318                     break;
 319                 case 'K':
 320                     PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
 321                     break;
 322                 case 'S':
 323                     PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
 324                     break;
 325                 case 'T':
 326                     PARSE_PDFINFO_FIELD(start, eol, title, "Title");
 327                     break;
 328             }
 329         }
 330     } catch (ReadError) {
 331         // It's probably best to index the document even if pdfinfo fails.
 332     }
 333 }
 334
 335 static void
 336 generate_sample_from_csv(const string & csv_data, string & sample)
 337 {
 338     // Add 3 to allow for a 4 byte utf-8 sequence being appended when
 339     // output is sample_size - 1 bytes long.  Use csv_data.size() if smaller
 340     // since the user might reasonably set sample_size really high.
 341     sample.reserve(min(sample_size + 3, csv_data.size()));
 342     size_t last_word_end = 0;
 343     bool in_space = true;
 344     bool in_quotes = false;
 345     for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
 346         unsigned ch = *i;
 347
 348         if (!in_quotes) {
 349             // If not already in double quotes, '"' starts quoting and
 350             // ',' starts a new field.
 351             if (ch == '"') {
 352                 in_quotes = true;
 353                 continue;
 354             }
 355             if (ch == ',')
 356                 ch = ' ';
 357         } else if (ch == '"') {
 358             // In double quotes, '"' either ends double quotes, or
 359             // if followed by another '"', means a literal '"'.
 360             if (++i == Xapian::Utf8Iterator())
 361                 break;
 362             ch = *i;
 363             if (ch != '"') {
 364                 in_quotes = false;
 365                 if (ch == ',')
 366                     ch = ' ';
 367             }
 368         }
 369
 370         if (ch <= ' ' || ch == 0xa0) {
 371             // FIXME: if all the whitespace characters between two
 372             // words are 0xa0 (non-breaking space) then perhaps we
 373             // should output 0xa0.
 374             if (in_space)
 375                 continue;
 376             last_word_end = sample.size();
 377             sample += ' ';
 378             in_space = true;
 379         } else {
 380             Xapian::Unicode::append_utf8(sample, ch);
 381             in_space = false;
 382         }
 383
 384         if (sample.size() >= sample_size) {
 385             // Need to truncate sample.
 386             if (last_word_end <= sample_size / 2) {
 387                 // Monster word!  We'll have to just split it.
 388                 sample.replace(sample_size - 3, string::npos, "...", 3);
 389             } else {
 390                 sample.replace(last_word_end, string::npos, " ...", 4);
 391             }
 392             break;
 393         }
 394     }
 395 }
 396
 397 static bool
 398 index_check_existing(const string & urlterm, time_t last_altered,
 399                      Xapian::docid & did)
 400 {
 401     switch (dup_action) {
 402         case DUP_SKIP: {
 403             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 404             if (p != db.postlist_end(urlterm)) {
 405                 if (verbose)
 406                     cout << "already indexed, not updating" << endl;
 407                 did = *p;
 408                 mark_as_seen(did);
 409                 return true;
 410             }
 411             break;
 412         }
 413         case DUP_CHECK_LAZILY: {
 414             // If last_altered > last_altered_max, we know for sure that the
 415             // file is new or updated.
 416             if (last_altered > last_altered_max) {
 417                 return false;
 418             }
 419
 420             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 421             if (p != db.postlist_end(urlterm)) {
 422                 did = *p;
 423                 Xapian::Document doc = db.get_document(did);
 424                 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 425                 string value = doc.get_value(slot);
 426                 time_t old_last_altered = binary_string_to_int(value);
 427                 if (last_altered <= old_last_altered) {
 428                     if (verbose)
 429                         cout << "already indexed" << endl;
 430                     // The docid should be in updated - the only valid
 431                     // exception is if the URL was long and hashed to the
 432                     // same URL as an existing document indexed in the same
 433                     // batch.
 434                     mark_as_seen(did);
 435                     return true;
 436                 }
 437             }
 438             break;
 439         }
 440     }
 441     return false;
 442 }
 443
 444 void
 445 index_add_document(const string & urlterm, time_t last_altered,
 446                    Xapian::docid did, const Xapian::Document & doc)
 447 {
 448     if (dup_action != DUP_SKIP) {
 449         // If this document has already been indexed, update the existing
 450         // entry.
 451         if (did) {
 452             // We already found out the document id above.
 453             db.replace_document(did, doc);
 454         } else if (last_altered <= last_altered_max) {
 455             // We checked for the UID term and didn't find it.
 456             did = db.add_document(doc);
 457         } else {
 458             did = db.replace_document(urlterm, doc);
 459         }
 460         mark_as_seen(did);
 461         if (verbose) {
 462             if (did <= old_lastdocid) {
 463                 cout << "updated" << endl;
 464             } else {
 465                 cout << "added" << endl;
 466             }
 467         }
 468     } else {
 469         // If this were a duplicate, we'd have skipped it above.
 470         db.add_document(doc);
 471         if (verbose)
 472             cout << "added" << endl;
 473     }
 474 }
 475
 476 void
 477 index_mimetype(const string & file, const string & urlterm, const string & url,
 478                const string & ext,
 479                const string &mimetype, DirectoryIterator &d,
 480                Xapian::Document & newdocument,
 481                string record)
 482 {
 483     string context(file, root.size(), string::npos);
 484
 485     // FIXME: We could be cleverer here and check mtime too when use_ctime is
 486     // set - if the ctime has changed but the mtime is unchanged, we can just
 487     // update the existing Document and avoid having to re-extract text, etc.
 488     time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
 489
 490     Xapian::docid did = 0;
 491     if (index_check_existing(urlterm, last_altered, did))
 492         return;
 493
 494     if (!retry_failed) {
 495         // We only store and check the mtime (last modified) - a change to the
 496         // metadata won't generally cause a previous failure to now work
 497         // (FIXME: except permissions).
 498         time_t failed_last_mod;
 499         off_t failed_size;
 500         if (failed.contains(urlterm, failed_last_mod, failed_size)) {
 501             if (d.get_mtime() <= failed_last_mod &&
 502                 d.get_size() == failed_size) {
 503                 if (verbose)
 504                     cout << "failed to extract text on earlier run" << endl;
 505                 return;
 506             }
 507             // The file has changed, so remove the entry for it.  If it fails
 508             // again on this attempt, we'll add a new one.
 509             failed.del(urlterm);
 510         }
 511     }
 512
 513     if (verbose) cout << flush;
 514
 515     string author, title, sample, keywords, topic, dump;
 516     string md5;
 517     time_t created = time_t(-1);
 518
 519     map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
 520     if (cmd_it == commands.end()) {
 521         size_t slash = mimetype.find('/');
 522         if (slash != string::npos) {
 523             string wildtype(mimetype, 0, slash + 2);
 524             wildtype[slash + 1] = '*';
 525             cmd_it = commands.find(wildtype);
 526             if (cmd_it == commands.end()) {
 527                 cmd_it = commands.find("*/*");
 528             }
 529         }
 530         if (cmd_it == commands.end()) {
 531             cmd_it = commands.find("*");
 532         }
 533     }
 534     try {
 535         if (cmd_it != commands.end()) {
 536             // Easy "run a command and read text or HTML from stdout or a
 537             // temporary file" cases.
 538             string cmd = cmd_it->second.cmd;
 539             if (cmd.empty()) {
 540                 skip(urlterm, context, "required filter not installed",
 541                      d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
 542                 return;
 543             }
 544             if (cmd == "false") {
 545                 // Allow setting 'false' as a filter to mean that a MIME type
 546                 // should be quietly ignored.
 547                 string m = "ignoring MIME type '";
 548                 m += cmd_it->first;
 549                 m += "'";
 550                 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
 551                      SKIP_VERBOSE_ONLY);
 552                 return;
 553             }
 554             bool use_shell = cmd_it->second.use_shell();
 555             bool substituted = false;
 556             string tmpout;
 557             size_t pcent = 0;
 558             while (true) {
 559                 pcent = cmd.find('%', pcent);
 560                 if (pcent >= cmd.size() - 1)
 561                     break;
 562                 switch (cmd[pcent + 1]) {
 563                     case '%': // %% -> %.
 564                         cmd.erase(++pcent, 1);
 565                         break;
 566                     case 'f': { // %f -> escaped filename.
 567                         substituted = true;
 568                         string tail(cmd, pcent + 2);
 569                         cmd.resize(pcent);
 570                         append_filename_argument(cmd, file);
 571                         // Remove the space append_filename_argument() adds before
 572                         // the argument - the command string either includes one,
 573                         // or won't expect one (e.g. --input=%f).
 574                         cmd.erase(pcent, 1);
 575                         pcent = cmd.size();
 576                         cmd += tail;
 577                         break;
 578                     }
 579                     case 't': { // %t -> temporary output file.
 580                         if (tmpout.empty()) {
 581                             // Use a temporary file with a suitable extension
 582                             // in case the command cares, and for more helpful
 583                             // error messages from the command.
 584                             if (cmd_it->second.output_type == "text/html") {
 585                                 tmpout = get_tmpfile("tmp.html");
 586                             } else {
 587                                 tmpout = get_tmpfile("tmp.txt");
 588                             }
 589                         }
 590                         substituted = true;
 591                         string tail(cmd, pcent + 2);
 592                         cmd.resize(pcent);
 593                         append_filename_argument(cmd, tmpout);
 594                         // Remove the space append_filename_argument() adds before
 595                         // the argument - the command string either includes one,
 596                         // or won't expect one (e.g. --input=%f).
 597                         cmd.erase(pcent, 1);
 598                         pcent = cmd.size();
 599                         cmd += tail;
 600                         break;
 601                     }
 602                     default:
 603                         // Leave anything else alone for now.
 604                         pcent += 2;
 605                         break;
 606                 }
 607             }
 608             if (!substituted && cmd != "true") {
 609                 // If no %f, append the filename to the command.
 610                 append_filename_argument(cmd, file);
 611             }
 612             try {
 613                 if (!tmpout.empty()) {
 614                     // Output in temporary file.
 615                     (void)stdout_to_string(cmd, use_shell);
 616                     if (!load_file(tmpout, dump)) {
 617                         throw ReadError("Couldn't read output file");
 618                     }
 619                     unlink(tmpout.c_str());
 620                 } else if (cmd == "true") {
 621                     // Ignore the file's contents, just index metadata from the
 622                     // filing system.
 623                 } else {
 624                     // Output on stdout.
 625                     dump = stdout_to_string(cmd, use_shell);
 626                 }
 627                 const string & charset = cmd_it->second.output_charset;
 628                 if (cmd_it->second.output_type == "text/html") {
 629                     MyHtmlParser p;
 630                     p.ignore_metarobots();
 631                     p.description_as_sample = description_as_sample;
 632                     try {
 633                         p.parse_html(dump, charset, false);
 634                     } catch (const string & newcharset) {
 635                         p.reset();
 636                         p.ignore_metarobots();
 637                         p.description_as_sample = description_as_sample;
 638                         p.parse_html(dump, newcharset, true);
 639                     } catch (ReadError) {
 640                         skip_cmd_failed(urlterm, context, cmd,
 641                                         d.get_size(), d.get_mtime());
 642                         return;
 643                     }
 644                     dump = p.dump;
 645                     title = p.title;
 646                     keywords = p.keywords;
 647                     topic = p.topic;
 648                     sample = p.sample;
 649                     author = p.author;
 650                     created = p.created;
 651                 } else if (!charset.empty()) {
 652                     convert_to_utf8(dump, charset);
 653                 }
 654             } catch (ReadError) {
 655                 skip_cmd_failed(urlterm, context, cmd,
 656                                 d.get_size(), d.get_mtime());
 657                 return;
 658             }
 659         } else if (mimetype == "text/html" || mimetype == "text/x-php") {
 660             const string & text = d.file_to_string();
 661             MyHtmlParser p;
 662             if (ignore_exclusions) p.ignore_metarobots();
 663             p.description_as_sample = description_as_sample;
 664             try {
 665                 // Default HTML character set is latin 1, though not specifying
 666                 // one is deprecated these days.
 667                 p.parse_html(text, "iso-8859-1", false);
 668             } catch (const string & newcharset) {
 669                 p.reset();
 670                 if (ignore_exclusions) p.ignore_metarobots();
 671                 p.description_as_sample = description_as_sample;
 672                 p.parse_html(text, newcharset, true);
 673             }
 674             if (!p.indexing_allowed) {
 675                 skip_meta_tag(urlterm, context,
 676                               d.get_size(), d.get_mtime());
 677                 return;
 678             }
 679             dump = p.dump;
 680             title = p.title;
 681             keywords = p.keywords;
 682             topic = p.topic;
 683             sample = p.sample;
 684             author = p.author;
 685             created = p.created;
 686             md5_string(text, md5);
 687         } else if (mimetype == "text/plain") {
 688             // Currently we assume that text files are UTF-8 unless they have a
 689             // byte-order mark.
 690             dump = d.file_to_string();
 691             md5_string(dump, md5);
 692
 693             // Look for Byte-Order Mark (BOM).
 694             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 695                 // UTF-16 in big-endian/little-endian order - we just convert
 696                 // it as "UTF-16" and let the conversion handle the BOM as that
 697                 // way we avoid the copying overhead of erasing 2 bytes from
 698                 // the start of dump.
 699                 convert_to_utf8(dump, "UTF-16");
 700             } else if (startswith(dump, "\xef\xbb\xbf")) {
 701                 // UTF-8 with stupid Windows not-the-byte-order mark.
 702                 dump.erase(0, 3);
 703             } else {
 704                 // FIXME: What charset is the file?  Look at contents?
 705             }
 706         } else if (mimetype == "application/pdf") {
 707             string cmd = "pdftotext -enc UTF-8";
 708             append_filename_argument(cmd, file);
 709             cmd += " -";
 710             try {
 711                 dump = stdout_to_string(cmd, false);
 712             } catch (ReadError) {
 713                 skip_cmd_failed(urlterm, context, cmd,
 714                                 d.get_size(), d.get_mtime());
 715                 return;
 716             }
 717             get_pdf_metainfo(file, author, title, keywords, topic);
 718         } else if (mimetype == "application/postscript") {
 719             // There simply doesn't seem to be a Unicode capable PostScript to
 720             // text converter (e.g. pstotext always outputs ISO-8859-1).  The
 721             // only solution seems to be to convert via PDF using ps2pdf and
 722             // then pdftotext.  This gives plausible looking UTF-8 output for
 723             // some Chinese PostScript files I found using Google.  It also has
 724             // the benefit of allowing us to extract meta information from
 725             // PostScript files.
 726             string tmpfile = get_tmpfile("tmp.pdf");
 727             if (tmpfile.empty()) {
 728                 // FIXME: should this be fatal?  Or disable indexing postscript?
 729                 string msg = "Couldn't create temporary directory (";
 730                 msg += strerror(errno);
 731                 msg += ")";
 732                 skip(urlterm, context, msg,
 733                      d.get_size(), d.get_mtime());
 734                 return;
 735             }
 736             string cmd = "ps2pdf";
 737             append_filename_argument(cmd, file);
 738             append_filename_argument(cmd, tmpfile);
 739             try {
 740                 (void)stdout_to_string(cmd, false);
 741                 cmd = "pdftotext -enc UTF-8";
 742                 append_filename_argument(cmd, tmpfile);
 743                 cmd += " -";
 744                 dump = stdout_to_string(cmd, false);
 745             } catch (ReadError) {
 746                 skip_cmd_failed(urlterm, context, cmd,
 747                                 d.get_size(), d.get_mtime());
 748                 unlink(tmpfile.c_str());
 749                 return;
 750             } catch (...) {
 751                 unlink(tmpfile.c_str());
 752                 throw;
 753             }
 754             try {
 755                 get_pdf_metainfo(tmpfile, author, title, keywords, topic);
 756             } catch (...) {
 757                 unlink(tmpfile.c_str());
 758                 throw;
 759             }
 760             unlink(tmpfile.c_str());
 761         } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
 762                    startswith(mimetype, "application/vnd.oasis.opendocument."))
 763         {
 764             // Inspired by http://mjr.towers.org.uk/comp/sxw2text
 765             string cmd = "unzip -p";
 766             append_filename_argument(cmd, file);
 767             cmd += " content.xml ; unzip -p";
 768             append_filename_argument(cmd, file);
 769             cmd += " styles.xml";
 770             try {
 771                 OpenDocParser parser;
 772                 parser.parse(stdout_to_string(cmd, true));
 773                 dump = parser.dump;
 774             } catch (ReadError) {
 775                 skip_cmd_failed(urlterm, context, cmd,
 776                                 d.get_size(), d.get_mtime());
 777                 return;
 778             }
 779
 780             cmd = "unzip -p";
 781             append_filename_argument(cmd, file);
 782             cmd += " meta.xml";
 783             try {
 784                 MetaXmlParser metaxmlparser;
 785                 metaxmlparser.parse(stdout_to_string(cmd, false));
 786                 title = metaxmlparser.title;
 787                 keywords = metaxmlparser.keywords;
 788                 // FIXME: topic = metaxmlparser.topic;
 789                 sample = metaxmlparser.sample;
 790                 author = metaxmlparser.author;
 791             } catch (ReadError) {
 792                 // It's probably best to index the document even if this fails.
 793             }
 794         } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
 795             const char * args = NULL;
 796             string tail(mimetype, 46);
 797             if (startswith(tail, "wordprocessingml.")) {
 798                 // unzip returns exit code 11 if a file to extract wasn't found
 799                 // which we want to ignore, because there may be no headers or
 800                 // no footers.
 801                 args = " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
 802             } else if (startswith(tail, "spreadsheetml.")) {
 803                 // Extract the shared string table first, so our parser can
 804                 // grab those ready for parsing the sheets which will reference
 805                 // the shared strings.
 806                 string cmd = "unzip -p";
 807                 append_filename_argument(cmd, file);
 808                 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
 809                 append_filename_argument(cmd, file);
 810                 cmd += " xl/worksheets/sheet\\*.xml";
 811                 try {
 812                     XlsxParser parser;
 813                     parser.parse(stdout_to_string(cmd, true));
 814                     dump = parser.dump;
 815                 } catch (ReadError) {
 816                     skip_cmd_failed(urlterm, context, cmd,
 817                                     d.get_size(), d.get_mtime());
 818                     return;
 819                 }
 820             } else if (startswith(tail, "presentationml.")) {
 821                 // unzip returns exit code 11 if a file to extract wasn't found
 822                 // which we want to ignore, because there may be no notesSlides
 823                 // or comments.
 824                 args = " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
 825             } else {
 826                 // Don't know how to index this type.
 827                 skip_unknown_mimetype(urlterm, context, mimetype,
 828                                       d.get_size(), d.get_mtime());
 829                 return;
 830             }
 831
 832             if (args) {
 833                 string cmd = "unzip -p";
 834                 append_filename_argument(cmd, file);
 835                 cmd += args;
 836                 try {
 837                     MSXmlParser xmlparser;
 838                     // Treat exit status 11 from unzip as success - this is
 839                     // what we get if one of the listed filenames to extract
 840                     // doesn't match anything in the zip file.
 841                     xmlparser.parse_xml(stdout_to_string(cmd, false, 11));
 842                     dump = xmlparser.dump;
 843                 } catch (ReadError) {
 844                     skip_cmd_failed(urlterm, context, cmd,
 845                                     d.get_size(), d.get_mtime());
 846                     return;
 847                 }
 848             }
 849
 850             string cmd = "unzip -p";
 851             append_filename_argument(cmd, file);
 852             cmd += " docProps/core.xml";
 853             try {
 854                 MetaXmlParser metaxmlparser;
 855                 metaxmlparser.parse(stdout_to_string(cmd, false));
 856                 title = metaxmlparser.title;
 857                 keywords = metaxmlparser.keywords;
 858                 // FIXME: topic = metaxmlparser.topic;
 859                 sample = metaxmlparser.sample;
 860                 author = metaxmlparser.author;
 861             } catch (ReadError) {
 862                 // It's probably best to index the document even if this fails.
 863             }
 864         } else if (mimetype == "application/x-abiword") {
 865             // FIXME: Implement support for metadata.
 866             XmlParser xmlparser;
 867             const string & text = d.file_to_string();
 868             xmlparser.parse_xml(text);
 869             dump = xmlparser.dump;
 870             md5_string(text, md5);
 871         } else if (mimetype == "application/x-abiword-compressed") {
 872             // FIXME: Implement support for metadata.
 873             XmlParser xmlparser;
 874             xmlparser.parse_xml(d.gzfile_to_string());
 875             dump = xmlparser.dump;
 876         } else if (mimetype == "application/vnd.ms-xpsdocument") {
 877             string cmd = "unzip -p";
 878             append_filename_argument(cmd, file);
 879             cmd += " 'Documents/1/Pages/*.fpage'";
 880             try {
 881                 XpsXmlParser xpsparser;
 882                 dump = stdout_to_string(cmd, false);
 883                 // Look for Byte-Order Mark (BOM).
 884                 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 885                     // UTF-16 in big-endian/little-endian order - we just
 886                     // convert it as "UTF-16" and let the conversion handle the
 887                     // BOM as that way we avoid the copying overhead of erasing
 888                     // 2 bytes from the start of dump.
 889                     convert_to_utf8(dump, "UTF-16");
 890                 }
 891                 xpsparser.parse(dump);
 892                 dump = xpsparser.dump;
 893             } catch (ReadError) {
 894                 skip_cmd_failed(urlterm, context, cmd,
 895                                 d.get_size(), d.get_mtime());
 896                 return;
 897             }
 898         } else if (mimetype == "text/csv") {
 899             // Currently we assume that text files are UTF-8 unless they have a
 900             // byte-order mark.
 901             dump = d.file_to_string();
 902             md5_string(dump, md5);
 903
 904             // Look for Byte-Order Mark (BOM).
 905             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 906                 // UTF-16 in big-endian/little-endian order - we just convert
 907                 // it as "UTF-16" and let the conversion handle the BOM as that
 908                 // way we avoid the copying overhead of erasing 2 bytes from
 909                 // the start of dump.
 910                 convert_to_utf8(dump, "UTF-16");
 911             } else if (startswith(dump, "\xef\xbb\xbf")) {
 912                 // UTF-8 with stupid Windows not-the-byte-order mark.
 913                 dump.erase(0, 3);
 914             } else {
 915                 // FIXME: What charset is the file?  Look at contents?
 916             }
 917
 918             generate_sample_from_csv(dump, sample);
 919         } else if (mimetype == "image/svg+xml") {
 920             SvgParser svgparser;
 921             const string & text = d.file_to_string();
 922             md5_string(text, md5);
 923             svgparser.parse(text);
 924             dump = svgparser.dump;
 925             title = svgparser.title;
 926             keywords = svgparser.keywords;
 927             // FIXME: topic = svgparser.topic;
 928             author = svgparser.author;
 929         } else if (mimetype == "application/vnd.debian.binary-package" ||
 930                    mimetype == "application/x-debian-package") {
 931             string cmd("dpkg-deb -f");
 932             append_filename_argument(cmd, file);
 933             cmd += " Description";
 934             const string & desc = stdout_to_string(cmd, false);
 935             // First line is short description, which we use as the title.
 936             string::size_type idx = desc.find('\n');
 937             title.assign(desc, 0, idx);
 938             if (idx != string::npos) {
 939                 dump.assign(desc, idx + 1, string::npos);
 940             }
 941         } else if (mimetype == "application/x-redhat-package-manager" ||
 942                    mimetype == "application/x-rpm") {
 943             string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
 944             append_filename_argument(cmd, file);
 945             const string & desc = stdout_to_string(cmd, false);
 946             // First line is summary, which we use as the title.
 947             string::size_type idx = desc.find('\n');
 948             title.assign(desc, 0, idx);
 949             if (idx != string::npos) {
 950                 dump.assign(desc, idx + 1, string::npos);
 951             }
 952         } else if (mimetype == "application/atom+xml") {
 953             AtomParser atomparser;
 954             const string & text = d.file_to_string();
 955             md5_string(text, md5);
 956             atomparser.parse(text);
 957             dump = atomparser.dump;
 958             title = atomparser.title;
 959             keywords = atomparser.keywords;
 960             // FIXME: topic = atomparser.topic;
 961             author = atomparser.author;
 962         } else {
 963             // Don't know how to index this type.
 964             skip_unknown_mimetype(urlterm, context, mimetype,
 965                                   d.get_size(), d.get_mtime());
 966             return;
 967         }
 968
 969         // Compute the MD5 of the file if we haven't already.
 970         if (md5.empty() && md5_file(file, md5, d.try_noatime()) == 0) {
 971             if (errno == ENOENT || errno == ENOTDIR) {
 972                 skip(urlterm, context, "File removed during indexing",
 973                      d.get_size(), d.get_mtime(),
 974                      SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
 975             } else {
 976                 skip(urlterm, context, "failed to read file to calculate MD5 checksum",
 977                      d.get_size(), d.get_mtime());
 978             }
 979             return;
 980         }
 981
 982         // Remove any trailing formfeeds, so we don't consider them when
 983         // considering if we extracted any text (e.g. pdftotext outputs a
 984         // formfeed between each page, even for blank pages).
 985         //
 986         // If dump contain only formfeeds, then trim_end will be string::npos
 987         // and ++trim_end will be 0, which is the correct new size.
 988         string::size_type trim_end = dump.find_last_not_of('\f');
 989         if (++trim_end != dump.size())
 990             dump.resize(trim_end);
 991
 992         if (dump.empty()) {
 993             switch (empty_body) {
 994                 case EMPTY_BODY_INDEX:
 995                     break;
 996                 case EMPTY_BODY_WARN:
 997                     cout << "no text extracted from document body, "
 998                             "but indexing metadata anyway" << endl;
 999                     break;
1000                 case EMPTY_BODY_SKIP:
1001                     skip(urlterm, context, "no text extracted from document body",
1002                          d.get_size(), d.get_mtime());
1003                     return;
1004             }
1005         }
1006
1007         // Produce a sample
1008         if (sample.empty()) {
1009             sample = generate_sample(dump, sample_size, "...", " ...");
1010         } else {
1011             sample = generate_sample(sample, sample_size, "...", " ...");
1012         }
1013
1014         // Put the data in the document
1015         if (record.empty()) {
1016             record = "url=";
1017         } else {
1018             record += "\nurl=";
1019         }
1020         record += url;
1021         record += "\nsample=";
1022         record += sample;
1023         if (!title.empty()) {
1024             record += "\ncaption=";
1025             record += generate_sample(title, title_size, "...", " ...");
1026         }
1027         if (!author.empty()) {
1028             record += "\nauthor=";
1029             record += author;
1030         }
1031         record += "\ntype=";
1032         record += mimetype;
1033         time_t mtime = d.get_mtime();
1034         if (mtime != static_cast<time_t>(-1)) {
1035             record += "\nmodtime=";
1036             record += str(mtime);
1037         }
1038         if (created != static_cast<time_t>(-1)) {
1039             record += "\ncreated=";
1040             record += str(created);
1041         }
1042         off_t size = d.get_size();
1043         record += "\nsize=";
1044         record += str(size);
1045         newdocument.set_data(record);
1046
1047         // Index the title, document text, keywords and topic.
1048         indexer.set_document(newdocument);
1049         if (!title.empty()) {
1050             indexer.index_text(title, 5, "S");
1051             indexer.increase_termpos(100);
1052         }
1053         if (!dump.empty()) {
1054             indexer.index_text(dump);
1055         }
1056         if (!keywords.empty()) {
1057             indexer.increase_termpos(100);
1058             indexer.index_text(keywords);
1059         }
1060         if (!topic.empty()) {
1061             indexer.increase_termpos(100);
1062             indexer.index_text(topic, 1, "B");
1063         }
1064         // Index the leafname of the file.
1065         {
1066             indexer.increase_termpos(100);
1067             string leaf = d.leafname();
1068             string::size_type dot = leaf.find_last_of('.');
1069             if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1070                 leaf.resize(dot);
1071             indexer.index_text(leaf, 1, "F");
1072
1073             // Also index with underscores and ampersands replaced by spaces.
1074             bool modified = false;
1075             string::size_type rep = 0;
1076             while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1077                 leaf[rep++] = ' ';
1078                 modified = true;
1079             }
1080             if (modified) {
1081                 indexer.increase_termpos(100);
1082                 indexer.index_text(leaf, 1, "F");
1083             }
1084         }
1085
1086         if (!author.empty()) {
1087             indexer.increase_termpos(100);
1088             indexer.index_text(author, 1, "A");
1089         }
1090
1091         // mimeType:
1092         newdocument.add_boolean_term("T" + mimetype);
1093
1094         newdocument.add_boolean_term(site_term);
1095
1096         if (!host_term.empty())
1097             newdocument.add_boolean_term(host_term);
1098
1099         struct tm *tm = localtime(&mtime);
1100         string date_term = "D" + date_to_string(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
1101         newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1102         date_term.resize(7);
1103         date_term[0] = 'M';
1104         newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1105         date_term.resize(5);
1106         date_term[0] = 'Y';
1107         newdocument.add_boolean_term(date_term); // Year (YYYY)
1108
1109         newdocument.add_boolean_term(urlterm); // Url
1110
1111         // Add mtime as a value to allow "sort by date".
1112         newdocument.add_value(VALUE_LASTMOD,
1113                               int_to_binary_string(uint32_t(mtime)));
1114         if (use_ctime) {
1115             // Add ctime as a value to track modifications.
1116             time_t ctime = d.get_ctime();
1117             newdocument.add_value(VALUE_CTIME,
1118                                   int_to_binary_string(uint32_t(ctime)));
1119         }
1120
1121         // Add MD5 as a value to allow duplicate documents to be collapsed
1122         // together.
1123         newdocument.add_value(VALUE_MD5, md5);
1124
1125         // Add the file size as a value to allow "sort by size" and size ranges.
1126         newdocument.add_value(VALUE_SIZE,
1127                               Xapian::sortable_serialise(size));
1128
1129         bool inc_tag_added = false;
1130         if (d.is_other_readable()) {
1131             inc_tag_added = true;
1132             newdocument.add_boolean_term("I*");
1133         } else if (d.is_group_readable()) {
1134             const char * group = d.get_group();
1135             if (group) {
1136                 newdocument.add_boolean_term(string("I#") + group);
1137             }
1138         }
1139         const char * owner = d.get_owner();
1140         if (owner) {
1141             newdocument.add_boolean_term(string("O") + owner);
1142             if (!inc_tag_added && d.is_owner_readable())
1143                 newdocument.add_boolean_term(string("I@") + owner);
1144         }
1145
1146         string ext_term("E");
1147         for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1148             char ch = *i;
1149             if (ch >= 'A' && ch <= 'Z')
1150                 ch |= 32;
1151             ext_term += ch;
1152         }
1153         newdocument.add_boolean_term(ext_term);
1154
1155         index_add_document(urlterm, last_altered, did, newdocument);
1156     } catch (ReadError) {
1157         skip(urlterm, context, string("can't read file: ") + strerror(errno),
1158              d.get_size(), d.get_mtime());
1159     } catch (NoSuchFilter) {
1160         string filter_entry;
1161         if (cmd_it != commands.end()) {
1162             filter_entry = cmd_it->first;
1163         } else {
1164             filter_entry = mimetype;
1165         }
1166         string m = "Filter for \"";
1167         m += filter_entry;
1168         m += "\" not installed";
1169         skip(urlterm, context, m, d.get_size(), d.get_mtime());
1170         commands[filter_entry] = Filter();
1171     } catch (FileNotFound) {
1172         skip(urlterm, context, "File removed during indexing",
1173              d.get_size(), d.get_mtime(),
1174              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1175     } catch (const std::string & error) {
1176         skip(urlterm, context, error, d.get_size(), d.get_mtime());
1177     }
1178 }
1179
1180 void
1181 index_handle_deletion()
1182 {
1183     if (updated.empty() || old_docs_not_seen == 0) return;
1184
1185     if (verbose) {
1186         cout << "Deleting " << old_docs_not_seen << " old documents which weren't found" << endl;
1187     }
1188     Xapian::PostingIterator alldocs = db.postlist_begin(string());
1189     Xapian::docid did = *alldocs;
1190     while (did < updated.size()) {
1191         if (!updated[did]) {
1192             alldocs.skip_to(did);
1193             if (alldocs == db.postlist_end(string()))
1194                 break;
1195             if (*alldocs != did) {
1196                 // Document #did didn't exist before we started.
1197                 did = *alldocs;
1198                 continue;
1199             }
1200             db.delete_document(did);
1201             if (--old_docs_not_seen == 0)
1202                 break;
1203         }
1204         ++did;
1205     }
1206 }
1207
1208 void
1209 index_commit()
1210 {
1211     db.commit();
1212 }
1213
1214 void
1215 index_done()
1216 {
1217     // If we created a temporary directory then delete it.
1218     remove_tmpdir();
1219 }