xapian-applications/omega/index_file.cc

   1 /** @file index_file.cc
   2  * @brief Handle indexing a document from a file
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include "index_file.h"
  30
  31 #include <algorithm>
  32 #include <iostream>
  33 #include <limits>
  34 #include <string>
  35 #include <map>
  36 #include <vector>
  37
  38 #include <sys/types.h>
  39 #include "safeunistd.h"
  40 #include <cerrno>
  41 #include <cstdio>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include "safefcntl.h"
  45 #include <ctime>
  46
  47 #include <xapian.h>
  48
  49 #include "append_filename_arg.h"
  50 #include "atomparse.h"
  51 #include "diritor.h"
  52 #include "failed.h"
  53 #include "md5wrap.h"
  54 #include "metaxmlparse.h"
  55 #include "mimemap.h"
  56 #include "msxmlparse.h"
  57 #include "myhtmlparse.h"
  58 #include "opendocparse.h"
  59 #include "pkglibbindir.h"
  60 #include "runfilter.h"
  61 #include "sample.h"
  62 #include "str.h"
  63 #include "stringutils.h"
  64 #include "svgparse.h"
  65 #include "tmpdir.h"
  66 #include "utf8convert.h"
  67 #include "utils.h"
  68 #include "values.h"
  69 #include "xmlparse.h"
  70 #include "xlsxparse.h"
  71 #include "xpsxmlparse.h"
  72
  73 using namespace std;
  74
  75 static Xapian::WritableDatabase db;
  76 static Xapian::TermGenerator indexer;
  77
  78 static Xapian::doccount old_docs_not_seen;
  79 static Xapian::docid old_lastdocid;
  80 static vector<bool> updated;
  81
  82 static bool verbose;
  83 static bool retry_failed;
  84 static bool use_ctime;
  85 static dup_action_type dup_action;
  86 static bool ignore_exclusions;
  87 static bool description_as_sample;
  88 static bool date_terms;
  89
  90 static time_t last_altered_max;
  91 static size_t sample_size;
  92 static size_t title_size;
  93 static size_t max_ext_len;
  94
  95 static empty_body_type empty_body;
  96
  97 static string root;
  98 static string site_term, host_term;
  99
 100 static Failed failed;
 101
 102 map<string, Filter> commands;
 103
 104 static void
 105 mark_as_seen(Xapian::docid did)
 106 {
 107     if (usual(did < updated.size() && !updated[did])) {
 108         updated[did] = true;
 109         --old_docs_not_seen;
 110     }
 111 }
 112
 113 void
 114 skip(const string & urlterm, const string & context, const string & msg,
 115      off_t size, time_t last_mod, unsigned flags)
 116 {
 117     failed.add(urlterm, last_mod, size);
 118
 119     if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
 120         if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
 121         cout << context << ": ";
 122     }
 123
 124     cout << "Skipping - " << msg << endl;
 125 }
 126
 127 static void
 128 skip_cmd_failed(const string & urlterm, const string & context, const string & cmd,
 129                 off_t size, time_t last_mod)
 130 {
 131     skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
 132 }
 133
 134 static void
 135 skip_meta_tag(const string & urlterm, const string & context,
 136               off_t size, time_t last_mod)
 137 {
 138     skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
 139 }
 140
 141 static void
 142 skip_unknown_mimetype(const string & urlterm, const string & context,
 143                       const string & mimetype, off_t size, time_t last_mod)
 144 {
 145     skip(urlterm, context, "unknown MIME type '" + mimetype + "'", size, last_mod);
 146 }
 147
 148 void
 149 index_add_default_filters()
 150 {
 151     index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
 152     index_command("application/vnd.ms-excel",
 153                   Filter("xls2csv -c' ' -q0 -dutf-8", false));
 154     index_command("application/vnd.ms-powerpoint",
 155                   Filter("catppt -dutf-8", false));
 156     // Looking at the source of wpd2html and wpd2text I think both output
 157     // UTF-8, but it's hard to be sure without sample Unicode .wpd files
 158     // as they don't seem to be at all well documented.
 159     index_command("application/vnd.wordperfect", Filter("wpd2text", false));
 160     // wps2text produces UTF-8 output from the sample files I've tested.
 161     index_command("application/vnd.ms-works", Filter("wps2text", false));
 162     // Output is UTF-8 according to "man djvutxt".  Generally this seems to
 163     // be true, though some examples from djvu.org generate isolated byte
 164     // 0x95 in a context which suggests it might be intended to be a bullet
 165     // (as it is in CP1250).
 166     index_command("image/vnd.djvu", Filter("djvutxt", false));
 167     index_command("text/markdown", Filter("markdown", "text/html", false));
 168     // The --text option unhelpfully converts all non-ASCII characters to "?"
 169     // so we use --html instead, which produces HTML entities.  The --nopict
 170     // option suppresses exporting picture files as pictNNNN.wmf in the current
 171     // directory.  Note that this option was ignored in some older versions,
 172     // but it was fixed in unrtf 0.20.4.
 173     index_command("text/rtf",
 174                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 175                          false));
 176     index_command("text/x-rst", Filter("rst2html", "text/html", false));
 177     index_command("application/x-mspublisher",
 178                   Filter("pub2xhtml", "text/html", false));
 179     index_command("application/vnd.ms-outlook",
 180                   Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
 181                          false));
 182     index_command("application/vnd.ms-visio.drawing",
 183                   Filter("vsd2xhtml", "image/svg+xml", false));
 184     index_command("application/vnd.ms-visio.stencil",
 185                   Filter("vsd2xhtml", "image/svg+xml", false));
 186     index_command("application/vnd.ms-visio.template",
 187                   Filter("vsd2xhtml", "image/svg+xml", false));
 188     index_command("application/vnd.visio",
 189                   Filter("vsd2xhtml", "image/svg+xml", false));
 190     // pod2text's output character set doesn't seem to be documented, but from
 191     // inspecting the source it looks like it's probably iso-8859-1.  We need
 192     // to pass "--errors=stderr" or else minor POD formatting errors cause a
 193     // file not to be indexed.
 194     index_command("text/x-perl",
 195                   Filter("pod2text --errors=stderr",
 196                          "text/plain", "iso-8859-1", false));
 197     // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
 198     // appearing as single ligatures.  For European languages, it's actually
 199     // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
 200     // now until we handle Unicode "compatibility decompositions".
 201     index_command("application/x-dvi",
 202                   Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
 203     // Simplistic - ought to look in index.rdf files for filename and character
 204     // set.
 205     index_command("application/x-maff",
 206                   Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
 207                          false));
 208     index_command("application/x-mimearchive",
 209                   Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
 210                          false));
 211     index_command("message/news",
 212                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 213                          false));
 214     index_command("message/rfc822",
 215                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 216                          false));
 217     index_command("text/vcard",
 218                   Filter(get_pkglibbindir() + "/vcard2text", false));
 219     index_command("application/vnd.apply.keynote",
 220                   Filter("key2text", false));
 221     index_command("application/vnd.apply.numbers",
 222                   Filter("numbers2text", false));
 223     index_command("application/vnd.apply.pages",
 224                   Filter("pages2text", false));
 225 }
 226
 227 void
 228 index_init(const string & dbpath, const Xapian::Stem & stemmer,
 229            const string & root_, const string & site_term_,
 230            const string & host_term_,
 231            empty_body_type empty_body_, dup_action_type dup_action_,
 232            size_t sample_size_, size_t title_size_, size_t max_ext_len_,
 233            bool overwrite, bool retry_failed_,
 234            bool delete_removed_documents, bool verbose_, bool use_ctime_,
 235            bool spelling, bool ignore_exclusions_, bool description_as_sample_,
 236            bool date_terms_)
 237 {
 238     root = root_;
 239     site_term = site_term_;
 240     host_term = host_term_;
 241     empty_body = empty_body_;
 242     dup_action = dup_action_;
 243     sample_size = sample_size_;
 244     title_size = title_size_;
 245     max_ext_len = max_ext_len_;
 246     verbose = verbose_;
 247     use_ctime = use_ctime_;
 248     ignore_exclusions = ignore_exclusions_;
 249     description_as_sample = description_as_sample_;
 250     date_terms = date_terms_;
 251
 252     if (!overwrite) {
 253         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
 254         old_docs_not_seen = db.get_doccount();
 255         // Handle an initially empty database exactly the same way as when
 256         // overwrite is true.
 257         if (old_docs_not_seen != 0) {
 258             old_lastdocid = db.get_lastdocid();
 259             if (delete_removed_documents) {
 260                 // + 1 so that old_lastdocid is a valid subscript.
 261                 updated.resize(old_lastdocid + 1);
 262             }
 263             try {
 264                 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 265                 string ubound = db.get_value_upper_bound(slot);
 266                 if (!ubound.empty())
 267                     last_altered_max = binary_string_to_int(ubound);
 268             } catch (const Xapian::UnimplementedError &) {
 269                 numeric_limits<time_t> n;
 270                 last_altered_max = n.max();
 271             }
 272         }
 273     } else {
 274         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
 275     }
 276
 277     if (spelling) {
 278         indexer.set_database(db);
 279         indexer.set_flags(indexer.FLAG_SPELLING);
 280     }
 281     indexer.set_stemmer(stemmer);
 282
 283     runfilter_init();
 284
 285     failed.init(db);
 286
 287     if (overwrite) {
 288         // There are no failures to retry, so setting this flag doesn't
 289         // change the outcome, but does mean we avoid the overhead of
 290         // checking for a previous failure.
 291         retry_failed = true;
 292     } else if (retry_failed_) {
 293         failed.clear();
 294         retry_failed = true;
 295     } else {
 296         // If there are no existing failures, setting this flag doesn't
 297         // change the outcome, but does mean we avoid the overhead of
 298         // checking for a previous failure.
 299         retry_failed = failed.empty();
 300     }
 301 }
 302
 303 static void
 304 parse_pdfinfo_field(const char * p, const char * end, string & out, const char * field, size_t len)
 305 {
 306     if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
 307         p += len;
 308         while (p != end && *p == ' ')
 309             ++p;
 310         if (p != end && (end[-1] != '\r' || --end != p))
 311             out.assign(p, end - p);
 312     }
 313 }
 314
 315 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
 316     parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
 317
 318 static void
 319 get_pdf_metainfo(const string & file, string &author, string &title,
 320                  string &keywords, string &topic, int& pages)
 321 {
 322     try {
 323         string cmd = "pdfinfo -enc UTF-8";
 324         append_filename_argument(cmd, file);
 325         string pdfinfo = stdout_to_string(cmd, false);
 326
 327         const char * p = pdfinfo.data();
 328         const char * end = p + pdfinfo.size();
 329         while (p != end) {
 330             const char * start = p;
 331             p = static_cast<const char *>(memchr(p, '\n', end - p));
 332             const char * eol;
 333             if (p) {
 334                 eol = p;
 335                 ++p;
 336             } else {
 337                 p = eol = end;
 338             }
 339             switch (*start) {
 340                 case 'A':
 341                     PARSE_PDFINFO_FIELD(start, eol, author, "Author");
 342                     break;
 343                 case 'K':
 344                     PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
 345                     break;
 346                 case 'P': {
 347                     string s;
 348                     PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
 349                     if (!s.empty())
 350                         pages = atoi(s.c_str());
 351                     break;
 352                 }
 353                 case 'S':
 354                     PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
 355                     break;
 356                 case 'T':
 357                     PARSE_PDFINFO_FIELD(start, eol, title, "Title");
 358                     break;
 359             }
 360         }
 361     } catch (ReadError) {
 362         // It's probably best to index the document even if pdfinfo fails.
 363     }
 364 }
 365
 366 static void
 367 generate_sample_from_csv(const string & csv_data, string & sample)
 368 {
 369     // Add 3 to allow for a 4 byte utf-8 sequence being appended when
 370     // output is sample_size - 1 bytes long.  Use csv_data.size() if smaller
 371     // since the user might reasonably set sample_size really high.
 372     sample.reserve(min(sample_size + 3, csv_data.size()));
 373     size_t last_word_end = 0;
 374     bool in_space = true;
 375     bool in_quotes = false;
 376     for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
 377         unsigned ch = *i;
 378
 379         if (!in_quotes) {
 380             // If not already in double quotes, '"' starts quoting and
 381             // ',' starts a new field.
 382             if (ch == '"') {
 383                 in_quotes = true;
 384                 continue;
 385             }
 386             if (ch == ',')
 387                 ch = ' ';
 388         } else if (ch == '"') {
 389             // In double quotes, '"' either ends double quotes, or
 390             // if followed by another '"', means a literal '"'.
 391             if (++i == Xapian::Utf8Iterator())
 392                 break;
 393             ch = *i;
 394             if (ch != '"') {
 395                 in_quotes = false;
 396                 if (ch == ',')
 397                     ch = ' ';
 398             }
 399         }
 400
 401         if (ch <= ' ' || ch == 0xa0) {
 402             // FIXME: if all the whitespace characters between two
 403             // words are 0xa0 (non-breaking space) then perhaps we
 404             // should output 0xa0.
 405             if (in_space)
 406                 continue;
 407             last_word_end = sample.size();
 408             sample += ' ';
 409             in_space = true;
 410         } else {
 411             Xapian::Unicode::append_utf8(sample, ch);
 412             in_space = false;
 413         }
 414
 415         if (sample.size() >= sample_size) {
 416             // Need to truncate sample.
 417             if (last_word_end <= sample_size / 2) {
 418                 // Monster word!  We'll have to just split it.
 419                 sample.replace(sample_size - 3, string::npos, "...", 3);
 420             } else {
 421                 sample.replace(last_word_end, string::npos, " ...", 4);
 422             }
 423             break;
 424         }
 425     }
 426 }
 427
 428 static bool
 429 index_check_existing(const string & urlterm, time_t last_altered,
 430                      Xapian::docid & did)
 431 {
 432     switch (dup_action) {
 433         case DUP_SKIP: {
 434             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 435             if (p != db.postlist_end(urlterm)) {
 436                 if (verbose)
 437                     cout << "already indexed, not updating" << endl;
 438                 did = *p;
 439                 mark_as_seen(did);
 440                 return true;
 441             }
 442             break;
 443         }
 444         case DUP_CHECK_LAZILY: {
 445             // If last_altered > last_altered_max, we know for sure that the
 446             // file is new or updated.
 447             if (last_altered > last_altered_max) {
 448                 return false;
 449             }
 450
 451             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 452             if (p != db.postlist_end(urlterm)) {
 453                 did = *p;
 454                 Xapian::Document doc = db.get_document(did);
 455                 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 456                 string value = doc.get_value(slot);
 457                 time_t old_last_altered = binary_string_to_int(value);
 458                 if (last_altered <= old_last_altered) {
 459                     if (verbose)
 460                         cout << "already indexed" << endl;
 461                     // The docid should be in updated - the only valid
 462                     // exception is if the URL was long and hashed to the
 463                     // same URL as an existing document indexed in the same
 464                     // batch.
 465                     mark_as_seen(did);
 466                     return true;
 467                 }
 468             }
 469             break;
 470         }
 471     }
 472     return false;
 473 }
 474
 475 void
 476 index_remove_failed_entry(const string& urlterm)
 477 {
 478     failed.del(urlterm);
 479 }
 480
 481 void
 482 index_add_document(const string & urlterm, time_t last_altered,
 483                    Xapian::docid did, const Xapian::Document & doc)
 484 {
 485     if (dup_action != DUP_SKIP) {
 486         // If this document has already been indexed, update the existing
 487         // entry.
 488         if (did) {
 489             // We already found out the document id above.
 490             db.replace_document(did, doc);
 491         } else if (last_altered <= last_altered_max) {
 492             // We checked for the UID term and didn't find it.
 493             did = db.add_document(doc);
 494         } else {
 495             did = db.replace_document(urlterm, doc);
 496         }
 497         mark_as_seen(did);
 498         if (verbose) {
 499             if (did <= old_lastdocid) {
 500                 cout << "updated" << endl;
 501             } else {
 502                 cout << "added" << endl;
 503             }
 504         }
 505     } else {
 506         // If this were a duplicate, we'd have skipped it above.
 507         db.add_document(doc);
 508         if (verbose)
 509             cout << "added" << endl;
 510     }
 511 }
 512
 513 void
 514 index_mimetype(const string & file, const string & urlterm, const string & url,
 515                const string & ext,
 516                const string &mimetype, DirectoryIterator &d,
 517                Xapian::Document & newdocument,
 518                string record)
 519 {
 520     string context(file, root.size(), string::npos);
 521
 522     // FIXME: We could be cleverer here and check mtime too when use_ctime is
 523     // set - if the ctime has changed but the mtime is unchanged, we can just
 524     // update the existing Document and avoid having to re-extract text, etc.
 525     time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
 526
 527     Xapian::docid did = 0;
 528     if (index_check_existing(urlterm, last_altered, did))
 529         return;
 530
 531     if (!retry_failed) {
 532         // We only store and check the mtime (last modified) - a change to the
 533         // metadata won't generally cause a previous failure to now work
 534         // (FIXME: except permissions).
 535         time_t failed_last_mod;
 536         off_t failed_size;
 537         if (failed.contains(urlterm, failed_last_mod, failed_size)) {
 538             if (d.get_mtime() <= failed_last_mod &&
 539                 d.get_size() == failed_size) {
 540                 if (verbose)
 541                     cout << "failed to extract text on earlier run" << endl;
 542                 return;
 543             }
 544             // The file has changed, so remove the entry for it.  If it fails
 545             // again on this attempt, we'll add a new one.
 546             failed.del(urlterm);
 547         }
 548     }
 549
 550     if (verbose) cout << flush;
 551
 552     string author, title, sample, keywords, topic, dump;
 553     string md5;
 554     time_t created = time_t(-1);
 555     int pages = -1;
 556
 557     map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
 558     if (cmd_it == commands.end()) {
 559         size_t slash = mimetype.find('/');
 560         if (slash != string::npos) {
 561             string wildtype(mimetype, 0, slash + 2);
 562             wildtype[slash + 1] = '*';
 563             cmd_it = commands.find(wildtype);
 564             if (cmd_it == commands.end()) {
 565                 cmd_it = commands.find("*/*");
 566             }
 567         }
 568         if (cmd_it == commands.end()) {
 569             cmd_it = commands.find("*");
 570         }
 571     }
 572     try {
 573         if (cmd_it != commands.end()) {
 574             // Easy "run a command and read text or HTML from stdout or a
 575             // temporary file" cases.
 576             string cmd = cmd_it->second.cmd;
 577             if (cmd.empty()) {
 578                 skip(urlterm, context, "required filter not installed",
 579                      d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
 580                 return;
 581             }
 582             if (cmd == "false") {
 583                 // Allow setting 'false' as a filter to mean that a MIME type
 584                 // should be quietly ignored.
 585                 string m = "ignoring MIME type '";
 586                 m += cmd_it->first;
 587                 m += "'";
 588                 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
 589                      SKIP_VERBOSE_ONLY);
 590                 return;
 591             }
 592             bool use_shell = cmd_it->second.use_shell();
 593             bool substituted = false;
 594             string tmpout;
 595             size_t pcent = 0;
 596             while (true) {
 597                 pcent = cmd.find('%', pcent);
 598                 if (pcent >= cmd.size() - 1)
 599                     break;
 600                 switch (cmd[pcent + 1]) {
 601                     case '%': // %% -> %.
 602                         cmd.erase(++pcent, 1);
 603                         break;
 604                     case 'f': { // %f -> escaped filename.
 605                         substituted = true;
 606                         string tail(cmd, pcent + 2);
 607                         cmd.resize(pcent);
 608                         append_filename_argument(cmd, file);
 609                         // Remove the space append_filename_argument() adds before
 610                         // the argument - the command string either includes one,
 611                         // or won't expect one (e.g. --input=%f).
 612                         cmd.erase(pcent, 1);
 613                         pcent = cmd.size();
 614                         cmd += tail;
 615                         break;
 616                     }
 617                     case 't': { // %t -> temporary output file.
 618                         if (tmpout.empty()) {
 619                             // Use a temporary file with a suitable extension
 620                             // in case the command cares, and for more helpful
 621                             // error messages from the command.
 622                             if (cmd_it->second.output_type == "text/html") {
 623                                 tmpout = get_tmpfile("tmp.html");
 624                             } else if (cmd_it->second.output_type == "image/svg+xml") {
 625                                 tmpout = get_tmpfile("tmp.svg");
 626                             } else {
 627                                 tmpout = get_tmpfile("tmp.txt");
 628                             }
 629                         }
 630                         substituted = true;
 631                         string tail(cmd, pcent + 2);
 632                         cmd.resize(pcent);
 633                         append_filename_argument(cmd, tmpout);
 634                         // Remove the space append_filename_argument() adds before
 635                         // the argument - the command string either includes one,
 636                         // or won't expect one (e.g. --input=%f).
 637                         cmd.erase(pcent, 1);
 638                         pcent = cmd.size();
 639                         cmd += tail;
 640                         break;
 641                     }
 642                     default:
 643                         // Leave anything else alone for now.
 644                         pcent += 2;
 645                         break;
 646                 }
 647             }
 648             if (!substituted && cmd != "true") {
 649                 // If no %f, append the filename to the command.
 650                 append_filename_argument(cmd, file);
 651             }
 652             try {
 653                 if (!tmpout.empty()) {
 654                     // Output in temporary file.
 655                     (void)stdout_to_string(cmd, use_shell);
 656                     if (!load_file(tmpout, dump)) {
 657                         throw ReadError("Couldn't read output file");
 658                     }
 659                     unlink(tmpout.c_str());
 660                 } else if (cmd == "true") {
 661                     // Ignore the file's contents, just index metadata from the
 662                     // filing system.
 663                 } else {
 664                     // Output on stdout.
 665                     dump = stdout_to_string(cmd, use_shell);
 666                 }
 667                 const string & charset = cmd_it->second.output_charset;
 668                 if (cmd_it->second.output_type == "text/html") {
 669                     MyHtmlParser p;
 670                     p.ignore_metarobots();
 671                     p.description_as_sample = description_as_sample;
 672                     try {
 673                         p.parse_html(dump, charset, false);
 674                     } catch (const string & newcharset) {
 675                         p.reset();
 676                         p.ignore_metarobots();
 677                         p.description_as_sample = description_as_sample;
 678                         p.parse_html(dump, newcharset, true);
 679                     } catch (ReadError) {
 680                         skip_cmd_failed(urlterm, context, cmd,
 681                                         d.get_size(), d.get_mtime());
 682                         return;
 683                     }
 684                     dump = p.dump;
 685                     title = p.title;
 686                     keywords = p.keywords;
 687                     topic = p.topic;
 688                     sample = p.sample;
 689                     author = p.author;
 690                     created = p.created;
 691                 } else if (cmd_it->second.output_type == "image/svg+xml") {
 692                     SvgParser svgparser;
 693                     svgparser.parse(dump);
 694                     dump = svgparser.dump;
 695                     title = svgparser.title;
 696                     keywords = svgparser.keywords;
 697                     // FIXME: topic = svgparser.topic;
 698                     author = svgparser.author;
 699                 } else if (!charset.empty()) {
 700                     convert_to_utf8(dump, charset);
 701                 }
 702             } catch (ReadError) {
 703                 skip_cmd_failed(urlterm, context, cmd,
 704                                 d.get_size(), d.get_mtime());
 705                 return;
 706             }
 707         } else if (mimetype == "text/html" || mimetype == "text/x-php") {
 708             const string & text = d.file_to_string();
 709             MyHtmlParser p;
 710             if (ignore_exclusions) p.ignore_metarobots();
 711             p.description_as_sample = description_as_sample;
 712             try {
 713                 // Default HTML character set is latin 1, though not specifying
 714                 // one is deprecated these days.
 715                 p.parse_html(text, "iso-8859-1", false);
 716             } catch (const string & newcharset) {
 717                 p.reset();
 718                 if (ignore_exclusions) p.ignore_metarobots();
 719                 p.description_as_sample = description_as_sample;
 720                 p.parse_html(text, newcharset, true);
 721             }
 722             if (!p.indexing_allowed) {
 723                 skip_meta_tag(urlterm, context,
 724                               d.get_size(), d.get_mtime());
 725                 return;
 726             }
 727             dump = p.dump;
 728             title = p.title;
 729             keywords = p.keywords;
 730             topic = p.topic;
 731             sample = p.sample;
 732             author = p.author;
 733             created = p.created;
 734             md5_string(text, md5);
 735         } else if (mimetype == "text/plain") {
 736             // Currently we assume that text files are UTF-8 unless they have a
 737             // byte-order mark.
 738             dump = d.file_to_string();
 739             md5_string(dump, md5);
 740
 741             // Look for Byte-Order Mark (BOM).
 742             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 743                 // UTF-16 in big-endian/little-endian order - we just convert
 744                 // it as "UTF-16" and let the conversion handle the BOM as that
 745                 // way we avoid the copying overhead of erasing 2 bytes from
 746                 // the start of dump.
 747                 convert_to_utf8(dump, "UTF-16");
 748             } else if (startswith(dump, "\xef\xbb\xbf")) {
 749                 // UTF-8 with stupid Windows not-the-byte-order mark.
 750                 dump.erase(0, 3);
 751             } else {
 752                 // FIXME: What charset is the file?  Look at contents?
 753             }
 754         } else if (mimetype == "application/pdf") {
 755             string cmd = "pdftotext -enc UTF-8";
 756             append_filename_argument(cmd, file);
 757             cmd += " -";
 758             try {
 759                 dump = stdout_to_string(cmd, false);
 760             } catch (ReadError) {
 761                 skip_cmd_failed(urlterm, context, cmd,
 762                                 d.get_size(), d.get_mtime());
 763                 return;
 764             }
 765             get_pdf_metainfo(file, author, title, keywords, topic, pages);
 766         } else if (mimetype == "application/postscript") {
 767             // There simply doesn't seem to be a Unicode capable PostScript to
 768             // text converter (e.g. pstotext always outputs ISO-8859-1).  The
 769             // only solution seems to be to convert via PDF using ps2pdf and
 770             // then pdftotext.  This gives plausible looking UTF-8 output for
 771             // some Chinese PostScript files I found using Google.  It also has
 772             // the benefit of allowing us to extract meta information from
 773             // PostScript files.
 774             string tmpfile = get_tmpfile("tmp.pdf");
 775             if (tmpfile.empty()) {
 776                 // FIXME: should this be fatal?  Or disable indexing postscript?
 777                 string msg = "Couldn't create temporary directory (";
 778                 msg += strerror(errno);
 779                 msg += ")";
 780                 skip(urlterm, context, msg,
 781                      d.get_size(), d.get_mtime());
 782                 return;
 783             }
 784             string cmd = "ps2pdf";
 785             append_filename_argument(cmd, file);
 786             append_filename_argument(cmd, tmpfile);
 787             try {
 788                 (void)stdout_to_string(cmd, false);
 789                 cmd = "pdftotext -enc UTF-8";
 790                 append_filename_argument(cmd, tmpfile);
 791                 cmd += " -";
 792                 dump = stdout_to_string(cmd, false);
 793             } catch (ReadError) {
 794                 skip_cmd_failed(urlterm, context, cmd,
 795                                 d.get_size(), d.get_mtime());
 796                 unlink(tmpfile.c_str());
 797                 return;
 798             } catch (...) {
 799                 unlink(tmpfile.c_str());
 800                 throw;
 801             }
 802             try {
 803                 get_pdf_metainfo(tmpfile, author, title, keywords, topic, pages);
 804             } catch (...) {
 805                 unlink(tmpfile.c_str());
 806                 throw;
 807             }
 808             unlink(tmpfile.c_str());
 809         } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
 810                    startswith(mimetype, "application/vnd.oasis.opendocument."))
 811         {
 812             // Inspired by http://mjr.towers.org.uk/comp/sxw2text
 813             string cmd = "unzip -p";
 814             append_filename_argument(cmd, file);
 815             cmd += " content.xml ; unzip -p";
 816             append_filename_argument(cmd, file);
 817             cmd += " styles.xml";
 818             try {
 819                 OpenDocParser parser;
 820                 parser.parse(stdout_to_string(cmd, true));
 821                 dump = parser.dump;
 822             } catch (ReadError) {
 823                 skip_cmd_failed(urlterm, context, cmd,
 824                                 d.get_size(), d.get_mtime());
 825                 return;
 826             }
 827
 828             cmd = "unzip -p";
 829             append_filename_argument(cmd, file);
 830             cmd += " meta.xml";
 831             try {
 832                 MetaXmlParser metaxmlparser;
 833                 metaxmlparser.parse(stdout_to_string(cmd, false));
 834                 title = metaxmlparser.title;
 835                 keywords = metaxmlparser.keywords;
 836                 // FIXME: topic = metaxmlparser.topic;
 837                 sample = metaxmlparser.sample;
 838                 author = metaxmlparser.author;
 839             } catch (ReadError) {
 840                 // It's probably best to index the document even if this fails.
 841             }
 842         } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
 843             const char * args = NULL;
 844             string tail(mimetype, 46);
 845             if (startswith(tail, "wordprocessingml.")) {
 846                 // unzip returns exit code 11 if a file to extract wasn't found
 847                 // which we want to ignore, because there may be no headers or
 848                 // no footers.
 849                 args = " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
 850             } else if (startswith(tail, "spreadsheetml.")) {
 851                 // Extract the shared string table first, so our parser can
 852                 // grab those ready for parsing the sheets which will reference
 853                 // the shared strings.
 854                 string cmd = "unzip -p";
 855                 append_filename_argument(cmd, file);
 856                 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
 857                 append_filename_argument(cmd, file);
 858                 cmd += " xl/worksheets/sheet\\*.xml";
 859                 try {
 860                     XlsxParser parser;
 861                     parser.parse(stdout_to_string(cmd, true));
 862                     dump = parser.dump;
 863                 } catch (ReadError) {
 864                     skip_cmd_failed(urlterm, context, cmd,
 865                                     d.get_size(), d.get_mtime());
 866                     return;
 867                 }
 868             } else if (startswith(tail, "presentationml.")) {
 869                 // unzip returns exit code 11 if a file to extract wasn't found
 870                 // which we want to ignore, because there may be no notesSlides
 871                 // or comments.
 872                 args = " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
 873             } else {
 874                 // Don't know how to index this type.
 875                 skip_unknown_mimetype(urlterm, context, mimetype,
 876                                       d.get_size(), d.get_mtime());
 877                 return;
 878             }
 879
 880             if (args) {
 881                 string cmd = "unzip -p";
 882                 append_filename_argument(cmd, file);
 883                 cmd += args;
 884                 try {
 885                     MSXmlParser xmlparser;
 886                     // Treat exit status 11 from unzip as success - this is
 887                     // what we get if one of the listed filenames to extract
 888                     // doesn't match anything in the zip file.
 889                     xmlparser.parse_xml(stdout_to_string(cmd, false, 11));
 890                     dump = xmlparser.dump;
 891                 } catch (ReadError) {
 892                     skip_cmd_failed(urlterm, context, cmd,
 893                                     d.get_size(), d.get_mtime());
 894                     return;
 895                 }
 896             }
 897
 898             string cmd = "unzip -p";
 899             append_filename_argument(cmd, file);
 900             cmd += " docProps/core.xml";
 901             try {
 902                 MetaXmlParser metaxmlparser;
 903                 metaxmlparser.parse(stdout_to_string(cmd, false));
 904                 title = metaxmlparser.title;
 905                 keywords = metaxmlparser.keywords;
 906                 // FIXME: topic = metaxmlparser.topic;
 907                 sample = metaxmlparser.sample;
 908                 author = metaxmlparser.author;
 909             } catch (ReadError) {
 910                 // It's probably best to index the document even if this fails.
 911             }
 912         } else if (mimetype == "application/x-abiword") {
 913             // FIXME: Implement support for metadata.
 914             XmlParser xmlparser;
 915             const string & text = d.file_to_string();
 916             xmlparser.parse_xml(text);
 917             dump = xmlparser.dump;
 918             md5_string(text, md5);
 919         } else if (mimetype == "application/x-abiword-compressed") {
 920             // FIXME: Implement support for metadata.
 921             XmlParser xmlparser;
 922             xmlparser.parse_xml(d.gzfile_to_string());
 923             dump = xmlparser.dump;
 924         } else if (mimetype == "application/vnd.ms-xpsdocument") {
 925             string cmd = "unzip -p";
 926             append_filename_argument(cmd, file);
 927             cmd += " 'Documents/1/Pages/*.fpage'";
 928             try {
 929                 XpsXmlParser xpsparser;
 930                 dump = stdout_to_string(cmd, false);
 931                 // Look for Byte-Order Mark (BOM).
 932                 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 933                     // UTF-16 in big-endian/little-endian order - we just
 934                     // convert it as "UTF-16" and let the conversion handle the
 935                     // BOM as that way we avoid the copying overhead of erasing
 936                     // 2 bytes from the start of dump.
 937                     convert_to_utf8(dump, "UTF-16");
 938                 }
 939                 xpsparser.parse(dump);
 940                 dump = xpsparser.dump;
 941             } catch (ReadError) {
 942                 skip_cmd_failed(urlterm, context, cmd,
 943                                 d.get_size(), d.get_mtime());
 944                 return;
 945             }
 946         } else if (mimetype == "text/csv") {
 947             // Currently we assume that text files are UTF-8 unless they have a
 948             // byte-order mark.
 949             dump = d.file_to_string();
 950             md5_string(dump, md5);
 951
 952             // Look for Byte-Order Mark (BOM).
 953             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 954                 // UTF-16 in big-endian/little-endian order - we just convert
 955                 // it as "UTF-16" and let the conversion handle the BOM as that
 956                 // way we avoid the copying overhead of erasing 2 bytes from
 957                 // the start of dump.
 958                 convert_to_utf8(dump, "UTF-16");
 959             } else if (startswith(dump, "\xef\xbb\xbf")) {
 960                 // UTF-8 with stupid Windows not-the-byte-order mark.
 961                 dump.erase(0, 3);
 962             } else {
 963                 // FIXME: What charset is the file?  Look at contents?
 964             }
 965
 966             generate_sample_from_csv(dump, sample);
 967         } else if (mimetype == "image/svg+xml") {
 968             SvgParser svgparser;
 969             const string & text = d.file_to_string();
 970             md5_string(text, md5);
 971             svgparser.parse(text);
 972             dump = svgparser.dump;
 973             title = svgparser.title;
 974             keywords = svgparser.keywords;
 975             // FIXME: topic = svgparser.topic;
 976             author = svgparser.author;
 977         } else if (mimetype == "application/vnd.debian.binary-package" ||
 978                    mimetype == "application/x-debian-package") {
 979             string cmd("dpkg-deb -f");
 980             append_filename_argument(cmd, file);
 981             cmd += " Description";
 982             const string & desc = stdout_to_string(cmd, false);
 983             // First line is short description, which we use as the title.
 984             string::size_type idx = desc.find('\n');
 985             title.assign(desc, 0, idx);
 986             if (idx != string::npos) {
 987                 dump.assign(desc, idx + 1, string::npos);
 988             }
 989         } else if (mimetype == "application/x-redhat-package-manager" ||
 990                    mimetype == "application/x-rpm") {
 991             string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
 992             append_filename_argument(cmd, file);
 993             const string & desc = stdout_to_string(cmd, false);
 994             // First line is summary, which we use as the title.
 995             string::size_type idx = desc.find('\n');
 996             title.assign(desc, 0, idx);
 997             if (idx != string::npos) {
 998                 dump.assign(desc, idx + 1, string::npos);
 999             }
1000         } else if (mimetype == "application/atom+xml") {
1001             AtomParser atomparser;
1002             const string & text = d.file_to_string();
1003             md5_string(text, md5);
1004             atomparser.parse(text);
1005             dump = atomparser.dump;
1006             title = atomparser.title;
1007             keywords = atomparser.keywords;
1008             // FIXME: topic = atomparser.topic;
1009             author = atomparser.author;
1010         } else {
1011             // Don't know how to index this type.
1012             skip_unknown_mimetype(urlterm, context, mimetype,
1013                                   d.get_size(), d.get_mtime());
1014             return;
1015         }
1016
1017         // Compute the MD5 of the file if we haven't already.
1018         if (md5.empty() && md5_file(file, md5, d.try_noatime()) == 0) {
1019             if (errno == ENOENT || errno == ENOTDIR) {
1020                 skip(urlterm, context, "File removed during indexing",
1021                      d.get_size(), d.get_mtime(),
1022                      SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1023             } else {
1024                 skip(urlterm, context, "failed to read file to calculate MD5 checksum",
1025                      d.get_size(), d.get_mtime());
1026             }
1027             return;
1028         }
1029
1030         // Remove any trailing formfeeds, so we don't consider them when
1031         // considering if we extracted any text (e.g. pdftotext outputs a
1032         // formfeed between each page, even for blank pages).
1033         //
1034         // If dump contain only formfeeds, then trim_end will be string::npos
1035         // and ++trim_end will be 0, which is the correct new size.
1036         string::size_type trim_end = dump.find_last_not_of('\f');
1037         if (++trim_end != dump.size())
1038             dump.resize(trim_end);
1039
1040         if (dump.empty()) {
1041             switch (empty_body) {
1042                 case EMPTY_BODY_INDEX:
1043                     break;
1044                 case EMPTY_BODY_WARN:
1045                     cout << "no text extracted from document body, "
1046                             "but indexing metadata anyway" << endl;
1047                     break;
1048                 case EMPTY_BODY_SKIP:
1049                     skip(urlterm, context, "no text extracted from document body",
1050                          d.get_size(), d.get_mtime());
1051                     return;
1052             }
1053         }
1054
1055         // Produce a sample
1056         if (sample.empty()) {
1057             sample = generate_sample(dump, sample_size, "...", " ...");
1058         } else {
1059             sample = generate_sample(sample, sample_size, "...", " ...");
1060         }
1061
1062         // Put the data in the document
1063         if (record.empty()) {
1064             record = "url=";
1065         } else {
1066             record += "\nurl=";
1067         }
1068         record += url;
1069         record += "\nsample=";
1070         record += sample;
1071         if (!title.empty()) {
1072             record += "\ncaption=";
1073             record += generate_sample(title, title_size, "...", " ...");
1074         }
1075         if (!author.empty()) {
1076             record += "\nauthor=";
1077             record += author;
1078         }
1079         record += "\ntype=";
1080         record += mimetype;
1081         time_t mtime = d.get_mtime();
1082         if (mtime != static_cast<time_t>(-1)) {
1083             record += "\nmodtime=";
1084             record += str(mtime);
1085         }
1086         if (created != static_cast<time_t>(-1)) {
1087             record += "\ncreated=";
1088             record += str(created);
1089         }
1090         if (pages >= 0) {
1091             record += "\npages=";
1092             record += str(pages);
1093         }
1094         off_t size = d.get_size();
1095         record += "\nsize=";
1096         record += str(size);
1097         newdocument.set_data(record);
1098
1099         // Index the title, document text, keywords and topic.
1100         indexer.set_document(newdocument);
1101         if (!title.empty()) {
1102             indexer.index_text(title, 5, "S");
1103             indexer.increase_termpos(100);
1104         }
1105         if (!dump.empty()) {
1106             indexer.index_text(dump);
1107         }
1108         if (!keywords.empty()) {
1109             indexer.increase_termpos(100);
1110             indexer.index_text(keywords);
1111         }
1112         if (!topic.empty()) {
1113             indexer.increase_termpos(100);
1114             indexer.index_text(topic, 1, "B");
1115         }
1116         // Index the leafname of the file.
1117         {
1118             indexer.increase_termpos(100);
1119             string leaf = d.leafname();
1120             string::size_type dot = leaf.find_last_of('.');
1121             if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1122                 leaf.resize(dot);
1123             indexer.index_text(leaf, 1, "F");
1124
1125             // Also index with underscores and ampersands replaced by spaces.
1126             bool modified = false;
1127             string::size_type rep = 0;
1128             while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1129                 leaf[rep++] = ' ';
1130                 modified = true;
1131             }
1132             if (modified) {
1133                 indexer.increase_termpos(100);
1134                 indexer.index_text(leaf, 1, "F");
1135             }
1136         }
1137
1138         if (!author.empty()) {
1139             indexer.increase_termpos(100);
1140             indexer.index_text(author, 1, "A");
1141         }
1142
1143         // mimeType:
1144         newdocument.add_boolean_term("T" + mimetype);
1145
1146         newdocument.add_boolean_term(site_term);
1147
1148         if (!host_term.empty())
1149             newdocument.add_boolean_term(host_term);
1150
1151         if (date_terms) {
1152             struct tm *tm = localtime(&mtime);
1153             string date_term = "D";
1154             date_term += date_to_string(tm->tm_year + 1900,
1155                                         tm->tm_mon + 1,
1156                                         tm->tm_mday);
1157             newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1158             date_term.resize(7);
1159             date_term[0] = 'M';
1160             newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1161             date_term.resize(5);
1162             date_term[0] = 'Y';
1163             newdocument.add_boolean_term(date_term); // Year (YYYY)
1164         }
1165
1166         newdocument.add_boolean_term(urlterm); // Url
1167
1168         // Add mtime as a value to allow "sort by date".
1169         newdocument.add_value(VALUE_LASTMOD,
1170                               int_to_binary_string(uint32_t(mtime)));
1171         if (use_ctime) {
1172             // Add ctime as a value to track modifications.
1173             time_t ctime = d.get_ctime();
1174             newdocument.add_value(VALUE_CTIME,
1175                                   int_to_binary_string(uint32_t(ctime)));
1176         }
1177
1178         // Add MD5 as a value to allow duplicate documents to be collapsed
1179         // together.
1180         newdocument.add_value(VALUE_MD5, md5);
1181
1182         // Add the file size as a value to allow "sort by size" and size ranges.
1183         newdocument.add_value(VALUE_SIZE,
1184                               Xapian::sortable_serialise(size));
1185
1186         bool inc_tag_added = false;
1187         if (d.is_other_readable()) {
1188             inc_tag_added = true;
1189             newdocument.add_boolean_term("I*");
1190         } else if (d.is_group_readable()) {
1191             const char * group = d.get_group();
1192             if (group) {
1193                 newdocument.add_boolean_term(string("I#") + group);
1194             }
1195         }
1196         const char * owner = d.get_owner();
1197         if (owner) {
1198             newdocument.add_boolean_term(string("O") + owner);
1199             if (!inc_tag_added && d.is_owner_readable())
1200                 newdocument.add_boolean_term(string("I@") + owner);
1201         }
1202
1203         string ext_term("E");
1204         for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1205             char ch = *i;
1206             if (ch >= 'A' && ch <= 'Z')
1207                 ch |= 32;
1208             ext_term += ch;
1209         }
1210         newdocument.add_boolean_term(ext_term);
1211
1212         index_add_document(urlterm, last_altered, did, newdocument);
1213     } catch (ReadError) {
1214         skip(urlterm, context, string("can't read file: ") + strerror(errno),
1215              d.get_size(), d.get_mtime());
1216     } catch (NoSuchFilter) {
1217         string filter_entry;
1218         if (cmd_it != commands.end()) {
1219             filter_entry = cmd_it->first;
1220         } else {
1221             filter_entry = mimetype;
1222         }
1223         string m = "Filter for \"";
1224         m += filter_entry;
1225         m += "\" not installed";
1226         skip(urlterm, context, m, d.get_size(), d.get_mtime());
1227         commands[filter_entry] = Filter();
1228     } catch (FileNotFound) {
1229         skip(urlterm, context, "File removed during indexing",
1230              d.get_size(), d.get_mtime(),
1231              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1232     } catch (const std::string & error) {
1233         skip(urlterm, context, error, d.get_size(), d.get_mtime());
1234     } catch (const std::bad_alloc&) {
1235         // Attempt to flag the file as failed and commit changes, though that
1236         // might fail too if we're low on memory rather than being asked to
1237         // allocate a ludicrous amount.
1238         skip(urlterm, context, "Out of memory trying to extract text from file",
1239              d.get_size(), d.get_mtime(),
1240              SKIP_SHOW_FILENAME);
1241         throw CommitAndExit("Caught std::bad_alloc", "");
1242     }
1243 }
1244
1245 void
1246 index_handle_deletion()
1247 {
1248     if (updated.empty() || old_docs_not_seen == 0) return;
1249
1250     if (verbose) {
1251         cout << "Deleting " << old_docs_not_seen << " old documents which weren't found" << endl;
1252     }
1253     Xapian::PostingIterator alldocs = db.postlist_begin(string());
1254     Xapian::docid did = *alldocs;
1255     while (did < updated.size()) {
1256         if (!updated[did]) {
1257             alldocs.skip_to(did);
1258             if (alldocs == db.postlist_end(string()))
1259                 break;
1260             if (*alldocs != did) {
1261                 // Document #did didn't exist before we started.
1262                 did = *alldocs;
1263                 continue;
1264             }
1265             db.delete_document(did);
1266             if (--old_docs_not_seen == 0)
1267                 break;
1268         }
1269         ++did;
1270     }
1271 }
1272
1273 void
1274 index_commit()
1275 {
1276     db.commit();
1277 }
1278
1279 void
1280 index_done()
1281 {
1282     // If we created a temporary directory then delete it.
1283     remove_tmpdir();
1284 }