Refactor handling of input files
[xapian.git] / xapian-applications / omega / index_file.cc
blobbc6bc5a59c031ceb8e87b575831a4ca704cc32f5
1 /** @file index_file.cc
2 * @brief Handle indexing a document from a file
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include "index_file.h"
31 #include <algorithm>
32 #include <iostream>
33 #include <limits>
34 #include <string>
35 #include <map>
36 #include <vector>
38 #include <sys/types.h>
39 #include "safeunistd.h"
40 #include <cerrno>
41 #include <cstdio>
42 #include <cstdlib>
43 #include <cstring>
44 #include "safefcntl.h"
45 #include <ctime>
47 #include <xapian.h>
49 #include "append_filename_arg.h"
50 #include "atomparse.h"
51 #include "diritor.h"
52 #include "failed.h"
53 #include "md5wrap.h"
54 #include "metaxmlparse.h"
55 #include "mimemap.h"
56 #include "msxmlparse.h"
57 #include "myhtmlparse.h"
58 #include "opendocparse.h"
59 #include "pkglibbindir.h"
60 #include "runfilter.h"
61 #include "sample.h"
62 #include "str.h"
63 #include "stringutils.h"
64 #include "svgparse.h"
65 #include "tmpdir.h"
66 #include "utf8convert.h"
67 #include "utils.h"
68 #include "values.h"
69 #include "xmlparse.h"
70 #include "xlsxparse.h"
71 #include "xpsxmlparse.h"
73 using namespace std;
75 static Xapian::WritableDatabase db;
76 static Xapian::TermGenerator indexer;
78 static Xapian::doccount old_docs_not_seen;
79 static Xapian::docid old_lastdocid;
80 static vector<bool> updated;
82 static bool verbose;
83 static bool retry_failed;
84 static bool use_ctime;
85 static dup_action_type dup_action;
86 static bool ignore_exclusions;
87 static bool description_as_sample;
89 static time_t last_altered_max;
90 static size_t sample_size;
91 static size_t title_size;
92 static size_t max_ext_len;
94 static empty_body_type empty_body;
96 static string root;
97 static string site_term, host_term;
99 static Failed failed;
101 map<string, Filter> commands;
103 static void
104 mark_as_seen(Xapian::docid did)
106 if (usual(did < updated.size() && !updated[did])) {
107 updated[did] = true;
108 --old_docs_not_seen;
112 void
113 skip(const string & urlterm, const string & context, const string & msg,
114 off_t size, time_t last_mod, unsigned flags)
116 failed.add(urlterm, last_mod, size);
118 if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
119 if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
120 cout << context << ": ";
123 cout << "Skipping - " << msg << endl;
126 static void
127 skip_cmd_failed(const string & urlterm, const string & context, const string & cmd,
128 off_t size, time_t last_mod)
130 skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
133 static void
134 skip_meta_tag(const string & urlterm, const string & context,
135 off_t size, time_t last_mod)
137 skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
140 static void
141 skip_unknown_mimetype(const string & urlterm, const string & context,
142 const string & mimetype, off_t size, time_t last_mod)
144 skip(urlterm, context, "unknown MIME type '" + mimetype + "'", size, last_mod);
147 void
148 index_add_default_filters()
150 index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
151 index_command("application/vnd.ms-excel",
152 Filter("xls2csv -c' ' -q0 -dutf-8", false));
153 index_command("application/vnd.ms-powerpoint",
154 Filter("catppt -dutf-8", false));
155 // Looking at the source of wpd2html and wpd2text I think both output
156 // UTF-8, but it's hard to be sure without sample Unicode .wpd files
157 // as they don't seem to be at all well documented.
158 index_command("application/vnd.wordperfect", Filter("wpd2text", false));
159 // wps2text produces UTF-8 output from the sample files I've tested.
160 index_command("application/vnd.ms-works", Filter("wps2text", false));
161 // Output is UTF-8 according to "man djvutxt". Generally this seems to
162 // be true, though some examples from djvu.org generate isolated byte
163 // 0x95 in a context which suggests it might be intended to be a bullet
164 // (as it is in CP1250).
165 index_command("image/vnd.djvu", Filter("djvutxt", false));
166 index_command("text/markdown", Filter("markdown", "text/html", false));
167 // The --text option unhelpfully converts all non-ASCII characters to "?"
168 // so we use --html instead, which produces HTML entities. The --nopict
169 // option suppresses exporting picture files as pictNNNN.wmf in the current
170 // directory. Note that this option was ignored in some older versions,
171 // but it was fixed in unrtf 0.20.4.
172 index_command("text/rtf",
173 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
174 false));
175 index_command("text/x-rst", Filter("rst2html", "text/html", false));
176 index_command("application/x-mspublisher",
177 Filter("pub2xhtml", "text/html", false));
178 index_command("application/vnd.ms-outlook",
179 Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
180 false));
181 index_command("application/vnd.ms-visio.drawing",
182 Filter("vsd2xhtml", "image/svg+xml", false));
183 index_command("application/vnd.ms-visio.stencil",
184 Filter("vsd2xhtml", "image/svg+xml", false));
185 index_command("application/vnd.ms-visio.template",
186 Filter("vsd2xhtml", "image/svg+xml", false));
187 index_command("application/vnd.visio",
188 Filter("vsd2xhtml", "image/svg+xml", false));
189 // pod2text's output character set doesn't seem to be documented, but from
190 // inspecting the source it looks like it's probably iso-8859-1. We need
191 // to pass "--errors=stderr" or else minor POD formatting errors cause a
192 // file not to be indexed.
193 index_command("text/x-perl",
194 Filter("pod2text --errors=stderr",
195 "text/plain", "iso-8859-1", false));
196 // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
197 // appearing as single ligatures. For European languages, it's actually
198 // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
199 // now until we handle Unicode "compatibility decompositions".
200 index_command("application/x-dvi",
201 Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
202 // Simplistic - ought to look in index.rdf files for filename and character
203 // set.
204 index_command("application/x-maff",
205 Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
206 false));
207 index_command("application/x-mimearchive",
208 Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
209 false));
210 index_command("message/news",
211 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
212 false));
213 index_command("message/rfc822",
214 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
215 false));
216 index_command("text/vcard",
217 Filter(get_pkglibbindir() + "/vcard2text", false));
218 index_command("application/vnd.apply.keynote",
219 Filter("key2text", false));
220 index_command("application/vnd.apply.numbers",
221 Filter("numbers2text", false));
222 index_command("application/vnd.apply.pages",
223 Filter("pages2text", false));
226 void
227 index_init(const string & dbpath, const Xapian::Stem & stemmer,
228 const string & root_, const string & site_term_,
229 const string & host_term_,
230 empty_body_type empty_body_, dup_action_type dup_action_,
231 size_t sample_size_, size_t title_size_, size_t max_ext_len_,
232 bool overwrite, bool retry_failed_,
233 bool delete_removed_documents, bool verbose_, bool use_ctime_,
234 bool spelling, bool ignore_exclusions_, bool description_as_sample_)
236 root = root_;
237 site_term = site_term_;
238 host_term = host_term_;
239 empty_body = empty_body_;
240 dup_action = dup_action_;
241 sample_size = sample_size_;
242 title_size = title_size_;
243 max_ext_len = max_ext_len_;
244 verbose = verbose_;
245 use_ctime = use_ctime_;
246 ignore_exclusions = ignore_exclusions_;
247 description_as_sample = description_as_sample_;
249 if (!overwrite) {
250 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
251 old_docs_not_seen = db.get_doccount();
252 // Handle an initially empty database exactly the same way as when
253 // overwrite is true.
254 if (old_docs_not_seen != 0) {
255 old_lastdocid = db.get_lastdocid();
256 if (delete_removed_documents) {
257 // + 1 so that old_lastdocid is a valid subscript.
258 updated.resize(old_lastdocid + 1);
260 try {
261 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
262 string ubound = db.get_value_upper_bound(slot);
263 if (!ubound.empty())
264 last_altered_max = binary_string_to_int(ubound);
265 } catch (const Xapian::UnimplementedError &) {
266 numeric_limits<time_t> n;
267 last_altered_max = n.max();
270 } else {
271 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
274 if (spelling) {
275 indexer.set_database(db);
276 indexer.set_flags(indexer.FLAG_SPELLING);
278 indexer.set_stemmer(stemmer);
280 runfilter_init();
282 failed.init(db);
284 if (overwrite) {
285 // There are no failures to retry, so setting this flag doesn't
286 // change the outcome, but does mean we avoid the overhead of
287 // checking for a previous failure.
288 retry_failed = true;
289 } else if (retry_failed_) {
290 failed.clear();
291 retry_failed = true;
292 } else {
293 // If there are no existing failures, setting this flag doesn't
294 // change the outcome, but does mean we avoid the overhead of
295 // checking for a previous failure.
296 retry_failed = failed.empty();
300 static void
301 parse_pdfinfo_field(const char * p, const char * end, string & out, const char * field, size_t len)
303 if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
304 p += len;
305 while (p != end && *p == ' ')
306 ++p;
307 if (p != end && (end[-1] != '\r' || --end != p))
308 out.assign(p, end - p);
312 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
313 parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
315 static void
316 get_pdf_metainfo(const string & file, string &author, string &title,
317 string &keywords, string &topic, int& pages)
319 try {
320 string cmd = "pdfinfo -enc UTF-8";
321 append_filename_argument(cmd, file);
322 string pdfinfo = stdout_to_string(cmd, false);
324 const char * p = pdfinfo.data();
325 const char * end = p + pdfinfo.size();
326 while (p != end) {
327 const char * start = p;
328 p = static_cast<const char *>(memchr(p, '\n', end - p));
329 const char * eol;
330 if (p) {
331 eol = p;
332 ++p;
333 } else {
334 p = eol = end;
336 switch (*start) {
337 case 'A':
338 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
339 break;
340 case 'K':
341 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
342 break;
343 case 'P': {
344 string s;
345 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
346 if (!s.empty())
347 pages = atoi(s.c_str());
348 break;
350 case 'S':
351 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
352 break;
353 case 'T':
354 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
355 break;
358 } catch (ReadError) {
359 // It's probably best to index the document even if pdfinfo fails.
363 static void
364 generate_sample_from_csv(const string & csv_data, string & sample)
366 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
367 // output is sample_size - 1 bytes long. Use csv_data.size() if smaller
368 // since the user might reasonably set sample_size really high.
369 sample.reserve(min(sample_size + 3, csv_data.size()));
370 size_t last_word_end = 0;
371 bool in_space = true;
372 bool in_quotes = false;
373 for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
374 unsigned ch = *i;
376 if (!in_quotes) {
377 // If not already in double quotes, '"' starts quoting and
378 // ',' starts a new field.
379 if (ch == '"') {
380 in_quotes = true;
381 continue;
383 if (ch == ',')
384 ch = ' ';
385 } else if (ch == '"') {
386 // In double quotes, '"' either ends double quotes, or
387 // if followed by another '"', means a literal '"'.
388 if (++i == Xapian::Utf8Iterator())
389 break;
390 ch = *i;
391 if (ch != '"') {
392 in_quotes = false;
393 if (ch == ',')
394 ch = ' ';
398 if (ch <= ' ' || ch == 0xa0) {
399 // FIXME: if all the whitespace characters between two
400 // words are 0xa0 (non-breaking space) then perhaps we
401 // should output 0xa0.
402 if (in_space)
403 continue;
404 last_word_end = sample.size();
405 sample += ' ';
406 in_space = true;
407 } else {
408 Xapian::Unicode::append_utf8(sample, ch);
409 in_space = false;
412 if (sample.size() >= sample_size) {
413 // Need to truncate sample.
414 if (last_word_end <= sample_size / 2) {
415 // Monster word! We'll have to just split it.
416 sample.replace(sample_size - 3, string::npos, "...", 3);
417 } else {
418 sample.replace(last_word_end, string::npos, " ...", 4);
420 break;
425 static bool
426 index_check_existing(const string & urlterm, time_t last_altered,
427 Xapian::docid & did)
429 switch (dup_action) {
430 case DUP_SKIP: {
431 Xapian::PostingIterator p = db.postlist_begin(urlterm);
432 if (p != db.postlist_end(urlterm)) {
433 if (verbose)
434 cout << "already indexed, not updating" << endl;
435 did = *p;
436 mark_as_seen(did);
437 return true;
439 break;
441 case DUP_CHECK_LAZILY: {
442 // If last_altered > last_altered_max, we know for sure that the
443 // file is new or updated.
444 if (last_altered > last_altered_max) {
445 return false;
448 Xapian::PostingIterator p = db.postlist_begin(urlterm);
449 if (p != db.postlist_end(urlterm)) {
450 did = *p;
451 Xapian::Document doc = db.get_document(did);
452 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
453 string value = doc.get_value(slot);
454 time_t old_last_altered = binary_string_to_int(value);
455 if (last_altered <= old_last_altered) {
456 if (verbose)
457 cout << "already indexed" << endl;
458 // The docid should be in updated - the only valid
459 // exception is if the URL was long and hashed to the
460 // same URL as an existing document indexed in the same
461 // batch.
462 mark_as_seen(did);
463 return true;
466 break;
469 return false;
472 void
473 index_remove_failed_entry(const string& urlterm)
475 failed.del(urlterm);
478 void
479 index_add_document(const string & urlterm, time_t last_altered,
480 Xapian::docid did, const Xapian::Document & doc)
482 if (dup_action != DUP_SKIP) {
483 // If this document has already been indexed, update the existing
484 // entry.
485 if (did) {
486 // We already found out the document id above.
487 db.replace_document(did, doc);
488 } else if (last_altered <= last_altered_max) {
489 // We checked for the UID term and didn't find it.
490 did = db.add_document(doc);
491 } else {
492 did = db.replace_document(urlterm, doc);
494 mark_as_seen(did);
495 if (verbose) {
496 if (did <= old_lastdocid) {
497 cout << "updated" << endl;
498 } else {
499 cout << "added" << endl;
502 } else {
503 // If this were a duplicate, we'd have skipped it above.
504 db.add_document(doc);
505 if (verbose)
506 cout << "added" << endl;
510 void
511 index_mimetype(const string & file, const string & urlterm, const string & url,
512 const string & ext,
513 const string &mimetype, DirectoryIterator &d,
514 Xapian::Document & newdocument,
515 string record)
517 string context(file, root.size(), string::npos);
519 // FIXME: We could be cleverer here and check mtime too when use_ctime is
520 // set - if the ctime has changed but the mtime is unchanged, we can just
521 // update the existing Document and avoid having to re-extract text, etc.
522 time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
524 Xapian::docid did = 0;
525 if (index_check_existing(urlterm, last_altered, did))
526 return;
528 if (!retry_failed) {
529 // We only store and check the mtime (last modified) - a change to the
530 // metadata won't generally cause a previous failure to now work
531 // (FIXME: except permissions).
532 time_t failed_last_mod;
533 off_t failed_size;
534 if (failed.contains(urlterm, failed_last_mod, failed_size)) {
535 if (d.get_mtime() <= failed_last_mod &&
536 d.get_size() == failed_size) {
537 if (verbose)
538 cout << "failed to extract text on earlier run" << endl;
539 return;
541 // The file has changed, so remove the entry for it. If it fails
542 // again on this attempt, we'll add a new one.
543 failed.del(urlterm);
547 if (verbose) cout << flush;
549 string author, title, sample, keywords, topic, dump;
550 string md5;
551 time_t created = time_t(-1);
552 int pages = -1;
554 map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
555 if (cmd_it == commands.end()) {
556 size_t slash = mimetype.find('/');
557 if (slash != string::npos) {
558 string wildtype(mimetype, 0, slash + 2);
559 wildtype[slash + 1] = '*';
560 cmd_it = commands.find(wildtype);
561 if (cmd_it == commands.end()) {
562 cmd_it = commands.find("*/*");
565 if (cmd_it == commands.end()) {
566 cmd_it = commands.find("*");
569 try {
570 if (cmd_it != commands.end()) {
571 // Easy "run a command and read text or HTML from stdout or a
572 // temporary file" cases.
573 string cmd = cmd_it->second.cmd;
574 if (cmd.empty()) {
575 skip(urlterm, context, "required filter not installed",
576 d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
577 return;
579 if (cmd == "false") {
580 // Allow setting 'false' as a filter to mean that a MIME type
581 // should be quietly ignored.
582 string m = "ignoring MIME type '";
583 m += cmd_it->first;
584 m += "'";
585 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
586 SKIP_VERBOSE_ONLY);
587 return;
589 bool use_shell = cmd_it->second.use_shell();
590 bool substituted = false;
591 string tmpout;
592 size_t pcent = 0;
593 while (true) {
594 pcent = cmd.find('%', pcent);
595 if (pcent >= cmd.size() - 1)
596 break;
597 switch (cmd[pcent + 1]) {
598 case '%': // %% -> %.
599 cmd.erase(++pcent, 1);
600 break;
601 case 'f': { // %f -> escaped filename.
602 substituted = true;
603 string tail(cmd, pcent + 2);
604 cmd.resize(pcent);
605 append_filename_argument(cmd, file);
606 // Remove the space append_filename_argument() adds before
607 // the argument - the command string either includes one,
608 // or won't expect one (e.g. --input=%f).
609 cmd.erase(pcent, 1);
610 pcent = cmd.size();
611 cmd += tail;
612 break;
614 case 't': { // %t -> temporary output file.
615 if (tmpout.empty()) {
616 // Use a temporary file with a suitable extension
617 // in case the command cares, and for more helpful
618 // error messages from the command.
619 if (cmd_it->second.output_type == "text/html") {
620 tmpout = get_tmpfile("tmp.html");
621 } else if (cmd_it->second.output_type == "image/svg+xml") {
622 tmpout = get_tmpfile("tmp.svg");
623 } else {
624 tmpout = get_tmpfile("tmp.txt");
627 substituted = true;
628 string tail(cmd, pcent + 2);
629 cmd.resize(pcent);
630 append_filename_argument(cmd, tmpout);
631 // Remove the space append_filename_argument() adds before
632 // the argument - the command string either includes one,
633 // or won't expect one (e.g. --input=%f).
634 cmd.erase(pcent, 1);
635 pcent = cmd.size();
636 cmd += tail;
637 break;
639 default:
640 // Leave anything else alone for now.
641 pcent += 2;
642 break;
645 if (!substituted && cmd != "true") {
646 // If no %f, append the filename to the command.
647 append_filename_argument(cmd, file);
649 try {
650 if (!tmpout.empty()) {
651 // Output in temporary file.
652 (void)stdout_to_string(cmd, use_shell);
653 if (!load_file(tmpout, dump, NOCACHE)) {
654 throw ReadError("Couldn't read output file");
656 unlink(tmpout.c_str());
657 } else if (cmd == "true") {
658 // Ignore the file's contents, just index metadata from the
659 // filing system.
660 } else {
661 // Output on stdout.
662 dump = stdout_to_string(cmd, use_shell);
664 const string & charset = cmd_it->second.output_charset;
665 if (cmd_it->second.output_type == "text/html") {
666 MyHtmlParser p;
667 p.ignore_metarobots();
668 p.description_as_sample = description_as_sample;
669 try {
670 p.parse_html(dump, charset, false);
671 } catch (const string & newcharset) {
672 p.reset();
673 p.ignore_metarobots();
674 p.description_as_sample = description_as_sample;
675 p.parse_html(dump, newcharset, true);
676 } catch (ReadError) {
677 skip_cmd_failed(urlterm, context, cmd,
678 d.get_size(), d.get_mtime());
679 return;
681 dump = p.dump;
682 title = p.title;
683 keywords = p.keywords;
684 topic = p.topic;
685 sample = p.sample;
686 author = p.author;
687 created = p.created;
688 } else if (cmd_it->second.output_type == "image/svg+xml") {
689 SvgParser svgparser;
690 svgparser.parse(dump);
691 dump = svgparser.dump;
692 title = svgparser.title;
693 keywords = svgparser.keywords;
694 // FIXME: topic = svgparser.topic;
695 author = svgparser.author;
696 } else if (!charset.empty()) {
697 convert_to_utf8(dump, charset);
699 } catch (ReadError) {
700 skip_cmd_failed(urlterm, context, cmd,
701 d.get_size(), d.get_mtime());
702 return;
704 } else if (mimetype == "text/html" || mimetype == "text/x-php") {
705 const string & text = d.file_to_string();
706 MyHtmlParser p;
707 if (ignore_exclusions) p.ignore_metarobots();
708 p.description_as_sample = description_as_sample;
709 try {
710 // Default HTML character set is latin 1, though not specifying
711 // one is deprecated these days.
712 p.parse_html(text, "iso-8859-1", false);
713 } catch (const string & newcharset) {
714 p.reset();
715 if (ignore_exclusions) p.ignore_metarobots();
716 p.description_as_sample = description_as_sample;
717 p.parse_html(text, newcharset, true);
719 if (!p.indexing_allowed) {
720 skip_meta_tag(urlterm, context,
721 d.get_size(), d.get_mtime());
722 return;
724 dump = p.dump;
725 title = p.title;
726 keywords = p.keywords;
727 topic = p.topic;
728 sample = p.sample;
729 author = p.author;
730 created = p.created;
731 md5_string(text, md5);
732 } else if (mimetype == "text/plain") {
733 // Currently we assume that text files are UTF-8 unless they have a
734 // byte-order mark.
735 dump = d.file_to_string();
736 md5_string(dump, md5);
738 // Look for Byte-Order Mark (BOM).
739 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
740 // UTF-16 in big-endian/little-endian order - we just convert
741 // it as "UTF-16" and let the conversion handle the BOM as that
742 // way we avoid the copying overhead of erasing 2 bytes from
743 // the start of dump.
744 convert_to_utf8(dump, "UTF-16");
745 } else if (startswith(dump, "\xef\xbb\xbf")) {
746 // UTF-8 with stupid Windows not-the-byte-order mark.
747 dump.erase(0, 3);
748 } else {
749 // FIXME: What charset is the file? Look at contents?
751 } else if (mimetype == "application/pdf") {
752 string cmd = "pdftotext -enc UTF-8";
753 append_filename_argument(cmd, file);
754 cmd += " -";
755 try {
756 dump = stdout_to_string(cmd, false);
757 } catch (ReadError) {
758 skip_cmd_failed(urlterm, context, cmd,
759 d.get_size(), d.get_mtime());
760 return;
762 get_pdf_metainfo(file, author, title, keywords, topic, pages);
763 } else if (mimetype == "application/postscript") {
764 // There simply doesn't seem to be a Unicode capable PostScript to
765 // text converter (e.g. pstotext always outputs ISO-8859-1). The
766 // only solution seems to be to convert via PDF using ps2pdf and
767 // then pdftotext. This gives plausible looking UTF-8 output for
768 // some Chinese PostScript files I found using Google. It also has
769 // the benefit of allowing us to extract meta information from
770 // PostScript files.
771 string tmpfile = get_tmpfile("tmp.pdf");
772 if (tmpfile.empty()) {
773 // FIXME: should this be fatal? Or disable indexing postscript?
774 string msg = "Couldn't create temporary directory (";
775 msg += strerror(errno);
776 msg += ")";
777 skip(urlterm, context, msg,
778 d.get_size(), d.get_mtime());
779 return;
781 string cmd = "ps2pdf";
782 append_filename_argument(cmd, file);
783 append_filename_argument(cmd, tmpfile);
784 try {
785 (void)stdout_to_string(cmd, false);
786 cmd = "pdftotext -enc UTF-8";
787 append_filename_argument(cmd, tmpfile);
788 cmd += " -";
789 dump = stdout_to_string(cmd, false);
790 } catch (ReadError) {
791 skip_cmd_failed(urlterm, context, cmd,
792 d.get_size(), d.get_mtime());
793 unlink(tmpfile.c_str());
794 return;
795 } catch (...) {
796 unlink(tmpfile.c_str());
797 throw;
799 try {
800 get_pdf_metainfo(tmpfile, author, title, keywords, topic, pages);
801 } catch (...) {
802 unlink(tmpfile.c_str());
803 throw;
805 unlink(tmpfile.c_str());
806 } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
807 startswith(mimetype, "application/vnd.oasis.opendocument."))
809 // Inspired by http://mjr.towers.org.uk/comp/sxw2text
810 string cmd = "unzip -p";
811 append_filename_argument(cmd, file);
812 cmd += " content.xml ; unzip -p";
813 append_filename_argument(cmd, file);
814 cmd += " styles.xml";
815 try {
816 OpenDocParser parser;
817 parser.parse(stdout_to_string(cmd, true));
818 dump = parser.dump;
819 } catch (ReadError) {
820 skip_cmd_failed(urlterm, context, cmd,
821 d.get_size(), d.get_mtime());
822 return;
825 cmd = "unzip -p";
826 append_filename_argument(cmd, file);
827 cmd += " meta.xml";
828 try {
829 MetaXmlParser metaxmlparser;
830 metaxmlparser.parse(stdout_to_string(cmd, false));
831 title = metaxmlparser.title;
832 keywords = metaxmlparser.keywords;
833 // FIXME: topic = metaxmlparser.topic;
834 sample = metaxmlparser.sample;
835 author = metaxmlparser.author;
836 } catch (ReadError) {
837 // It's probably best to index the document even if this fails.
839 } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
840 const char * args = NULL;
841 string tail(mimetype, 46);
842 if (startswith(tail, "wordprocessingml.")) {
843 // unzip returns exit code 11 if a file to extract wasn't found
844 // which we want to ignore, because there may be no headers or
845 // no footers.
846 args = " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
847 } else if (startswith(tail, "spreadsheetml.")) {
848 // Extract the shared string table first, so our parser can
849 // grab those ready for parsing the sheets which will reference
850 // the shared strings.
851 string cmd = "unzip -p";
852 append_filename_argument(cmd, file);
853 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
854 append_filename_argument(cmd, file);
855 cmd += " xl/worksheets/sheet\\*.xml";
856 try {
857 XlsxParser parser;
858 parser.parse(stdout_to_string(cmd, true));
859 dump = parser.dump;
860 } catch (ReadError) {
861 skip_cmd_failed(urlterm, context, cmd,
862 d.get_size(), d.get_mtime());
863 return;
865 } else if (startswith(tail, "presentationml.")) {
866 // unzip returns exit code 11 if a file to extract wasn't found
867 // which we want to ignore, because there may be no notesSlides
868 // or comments.
869 args = " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
870 } else {
871 // Don't know how to index this type.
872 skip_unknown_mimetype(urlterm, context, mimetype,
873 d.get_size(), d.get_mtime());
874 return;
877 if (args) {
878 string cmd = "unzip -p";
879 append_filename_argument(cmd, file);
880 cmd += args;
881 try {
882 MSXmlParser xmlparser;
883 // Treat exit status 11 from unzip as success - this is
884 // what we get if one of the listed filenames to extract
885 // doesn't match anything in the zip file.
886 xmlparser.parse_xml(stdout_to_string(cmd, false, 11));
887 dump = xmlparser.dump;
888 } catch (ReadError) {
889 skip_cmd_failed(urlterm, context, cmd,
890 d.get_size(), d.get_mtime());
891 return;
895 string cmd = "unzip -p";
896 append_filename_argument(cmd, file);
897 cmd += " docProps/core.xml";
898 try {
899 MetaXmlParser metaxmlparser;
900 metaxmlparser.parse(stdout_to_string(cmd, false));
901 title = metaxmlparser.title;
902 keywords = metaxmlparser.keywords;
903 // FIXME: topic = metaxmlparser.topic;
904 sample = metaxmlparser.sample;
905 author = metaxmlparser.author;
906 } catch (ReadError) {
907 // It's probably best to index the document even if this fails.
909 } else if (mimetype == "application/x-abiword") {
910 // FIXME: Implement support for metadata.
911 XmlParser xmlparser;
912 const string & text = d.file_to_string();
913 xmlparser.parse_xml(text);
914 dump = xmlparser.dump;
915 md5_string(text, md5);
916 } else if (mimetype == "application/x-abiword-compressed") {
917 // FIXME: Implement support for metadata.
918 XmlParser xmlparser;
919 xmlparser.parse_xml(d.gzfile_to_string());
920 dump = xmlparser.dump;
921 } else if (mimetype == "application/vnd.ms-xpsdocument") {
922 string cmd = "unzip -p";
923 append_filename_argument(cmd, file);
924 cmd += " 'Documents/1/Pages/*.fpage'";
925 try {
926 XpsXmlParser xpsparser;
927 dump = stdout_to_string(cmd, false);
928 // Look for Byte-Order Mark (BOM).
929 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
930 // UTF-16 in big-endian/little-endian order - we just
931 // convert it as "UTF-16" and let the conversion handle the
932 // BOM as that way we avoid the copying overhead of erasing
933 // 2 bytes from the start of dump.
934 convert_to_utf8(dump, "UTF-16");
936 xpsparser.parse(dump);
937 dump = xpsparser.dump;
938 } catch (ReadError) {
939 skip_cmd_failed(urlterm, context, cmd,
940 d.get_size(), d.get_mtime());
941 return;
943 } else if (mimetype == "text/csv") {
944 // Currently we assume that text files are UTF-8 unless they have a
945 // byte-order mark.
946 dump = d.file_to_string();
947 md5_string(dump, md5);
949 // Look for Byte-Order Mark (BOM).
950 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
951 // UTF-16 in big-endian/little-endian order - we just convert
952 // it as "UTF-16" and let the conversion handle the BOM as that
953 // way we avoid the copying overhead of erasing 2 bytes from
954 // the start of dump.
955 convert_to_utf8(dump, "UTF-16");
956 } else if (startswith(dump, "\xef\xbb\xbf")) {
957 // UTF-8 with stupid Windows not-the-byte-order mark.
958 dump.erase(0, 3);
959 } else {
960 // FIXME: What charset is the file? Look at contents?
963 generate_sample_from_csv(dump, sample);
964 } else if (mimetype == "image/svg+xml") {
965 SvgParser svgparser;
966 const string & text = d.file_to_string();
967 md5_string(text, md5);
968 svgparser.parse(text);
969 dump = svgparser.dump;
970 title = svgparser.title;
971 keywords = svgparser.keywords;
972 // FIXME: topic = svgparser.topic;
973 author = svgparser.author;
974 } else if (mimetype == "application/vnd.debian.binary-package" ||
975 mimetype == "application/x-debian-package") {
976 string cmd("dpkg-deb -f");
977 append_filename_argument(cmd, file);
978 cmd += " Description";
979 const string & desc = stdout_to_string(cmd, false);
980 // First line is short description, which we use as the title.
981 string::size_type idx = desc.find('\n');
982 title.assign(desc, 0, idx);
983 if (idx != string::npos) {
984 dump.assign(desc, idx + 1, string::npos);
986 } else if (mimetype == "application/x-redhat-package-manager" ||
987 mimetype == "application/x-rpm") {
988 string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
989 append_filename_argument(cmd, file);
990 const string & desc = stdout_to_string(cmd, false);
991 // First line is summary, which we use as the title.
992 string::size_type idx = desc.find('\n');
993 title.assign(desc, 0, idx);
994 if (idx != string::npos) {
995 dump.assign(desc, idx + 1, string::npos);
997 } else if (mimetype == "application/atom+xml") {
998 AtomParser atomparser;
999 const string & text = d.file_to_string();
1000 md5_string(text, md5);
1001 atomparser.parse(text);
1002 dump = atomparser.dump;
1003 title = atomparser.title;
1004 keywords = atomparser.keywords;
1005 // FIXME: topic = atomparser.topic;
1006 author = atomparser.author;
1007 } else {
1008 // Don't know how to index this type.
1009 skip_unknown_mimetype(urlterm, context, mimetype,
1010 d.get_size(), d.get_mtime());
1011 return;
1014 // Compute the MD5 of the file if we haven't already.
1015 if (md5.empty() && !d.md5(md5)) {
1016 if (errno == ENOENT || errno == ENOTDIR) {
1017 skip(urlterm, context, "File removed during indexing",
1018 d.get_size(), d.get_mtime(),
1019 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1020 } else {
1021 skip(urlterm, context, "failed to read file to calculate MD5 checksum",
1022 d.get_size(), d.get_mtime());
1024 return;
1027 // Remove any trailing formfeeds, so we don't consider them when
1028 // considering if we extracted any text (e.g. pdftotext outputs a
1029 // formfeed between each page, even for blank pages).
1031 // If dump contain only formfeeds, then trim_end will be string::npos
1032 // and ++trim_end will be 0, which is the correct new size.
1033 string::size_type trim_end = dump.find_last_not_of('\f');
1034 if (++trim_end != dump.size())
1035 dump.resize(trim_end);
1037 if (dump.empty()) {
1038 switch (empty_body) {
1039 case EMPTY_BODY_INDEX:
1040 break;
1041 case EMPTY_BODY_WARN:
1042 cout << "no text extracted from document body, "
1043 "but indexing metadata anyway" << endl;
1044 break;
1045 case EMPTY_BODY_SKIP:
1046 skip(urlterm, context, "no text extracted from document body",
1047 d.get_size(), d.get_mtime());
1048 return;
1052 // Produce a sample
1053 if (sample.empty()) {
1054 sample = generate_sample(dump, sample_size, "...", " ...");
1055 } else {
1056 sample = generate_sample(sample, sample_size, "...", " ...");
1059 // Put the data in the document
1060 if (record.empty()) {
1061 record = "url=";
1062 } else {
1063 record += "\nurl=";
1065 record += url;
1066 record += "\nsample=";
1067 record += sample;
1068 if (!title.empty()) {
1069 record += "\ncaption=";
1070 record += generate_sample(title, title_size, "...", " ...");
1072 if (!author.empty()) {
1073 record += "\nauthor=";
1074 record += author;
1076 record += "\ntype=";
1077 record += mimetype;
1078 time_t mtime = d.get_mtime();
1079 if (mtime != static_cast<time_t>(-1)) {
1080 record += "\nmodtime=";
1081 record += str(mtime);
1083 if (created != static_cast<time_t>(-1)) {
1084 record += "\ncreated=";
1085 record += str(created);
1087 if (pages >= 0) {
1088 record += "\npages=";
1089 record += str(pages);
1091 off_t size = d.get_size();
1092 record += "\nsize=";
1093 record += str(size);
1094 newdocument.set_data(record);
1096 // Index the title, document text, keywords and topic.
1097 indexer.set_document(newdocument);
1098 if (!title.empty()) {
1099 indexer.index_text(title, 5, "S");
1100 indexer.increase_termpos(100);
1102 if (!dump.empty()) {
1103 indexer.index_text(dump);
1105 if (!keywords.empty()) {
1106 indexer.increase_termpos(100);
1107 indexer.index_text(keywords);
1109 if (!topic.empty()) {
1110 indexer.increase_termpos(100);
1111 indexer.index_text(topic, 1, "B");
1113 // Index the leafname of the file.
1115 indexer.increase_termpos(100);
1116 string leaf = d.leafname();
1117 string::size_type dot = leaf.find_last_of('.');
1118 if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1119 leaf.resize(dot);
1120 indexer.index_text(leaf, 1, "F");
1122 // Also index with underscores and ampersands replaced by spaces.
1123 bool modified = false;
1124 string::size_type rep = 0;
1125 while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1126 leaf[rep++] = ' ';
1127 modified = true;
1129 if (modified) {
1130 indexer.increase_termpos(100);
1131 indexer.index_text(leaf, 1, "F");
1135 if (!author.empty()) {
1136 indexer.increase_termpos(100);
1137 indexer.index_text(author, 1, "A");
1140 // mimeType:
1141 newdocument.add_boolean_term("T" + mimetype);
1143 newdocument.add_boolean_term(site_term);
1145 if (!host_term.empty())
1146 newdocument.add_boolean_term(host_term);
1148 struct tm *tm = localtime(&mtime);
1149 string date_term = "D" + date_to_string(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
1150 newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1151 date_term.resize(7);
1152 date_term[0] = 'M';
1153 newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1154 date_term.resize(5);
1155 date_term[0] = 'Y';
1156 newdocument.add_boolean_term(date_term); // Year (YYYY)
1158 newdocument.add_boolean_term(urlterm); // Url
1160 // Add mtime as a value to allow "sort by date".
1161 newdocument.add_value(VALUE_LASTMOD,
1162 int_to_binary_string(uint32_t(mtime)));
1163 if (use_ctime) {
1164 // Add ctime as a value to track modifications.
1165 time_t ctime = d.get_ctime();
1166 newdocument.add_value(VALUE_CTIME,
1167 int_to_binary_string(uint32_t(ctime)));
1170 // Add MD5 as a value to allow duplicate documents to be collapsed
1171 // together.
1172 newdocument.add_value(VALUE_MD5, md5);
1174 // Add the file size as a value to allow "sort by size" and size ranges.
1175 newdocument.add_value(VALUE_SIZE,
1176 Xapian::sortable_serialise(size));
1178 bool inc_tag_added = false;
1179 if (d.is_other_readable()) {
1180 inc_tag_added = true;
1181 newdocument.add_boolean_term("I*");
1182 } else if (d.is_group_readable()) {
1183 const char * group = d.get_group();
1184 if (group) {
1185 newdocument.add_boolean_term(string("I#") + group);
1188 const char * owner = d.get_owner();
1189 if (owner) {
1190 newdocument.add_boolean_term(string("O") + owner);
1191 if (!inc_tag_added && d.is_owner_readable())
1192 newdocument.add_boolean_term(string("I@") + owner);
1195 string ext_term("E");
1196 for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1197 char ch = *i;
1198 if (ch >= 'A' && ch <= 'Z')
1199 ch |= 32;
1200 ext_term += ch;
1202 newdocument.add_boolean_term(ext_term);
1204 index_add_document(urlterm, last_altered, did, newdocument);
1205 } catch (ReadError) {
1206 skip(urlterm, context, string("can't read file: ") + strerror(errno),
1207 d.get_size(), d.get_mtime());
1208 } catch (NoSuchFilter) {
1209 string filter_entry;
1210 if (cmd_it != commands.end()) {
1211 filter_entry = cmd_it->first;
1212 } else {
1213 filter_entry = mimetype;
1215 string m = "Filter for \"";
1216 m += filter_entry;
1217 m += "\" not installed";
1218 skip(urlterm, context, m, d.get_size(), d.get_mtime());
1219 commands[filter_entry] = Filter();
1220 } catch (FileNotFound) {
1221 skip(urlterm, context, "File removed during indexing",
1222 d.get_size(), d.get_mtime(),
1223 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1224 } catch (const std::string & error) {
1225 skip(urlterm, context, error, d.get_size(), d.get_mtime());
1226 } catch (const std::bad_alloc&) {
1227 // Attempt to flag the file as failed and commit changes, though that
1228 // might fail too if we're low on memory rather than being asked to
1229 // allocate a ludicrous amount.
1230 skip(urlterm, context, "Out of memory trying to extract text from file",
1231 d.get_size(), d.get_mtime(),
1232 SKIP_SHOW_FILENAME);
1233 throw CommitAndExit("Caught std::bad_alloc", "");
1237 void
1238 index_handle_deletion()
1240 if (updated.empty() || old_docs_not_seen == 0) return;
1242 if (verbose) {
1243 cout << "Deleting " << old_docs_not_seen << " old documents which weren't found" << endl;
1245 Xapian::PostingIterator alldocs = db.postlist_begin(string());
1246 Xapian::docid did = *alldocs;
1247 while (did < updated.size()) {
1248 if (!updated[did]) {
1249 alldocs.skip_to(did);
1250 if (alldocs == db.postlist_end(string()))
1251 break;
1252 if (*alldocs != did) {
1253 // Document #did didn't exist before we started.
1254 did = *alldocs;
1255 continue;
1257 db.delete_document(did);
1258 if (--old_docs_not_seen == 0)
1259 break;
1261 ++did;
1265 void
1266 index_commit()
1268 db.commit();
1271 void
1272 index_done()
1274 // If we created a temporary directory then delete it.
1275 remove_tmpdir();