Drop special handling for Compaq C++
[xapian.git] / xapian-applications / omega / index_file.cc
blob042971c393ab6f1b0d47edf4b109869f0ba05ef8
1 /** @file index_file.cc
2 * @brief Handle indexing a document from a file
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include "index_file.h"
31 #include <algorithm>
32 #include <iostream>
33 #include <limits>
34 #include <string>
35 #include <map>
36 #include <vector>
38 #include <sys/types.h>
39 #include "safeunistd.h"
40 #include <cerrno>
41 #include <cstdio>
42 #include <cstdlib>
43 #include <cstring>
44 #include "safefcntl.h"
45 #include <ctime>
47 #include <xapian.h>
49 #include "append_filename_arg.h"
50 #include "atomparse.h"
51 #include "diritor.h"
52 #include "failed.h"
53 #include "md5wrap.h"
54 #include "metaxmlparse.h"
55 #include "mimemap.h"
56 #include "msxmlparse.h"
57 #include "myhtmlparse.h"
58 #include "opendocparse.h"
59 #include "pkglibbindir.h"
60 #include "runfilter.h"
61 #include "sample.h"
62 #include "str.h"
63 #include "stringutils.h"
64 #include "svgparse.h"
65 #include "tmpdir.h"
66 #include "utf8convert.h"
67 #include "utils.h"
68 #include "values.h"
69 #include "xmlparse.h"
70 #include "xlsxparse.h"
71 #include "xpsxmlparse.h"
73 using namespace std;
75 static Xapian::WritableDatabase db;
76 static Xapian::TermGenerator indexer;
78 static Xapian::doccount old_docs_not_seen;
79 static Xapian::docid old_lastdocid;
80 static vector<bool> updated;
82 static bool verbose;
83 static bool retry_failed;
84 static bool use_ctime;
85 static dup_action_type dup_action;
86 static bool ignore_exclusions;
87 static bool description_as_sample;
88 static bool date_terms;
90 static time_t last_altered_max;
91 static size_t sample_size;
92 static size_t title_size;
93 static size_t max_ext_len;
95 static empty_body_type empty_body;
97 static string root;
98 static string site_term, host_term;
100 static Failed failed;
102 map<string, Filter> commands;
104 static void
105 mark_as_seen(Xapian::docid did)
107 if (usual(did < updated.size() && !updated[did])) {
108 updated[did] = true;
109 --old_docs_not_seen;
113 void
114 skip(const string & urlterm, const string & context, const string & msg,
115 off_t size, time_t last_mod, unsigned flags)
117 failed.add(urlterm, last_mod, size);
119 if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
120 if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
121 cout << context << ": ";
124 cout << "Skipping - " << msg << endl;
127 static void
128 skip_cmd_failed(const string & urlterm, const string & context, const string & cmd,
129 off_t size, time_t last_mod)
131 skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
134 static void
135 skip_meta_tag(const string & urlterm, const string & context,
136 off_t size, time_t last_mod)
138 skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
141 static void
142 skip_unknown_mimetype(const string & urlterm, const string & context,
143 const string & mimetype, off_t size, time_t last_mod)
145 skip(urlterm, context, "unknown MIME type '" + mimetype + "'", size, last_mod);
148 void
149 index_add_default_filters()
151 index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
152 index_command("application/vnd.ms-excel",
153 Filter("xls2csv -c' ' -q0 -dutf-8", false));
154 index_command("application/vnd.ms-powerpoint",
155 Filter("catppt -dutf-8", false));
156 // Looking at the source of wpd2html and wpd2text I think both output
157 // UTF-8, but it's hard to be sure without sample Unicode .wpd files
158 // as they don't seem to be at all well documented.
159 index_command("application/vnd.wordperfect", Filter("wpd2text", false));
160 // wps2text produces UTF-8 output from the sample files I've tested.
161 index_command("application/vnd.ms-works", Filter("wps2text", false));
162 // Output is UTF-8 according to "man djvutxt". Generally this seems to
163 // be true, though some examples from djvu.org generate isolated byte
164 // 0x95 in a context which suggests it might be intended to be a bullet
165 // (as it is in CP1250).
166 index_command("image/vnd.djvu", Filter("djvutxt", false));
167 index_command("text/markdown", Filter("markdown", "text/html", false));
168 // The --text option unhelpfully converts all non-ASCII characters to "?"
169 // so we use --html instead, which produces HTML entities. The --nopict
170 // option suppresses exporting picture files as pictNNNN.wmf in the current
171 // directory. Note that this option was ignored in some older versions,
172 // but it was fixed in unrtf 0.20.4.
173 index_command("text/rtf",
174 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
175 false));
176 index_command("text/x-rst", Filter("rst2html", "text/html", false));
177 index_command("application/x-mspublisher",
178 Filter("pub2xhtml", "text/html", false));
179 index_command("application/vnd.ms-outlook",
180 Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
181 false));
182 index_command("application/vnd.ms-visio.drawing",
183 Filter("vsd2xhtml", "image/svg+xml", false));
184 index_command("application/vnd.ms-visio.stencil",
185 Filter("vsd2xhtml", "image/svg+xml", false));
186 index_command("application/vnd.ms-visio.template",
187 Filter("vsd2xhtml", "image/svg+xml", false));
188 index_command("application/vnd.visio",
189 Filter("vsd2xhtml", "image/svg+xml", false));
190 // pod2text's output character set doesn't seem to be documented, but from
191 // inspecting the source it looks like it's probably iso-8859-1. We need
192 // to pass "--errors=stderr" or else minor POD formatting errors cause a
193 // file not to be indexed.
194 index_command("text/x-perl",
195 Filter("pod2text --errors=stderr",
196 "text/plain", "iso-8859-1", false));
197 // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
198 // appearing as single ligatures. For European languages, it's actually
199 // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
200 // now until we handle Unicode "compatibility decompositions".
201 index_command("application/x-dvi",
202 Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
203 // Simplistic - ought to look in index.rdf files for filename and character
204 // set.
205 index_command("application/x-maff",
206 Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
207 false));
208 index_command("application/x-mimearchive",
209 Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
210 false));
211 index_command("message/news",
212 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
213 false));
214 index_command("message/rfc822",
215 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
216 false));
217 index_command("text/vcard",
218 Filter(get_pkglibbindir() + "/vcard2text", false));
219 index_command("application/vnd.apply.keynote",
220 Filter("key2text", false));
221 index_command("application/vnd.apply.numbers",
222 Filter("numbers2text", false));
223 index_command("application/vnd.apply.pages",
224 Filter("pages2text", false));
227 void
228 index_init(const string & dbpath, const Xapian::Stem & stemmer,
229 const string & root_, const string & site_term_,
230 const string & host_term_,
231 empty_body_type empty_body_, dup_action_type dup_action_,
232 size_t sample_size_, size_t title_size_, size_t max_ext_len_,
233 bool overwrite, bool retry_failed_,
234 bool delete_removed_documents, bool verbose_, bool use_ctime_,
235 bool spelling, bool ignore_exclusions_, bool description_as_sample_,
236 bool date_terms_)
238 root = root_;
239 site_term = site_term_;
240 host_term = host_term_;
241 empty_body = empty_body_;
242 dup_action = dup_action_;
243 sample_size = sample_size_;
244 title_size = title_size_;
245 max_ext_len = max_ext_len_;
246 verbose = verbose_;
247 use_ctime = use_ctime_;
248 ignore_exclusions = ignore_exclusions_;
249 description_as_sample = description_as_sample_;
250 date_terms = date_terms_;
252 if (!overwrite) {
253 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
254 old_docs_not_seen = db.get_doccount();
255 // Handle an initially empty database exactly the same way as when
256 // overwrite is true.
257 if (old_docs_not_seen != 0) {
258 old_lastdocid = db.get_lastdocid();
259 if (delete_removed_documents) {
260 // + 1 so that old_lastdocid is a valid subscript.
261 updated.resize(old_lastdocid + 1);
263 try {
264 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
265 string ubound = db.get_value_upper_bound(slot);
266 if (!ubound.empty())
267 last_altered_max = binary_string_to_int(ubound);
268 } catch (const Xapian::UnimplementedError &) {
269 numeric_limits<time_t> n;
270 last_altered_max = n.max();
273 } else {
274 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
277 if (spelling) {
278 indexer.set_database(db);
279 indexer.set_flags(indexer.FLAG_SPELLING);
281 indexer.set_stemmer(stemmer);
283 runfilter_init();
285 failed.init(db);
287 if (overwrite) {
288 // There are no failures to retry, so setting this flag doesn't
289 // change the outcome, but does mean we avoid the overhead of
290 // checking for a previous failure.
291 retry_failed = true;
292 } else if (retry_failed_) {
293 failed.clear();
294 retry_failed = true;
295 } else {
296 // If there are no existing failures, setting this flag doesn't
297 // change the outcome, but does mean we avoid the overhead of
298 // checking for a previous failure.
299 retry_failed = failed.empty();
303 static void
304 parse_pdfinfo_field(const char * p, const char * end, string & out, const char * field, size_t len)
306 if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
307 p += len;
308 while (p != end && *p == ' ')
309 ++p;
310 if (p != end && (end[-1] != '\r' || --end != p))
311 out.assign(p, end - p);
315 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
316 parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
318 static void
319 get_pdf_metainfo(const string & file, string &author, string &title,
320 string &keywords, string &topic, int& pages)
322 try {
323 string cmd = "pdfinfo -enc UTF-8";
324 append_filename_argument(cmd, file);
325 string pdfinfo = stdout_to_string(cmd, false);
327 const char * p = pdfinfo.data();
328 const char * end = p + pdfinfo.size();
329 while (p != end) {
330 const char * start = p;
331 p = static_cast<const char *>(memchr(p, '\n', end - p));
332 const char * eol;
333 if (p) {
334 eol = p;
335 ++p;
336 } else {
337 p = eol = end;
339 switch (*start) {
340 case 'A':
341 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
342 break;
343 case 'K':
344 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
345 break;
346 case 'P': {
347 string s;
348 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
349 if (!s.empty())
350 pages = atoi(s.c_str());
351 break;
353 case 'S':
354 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
355 break;
356 case 'T':
357 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
358 break;
361 } catch (ReadError) {
362 // It's probably best to index the document even if pdfinfo fails.
366 static void
367 generate_sample_from_csv(const string & csv_data, string & sample)
369 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
370 // output is sample_size - 1 bytes long. Use csv_data.size() if smaller
371 // since the user might reasonably set sample_size really high.
372 sample.reserve(min(sample_size + 3, csv_data.size()));
373 size_t last_word_end = 0;
374 bool in_space = true;
375 bool in_quotes = false;
376 for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
377 unsigned ch = *i;
379 if (!in_quotes) {
380 // If not already in double quotes, '"' starts quoting and
381 // ',' starts a new field.
382 if (ch == '"') {
383 in_quotes = true;
384 continue;
386 if (ch == ',')
387 ch = ' ';
388 } else if (ch == '"') {
389 // In double quotes, '"' either ends double quotes, or
390 // if followed by another '"', means a literal '"'.
391 if (++i == Xapian::Utf8Iterator())
392 break;
393 ch = *i;
394 if (ch != '"') {
395 in_quotes = false;
396 if (ch == ',')
397 ch = ' ';
401 if (ch <= ' ' || ch == 0xa0) {
402 // FIXME: if all the whitespace characters between two
403 // words are 0xa0 (non-breaking space) then perhaps we
404 // should output 0xa0.
405 if (in_space)
406 continue;
407 last_word_end = sample.size();
408 sample += ' ';
409 in_space = true;
410 } else {
411 Xapian::Unicode::append_utf8(sample, ch);
412 in_space = false;
415 if (sample.size() >= sample_size) {
416 // Need to truncate sample.
417 if (last_word_end <= sample_size / 2) {
418 // Monster word! We'll have to just split it.
419 sample.replace(sample_size - 3, string::npos, "...", 3);
420 } else {
421 sample.replace(last_word_end, string::npos, " ...", 4);
423 break;
428 static bool
429 index_check_existing(const string & urlterm, time_t last_altered,
430 Xapian::docid & did)
432 switch (dup_action) {
433 case DUP_SKIP: {
434 Xapian::PostingIterator p = db.postlist_begin(urlterm);
435 if (p != db.postlist_end(urlterm)) {
436 if (verbose)
437 cout << "already indexed, not updating" << endl;
438 did = *p;
439 mark_as_seen(did);
440 return true;
442 break;
444 case DUP_CHECK_LAZILY: {
445 // If last_altered > last_altered_max, we know for sure that the
446 // file is new or updated.
447 if (last_altered > last_altered_max) {
448 return false;
451 Xapian::PostingIterator p = db.postlist_begin(urlterm);
452 if (p != db.postlist_end(urlterm)) {
453 did = *p;
454 Xapian::Document doc = db.get_document(did);
455 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
456 string value = doc.get_value(slot);
457 time_t old_last_altered = binary_string_to_int(value);
458 if (last_altered <= old_last_altered) {
459 if (verbose)
460 cout << "already indexed" << endl;
461 // The docid should be in updated - the only valid
462 // exception is if the URL was long and hashed to the
463 // same URL as an existing document indexed in the same
464 // batch.
465 mark_as_seen(did);
466 return true;
469 break;
472 return false;
475 void
476 index_remove_failed_entry(const string& urlterm)
478 failed.del(urlterm);
481 void
482 index_add_document(const string & urlterm, time_t last_altered,
483 Xapian::docid did, const Xapian::Document & doc)
485 if (dup_action != DUP_SKIP) {
486 // If this document has already been indexed, update the existing
487 // entry.
488 if (did) {
489 // We already found out the document id above.
490 db.replace_document(did, doc);
491 } else if (last_altered <= last_altered_max) {
492 // We checked for the UID term and didn't find it.
493 did = db.add_document(doc);
494 } else {
495 did = db.replace_document(urlterm, doc);
497 mark_as_seen(did);
498 if (verbose) {
499 if (did <= old_lastdocid) {
500 cout << "updated" << endl;
501 } else {
502 cout << "added" << endl;
505 } else {
506 // If this were a duplicate, we'd have skipped it above.
507 db.add_document(doc);
508 if (verbose)
509 cout << "added" << endl;
513 void
514 index_mimetype(const string & file, const string & urlterm, const string & url,
515 const string & ext,
516 const string &mimetype, DirectoryIterator &d,
517 Xapian::Document & newdocument,
518 string record)
520 string context(file, root.size(), string::npos);
522 // FIXME: We could be cleverer here and check mtime too when use_ctime is
523 // set - if the ctime has changed but the mtime is unchanged, we can just
524 // update the existing Document and avoid having to re-extract text, etc.
525 time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
527 Xapian::docid did = 0;
528 if (index_check_existing(urlterm, last_altered, did))
529 return;
531 if (!retry_failed) {
532 // We only store and check the mtime (last modified) - a change to the
533 // metadata won't generally cause a previous failure to now work
534 // (FIXME: except permissions).
535 time_t failed_last_mod;
536 off_t failed_size;
537 if (failed.contains(urlterm, failed_last_mod, failed_size)) {
538 if (d.get_mtime() <= failed_last_mod &&
539 d.get_size() == failed_size) {
540 if (verbose)
541 cout << "failed to extract text on earlier run" << endl;
542 return;
544 // The file has changed, so remove the entry for it. If it fails
545 // again on this attempt, we'll add a new one.
546 failed.del(urlterm);
550 if (verbose) cout << flush;
552 string author, title, sample, keywords, topic, dump;
553 string md5;
554 time_t created = time_t(-1);
555 int pages = -1;
557 map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
558 if (cmd_it == commands.end()) {
559 size_t slash = mimetype.find('/');
560 if (slash != string::npos) {
561 string wildtype(mimetype, 0, slash + 2);
562 wildtype[slash + 1] = '*';
563 cmd_it = commands.find(wildtype);
564 if (cmd_it == commands.end()) {
565 cmd_it = commands.find("*/*");
568 if (cmd_it == commands.end()) {
569 cmd_it = commands.find("*");
572 try {
573 if (cmd_it != commands.end()) {
574 // Easy "run a command and read text or HTML from stdout or a
575 // temporary file" cases.
576 string cmd = cmd_it->second.cmd;
577 if (cmd.empty()) {
578 skip(urlterm, context, "required filter not installed",
579 d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
580 return;
582 if (cmd == "false") {
583 // Allow setting 'false' as a filter to mean that a MIME type
584 // should be quietly ignored.
585 string m = "ignoring MIME type '";
586 m += cmd_it->first;
587 m += "'";
588 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
589 SKIP_VERBOSE_ONLY);
590 return;
592 bool use_shell = cmd_it->second.use_shell();
593 bool substituted = false;
594 string tmpout;
595 size_t pcent = 0;
596 while (true) {
597 pcent = cmd.find('%', pcent);
598 if (pcent >= cmd.size() - 1)
599 break;
600 switch (cmd[pcent + 1]) {
601 case '%': // %% -> %.
602 cmd.erase(++pcent, 1);
603 break;
604 case 'f': { // %f -> escaped filename.
605 substituted = true;
606 string tail(cmd, pcent + 2);
607 cmd.resize(pcent);
608 append_filename_argument(cmd, file);
609 // Remove the space append_filename_argument() adds before
610 // the argument - the command string either includes one,
611 // or won't expect one (e.g. --input=%f).
612 cmd.erase(pcent, 1);
613 pcent = cmd.size();
614 cmd += tail;
615 break;
617 case 't': { // %t -> temporary output file.
618 if (tmpout.empty()) {
619 // Use a temporary file with a suitable extension
620 // in case the command cares, and for more helpful
621 // error messages from the command.
622 if (cmd_it->second.output_type == "text/html") {
623 tmpout = get_tmpfile("tmp.html");
624 } else if (cmd_it->second.output_type == "image/svg+xml") {
625 tmpout = get_tmpfile("tmp.svg");
626 } else {
627 tmpout = get_tmpfile("tmp.txt");
630 substituted = true;
631 string tail(cmd, pcent + 2);
632 cmd.resize(pcent);
633 append_filename_argument(cmd, tmpout);
634 // Remove the space append_filename_argument() adds before
635 // the argument - the command string either includes one,
636 // or won't expect one (e.g. --input=%f).
637 cmd.erase(pcent, 1);
638 pcent = cmd.size();
639 cmd += tail;
640 break;
642 default:
643 // Leave anything else alone for now.
644 pcent += 2;
645 break;
648 if (!substituted && cmd != "true") {
649 // If no %f, append the filename to the command.
650 append_filename_argument(cmd, file);
652 try {
653 if (!tmpout.empty()) {
654 // Output in temporary file.
655 (void)stdout_to_string(cmd, use_shell);
656 if (!load_file(tmpout, dump)) {
657 throw ReadError("Couldn't read output file");
659 unlink(tmpout.c_str());
660 } else if (cmd == "true") {
661 // Ignore the file's contents, just index metadata from the
662 // filing system.
663 } else {
664 // Output on stdout.
665 dump = stdout_to_string(cmd, use_shell);
667 const string & charset = cmd_it->second.output_charset;
668 if (cmd_it->second.output_type == "text/html") {
669 MyHtmlParser p;
670 p.ignore_metarobots();
671 p.description_as_sample = description_as_sample;
672 try {
673 p.parse_html(dump, charset, false);
674 } catch (const string & newcharset) {
675 p.reset();
676 p.ignore_metarobots();
677 p.description_as_sample = description_as_sample;
678 p.parse_html(dump, newcharset, true);
679 } catch (ReadError) {
680 skip_cmd_failed(urlterm, context, cmd,
681 d.get_size(), d.get_mtime());
682 return;
684 dump = p.dump;
685 title = p.title;
686 keywords = p.keywords;
687 topic = p.topic;
688 sample = p.sample;
689 author = p.author;
690 created = p.created;
691 } else if (cmd_it->second.output_type == "image/svg+xml") {
692 SvgParser svgparser;
693 svgparser.parse(dump);
694 dump = svgparser.dump;
695 title = svgparser.title;
696 keywords = svgparser.keywords;
697 // FIXME: topic = svgparser.topic;
698 author = svgparser.author;
699 } else if (!charset.empty()) {
700 convert_to_utf8(dump, charset);
702 } catch (ReadError) {
703 skip_cmd_failed(urlterm, context, cmd,
704 d.get_size(), d.get_mtime());
705 return;
707 } else if (mimetype == "text/html" || mimetype == "text/x-php") {
708 const string & text = d.file_to_string();
709 MyHtmlParser p;
710 if (ignore_exclusions) p.ignore_metarobots();
711 p.description_as_sample = description_as_sample;
712 try {
713 // Default HTML character set is latin 1, though not specifying
714 // one is deprecated these days.
715 p.parse_html(text, "iso-8859-1", false);
716 } catch (const string & newcharset) {
717 p.reset();
718 if (ignore_exclusions) p.ignore_metarobots();
719 p.description_as_sample = description_as_sample;
720 p.parse_html(text, newcharset, true);
722 if (!p.indexing_allowed) {
723 skip_meta_tag(urlterm, context,
724 d.get_size(), d.get_mtime());
725 return;
727 dump = p.dump;
728 title = p.title;
729 keywords = p.keywords;
730 topic = p.topic;
731 sample = p.sample;
732 author = p.author;
733 created = p.created;
734 md5_string(text, md5);
735 } else if (mimetype == "text/plain") {
736 // Currently we assume that text files are UTF-8 unless they have a
737 // byte-order mark.
738 dump = d.file_to_string();
739 md5_string(dump, md5);
741 // Look for Byte-Order Mark (BOM).
742 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
743 // UTF-16 in big-endian/little-endian order - we just convert
744 // it as "UTF-16" and let the conversion handle the BOM as that
745 // way we avoid the copying overhead of erasing 2 bytes from
746 // the start of dump.
747 convert_to_utf8(dump, "UTF-16");
748 } else if (startswith(dump, "\xef\xbb\xbf")) {
749 // UTF-8 with stupid Windows not-the-byte-order mark.
750 dump.erase(0, 3);
751 } else {
752 // FIXME: What charset is the file? Look at contents?
754 } else if (mimetype == "application/pdf") {
755 string cmd = "pdftotext -enc UTF-8";
756 append_filename_argument(cmd, file);
757 cmd += " -";
758 try {
759 dump = stdout_to_string(cmd, false);
760 } catch (ReadError) {
761 skip_cmd_failed(urlterm, context, cmd,
762 d.get_size(), d.get_mtime());
763 return;
765 get_pdf_metainfo(file, author, title, keywords, topic, pages);
766 } else if (mimetype == "application/postscript") {
767 // There simply doesn't seem to be a Unicode capable PostScript to
768 // text converter (e.g. pstotext always outputs ISO-8859-1). The
769 // only solution seems to be to convert via PDF using ps2pdf and
770 // then pdftotext. This gives plausible looking UTF-8 output for
771 // some Chinese PostScript files I found using Google. It also has
772 // the benefit of allowing us to extract meta information from
773 // PostScript files.
774 string tmpfile = get_tmpfile("tmp.pdf");
775 if (tmpfile.empty()) {
776 // FIXME: should this be fatal? Or disable indexing postscript?
777 string msg = "Couldn't create temporary directory (";
778 msg += strerror(errno);
779 msg += ")";
780 skip(urlterm, context, msg,
781 d.get_size(), d.get_mtime());
782 return;
784 string cmd = "ps2pdf";
785 append_filename_argument(cmd, file);
786 append_filename_argument(cmd, tmpfile);
787 try {
788 (void)stdout_to_string(cmd, false);
789 cmd = "pdftotext -enc UTF-8";
790 append_filename_argument(cmd, tmpfile);
791 cmd += " -";
792 dump = stdout_to_string(cmd, false);
793 } catch (ReadError) {
794 skip_cmd_failed(urlterm, context, cmd,
795 d.get_size(), d.get_mtime());
796 unlink(tmpfile.c_str());
797 return;
798 } catch (...) {
799 unlink(tmpfile.c_str());
800 throw;
802 try {
803 get_pdf_metainfo(tmpfile, author, title, keywords, topic, pages);
804 } catch (...) {
805 unlink(tmpfile.c_str());
806 throw;
808 unlink(tmpfile.c_str());
809 } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
810 startswith(mimetype, "application/vnd.oasis.opendocument."))
812 // Inspired by http://mjr.towers.org.uk/comp/sxw2text
813 string cmd = "unzip -p";
814 append_filename_argument(cmd, file);
815 cmd += " content.xml ; unzip -p";
816 append_filename_argument(cmd, file);
817 cmd += " styles.xml";
818 try {
819 OpenDocParser parser;
820 parser.parse(stdout_to_string(cmd, true));
821 dump = parser.dump;
822 } catch (ReadError) {
823 skip_cmd_failed(urlterm, context, cmd,
824 d.get_size(), d.get_mtime());
825 return;
828 cmd = "unzip -p";
829 append_filename_argument(cmd, file);
830 cmd += " meta.xml";
831 try {
832 MetaXmlParser metaxmlparser;
833 metaxmlparser.parse(stdout_to_string(cmd, false));
834 title = metaxmlparser.title;
835 keywords = metaxmlparser.keywords;
836 // FIXME: topic = metaxmlparser.topic;
837 sample = metaxmlparser.sample;
838 author = metaxmlparser.author;
839 } catch (ReadError) {
840 // It's probably best to index the document even if this fails.
842 } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
843 const char * args = NULL;
844 string tail(mimetype, 46);
845 if (startswith(tail, "wordprocessingml.")) {
846 // unzip returns exit code 11 if a file to extract wasn't found
847 // which we want to ignore, because there may be no headers or
848 // no footers.
849 args = " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
850 } else if (startswith(tail, "spreadsheetml.")) {
851 // Extract the shared string table first, so our parser can
852 // grab those ready for parsing the sheets which will reference
853 // the shared strings.
854 string cmd = "unzip -p";
855 append_filename_argument(cmd, file);
856 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
857 append_filename_argument(cmd, file);
858 cmd += " xl/worksheets/sheet\\*.xml";
859 try {
860 XlsxParser parser;
861 parser.parse(stdout_to_string(cmd, true));
862 dump = parser.dump;
863 } catch (ReadError) {
864 skip_cmd_failed(urlterm, context, cmd,
865 d.get_size(), d.get_mtime());
866 return;
868 } else if (startswith(tail, "presentationml.")) {
869 // unzip returns exit code 11 if a file to extract wasn't found
870 // which we want to ignore, because there may be no notesSlides
871 // or comments.
872 args = " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
873 } else {
874 // Don't know how to index this type.
875 skip_unknown_mimetype(urlterm, context, mimetype,
876 d.get_size(), d.get_mtime());
877 return;
880 if (args) {
881 string cmd = "unzip -p";
882 append_filename_argument(cmd, file);
883 cmd += args;
884 try {
885 MSXmlParser xmlparser;
886 // Treat exit status 11 from unzip as success - this is
887 // what we get if one of the listed filenames to extract
888 // doesn't match anything in the zip file.
889 xmlparser.parse_xml(stdout_to_string(cmd, false, 11));
890 dump = xmlparser.dump;
891 } catch (ReadError) {
892 skip_cmd_failed(urlterm, context, cmd,
893 d.get_size(), d.get_mtime());
894 return;
898 string cmd = "unzip -p";
899 append_filename_argument(cmd, file);
900 cmd += " docProps/core.xml";
901 try {
902 MetaXmlParser metaxmlparser;
903 metaxmlparser.parse(stdout_to_string(cmd, false));
904 title = metaxmlparser.title;
905 keywords = metaxmlparser.keywords;
906 // FIXME: topic = metaxmlparser.topic;
907 sample = metaxmlparser.sample;
908 author = metaxmlparser.author;
909 } catch (ReadError) {
910 // It's probably best to index the document even if this fails.
912 } else if (mimetype == "application/x-abiword") {
913 // FIXME: Implement support for metadata.
914 XmlParser xmlparser;
915 const string & text = d.file_to_string();
916 xmlparser.parse_xml(text);
917 dump = xmlparser.dump;
918 md5_string(text, md5);
919 } else if (mimetype == "application/x-abiword-compressed") {
920 // FIXME: Implement support for metadata.
921 XmlParser xmlparser;
922 xmlparser.parse_xml(d.gzfile_to_string());
923 dump = xmlparser.dump;
924 } else if (mimetype == "application/vnd.ms-xpsdocument") {
925 string cmd = "unzip -p";
926 append_filename_argument(cmd, file);
927 cmd += " 'Documents/1/Pages/*.fpage'";
928 try {
929 XpsXmlParser xpsparser;
930 dump = stdout_to_string(cmd, false);
931 // Look for Byte-Order Mark (BOM).
932 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
933 // UTF-16 in big-endian/little-endian order - we just
934 // convert it as "UTF-16" and let the conversion handle the
935 // BOM as that way we avoid the copying overhead of erasing
936 // 2 bytes from the start of dump.
937 convert_to_utf8(dump, "UTF-16");
939 xpsparser.parse(dump);
940 dump = xpsparser.dump;
941 } catch (ReadError) {
942 skip_cmd_failed(urlterm, context, cmd,
943 d.get_size(), d.get_mtime());
944 return;
946 } else if (mimetype == "text/csv") {
947 // Currently we assume that text files are UTF-8 unless they have a
948 // byte-order mark.
949 dump = d.file_to_string();
950 md5_string(dump, md5);
952 // Look for Byte-Order Mark (BOM).
953 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
954 // UTF-16 in big-endian/little-endian order - we just convert
955 // it as "UTF-16" and let the conversion handle the BOM as that
956 // way we avoid the copying overhead of erasing 2 bytes from
957 // the start of dump.
958 convert_to_utf8(dump, "UTF-16");
959 } else if (startswith(dump, "\xef\xbb\xbf")) {
960 // UTF-8 with stupid Windows not-the-byte-order mark.
961 dump.erase(0, 3);
962 } else {
963 // FIXME: What charset is the file? Look at contents?
966 generate_sample_from_csv(dump, sample);
967 } else if (mimetype == "image/svg+xml") {
968 SvgParser svgparser;
969 const string & text = d.file_to_string();
970 md5_string(text, md5);
971 svgparser.parse(text);
972 dump = svgparser.dump;
973 title = svgparser.title;
974 keywords = svgparser.keywords;
975 // FIXME: topic = svgparser.topic;
976 author = svgparser.author;
977 } else if (mimetype == "application/vnd.debian.binary-package" ||
978 mimetype == "application/x-debian-package") {
979 string cmd("dpkg-deb -f");
980 append_filename_argument(cmd, file);
981 cmd += " Description";
982 const string & desc = stdout_to_string(cmd, false);
983 // First line is short description, which we use as the title.
984 string::size_type idx = desc.find('\n');
985 title.assign(desc, 0, idx);
986 if (idx != string::npos) {
987 dump.assign(desc, idx + 1, string::npos);
989 } else if (mimetype == "application/x-redhat-package-manager" ||
990 mimetype == "application/x-rpm") {
991 string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
992 append_filename_argument(cmd, file);
993 const string & desc = stdout_to_string(cmd, false);
994 // First line is summary, which we use as the title.
995 string::size_type idx = desc.find('\n');
996 title.assign(desc, 0, idx);
997 if (idx != string::npos) {
998 dump.assign(desc, idx + 1, string::npos);
1000 } else if (mimetype == "application/atom+xml") {
1001 AtomParser atomparser;
1002 const string & text = d.file_to_string();
1003 md5_string(text, md5);
1004 atomparser.parse(text);
1005 dump = atomparser.dump;
1006 title = atomparser.title;
1007 keywords = atomparser.keywords;
1008 // FIXME: topic = atomparser.topic;
1009 author = atomparser.author;
1010 } else {
1011 // Don't know how to index this type.
1012 skip_unknown_mimetype(urlterm, context, mimetype,
1013 d.get_size(), d.get_mtime());
1014 return;
1017 // Compute the MD5 of the file if we haven't already.
1018 if (md5.empty() && md5_file(file, md5, d.try_noatime()) == 0) {
1019 if (errno == ENOENT || errno == ENOTDIR) {
1020 skip(urlterm, context, "File removed during indexing",
1021 d.get_size(), d.get_mtime(),
1022 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1023 } else {
1024 skip(urlterm, context, "failed to read file to calculate MD5 checksum",
1025 d.get_size(), d.get_mtime());
1027 return;
1030 // Remove any trailing formfeeds, so we don't consider them when
1031 // considering if we extracted any text (e.g. pdftotext outputs a
1032 // formfeed between each page, even for blank pages).
1034 // If dump contain only formfeeds, then trim_end will be string::npos
1035 // and ++trim_end will be 0, which is the correct new size.
1036 string::size_type trim_end = dump.find_last_not_of('\f');
1037 if (++trim_end != dump.size())
1038 dump.resize(trim_end);
1040 if (dump.empty()) {
1041 switch (empty_body) {
1042 case EMPTY_BODY_INDEX:
1043 break;
1044 case EMPTY_BODY_WARN:
1045 cout << "no text extracted from document body, "
1046 "but indexing metadata anyway" << endl;
1047 break;
1048 case EMPTY_BODY_SKIP:
1049 skip(urlterm, context, "no text extracted from document body",
1050 d.get_size(), d.get_mtime());
1051 return;
1055 // Produce a sample
1056 if (sample.empty()) {
1057 sample = generate_sample(dump, sample_size, "...", " ...");
1058 } else {
1059 sample = generate_sample(sample, sample_size, "...", " ...");
1062 // Put the data in the document
1063 if (record.empty()) {
1064 record = "url=";
1065 } else {
1066 record += "\nurl=";
1068 record += url;
1069 record += "\nsample=";
1070 record += sample;
1071 if (!title.empty()) {
1072 record += "\ncaption=";
1073 record += generate_sample(title, title_size, "...", " ...");
1075 if (!author.empty()) {
1076 record += "\nauthor=";
1077 record += author;
1079 record += "\ntype=";
1080 record += mimetype;
1081 time_t mtime = d.get_mtime();
1082 if (mtime != static_cast<time_t>(-1)) {
1083 record += "\nmodtime=";
1084 record += str(mtime);
1086 if (created != static_cast<time_t>(-1)) {
1087 record += "\ncreated=";
1088 record += str(created);
1090 if (pages >= 0) {
1091 record += "\npages=";
1092 record += str(pages);
1094 off_t size = d.get_size();
1095 record += "\nsize=";
1096 record += str(size);
1097 newdocument.set_data(record);
1099 // Index the title, document text, keywords and topic.
1100 indexer.set_document(newdocument);
1101 if (!title.empty()) {
1102 indexer.index_text(title, 5, "S");
1103 indexer.increase_termpos(100);
1105 if (!dump.empty()) {
1106 indexer.index_text(dump);
1108 if (!keywords.empty()) {
1109 indexer.increase_termpos(100);
1110 indexer.index_text(keywords);
1112 if (!topic.empty()) {
1113 indexer.increase_termpos(100);
1114 indexer.index_text(topic, 1, "B");
1116 // Index the leafname of the file.
1118 indexer.increase_termpos(100);
1119 string leaf = d.leafname();
1120 string::size_type dot = leaf.find_last_of('.');
1121 if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1122 leaf.resize(dot);
1123 indexer.index_text(leaf, 1, "F");
1125 // Also index with underscores and ampersands replaced by spaces.
1126 bool modified = false;
1127 string::size_type rep = 0;
1128 while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1129 leaf[rep++] = ' ';
1130 modified = true;
1132 if (modified) {
1133 indexer.increase_termpos(100);
1134 indexer.index_text(leaf, 1, "F");
1138 if (!author.empty()) {
1139 indexer.increase_termpos(100);
1140 indexer.index_text(author, 1, "A");
1143 // mimeType:
1144 newdocument.add_boolean_term("T" + mimetype);
1146 newdocument.add_boolean_term(site_term);
1148 if (!host_term.empty())
1149 newdocument.add_boolean_term(host_term);
1151 if (date_terms) {
1152 struct tm *tm = localtime(&mtime);
1153 string date_term = "D";
1154 date_term += date_to_string(tm->tm_year + 1900,
1155 tm->tm_mon + 1,
1156 tm->tm_mday);
1157 newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1158 date_term.resize(7);
1159 date_term[0] = 'M';
1160 newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1161 date_term.resize(5);
1162 date_term[0] = 'Y';
1163 newdocument.add_boolean_term(date_term); // Year (YYYY)
1166 newdocument.add_boolean_term(urlterm); // Url
1168 // Add mtime as a value to allow "sort by date".
1169 newdocument.add_value(VALUE_LASTMOD,
1170 int_to_binary_string(uint32_t(mtime)));
1171 if (use_ctime) {
1172 // Add ctime as a value to track modifications.
1173 time_t ctime = d.get_ctime();
1174 newdocument.add_value(VALUE_CTIME,
1175 int_to_binary_string(uint32_t(ctime)));
1178 // Add MD5 as a value to allow duplicate documents to be collapsed
1179 // together.
1180 newdocument.add_value(VALUE_MD5, md5);
1182 // Add the file size as a value to allow "sort by size" and size ranges.
1183 newdocument.add_value(VALUE_SIZE,
1184 Xapian::sortable_serialise(size));
1186 bool inc_tag_added = false;
1187 if (d.is_other_readable()) {
1188 inc_tag_added = true;
1189 newdocument.add_boolean_term("I*");
1190 } else if (d.is_group_readable()) {
1191 const char * group = d.get_group();
1192 if (group) {
1193 newdocument.add_boolean_term(string("I#") + group);
1196 const char * owner = d.get_owner();
1197 if (owner) {
1198 newdocument.add_boolean_term(string("O") + owner);
1199 if (!inc_tag_added && d.is_owner_readable())
1200 newdocument.add_boolean_term(string("I@") + owner);
1203 string ext_term("E");
1204 for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1205 char ch = *i;
1206 if (ch >= 'A' && ch <= 'Z')
1207 ch |= 32;
1208 ext_term += ch;
1210 newdocument.add_boolean_term(ext_term);
1212 index_add_document(urlterm, last_altered, did, newdocument);
1213 } catch (ReadError) {
1214 skip(urlterm, context, string("can't read file: ") + strerror(errno),
1215 d.get_size(), d.get_mtime());
1216 } catch (NoSuchFilter) {
1217 string filter_entry;
1218 if (cmd_it != commands.end()) {
1219 filter_entry = cmd_it->first;
1220 } else {
1221 filter_entry = mimetype;
1223 string m = "Filter for \"";
1224 m += filter_entry;
1225 m += "\" not installed";
1226 skip(urlterm, context, m, d.get_size(), d.get_mtime());
1227 commands[filter_entry] = Filter();
1228 } catch (FileNotFound) {
1229 skip(urlterm, context, "File removed during indexing",
1230 d.get_size(), d.get_mtime(),
1231 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1232 } catch (const std::string & error) {
1233 skip(urlterm, context, error, d.get_size(), d.get_mtime());
1234 } catch (const std::bad_alloc&) {
1235 // Attempt to flag the file as failed and commit changes, though that
1236 // might fail too if we're low on memory rather than being asked to
1237 // allocate a ludicrous amount.
1238 skip(urlterm, context, "Out of memory trying to extract text from file",
1239 d.get_size(), d.get_mtime(),
1240 SKIP_SHOW_FILENAME);
1241 throw CommitAndExit("Caught std::bad_alloc", "");
1245 void
1246 index_handle_deletion()
1248 if (updated.empty() || old_docs_not_seen == 0) return;
1250 if (verbose) {
1251 cout << "Deleting " << old_docs_not_seen << " old documents which weren't found" << endl;
1253 Xapian::PostingIterator alldocs = db.postlist_begin(string());
1254 Xapian::docid did = *alldocs;
1255 while (did < updated.size()) {
1256 if (!updated[did]) {
1257 alldocs.skip_to(did);
1258 if (alldocs == db.postlist_end(string()))
1259 break;
1260 if (*alldocs != did) {
1261 // Document #did didn't exist before we started.
1262 did = *alldocs;
1263 continue;
1265 db.delete_document(did);
1266 if (--old_docs_not_seen == 0)
1267 break;
1269 ++did;
1273 void
1274 index_commit()
1276 db.commit();
1279 void
1280 index_done()
1282 // If we created a temporary directory then delete it.
1283 remove_tmpdir();