From 95639fc9f8e1340dd3bc0dfe9a680f0c5abfde1d Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Mon, 17 Sep 2018 11:04:21 +1200 Subject: [PATCH] Index Visio files using vsd2xhtml This includes a patch to extend --filter to support filters which produce SVG as output, since vsd2xhtml converts Visio files to SVG. --- xapian-applications/omega/docs/overview.rst | 13 ++++++++----- xapian-applications/omega/index_file.cc | 20 ++++++++++++++++++++ xapian-applications/omega/mimemap.tokens | 13 +++++++++++++ xapian-applications/omega/omindex.cc | 10 ++++++---- 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/xapian-applications/omega/docs/overview.rst b/xapian-applications/omega/docs/overview.rst index bdae182a9..2c98919bf 100644 --- a/xapian-applications/omega/docs/overview.rst +++ b/xapian-applications/omega/docs/overview.rst @@ -282,6 +282,8 @@ other filters too - see below): * MS Outlook message (.msg) if perl with Email::Outlook::Message and HTML::Parser modules is available * MS Publisher documents (.pub) if pub2xhtml is available (comes with libmspub) +* MS Visio documents documents (.vsd, .vss, .vst, .vsw, .vsdx, .vssx, .vstx, + .vsdm, .vssm, .vstm) if vsd2xhtml is available (comes with libvisio) * AbiWord documents (.abw) * Compressed AbiWord documents (.zabw) * Rich Text Format documents (.rtf) if unrtf is available @@ -356,9 +358,10 @@ to. You can add support for additional MIME content types (or override existing ones) using the ``--filter`` option to specify a command to run. At present, -this command needs to produce output in either HTML or plain text format +this command needs to produce output in either HTML, SVG, or plain text format (as of 1.3.3, you can specify the character encoding that the output will be -in; in earlier versions, plain text output had to be UTF-8). +in; in earlier versions, plain text output had to be UTF-8). Support for SVG +output from external commands was added in 1.4.8. As of 1.3.3, the command can include certain placeholders which are substituted by omindex: @@ -373,9 +376,9 @@ by omindex: * Any ``%t`` in this command will be replaced with a filename in a temporary directory (suitably escaped to protect it from the shell, so don't put quotes around ``%t``). The extension of this filename will reflect the - expected output format (either ``.html`` or ``.txt``). If you don't use - ``%t`` in the command, then omindex will expect output on ``stdout`` (prior - to 1.3.3, output had to be on ``stdout``). + expected output format (either ``.html``, ``.svg`` or ``.txt``). If you + don't use ``%t`` in the command, then omindex will expect output on + ``stdout`` (prior to 1.3.3, output had to be on ``stdout``). * ``%%`` can be used should you need a literal ``%`` in the command. diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index 11e4376a3..389426692 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -179,6 +179,14 @@ index_add_default_filters() index_command("application/vnd.ms-outlook", Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html", false)); + index_command("application/vnd.ms-visio.drawing", + Filter("vsd2xhtml", "image/svg+xml", false)); + index_command("application/vnd.ms-visio.stencil", + Filter("vsd2xhtml", "image/svg+xml", false)); + index_command("application/vnd.ms-visio.template", + Filter("vsd2xhtml", "image/svg+xml", false)); + index_command("application/vnd.visio", + Filter("vsd2xhtml", "image/svg+xml", false)); // pod2text's output character set doesn't seem to be documented, but from // inspecting the source it looks like it's probably iso-8859-1. We need // to pass "--errors=stderr" or else minor POD formatting errors cause a @@ -208,6 +216,8 @@ index_add_default_filters() false)); index_command("text/vcard", Filter(get_pkglibbindir() + "/vcard2text", false)); + index_command("text/vcard", + Filter(get_pkglibbindir() + "/vcard2text", false)); } void @@ -589,6 +599,8 @@ index_mimetype(const string & file, const string & urlterm, const string & url, // error messages from the command. if (cmd_it->second.output_type == "text/html") { tmpout = get_tmpfile("tmp.html"); + } else if (cmd_it->second.output_type == "image/svg+xml") { + tmpout = get_tmpfile("tmp.svg"); } else { tmpout = get_tmpfile("tmp.txt"); } @@ -654,6 +666,14 @@ index_mimetype(const string & file, const string & urlterm, const string & url, sample = p.sample; author = p.author; created = p.created; + } else if (cmd_it->second.output_type == "image/svg+xml") { + SvgParser svgparser; + svgparser.parse(dump); + dump = svgparser.dump; + title = svgparser.title; + keywords = svgparser.keywords; + // FIXME: topic = svgparser.topic; + author = svgparser.author; } else if (!charset.empty()) { convert_to_utf8(dump, charset); } diff --git a/xapian-applications/omega/mimemap.tokens b/xapian-applications/omega/mimemap.tokens index 32a7d70eb..330e789fc 100644 --- a/xapian-applications/omega/mimemap.tokens +++ b/xapian-applications/omega/mimemap.tokens @@ -83,6 +83,9 @@ pptx application/vnd.openxmlformats-officedocument.presentationml.presentation ppsx application/vnd.openxmlformats-officedocument.presentationml.slideshow # PowerPoint 2007 template potx application/vnd.openxmlformats-officedocument.presentationml.template +vsdx application/vnd.ms-visio.drawing +vssx application/vnd.ms-visio.stencil +vstx application/vnd.ms-visio.template xps application/vnd.ms-xpsdocument # Macro-enabled variants - these appear to be the same formats as the @@ -102,6 +105,10 @@ pptm application/vnd.openxmlformats-officedocument.presentationml.presentation ppsm application/vnd.openxmlformats-officedocument.presentationml.slideshow # MS say: application/vnd.ms-powerpoint.presentation.macroEnabled.12 potm application/vnd.openxmlformats-officedocument.presentationml.template +# MS say: application/vnd.ms-visio.drawing.macroEnabled etc +vsdm application/vnd.ms-visio.drawing +vssm application/vnd.ms-visio.stencil +vstm application/vnd.ms-visio.template # Some other word processor formats: # Word template @@ -119,6 +126,12 @@ rtf text/rtf # actually saves RTF when asked to save as .doc, and Microsoft Word # quietly loads RTF files with a .doc extension), or plain-text. +# Visio: +vsd application/vnd.visio +vss application/vnd.visio +vst application/vnd.visio +vsw application/vnd.visio + # Other MS formats: xls application/vnd.ms-excel xlb application/vnd.ms-excel diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 80dbe068d..70244456a 100644 --- a/xapian-applications/omega/omindex.cc +++ b/xapian-applications/omega/omindex.cc @@ -381,8 +381,8 @@ main(int argc, char **argv) " process files with MIME Content-Type M using\n" " command CMD, which produces output (on stdout or\n" " in a temporary file) with format T (Content-Type\n" -" or file extension; currently txt (default) or\n" -" html) in character encoding C (default: UTF-8).\n" +" or file extension; currently txt (default), html\n" +" or svg) in character encoding C (default: UTF-8).\n" " E.g. -Fapplication/octet-stream:'strings -n8'\n" " or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n" " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n" @@ -499,8 +499,10 @@ main(int argc, char **argv) } } if (output_type != "text/html" && - output_type != "text/plain") { - cerr << "Currently only output types 'text/html' and 'text/plain' are supported." + output_type != "text/plain" && + output_type != "image/svg+xml") { + cerr << "Currently only output types 'image/svg+xml', " + "'text/html' and 'text/plain' are supported." << endl; return 1; } -- 2.11.4.GIT