xapian-applications/omega/handler_libarchive.cc

   1 /** @file
   2  * @brief Extract text and metadata using libarchive.
   3  */
   4 /* Copyright (C) 2020 Parth Kapadia
   5  * Copyright (C) 2022,2023 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22 #include <config.h>
  23 #include "handler.h"
  24
  25 #include "msxmlparser.h"
  26 #include "opendocmetaparser.h"
  27 #include "opendocparser.h"
  28 #include "stringutils.h"
  29 #include "xlsxparser.h"
  30 #include "xpsparser.h"
  31
  32 #include <archive.h>
  33 #include <archive_entry.h>
  34
  35 #define DEFAULT_BLOCK_SIZE 10240
  36
  37 using namespace std;
  38
  39 static void
  40 parse_metadata(const string& metadata)
  41 {
  42     OpenDocMetaParser metaparser;
  43     metaparser.parse(metadata);
  44     send_field(FIELD_TITLE, metaparser.title);
  45     send_field(FIELD_KEYWORDS, metaparser.keywords);
  46     send_field(FIELD_AUTHOR, metaparser.author);
  47     send_field_created_date(metaparser.created);
  48     send_field_page_count(metaparser.pages);
  49 }
  50
  51 static bool
  52 extract_opendoc(struct archive* archive_obj)
  53 {
  54     string styles;
  55     OpenDocParser parser;
  56
  57     struct archive_entry* entry;
  58     while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
  59         size_t total;
  60         ssize_t size;
  61         string pathname = archive_entry_pathname(entry);
  62         if (pathname == "content.xml") {
  63             total = archive_entry_size(entry);
  64             string content(total, '\0');
  65             size = archive_read_data(archive_obj, &content[0], total);
  66
  67             if (size <= 0) {
  68                 send_field(FIELD_ERROR, "Failed to read content.xml");
  69                 return false;
  70             }
  71             content.resize(size);
  72             parser.parse(content);
  73         } else if (pathname == "styles.xml") {
  74             total = archive_entry_size(entry);
  75             styles.resize(total);
  76             size = archive_read_data(archive_obj, &styles[0], total);
  77
  78             if (size <= 0) {
  79                 send_field(FIELD_ERROR, "Failed to read styles.xml");
  80                 return false;
  81             }
  82             styles.resize(size);
  83         } else if (pathname == "meta.xml") {
  84             total = archive_entry_size(entry);
  85             string metadata(total, '\0');
  86             size = archive_read_data(archive_obj, &metadata[0], total);
  87
  88             if (size > 0) {
  89                 // indexing file even if this fails
  90                 metadata.resize(size);
  91                 parse_metadata(metadata);
  92             }
  93         }
  94     }
  95
  96     // We want to parse styles.xml after content.xml, but they could be stored
  97     // in either order in the ZIP container.
  98     parser.parse(styles);
  99
 100     send_field(FIELD_BODY, parser.dump);
 101     return true;
 102 }
 103
 104 static bool
 105 extract_xlsx(struct archive* archive_obj)
 106 {
 107     int pages = 0;
 108     string sheets;
 109     XlsxParser parser;
 110
 111     struct archive_entry* entry;
 112     while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
 113         string pathname = archive_entry_pathname(entry);
 114         if (pathname == "xl/styles.xml" ||
 115             pathname == "xl/workbook.xml" ||
 116             pathname == "xl/sharedStrings.xml") {
 117             size_t total = archive_entry_size(entry);
 118             string shared_strings(total, '\0');
 119             ssize_t size = archive_read_data(archive_obj, &shared_strings[0],
 120                                              total);
 121
 122             if (size > 0) {
 123                 shared_strings.resize(size);
 124                 parser.parse(shared_strings);
 125             }
 126         } else if (startswith(pathname, "xl/worksheets/sheet")) {
 127             auto i = sheets.size();
 128             size_t total = archive_entry_size(entry);
 129             sheets.resize(i + total);
 130             ssize_t size = archive_read_data(archive_obj, &sheets[i], total);
 131
 132             if (size <= 0) {
 133                 send_field(FIELD_ERROR, "Failed to read " + pathname);
 134                 return false;
 135             }
 136             sheets.resize(i + size);
 137             ++pages;
 138         } else if (pathname == "docProps/core.xml") {
 139             size_t total = archive_entry_size(entry);
 140             string metadata(total, '\0');
 141             ssize_t size = archive_read_data(archive_obj, &metadata[0], total);
 142             if (size > 0) {
 143                 metadata.resize(size);
 144                 parse_metadata(metadata);
 145             }
 146         }
 147     }
 148     parser.parse(sheets);
 149     send_field(FIELD_BODY, parser.dump);
 150     send_field_page_count(pages);
 151     return true;
 152 }
 153
 154 static bool
 155 extract_msxml(struct archive* archive_obj,
 156               const string& tail)
 157 {
 158     size_t total;
 159     ssize_t size;
 160     struct archive_entry* entry;
 161     string content;
 162
 163     if (startswith(tail, "wordprocessingml.")) {
 164         while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
 165             string pathname = archive_entry_pathname(entry);
 166             if (pathname == "word/document.xml") {
 167                 auto i = content.size();
 168                 total = archive_entry_size(entry);
 169                 content.resize(i + total);
 170                 size = archive_read_data(archive_obj, &content[i], total);
 171
 172                 if (size <= 0) {
 173                     send_field(FIELD_ERROR, "Failed to read word/document.xml");
 174                     return false;
 175                 }
 176                 content.resize(i + size);
 177             } else if (startswith(pathname, "word/header") ||
 178                        startswith(pathname, "word/footer")) {
 179                 auto i = content.size();
 180                 total = archive_entry_size(entry);
 181                 content.resize(i + total);
 182                 size = archive_read_data(archive_obj, &content[i], total);
 183
 184                 if (size > 0) {
 185                     content.resize(i + size);
 186                 } else {
 187                     // Ignore this as header/footer may not be present
 188                     content.resize(i);
 189                 }
 190             } else if (pathname == "docProps/core.xml") {
 191                 // docProps/core.xml stores meta data
 192                 total = archive_entry_size(entry);
 193                 string metadata(total, '\0');
 194                 size = archive_read_data(archive_obj, &metadata[0], total);
 195                 if (size > 0) {
 196                     metadata.resize(size);
 197                     parse_metadata(metadata);
 198                 }
 199             }
 200         }
 201     } else if (startswith(tail, "presentationml.")) {
 202         int pages = 0;
 203         while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
 204             string pathname = archive_entry_pathname(entry);
 205             if (startswith(pathname, "ppt/slides/slide")) {
 206                 ++pages;
 207                 goto handle_pptx_content;
 208             } else if (startswith(pathname, "ppt/notesSlides/notesSlide") ||
 209                        startswith(pathname, "ppt/comments/comment")) {
 210 handle_pptx_content:
 211                 auto i = content.size();
 212                 total = archive_entry_size(entry);
 213                 content.resize(i + total);
 214                 size = archive_read_data(archive_obj, &content[i], total);
 215
 216                 if (size <= 0) {
 217                     send_field(FIELD_ERROR, "Failed to read " + pathname);
 218                     return false;
 219                 }
 220                 content.resize(i + size);
 221             } else if (pathname == "docProps/core.xml") {
 222                 total = archive_entry_size(entry);
 223                 string metadata(total, '\0');
 224                 size = archive_read_data(archive_obj, &metadata[0], total);
 225                 if (size > 0) {
 226                     metadata.resize(size);
 227                     parse_metadata(metadata);
 228                 }
 229             }
 230         }
 231         send_field_page_count(pages);
 232     }
 233
 234     MSXmlParser parser;
 235     parser.parse(content);
 236     send_field(FIELD_BODY, parser.dump);
 237     return true;
 238 }
 239
 240 static bool
 241 extract_xps(struct archive* archive_obj)
 242 {
 243     int pages = 0;
 244     string content;
 245     XpsParser parser;
 246
 247     struct archive_entry* entry;
 248     while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
 249         string pathname = archive_entry_pathname(entry);
 250         if (startswith(pathname, "Documents/") &&
 251             endswith(pathname, ".fpage") &&
 252             pathname.find("/Pages/") != string::npos) {
 253             size_t total = archive_entry_size(entry);
 254             content.resize(total);
 255             ssize_t size = archive_read_data(archive_obj, &content[0], total);
 256
 257             if (size <= 0) {
 258                 send_field(FIELD_ERROR, "Failed to read " + pathname);
 259                 return false;
 260             }
 261             content.resize(size);
 262             parser.parse(content);
 263             ++pages;
 264         } else if (pathname == "docProps/core.xml") {
 265             // If present, docProps/core.xml stores meta data.
 266             size_t total = archive_entry_size(entry);
 267             content.resize(total);
 268             ssize_t size = archive_read_data(archive_obj, &content[0], total);
 269             if (size > 0) {
 270                 content.resize(size);
 271                 parse_metadata(content);
 272             }
 273         }
 274     }
 275
 276     send_field(FIELD_BODY, parser.dump);
 277     send_field_page_count(pages);
 278     return true;
 279 }
 280
 281 bool
 282 initialise()
 283 {
 284     return true;
 285 }
 286
 287 void
 288 extract(const string& filename,
 289         const string& mimetype)
 290 {
 291     const char* file = filename.c_str();
 292     struct archive* archive_obj = archive_read_new();
 293     archive_read_support_format_zip(archive_obj);
 294     // Block size will be determined by libarchive automatically for
 295     // regular files. Specified block size will only be used for tape drives
 296     // 10240 is chosen as default size (20 records - 512 bytes each)
 297     int status_code = archive_read_open_filename(archive_obj, file,
 298                                                  DEFAULT_BLOCK_SIZE);
 299
 300     if (status_code != ARCHIVE_OK) {
 301         send_field(FIELD_ERROR, "Failed to open file");
 302         return;
 303     }
 304
 305     if (startswith(mimetype, "application/vnd.sun.xml.") ||
 306         startswith(mimetype, "application/vnd.oasis.opendocument.")) {
 307         if (!extract_opendoc(archive_obj))
 308             return;
 309     } else if (startswith(mimetype,
 310                           "application/vnd.openxmlformats-officedocument."))
 311     {
 312         string tail(mimetype, 46);
 313         if (startswith(tail, "spreadsheetml.")) {
 314             if (!extract_xlsx(archive_obj))
 315                 return;
 316         } else {
 317             if (!extract_msxml(archive_obj, tail))
 318                 return;
 319         }
 320     } else if (mimetype == "application/oxps" ||
 321                mimetype == "application/vnd.ms-xpsdocument") {
 322         if (!extract_xps(archive_obj))
 323             return;
 324     }
 325
 326     status_code = archive_read_free(archive_obj);
 327     if (status_code != ARCHIVE_OK) {
 328         send_field(FIELD_ERROR, archive_error_string(archive_obj));
 329         return;
 330     }
 331 }