xapian-applications/omega/opendocparser.cc

   1 /** @file
   2  * @brief Extract text from OpenDocument XML.
   3  */
   4 /* Copyright (C) 2012-2022 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "opendocparser.h"
  24
  25 #include <cstring>
  26
  27 #include "stringutils.h"
  28
  29 using namespace std;
  30
  31 bool
  32 OpenDocParser::opening_tag(const string& tag)
  33 {
  34     if (startswith(tag, "text:")) {
  35         const char* tail = tag.c_str() + 5;
  36         if (strcmp(tail, "p") == 0 ||
  37             strcmp(tail, "h") == 0 ||
  38             strcmp(tail, "line-break") == 0 ||
  39             strcmp(tail, "tab") == 0) {
  40             pending_space = true;
  41         }
  42     } else if (tag == "office:body") {
  43         indexing = true;
  44     } else if (tag == "style:style") {
  45         (void)get_attribute("style:master-page-name", master_page_name);
  46     } else if (tag == "style:master-page") {
  47         string n;
  48         if (get_attribute("style:name", n) && n == master_page_name)
  49             indexing = true;
  50     }
  51     return true;
  52 }
  53
  54 bool
  55 OpenDocParser::closing_tag(const string& tag)
  56 {
  57     if (!indexing)
  58         return true;
  59
  60     if (tag == "text:p" || tag == "text:h") {
  61         pending_space = true;
  62     } else if (tag == "office:body" || tag == "style:style") {
  63         indexing = false;
  64     }
  65     return true;
  66 }
  67
  68 void
  69 OpenDocParser::process_content(const string& content)
  70 {
  71     if (indexing && !content.empty()) {
  72         if (pending_space) {
  73             pending_space = false;
  74             if (!content.empty()) dump += ' ';
  75         }
  76         dump += content;
  77     }
  78 }