xapian-applications/omega/myhtmlparse.cc

   1 /* myhtmlparse.cc: subclass of HtmlParser for extracting text.
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2012,2013,2014,2015,2017 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  19  * USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "myhtmlparse.h"
  25
  26 #include "datetime.h"
  27 #include "keyword.h"
  28 #include "my-html-tok.h"
  29 #include "stringutils.h"
  30 #include "utf8convert.h"
  31
  32 #include <cstring>
  33
  34 using namespace std;
  35
  36 static const char whitespace[] = "_ \t\r\r\f";
  37
  38 inline void
  39 lowercase_string(string &str)
  40 {
  41     for (string::iterator i = str.begin(); i != str.end(); ++i) {
  42         *i = C_tolower(*i);
  43     }
  44 }
  45
  46 void
  47 MyHtmlParser::parse_html(const string &text, const string &charset_,
  48                          bool charset_from_meta_)
  49 {
  50     charset = charset_;
  51     charset_from_meta = charset_from_meta_;
  52     parse(text);
  53 }
  54
  55 void
  56 MyHtmlParser::process_text(const string &text)
  57 {
  58     if (!text.empty() && !in_script_tag && !in_style_tag) {
  59         string::size_type b = text.find_first_not_of(WHITESPACE);
  60         if (b && !pending_space) pending_space = SPACE;
  61         while (b != string::npos) {
  62             if (pending_space && !target->empty())
  63                 *target += whitespace[pending_space];
  64             string::size_type e = text.find_first_of(WHITESPACE, b);
  65             if (e == string::npos) {
  66                 target->append(text.data() + b, text.size() - b);
  67                 pending_space = 0;
  68                 return;
  69             }
  70             target->append(text.data() + b, e - b);
  71             pending_space = SPACE;
  72             b = text.find_first_not_of(WHITESPACE, e + 1);
  73         }
  74     }
  75 }
  76
  77 bool
  78 MyHtmlParser::opening_tag(const string &tag)
  79 {
  80     int k = keyword(tab, tag.data(), tag.size());
  81     if (k < 0)
  82         return true;
  83     pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
  84     switch (html_tag(k)) {
  85         case P:
  86             if (pending_space < PAGE) {
  87                 string style;
  88                 if (get_parameter("style", style)) {
  89                     // As produced by Libreoffice's HTML export:
  90                     if (style.find("page-break-before: always") != string::npos)
  91                         pending_space = PAGE;
  92                 }
  93             }
  94             break;
  95         case META: {
  96                 string content;
  97                 if (get_parameter("content", content)) {
  98                     string name;
  99                     if (get_parameter("name", name)) {
 100                         lowercase_string(name);
 101                         if (name == "description") {
 102                             convert_to_utf8(content, charset);
 103                             decode_entities(content);
 104                             if (description_as_sample && sample.empty()) {
 105                                 swap(sample, content);
 106                             } else {
 107                                 // If we're not using the description as the
 108                                 // sample, or for second and subsequent
 109                                 // descriptions, treat as keywords.
 110                                 if (keywords.empty()) {
 111                                     swap(keywords, content);
 112                                 } else {
 113                                     keywords += ' ';
 114                                     keywords += content;
 115                                 }
 116                             }
 117                         } else if (name == "keywords" ||
 118                                    name == "dcterms.subject" ||
 119                                    name == "dcterms.description") {
 120                             // LibreOffice HTML export puts "Subject" and
 121                             // "Keywords" into DCTERMS.subject, and "Comments"
 122                             // into DCTERMS.description.  Best option seems to
 123                             // be to treat all of these as keywords, i.e. just
 124                             // more text to index, but not show in/as the
 125                             // sample.
 126                             if (!keywords.empty()) keywords += ' ';
 127                             convert_to_utf8(content, charset);
 128                             decode_entities(content);
 129                             keywords += content;
 130                         } else if (name == "author" ||
 131                                    name == "dcterms.creator" ||
 132                                    name == "dcterms.contributor") {
 133                             // LibreOffice HTML export includes DCTERMS.creator
 134                             // and DCTERMS.contributor.
 135                             if (!author.empty()) author += ' ';
 136                             convert_to_utf8(content, charset);
 137                             decode_entities(content);
 138                             author += content;
 139                         } else if (name == "classification") {
 140                             if (!topic.empty()) topic += ' ';
 141                             convert_to_utf8(content, charset);
 142                             decode_entities(content);
 143                             topic += content;
 144                         } else if (!ignoring_metarobots && name == "robots") {
 145                             decode_entities(content);
 146                             lowercase_string(content);
 147                             if (content.find("none") != string::npos ||
 148                                 content.find("noindex") != string::npos) {
 149                                 indexing_allowed = false;
 150                                 return false;
 151                             }
 152                         } else if (name == "created" ||
 153                                    name == "dcterms.issued") {
 154                             created = parse_datetime(content);
 155                         }
 156                         break;
 157                     }
 158                     // If the current charset came from a meta tag, don't
 159                     // force reparsing again!
 160                     if (charset_from_meta) break;
 161                     string hdr;
 162                     if (get_parameter("http-equiv", hdr)) {
 163                         lowercase_string(hdr);
 164                         if (hdr == "content-type") {
 165                             lowercase_string(content);
 166                             size_t start = content.find("charset=");
 167                             if (start == string::npos) break;
 168                             start += 8;
 169                             if (start == content.size()) break;
 170                             size_t end = start;
 171                             if (content[start] != '"') {
 172                                 while (end < content.size()) {
 173                                     unsigned char ch = content[end];
 174                                     if (ch <= 32 || ch >= 127 ||
 175                                         strchr(";()<>@,:\\\"/[]?={}", ch))
 176                                         break;
 177                                     ++end;
 178                                 }
 179                             } else {
 180                                 ++start;
 181                                 ++end;
 182                                 while (end < content.size()) {
 183                                     unsigned char ch = content[end];
 184                                     if (ch == '"') break;
 185                                     if (ch == '\\') content.erase(end, 1);
 186                                     ++end;
 187                                 }
 188                             }
 189                             string newcharset(content, start, end - start);
 190                             if (charset != newcharset) {
 191                                 throw newcharset;
 192                             }
 193                         }
 194                     }
 195                     break;
 196                 }
 197                 if (charset_from_meta) break;
 198                 string newcharset;
 199                 if (get_parameter("charset", newcharset)) {
 200                     // HTML5 added: <meta charset="...">
 201                     lowercase_string(newcharset);
 202                     if (charset != newcharset) {
 203                         throw newcharset;
 204                     }
 205                 }
 206                 break;
 207             }
 208         case STYLE:
 209             in_style_tag = true;
 210             break;
 211         case SCRIPT:
 212             in_script_tag = true;
 213             break;
 214         case TITLE:
 215             target = &title;
 216             pending_space = 0;
 217             break;
 218         default:
 219             /* No action */
 220             break;
 221     }
 222     return true;
 223 }
 224
 225 bool
 226 MyHtmlParser::closing_tag(const string &tag)
 227 {
 228     int k = keyword(tab, tag.data(), tag.size());
 229     if (k < 0 || (token_space[k] & NOCLOSE))
 230         return true;
 231     pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
 232     switch (html_tag(k)) {
 233         case STYLE:
 234             in_style_tag = false;
 235             break;
 236         case SCRIPT:
 237             in_script_tag = false;
 238             break;
 239         case TITLE:
 240             target = &dump;
 241             pending_space = 0;
 242             break;
 243         default:
 244             /* No action */
 245             break;
 246     }
 247     return true;
 248 }