Make sure EOF is defined
[xapian.git] / xapian-applications / omega / myhtmlparse.cc
blobf1b495a8aadb52f9fefb93634e8f480ad248dd46
1 /* myhtmlparse.cc: subclass of HtmlParser for extracting text.
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2012,2013,2014,2015,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "myhtmlparse.h"
26 #include "datetime.h"
27 #include "keyword.h"
28 #include "my-html-tok.h"
29 #include "stringutils.h"
30 #include "utf8convert.h"
32 #include <cstring>
34 using namespace std;
36 static const char whitespace[] = "_ \t\r\r\f";
38 inline void
39 lowercase_string(string &str)
41 for (string::iterator i = str.begin(); i != str.end(); ++i) {
42 *i = C_tolower(*i);
46 void
47 MyHtmlParser::parse_html(const string &text, const string &charset_,
48 bool charset_from_meta_)
50 charset = charset_;
51 charset_from_meta = charset_from_meta_;
52 parse(text);
55 void
56 MyHtmlParser::process_text(const string &text)
58 if (!text.empty() && !in_script_tag && !in_style_tag) {
59 string::size_type b = text.find_first_not_of(WHITESPACE);
60 if (b && !pending_space) pending_space = SPACE;
61 while (b != string::npos) {
62 if (pending_space && !target->empty())
63 *target += whitespace[pending_space];
64 string::size_type e = text.find_first_of(WHITESPACE, b);
65 if (e == string::npos) {
66 target->append(text.data() + b, text.size() - b);
67 pending_space = 0;
68 return;
70 target->append(text.data() + b, e - b);
71 pending_space = SPACE;
72 b = text.find_first_not_of(WHITESPACE, e + 1);
77 bool
78 MyHtmlParser::opening_tag(const string &tag)
80 int k = keyword(tab, tag.data(), tag.size());
81 if (k < 0)
82 return true;
83 pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
84 switch (html_tag(k)) {
85 case P:
86 if (pending_space < PAGE) {
87 string style;
88 if (get_parameter("style", style)) {
89 // As produced by Libreoffice's HTML export:
90 if (style.find("page-break-before: always") != string::npos)
91 pending_space = PAGE;
94 break;
95 case META: {
96 string content;
97 if (get_parameter("content", content)) {
98 string name;
99 if (get_parameter("name", name)) {
100 lowercase_string(name);
101 if (name == "description") {
102 convert_to_utf8(content, charset);
103 decode_entities(content);
104 if (description_as_sample && sample.empty()) {
105 swap(sample, content);
106 } else {
107 // If we're not using the description as the
108 // sample, or for second and subsequent
109 // descriptions, treat as keywords.
110 if (keywords.empty()) {
111 swap(keywords, content);
112 } else {
113 keywords += ' ';
114 keywords += content;
117 } else if (name == "keywords" ||
118 name == "dcterms.subject" ||
119 name == "dcterms.description") {
120 // LibreOffice HTML export puts "Subject" and
121 // "Keywords" into DCTERMS.subject, and "Comments"
122 // into DCTERMS.description. Best option seems to
123 // be to treat all of these as keywords, i.e. just
124 // more text to index, but not show in/as the
125 // sample.
126 if (!keywords.empty()) keywords += ' ';
127 convert_to_utf8(content, charset);
128 decode_entities(content);
129 keywords += content;
130 } else if (name == "author" ||
131 name == "dcterms.creator" ||
132 name == "dcterms.contributor") {
133 // LibreOffice HTML export includes DCTERMS.creator
134 // and DCTERMS.contributor.
135 if (!author.empty()) author += ' ';
136 convert_to_utf8(content, charset);
137 decode_entities(content);
138 author += content;
139 } else if (name == "classification") {
140 if (!topic.empty()) topic += ' ';
141 convert_to_utf8(content, charset);
142 decode_entities(content);
143 topic += content;
144 } else if (!ignoring_metarobots && name == "robots") {
145 decode_entities(content);
146 lowercase_string(content);
147 if (content.find("none") != string::npos ||
148 content.find("noindex") != string::npos) {
149 indexing_allowed = false;
150 return false;
152 } else if (name == "created" ||
153 name == "dcterms.issued") {
154 created = parse_datetime(content);
156 break;
158 // If the current charset came from a meta tag, don't
159 // force reparsing again!
160 if (charset_from_meta) break;
161 string hdr;
162 if (get_parameter("http-equiv", hdr)) {
163 lowercase_string(hdr);
164 if (hdr == "content-type") {
165 lowercase_string(content);
166 size_t start = content.find("charset=");
167 if (start == string::npos) break;
168 start += 8;
169 if (start == content.size()) break;
170 size_t end = start;
171 if (content[start] != '"') {
172 while (end < content.size()) {
173 unsigned char ch = content[end];
174 if (ch <= 32 || ch >= 127 ||
175 strchr(";()<>@,:\\\"/[]?={}", ch))
176 break;
177 ++end;
179 } else {
180 ++start;
181 ++end;
182 while (end < content.size()) {
183 unsigned char ch = content[end];
184 if (ch == '"') break;
185 if (ch == '\\') content.erase(end, 1);
186 ++end;
189 string newcharset(content, start, end - start);
190 if (charset != newcharset) {
191 throw newcharset;
195 break;
197 if (charset_from_meta) break;
198 string newcharset;
199 if (get_parameter("charset", newcharset)) {
200 // HTML5 added: <meta charset="...">
201 lowercase_string(newcharset);
202 if (charset != newcharset) {
203 throw newcharset;
206 break;
208 case STYLE:
209 in_style_tag = true;
210 break;
211 case SCRIPT:
212 in_script_tag = true;
213 break;
214 case TITLE:
215 target = &title;
216 pending_space = 0;
217 break;
218 default:
219 /* No action */
220 break;
222 return true;
225 bool
226 MyHtmlParser::closing_tag(const string &tag)
228 int k = keyword(tab, tag.data(), tag.size());
229 if (k < 0 || (token_space[k] & NOCLOSE))
230 return true;
231 pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
232 switch (html_tag(k)) {
233 case STYLE:
234 in_style_tag = false;
235 break;
236 case SCRIPT:
237 in_script_tag = false;
238 break;
239 case TITLE:
240 target = &dump;
241 pending_space = 0;
242 break;
243 default:
244 /* No action */
245 break;
247 return true;