1 /* myhtmlparse.cc: subclass of HtmlParser for extracting text.
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2012,2013,2014,2015,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "myhtmlparse.h"
28 #include "my-html-tok.h"
29 #include "stringutils.h"
30 #include "utf8convert.h"
36 static const char whitespace
[] = "_ \t\r\r\f";
39 lowercase_string(string
&str
)
41 for (string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
47 MyHtmlParser::parse_html(const string
&text
, const string
&charset_
,
48 bool charset_from_meta_
)
51 charset_from_meta
= charset_from_meta_
;
56 MyHtmlParser::process_text(const string
&text
)
58 if (!text
.empty() && !in_script_tag
&& !in_style_tag
) {
59 string::size_type b
= text
.find_first_not_of(WHITESPACE
);
60 if (b
&& !pending_space
) pending_space
= SPACE
;
61 while (b
!= string::npos
) {
62 if (pending_space
&& !target
->empty())
63 *target
+= whitespace
[pending_space
];
64 string::size_type e
= text
.find_first_of(WHITESPACE
, b
);
65 if (e
== string::npos
) {
66 target
->append(text
.data() + b
, text
.size() - b
);
70 target
->append(text
.data() + b
, e
- b
);
71 pending_space
= SPACE
;
72 b
= text
.find_first_not_of(WHITESPACE
, e
+ 1);
78 MyHtmlParser::opening_tag(const string
&tag
)
80 int k
= keyword(tab
, tag
.data(), tag
.size());
83 pending_space
= max(pending_space
, (token_space
[k
] & TOKEN_SPACE_MASK
));
84 switch (html_tag(k
)) {
86 if (pending_space
< PAGE
) {
88 if (get_parameter("style", style
)) {
89 // As produced by Libreoffice's HTML export:
90 if (style
.find("page-break-before: always") != string::npos
)
97 if (get_parameter("content", content
)) {
99 if (get_parameter("name", name
)) {
100 lowercase_string(name
);
101 if (name
== "description") {
102 convert_to_utf8(content
, charset
);
103 decode_entities(content
);
104 if (description_as_sample
&& sample
.empty()) {
105 swap(sample
, content
);
107 // If we're not using the description as the
108 // sample, or for second and subsequent
109 // descriptions, treat as keywords.
110 if (keywords
.empty()) {
111 swap(keywords
, content
);
117 } else if (name
== "keywords" ||
118 name
== "dcterms.subject" ||
119 name
== "dcterms.description") {
120 // LibreOffice HTML export puts "Subject" and
121 // "Keywords" into DCTERMS.subject, and "Comments"
122 // into DCTERMS.description. Best option seems to
123 // be to treat all of these as keywords, i.e. just
124 // more text to index, but not show in/as the
126 if (!keywords
.empty()) keywords
+= ' ';
127 convert_to_utf8(content
, charset
);
128 decode_entities(content
);
130 } else if (name
== "author" ||
131 name
== "dcterms.creator" ||
132 name
== "dcterms.contributor") {
133 // LibreOffice HTML export includes DCTERMS.creator
134 // and DCTERMS.contributor.
135 if (!author
.empty()) author
+= ' ';
136 convert_to_utf8(content
, charset
);
137 decode_entities(content
);
139 } else if (name
== "classification") {
140 if (!topic
.empty()) topic
+= ' ';
141 convert_to_utf8(content
, charset
);
142 decode_entities(content
);
144 } else if (!ignoring_metarobots
&& name
== "robots") {
145 decode_entities(content
);
146 lowercase_string(content
);
147 if (content
.find("none") != string::npos
||
148 content
.find("noindex") != string::npos
) {
149 indexing_allowed
= false;
152 } else if (name
== "created" ||
153 name
== "dcterms.issued") {
154 created
= parse_datetime(content
);
158 // If the current charset came from a meta tag, don't
159 // force reparsing again!
160 if (charset_from_meta
) break;
162 if (get_parameter("http-equiv", hdr
)) {
163 lowercase_string(hdr
);
164 if (hdr
== "content-type") {
165 lowercase_string(content
);
166 size_t start
= content
.find("charset=");
167 if (start
== string::npos
) break;
169 if (start
== content
.size()) break;
171 if (content
[start
] != '"') {
172 while (end
< content
.size()) {
173 unsigned char ch
= content
[end
];
174 if (ch
<= 32 || ch
>= 127 ||
175 strchr(";()<>@,:\\\"/[]?={}", ch
))
182 while (end
< content
.size()) {
183 unsigned char ch
= content
[end
];
184 if (ch
== '"') break;
185 if (ch
== '\\') content
.erase(end
, 1);
189 string
newcharset(content
, start
, end
- start
);
190 if (charset
!= newcharset
) {
197 if (charset_from_meta
) break;
199 if (get_parameter("charset", newcharset
)) {
200 // HTML5 added: <meta charset="...">
201 lowercase_string(newcharset
);
202 if (charset
!= newcharset
) {
212 in_script_tag
= true;
226 MyHtmlParser::closing_tag(const string
&tag
)
228 int k
= keyword(tab
, tag
.data(), tag
.size());
229 if (k
< 0 || (token_space
[k
] & NOCLOSE
))
231 pending_space
= max(pending_space
, (token_space
[k
] & TOKEN_SPACE_MASK
));
232 switch (html_tag(k
)) {
234 in_style_tag
= false;
237 in_script_tag
= false;