1 /* htmlparse.cc: simple HTML parser for omega indexer
3 * ----START-LICENCE----
4 * Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Ananova Ltd
6 * Copyright 2002 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * -----END-LICENCE-----
28 #include "htmlparse.h"
35 map
<string
, unsigned int> HtmlParser::named_ents
;
62 p_notwhitespace(char c
)
70 return !isalnum(c
) && c
!= '.' && c
!= '-';
74 p_whitespacegt(char c
)
76 return isspace(c
) || c
== '>';
80 p_whitespaceeqgt(char c
)
82 return isspace(c
) || c
== '=' || c
== '>';
85 HtmlParser::HtmlParser()
87 static struct ent
{ const char *n
; unsigned int v
; } ents
[] = {
188 // iso8859-1 only for now { "OElig", 338 },
189 // ditto { "oelig", 339 },
192 if (named_ents
.empty()) {
193 struct ent
*i
= ents
;
195 named_ents
[string(i
->n
)] = i
->v
;
202 HtmlParser::decode_entities(string
&s
)
204 // We need a const_iterator version of s.end() - otherwise the
205 // find() and find_if() templates don't work...
206 string::const_iterator amp
= s
.begin(), s_end
= s
.end();
207 while ((amp
= find(amp
, s_end
, '&')) != s_end
) {
208 unsigned int val
= 0;
209 string::const_iterator end
, p
= amp
+ 1;
210 if (p
!= s_end
&& *p
== '#') {
212 if (p
!= s_end
&& tolower(*p
) == 'x') {
215 end
= find_if(p
, s_end
, p_notxdigit
);
216 sscanf(s
.substr(p
- s
.begin(), end
- p
).c_str(), "%x", &val
);
219 end
= find_if(p
, s_end
, p_notdigit
);
220 val
= atoi(s
.substr(p
- s
.begin(), end
- p
).c_str());
223 end
= find_if(p
, s_end
, p_notalnum
);
224 string code
= s
.substr(p
- s
.begin(), end
- p
);
225 map
<string
, unsigned int>::const_iterator i
;
226 i
= named_ents
.find(code
);
227 if (i
!= named_ents
.end()) val
= i
->second
;
229 if (end
< s_end
&& *end
== ';') end
++;
231 string::size_type amp_pos
= amp
- s
.begin();
232 s
.replace(amp_pos
, end
- amp
, 1u, char(val
));
234 // We've modified the string, so the iterators are no longer
236 amp
= s
.begin() + amp_pos
+ 1;
244 HtmlParser::parse_html(const string
&body
)
246 map
<string
,string
> Param
;
247 string::const_iterator start
= body
.begin();
251 // Skip through until we find an HTML tag, a comment, or the end of
252 // document. Ignore isolated occurences of `<' which don't start
254 string::const_iterator p
= start
;
257 p
= find(p
, body
.end(), '<');
258 if (p
== body
.end()) break;
260 // tag, closing tag, comment (or SGML declaration), or PHP
261 if (isalpha(ch
) || ch
== '/' || ch
== '!' || ch
== '?') break;
266 // process text up to start of tag
268 string text
= body
.substr(start
- body
.begin(), p
- start
);
269 decode_entities(text
);
273 if (p
== body
.end()) break;
277 if (start
== body
.end()) break;
280 if (++start
== body
.end()) break;
281 if (++start
== body
.end()) break;
282 // comment or SGML declaration
283 if (*(start
- 1) == '-' && *start
== '-') {
284 start
= find(start
+ 1, body
.end(), '>');
285 // unterminated comment swallows rest of document
286 // (like NS, but unlike MSIE iirc)
287 if (start
== body
.end()) break;
291 while (p
!= body
.end() && (*(p
- 1) != '-' || *(p
- 2) != '-'))
292 p
= find(p
+ 1, body
.end(), '>');
294 // If we found --> skip to there, otherwise
295 // skip to the first > we found (as Netscape does)
296 if (p
!= body
.end()) start
= p
;
298 // just an SGML declaration, perhaps giving the DTD - ignore it
299 start
= find(start
- 1, body
.end(), '>');
300 if (start
== body
.end()) break;
303 } else if (*start
== '?') {
304 if (++start
== body
.end()) break;
305 // PHP - swallow until ?> or EOF
306 start
= find(start
+ 1, body
.end(), '>');
309 while (start
!= body
.end() && *(start
- 1) != '?')
310 start
= find(start
+ 1, body
.end(), '>');
312 // unterminated PHP swallows rest of document (rather arbitrarily
313 // but it avoids polluting the database when things go wrong)
314 if (start
!= body
.end()) ++start
;
316 // opening or closing tag
321 start
= find_if(start
+ 1, body
.end(), p_notwhitespace
);
325 start
= find_if(start
, body
.end(), p_nottag
);
327 string tag
= body
.substr(p
- body
.begin(), start
- p
);
329 // convert tagname to lowercase
330 for (string::iterator i
= tag
.begin(); i
!= tag
.end(); i
++)
337 /* ignore any bogus parameters on closing tags */
338 p
= find(start
, body
.end(), '>');
339 if (p
== body
.end()) break;
343 while (start
< body
.end() && *start
!= '>') {
346 p
= find_if(start
, body
.end(), p_whitespaceeqgt
);
348 name
= body
.substr(start
- body
.begin(), p
- start
);
350 p
= find_if(p
, body
.end(), p_notwhitespace
);
353 if (start
!= body
.end() && *start
== '=') {
356 start
= find_if(start
+ 1, body
.end(), p_notwhitespace
);
361 if (quote
== '"' || quote
== '\'') {
363 p
= find(start
, body
.end(), quote
);
366 if (p
== body
.end()) {
367 // unquoted or no closing quote
368 p
= find_if(start
, body
.end(), p_whitespacegt
);
370 value
= body
.substr(start
- body
.begin(), p
- start
);
372 start
= find_if(p
, body
.end(), p_notwhitespace
);
374 value
= body
.substr(start
- body
.begin(), p
- start
);
379 // convert parameter name to lowercase
381 for (i
= name
.begin(); i
!= name
.end(); i
++)
383 // in case of multiple entries, use the first
384 // (as Netscape does)
385 if (Param
.find(name
) == Param
.end())
390 opening_tag(tag
, Param
);
393 if (start
!= body
.end() && *start
== '>') ++start
;