2 * @brief subclass of XmlParser for extracting text from HTML.
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002-2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 #ifndef OMEGA_INCLUDED_HTMLPARSER_H
24 #define OMEGA_INCLUDED_HTMLPARSER_H
26 #include "xmlparser.h"
30 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
31 // not in all charsets and perhaps spans of all \xa0 should become a single
33 #define WHITESPACE " \t\n\r"
35 class HtmlParser
: public XmlParser
{
37 bool pending_space
= false;
38 bool in_script_tag
= false;
39 bool in_style_tag
= false;
40 bool indexing_allowed
= true;
41 bool ignoring_metarobots
= false;
42 bool charset_from_meta
= false;
43 bool description_as_sample
= false;
44 std::string title
, sample
, keywords
, dump
, author
, topic
;
45 time_t created
= time_t(-1);
48 void process_content(const std::string
& content
);
49 bool opening_tag(const std::string
& tag
);
50 bool closing_tag(const std::string
& tag
);
51 void parse(std::string_view text
,
52 const std::string
& charset_
,
53 bool charset_from_meta_
);
54 void ignore_metarobots() { ignoring_metarobots
= true; }
56 HtmlParser() : XmlParser(false), target(&dump
) { }
59 pending_space
= false;
60 in_script_tag
= false;
62 indexing_allowed
= true;
63 ignoring_metarobots
= false;
64 charset_from_meta
= false;
65 description_as_sample
= false;
77 #endif // OMEGA_INCLUDED_HTMLPARSER_H