Fix masking of bits in serialised query
[xapian.git] / xapian-applications / omega / myhtmlparse.h
blob607e8f76d158808a537324122be4434efad2d5fb
1 /* myhtmlparse.h: subclass of HtmlParser for extracting text
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002,2003,2004,2006,2008,2010,2011,2012,2013 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
23 #define OMEGA_INCLUDED_MYHTMLPARSE_H
25 #include "htmlparse.h"
27 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
28 // not in all charsets and perhaps spans of all \xa0 should become a single
29 // \xa0?
30 #define WHITESPACE " \t\n\r"
32 class MyHtmlParser : public HtmlParser {
33 public:
34 int pending_space;
35 bool in_script_tag;
36 bool in_style_tag;
37 bool indexing_allowed;
38 bool ignoring_metarobots;
39 bool charset_from_meta;
40 string title, sample, keywords, dump, author, topic;
41 time_t created;
42 string * target;
44 void process_text(const string &text);
45 bool opening_tag(const string &tag);
46 bool closing_tag(const string &tag);
47 void parse_html(const string &text, const string &charset_,
48 bool charset_from_meta_);
49 void ignore_metarobots() { ignoring_metarobots = true; }
50 MyHtmlParser() :
51 pending_space(0),
52 in_script_tag(false),
53 in_style_tag(false),
54 indexing_allowed(true),
55 ignoring_metarobots(false),
56 charset_from_meta(false),
57 created(time_t(-1)),
58 target(&dump) { }
60 void reset() {
61 pending_space = 0;
62 in_script_tag = false;
63 in_style_tag = false;
64 indexing_allowed = true;
65 ignoring_metarobots = false;
66 charset_from_meta = false;
67 title.resize(0);
68 sample.resize(0);
69 keywords.resize(0);
70 dump.resize(0);
71 author.resize(0);
72 topic.resize(0);
73 created = time_t(-1);
74 target = &dump;
78 #endif // OMEGA_INCLUDED_MYHTMLPARSE_H