FBReader 0.8.15
[lbook_fbreader.git] / fbreader / src / formats / html / HtmlReader.cpp
blob1b608491fddc7150af120b9e33eff7ee004f4816
1 /*
2 * Copyright (C) 2004-2008 Geometer Plus <contact@geometerplus.com>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
20 #include <algorithm>
21 #include <cctype>
23 #include <ZLInputStream.h>
24 #include <ZLXMLReader.h>
25 #include <ZLFile.h>
26 #include <ZLStringUtil.h>
27 #include <ZLUnicodeUtil.h>
29 #include "HtmlReader.h"
30 #include "HtmlEntityCollection.h"
32 HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) {
35 HtmlReader::~HtmlReader() {
38 void HtmlReader::setTag(HtmlTag &tag, const std::string &name) {
39 tag.Attributes.clear();
41 if (name.length() == 0) {
42 tag.Name = name;
43 return;
46 tag.Start = name[0] != '/';
47 if (tag.Start) {
48 tag.Name = name;
49 } else {
50 tag.Name = name.substr(1);
53 const size_t len = tag.Name.length();
54 for (size_t i = 0; i < len; ++i) {
55 tag.Name[i] = toupper(tag.Name[i]);
59 enum ParseState {
60 PS_TEXT,
61 PS_TAGSTART,
62 PS_TAGNAME,
63 PS_ATTRIBUTENAME,
64 PS_ATTRIBUTEVALUE,
65 PS_SKIPTAG,
66 PS_COMMENT,
67 PS_SPECIAL,
68 PS_SPECIAL_IN_ATTRIBUTEVALUE,
71 enum SpecialType {
72 ST_UNKNOWN,
73 ST_NUM,
74 ST_NAME,
75 ST_DEC,
76 ST_HEX
79 static bool allowSymbol(SpecialType type, char ch) {
80 return
81 ((type == ST_NAME) && isalpha(ch)) ||
82 ((type == ST_DEC) && isdigit(ch)) ||
83 ((type == ST_HEX) && isxdigit(ch));
86 static int specialSymbolNumber(SpecialType type, const std::string &txt) {
87 char *end = 0;
88 switch (type) {
89 case ST_NAME:
90 return HtmlEntityCollection::symbolNumber(txt);
91 case ST_DEC:
92 return strtol(txt.c_str() + 1, &end, 10);
93 case ST_HEX:
94 return strtol(txt.c_str() + 2, &end, 16);
95 default:
96 return 0;
100 void HtmlReader::appendString(std::string &to, std::string &from) {
101 if (myConverter.isNull()) {
102 to += from;
103 } else {
104 myConverter->convert(to, from);
105 myConverter->reset();
107 from.erase();
110 void HtmlReader::readDocument(ZLInputStream &stream) {
111 if (!stream.open()) {
112 return;
115 startDocumentHandler();
117 ParseState state = PS_TEXT;
118 SpecialType state_special = ST_UNKNOWN;
119 std::string currentString;
120 std::string attributeValueString;
121 std::string specialString;
122 int quotationCounter = 0;
123 HtmlTag currentTag;
124 char endOfComment[2] = "\0";
126 const size_t BUFSIZE = 2048;
127 char *buffer = new char[BUFSIZE];
128 size_t length;
129 size_t offset = 0;
130 do {
131 length = stream.read(buffer, BUFSIZE);
132 char *start = buffer;
133 char *endOfBuffer = buffer + length;
134 for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
135 switch (state) {
136 case PS_TEXT:
137 if (*ptr == '<') {
138 if (!characterDataHandler(start, ptr - start, true)) {
139 goto endOfProcessing;
141 start = ptr + 1;
142 state = PS_TAGSTART;
143 currentTag.Offset = offset + (ptr - buffer);
145 if (*ptr == '&') {
146 if (!characterDataHandler(start, ptr - start, true)) {
147 goto endOfProcessing;
149 start = ptr + 1;
150 state = PS_SPECIAL;
151 state_special = ST_UNKNOWN;
153 break;
154 case PS_SPECIAL:
155 case PS_SPECIAL_IN_ATTRIBUTEVALUE:
156 if (state_special == ST_UNKNOWN) {
157 if (*ptr == '#') {
158 state_special = ST_NUM;
159 } else if (isalpha(*ptr)) {
160 state_special = ST_NAME;
161 } else {
162 start = ptr;
163 state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
165 } else if (state_special == ST_NUM) {
166 if (*ptr == 'x') {
167 state_special = ST_HEX;
168 } else if (isdigit(*ptr)) {
169 state_special = ST_DEC;
170 } else {
171 start = ptr;
172 state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
174 } else {
175 if (*ptr == ';') {
176 specialString.append(start, ptr - start);
177 int number = specialSymbolNumber(state_special, specialString);
178 if ((128 <= number) && (number <= 159)) {
179 char ch = number;
180 if (state == PS_SPECIAL) {
181 characterDataHandler(&ch, 1, true);
182 } else {
183 myConverter->convert(attributeValueString, &ch, &ch + 1);
185 } else if (number != 0) {
186 char buffer[4];
187 int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number);
188 if (state == PS_SPECIAL) {
189 characterDataHandler(buffer, len, false);
190 } else {
191 attributeValueString.append(buffer, len);
193 } else {
194 specialString = "&" + specialString + ";";
195 if (state == PS_SPECIAL) {
196 characterDataHandler(specialString.c_str(), specialString.length(), false);
197 } else {
198 attributeValueString += specialString;
201 specialString.erase();
202 start = ptr + 1;
203 state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
204 } else if (!allowSymbol(state_special, *ptr)) {
205 start = ptr;
206 state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
209 break;
210 case PS_TAGSTART:
211 state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
212 break;
213 case PS_COMMENT:
214 if ((endOfComment[0] == '\0') && (*ptr != '-')) {
215 state = PS_TAGNAME;
216 } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
217 start = ptr + 1;
218 state = PS_TEXT;
219 endOfComment[0] = '\0';
220 endOfComment[1] = '\0';
221 } else {
222 endOfComment[0] = endOfComment[1];
223 endOfComment[1] = *ptr;
225 break;
226 case PS_TAGNAME:
227 if ((*ptr == '>') || isspace((unsigned char)*ptr)) {
228 currentString.append(start, ptr - start);
229 start = ptr + 1;
230 setTag(currentTag, currentString);
231 currentString.erase();
232 if (currentTag.Name == "") {
233 state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG;
234 } else {
235 if (*ptr == '>') {
236 if (!tagHandler(currentTag)) {
237 goto endOfProcessing;
239 state = PS_TEXT;
240 } else {
241 state = PS_ATTRIBUTENAME;
245 break;
246 case PS_ATTRIBUTENAME:
247 if ((*ptr == '>') || (*ptr == '=') || isspace((unsigned char)*ptr)) {
248 if ((ptr != start) || !currentString.empty()) {
249 currentString.append(start, ptr - start);
250 for (unsigned int i = 0; i < currentString.length(); ++i) {
251 currentString[i] = toupper(currentString[i]);
253 currentTag.addAttribute(currentString);
254 currentString.erase();
256 start = ptr + 1;
257 if (*ptr == '>') {
258 if (!tagHandler(currentTag)) {
259 goto endOfProcessing;
261 state = PS_TEXT;
262 } else {
263 state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
266 break;
267 case PS_ATTRIBUTEVALUE:
268 if (*ptr == '"') {
269 if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
270 ++quotationCounter;
272 } else if (*ptr == '&') {
273 currentString.append(start, ptr - start);
274 start = ptr + 1;
275 appendString(attributeValueString, currentString);
276 state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
277 state_special = ST_UNKNOWN;
278 } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace((unsigned char)*ptr))) {
279 if ((ptr != start) || !currentString.empty()) {
280 currentString.append(start, ptr - start);
281 if (currentString[0] == '"') {
282 currentString = currentString.substr(1, currentString.length() - 2);
284 appendString(attributeValueString, currentString);
285 currentTag.setLastAttributeValue(attributeValueString);
286 attributeValueString.erase();
287 quotationCounter = 0;
289 start = ptr + 1;
290 if (*ptr == '>') {
291 if (!tagHandler(currentTag)) {
292 goto endOfProcessing;
294 state = PS_TEXT;
295 } else {
296 state = PS_ATTRIBUTENAME;
299 break;
300 case PS_SKIPTAG:
301 if (*ptr == '>') {
302 start = ptr + 1;
303 state = PS_TEXT;
305 break;
308 if (start != endOfBuffer) {
309 switch (state) {
310 case PS_TEXT:
311 if (!characterDataHandler(start, endOfBuffer - start, true)) {
312 goto endOfProcessing;
314 break;
315 case PS_TAGNAME:
316 case PS_ATTRIBUTENAME:
317 case PS_ATTRIBUTEVALUE:
318 currentString.append(start, endOfBuffer - start);
319 break;
320 case PS_SPECIAL:
321 case PS_SPECIAL_IN_ATTRIBUTEVALUE:
322 specialString.append(start, endOfBuffer - start);
323 break;
324 case PS_TAGSTART:
325 case PS_SKIPTAG:
326 case PS_COMMENT:
327 break;
330 offset += length;
331 } while (length == BUFSIZE);
332 endOfProcessing:
333 delete[] buffer;
335 endDocumentHandler();
337 stream.close();