Fix out of bound access to the scanned string
[elinks.git] / src / dom / sgml / scanner.c
blobb0208fb0299a000c0d46b2e2e3cf4280614ace19
1 /* SGML token scanner utilities */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <stdio.h>
8 #include <string.h>
10 #include "elinks.h"
12 #include "dom/scanner.h"
13 #include "dom/sgml/scanner.h"
14 #include "dom/string.h"
15 #include "util/error.h"
18 /* Bitmap entries for the SGML character groups used in the scanner table */
20 enum sgml_char_group {
21 SGML_CHAR_ENTITY = (1 << 1),
22 SGML_CHAR_IDENT = (1 << 2),
23 SGML_CHAR_NEWLINE = (1 << 3),
24 SGML_CHAR_WHITESPACE = (1 << 4),
25 SGML_CHAR_NOT_TEXT = (1 << 5),
26 SGML_CHAR_NOT_ATTRIBUTE = (1 << 6),
29 static struct dom_scan_table_info sgml_scan_table_info[] = {
30 DOM_SCAN_TABLE_RANGE("0", '9', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
31 DOM_SCAN_TABLE_RANGE("A", 'Z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
32 DOM_SCAN_TABLE_RANGE("a", 'z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
33 /* For the octal number impared (me including) \241 is 161 --jonas */
34 DOM_SCAN_TABLE_RANGE("\241", 255, SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
36 DOM_SCAN_TABLE_STRING("-_:.", SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
37 DOM_SCAN_TABLE_STRING("#", SGML_CHAR_ENTITY),
38 DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE),
39 DOM_SCAN_TABLE_STRING("\f\n", SGML_CHAR_NEWLINE),
40 DOM_SCAN_TABLE_STRING("<&", SGML_CHAR_NOT_TEXT),
41 DOM_SCAN_TABLE_STRING("<=>", SGML_CHAR_NOT_ATTRIBUTE),
43 DOM_SCAN_TABLE_END,
46 #define SGML_STRING_MAP(str, type, family) \
47 { INIT_DOM_STRING(str, -1), SGML_TOKEN_##type, SGML_TOKEN_##family }
49 static struct dom_scanner_string_mapping sgml_string_mappings[] = {
50 SGML_STRING_MAP("--", NOTATION_COMMENT, NOTATION),
51 SGML_STRING_MAP("ATTLIST", NOTATION_ATTLIST, NOTATION),
52 SGML_STRING_MAP("DOCTYPE", NOTATION_DOCTYPE, NOTATION),
53 SGML_STRING_MAP("ELEMENT", NOTATION_ELEMENT, NOTATION),
54 SGML_STRING_MAP("ENTITY", NOTATION_ENTITY, NOTATION),
56 SGML_STRING_MAP("xml", PROCESS_XML, PROCESS),
57 SGML_STRING_MAP("xml-stylesheet", PROCESS_XML_STYLESHEET, PROCESS),
59 DOM_STRING_MAP_END,
62 static struct dom_scanner_token *scan_sgml_tokens(struct dom_scanner *scanner);
64 struct dom_scanner_info sgml_scanner_info = {
65 sgml_string_mappings,
66 sgml_scan_table_info,
67 scan_sgml_tokens,
70 #define check_sgml_table(c, bit) (sgml_scanner_info.scan_table[(c)] & (bit))
72 #define scan_sgml(scanner, s, bit) \
73 while ((s) < (scanner)->end && check_sgml_table(*(s), bit)) (s)++;
75 #define is_sgml_ident(c) check_sgml_table(c, SGML_CHAR_IDENT)
76 #define is_sgml_entity(c) check_sgml_table(c, SGML_CHAR_ENTITY)
77 #define is_sgml_space(c) check_sgml_table(c, SGML_CHAR_WHITESPACE)
78 #define is_sgml_newline(c) check_sgml_table(c, SGML_CHAR_NEWLINE)
79 #define is_sgml_text(c) !check_sgml_table(c, SGML_CHAR_NOT_TEXT)
80 #define is_sgml_token_start(c) check_sgml_table(c, SGML_CHAR_TOKEN_START)
81 #define is_sgml_attribute(c) !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE)
83 static inline void
84 skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
86 unsigned char *pos = *string;
88 if (!scanner->count_lines) {
89 scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE);
90 } else {
91 while (pos < scanner->end && is_sgml_space(*pos)) {
92 if (is_sgml_newline(*pos))
93 scanner->lineno++;
94 pos++;
98 *string = pos;
102 /* Text token scanning */
104 /* I think it is faster to not check the table here --jonas */
105 #define foreach_sgml_cdata(scanner, str) \
106 for (; ((str) < (scanner)->end && *(str) != '<' && *(str) != '&'); (str)++)
108 static inline void
109 scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
111 unsigned char *string = scanner->position;
112 unsigned char first_char = *string;
113 enum sgml_token_type type = SGML_TOKEN_GARBAGE;
114 int real_length = -1;
116 /* In scan_sgml_tokens() we check that first_char != '<' */
117 assert(first_char != '<' && scanner->state == SGML_STATE_TEXT);
119 token->string.string = string++;
121 if (first_char == '&') {
122 if (is_sgml_entity(*string)) {
123 scan_sgml(scanner, string, SGML_CHAR_ENTITY);
124 type = SGML_TOKEN_ENTITY;
125 token->string.string++;
126 real_length = string - token->string.string;
129 foreach_sgml_cdata (scanner, string) {
130 if (*string == ';') {
131 string++;
132 break;
136 } else {
137 if (is_sgml_space(first_char)) {
138 skip_sgml_space(scanner, &string);
139 type = string < scanner->end && is_sgml_text(*string)
140 ? SGML_TOKEN_TEXT : SGML_TOKEN_SPACE;
141 } else {
142 type = SGML_TOKEN_TEXT;
145 foreach_sgml_cdata (scanner, string) {
146 /* m33p */;
150 token->type = type;
151 token->string.length = real_length >= 0 ? real_length : string - token->string.string;
152 token->precedence = get_sgml_precedence(type);
153 scanner->position = string;
157 /* Element scanning */
159 /* Check whether it is safe to skip the @token when looking for @skipto. */
160 static inline int
161 check_sgml_precedence(int type, int skipto)
163 return get_sgml_precedence(type) <= get_sgml_precedence(skipto);
166 /* Skip until @skipto is found, without taking precedence into account. */
167 static inline unsigned char *
168 skip_sgml_chars(struct dom_scanner *scanner, unsigned char *string,
169 unsigned char skipto)
171 int newlines;
173 assert(string >= scanner->position && string <= scanner->end);
175 if (!scanner->count_lines) {
176 size_t length = scanner->end - string;
178 return memchr(string, skipto, length);
181 for (newlines = 0; string < scanner->end; string++) {
182 if (is_sgml_newline(*string))
183 newlines++;
184 if (*string == skipto) {
185 /* Only count newlines if we actually find the
186 * requested char. Else callers are assumed to discard
187 * the scanning. */
188 scanner->lineno += newlines;
189 return string;
193 return NULL;
196 /* XXX: Only element or ``in tag'' precedence is handled correctly however
197 * using this function for CDATA or text would be overkill. */
198 static inline unsigned char *
199 skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char skipto,
200 int check_quoting)
202 unsigned char *pos = *string;
204 for (; pos < scanner->end; pos++) {
205 if (*pos == skipto) {
206 *string = pos + 1;
207 return pos;
210 if (!check_sgml_precedence(*pos, skipto))
211 break;
213 if (check_quoting && isquote(*pos)) {
214 unsigned char *end;
216 end = skip_sgml_chars(scanner, pos + 1, *pos);
217 if (end) pos = end;
219 } else if (scanner->count_lines && is_sgml_newline(*pos)) {
220 scanner->lineno++;
224 *string = pos;
225 return NULL;
228 static inline int
229 skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
231 unsigned char *pos = *string;
232 int length = 0;
234 for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
235 /* It is always safe to access index -2 and -1 here since we
236 * are supposed to have '<!--' before this is called. We do
237 * however need to check that the '-->' are not overlapping any
238 * preceeding '-'. */
239 if (pos[-2] == '-' && pos[-1] == '-' && &pos[-2] >= *string) {
240 length = pos - *string - 2;
241 pos++;
242 break;
246 if (!pos) {
247 pos = scanner->end;
248 length = pos - *string;
251 *string = pos;
252 return length;
255 static inline int
256 skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
258 unsigned char *pos = *string;
259 int length = 0;
261 for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
262 /* It is always safe to access index -2 and -1 here since we
263 * are supposed to have '<![CDATA[' before this is called. */
264 if (pos[-2] == ']' && pos[-1] == ']') {
265 length = pos - *string - 2;
266 pos++;
267 break;
271 if (!pos) {
272 pos = scanner->end;
273 length = pos - *string;
276 *string = pos;
277 return length;
280 #define scan_sgml_attribute(scanner, str) \
281 while ((str) < (scanner)->end && is_sgml_attribute(*(str))) \
282 (str)++;
284 static inline void
285 scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
287 unsigned char *string = scanner->position;
288 unsigned char first_char = *string;
289 enum sgml_token_type type = SGML_TOKEN_GARBAGE;
290 int real_length = -1;
292 token->string.string = string++;
294 if (first_char == '<') {
295 skip_sgml_space(scanner, &string);
297 if (string == scanner->end) {
298 /* Prevent out of bound access. */
300 } else if (scanner->state == SGML_STATE_ELEMENT) {
301 /* Already inside an element so insert a tag end token
302 * and continue scanning in next iteration. */
303 string--;
304 real_length = 0;
305 type = SGML_TOKEN_TAG_END;
306 scanner->state = SGML_STATE_TEXT;
308 } else if (is_sgml_ident(*string)) {
309 token->string.string = string;
310 scan_sgml(scanner, string, SGML_CHAR_IDENT);
312 real_length = string - token->string.string;
314 skip_sgml_space(scanner, &string);
315 if (string < scanner->end && *string == '>') {
316 type = SGML_TOKEN_ELEMENT;
317 string++;
318 } else {
319 scanner->state = SGML_STATE_ELEMENT;
320 type = SGML_TOKEN_ELEMENT_BEGIN;
323 } else if (*string == '!') {
324 unsigned char *ident;
325 enum sgml_token_type base = SGML_TOKEN_NOTATION;
327 string++;
328 skip_sgml_space(scanner, &string);
329 token->string.string = ident = string;
331 if (string + 1 < scanner->end
332 && string[0] == '-' && string[1] == '-') {
333 string += 2;
334 type = SGML_TOKEN_NOTATION_COMMENT;
335 token->string.string = string;
336 real_length = skip_sgml_comment(scanner, &string);
337 assert(real_length >= 0);
339 } else if (string + 6 < scanner->end
340 && !memcmp(string, "[CDATA[", 7)) {
342 string += 7;
343 type = SGML_TOKEN_CDATA_SECTION;
344 token->string.string = string;
345 real_length = skip_sgml_cdata_section(scanner, &string);
346 assert(real_length >= 0);
348 } else {
349 skip_sgml_space(scanner, &string);
350 type = map_dom_scanner_string(scanner, ident, string, base);
351 skip_sgml(scanner, &string, '>', 0);
354 } else if (*string == '?') {
355 unsigned char *pos;
356 enum sgml_token_type base = SGML_TOKEN_PROCESS;
358 string++;
359 skip_sgml_space(scanner, &string);
360 token->string.string = pos = string;
361 scan_sgml(scanner, string, SGML_CHAR_IDENT);
363 type = map_dom_scanner_string(scanner, pos, string, base);
365 scanner->state = SGML_STATE_PROC_INST;
367 } else if (*string == '/') {
368 string++;
369 skip_sgml_space(scanner, &string);
371 if (string == scanner->end) {
372 /* Prevent out of bound access. */
374 } else if (is_sgml_ident(*string)) {
375 token->string.string = string;
376 scan_sgml(scanner, string, SGML_CHAR_IDENT);
377 real_length = string - token->string.string;
379 type = SGML_TOKEN_ELEMENT_END;
380 skip_sgml(scanner, &string, '>', 1);
382 } else if (*string == '>') {
383 string++;
384 real_length = 0;
385 type = SGML_TOKEN_ELEMENT_END;
388 if (type != SGML_TOKEN_GARBAGE)
389 scanner->state = SGML_STATE_TEXT;
391 } else {
392 /* Alien < > stuff so ignore it */
393 skip_sgml(scanner, &string, '>', 0);
396 } else if (first_char == '=') {
397 type = '=';
399 } else if (first_char == '?' || first_char == '>') {
400 if (first_char == '?') {
401 skip_sgml(scanner, &string, '>', 0);
404 type = SGML_TOKEN_TAG_END;
405 assert(scanner->state == SGML_STATE_ELEMENT);
406 scanner->state = SGML_STATE_TEXT;
408 } else if (first_char == '/') {
409 if (string == scanner->end) {
410 /* Prevent out of bound access. */
412 } else if (*string == '>') {
413 string++;
414 real_length = 0;
415 type = SGML_TOKEN_ELEMENT_EMPTY_END;
416 assert(scanner->state == SGML_STATE_ELEMENT);
417 scanner->state = SGML_STATE_TEXT;
418 } else if (is_sgml_attribute(*string)) {
419 scan_sgml_attribute(scanner, string);
420 type = SGML_TOKEN_ATTRIBUTE;
421 if (string[-1] == '/' && string[0] == '>')
422 string--;
425 } else if (isquote(first_char)) {
426 unsigned char *string_end = skip_sgml_chars(scanner, string, first_char);
428 if (string_end) {
429 /* We don't want the delimiters in the token */
430 token->string.string++;
431 real_length = string_end - token->string.string;
432 string = string_end + 1;
433 type = SGML_TOKEN_STRING;
435 } else if (string < scanner->end
436 && is_sgml_attribute(*string)) {
438 token->string.string++;
439 scan_sgml_attribute(scanner, string);
440 type = SGML_TOKEN_ATTRIBUTE;
443 } else if (is_sgml_attribute(first_char)) {
444 if (is_sgml_ident(first_char)) {
445 scan_sgml(scanner, string, SGML_CHAR_IDENT);
446 type = SGML_TOKEN_IDENT;
449 if (string < scanner->end
450 && is_sgml_attribute(*string)) {
451 scan_sgml_attribute(scanner, string);
452 type = SGML_TOKEN_ATTRIBUTE;
453 if (string[-1] == '/' && string[0] == '>')
454 string--;
458 token->type = type;
459 token->string.length = real_length >= 0 ? real_length : string - token->string.string;
460 token->precedence = get_sgml_precedence(type);
461 scanner->position = string;
465 /* Processing instruction data scanning */
467 static inline void
468 scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
470 unsigned char *string = scanner->position;
472 token->string.string = string;
474 /* Figure out where the processing instruction ends. This doesn't use
475 * skip_sgml() since we MUST ignore precedence here to allow '<' inside
476 * the data part to be skipped correctly. */
477 for ( ; (string = skip_sgml_chars(scanner, string, '>')); string++) {
478 if (string[-1] == '?') {
479 string++;
480 break;
484 if (!string) string = scanner->end;
486 token->type = SGML_TOKEN_PROCESS_DATA;
487 token->string.length = string - token->string.string - 2;
488 token->precedence = get_sgml_precedence(token->type);
489 scanner->position = string;
490 scanner->state = SGML_STATE_TEXT;
494 /* Scanner multiplexor */
496 static struct dom_scanner_token *
497 scan_sgml_tokens(struct dom_scanner *scanner)
499 struct dom_scanner_token *table_end = scanner->table + DOM_SCANNER_TOKENS;
500 struct dom_scanner_token *current;
502 if (!begin_dom_token_scanning(scanner))
503 return get_dom_scanner_token(scanner);
505 /* Scan tokens until we fill the table */
506 for (current = scanner->table + scanner->tokens;
507 current < table_end && scanner->position < scanner->end;
508 current++) {
509 if (scanner->state == SGML_STATE_ELEMENT
510 || (*scanner->position == '<'
511 && scanner->state != SGML_STATE_PROC_INST)) {
512 skip_sgml_space(scanner, &scanner->position);
513 if (scanner->position >= scanner->end) break;
515 scan_sgml_element_token(scanner, current);
517 /* Shall we scratch this token? */
518 if (current->type == SGML_TOKEN_SKIP) {
519 current--;
522 } else if (scanner->state == SGML_STATE_TEXT) {
523 scan_sgml_text_token(scanner, current);
525 } else {
526 skip_sgml_space(scanner, &scanner->position);
527 scan_sgml_proc_inst_token(scanner, current);
531 return end_dom_token_scanning(scanner, current);