get_attr_value: do not do trim_chars
[elinks.git] / src / document / html / parser / parse.c
blob135f64599b71800de89e500b55aa3242d8e00099
1 /* HTML core parser routines */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <errno.h>
8 #include <stdarg.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
13 #include "elinks.h"
15 #include "document/css/apply.h"
16 #include "document/css/parser.h"
17 #include "document/html/parser/forms.h"
18 #include "document/html/parser/general.h"
19 #include "document/html/parser/link.h"
20 #include "document/html/parser/parse.h"
21 #include "document/html/parser/stack.h"
22 #include "document/html/parser.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memdebug.h"
29 #include "util/memory.h"
30 #include "util/string.h"
32 /* Unsafe macros */
33 #include "document/html/internal.h"
36 #define end_of_tag(c) ((c) == '>' || (c) == '<')
38 static inline int
39 atchr(register unsigned char c)
41 return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));
44 /* This function eats one html element. */
45 /* - e is pointer to the begining of the element (*e must be '<')
46 * - eof is pointer to the end of scanned area
47 * - parsed element name is stored in name, it's length is namelen
48 * - first attribute is stored in attr
49 * - end points to first character behind the html element */
50 /* It returns -1 when it failed (returned values in pointers are invalid) and
51 * 0 for success. */
52 int
53 parse_element(register unsigned char *e, unsigned char *eof,
54 unsigned char **name, int *namelen,
55 unsigned char **attr, unsigned char **end)
57 #define next_char() if (++e == eof) return -1;
59 assert(e && eof);
60 if (e >= eof || *e != '<') return -1;
62 next_char();
63 if (name) *name = e;
65 if (*e == '/') next_char();
66 if (!isident(*e)) return -1;
68 while (isident(*e)) next_char();
70 if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')
71 return -1;
73 if (name && namelen) *namelen = e - *name;
75 while (isspace(*e) || *e == '/' || *e == ':') next_char();
77 /* Skip bad attribute */
78 while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
80 if (attr) *attr = e;
82 next_attr:
83 while (isspace(*e)) next_char();
85 /* Skip bad attribute */
86 while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
88 if (end_of_tag(*e)) goto end;
90 while (atchr(*e)) next_char();
91 while (isspace(*e)) next_char();
93 if (*e != '=') {
94 if (end_of_tag(*e)) goto end;
95 goto next_attr;
97 next_char();
99 while (isspace(*e)) next_char();
101 if (isquote(*e)) {
102 unsigned char quote = *e;
104 /* quoted_value: */
105 next_char();
106 while (*e != quote) next_char();
107 next_char();
108 /* The following apparently handles the case of <foo
109 * id="a""b">, however that is very rare and probably not
110 * conforming. More frequent (and mishandling it more fatal) is
111 * probably the typo of <foo id="a""> - we can handle it as
112 * long as this is commented out. --pasky */
113 /* if (*e == quote) goto quoted_value; */
114 } else {
115 while (!isspace(*e) && !end_of_tag(*e)) next_char();
118 while (isspace(*e)) next_char();
120 if (!end_of_tag(*e)) goto next_attr;
122 end:
123 if (end) *end = e + (*e == '>');
125 return 0;
129 #define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, 0xFF)
131 #define add_chr(s, l, c) \
132 do { \
133 if (!realloc_chrs(&(s), l)) return NULL; \
134 (s)[(l)++] = (c); \
135 } while (0)
137 unsigned char *
138 get_attr_value(register unsigned char *e, unsigned char *name,
139 int cp, enum html_attr_flags flags)
141 unsigned char *n;
142 unsigned char *name_start;
143 unsigned char *attr = NULL;
144 int attrlen = 0;
145 int found;
147 next_attr:
148 skip_space(e);
149 if (end_of_tag(*e) || !atchr(*e)) goto parse_error;
150 n = name;
151 name_start = e;
153 while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;
154 found = !*n && !atchr(*e);
156 if (found && (flags & HTML_ATTR_TEST)) return name_start;
158 while (atchr(*e)) e++;
159 skip_space(e);
160 if (*e != '=') {
161 if (found) goto found_endattr;
162 goto next_attr;
164 e++;
165 skip_space(e);
167 if (found) {
168 if (!isquote(*e)) {
169 while (!isspace(*e) && !end_of_tag(*e)) {
170 if (!*e) goto parse_error;
171 add_chr(attr, attrlen, *e);
172 e++;
174 } else {
175 unsigned char quote = *e;
177 /* parse_quoted_value: */
178 while (*(++e) != quote) {
179 if (*e == ASCII_CR) continue;
180 if (!*e) goto parse_error;
181 if (*e != ASCII_TAB && *e != ASCII_LF)
182 add_chr(attr, attrlen, *e);
183 else if (!(flags & HTML_ATTR_EAT_NL))
184 add_chr(attr, attrlen, ' ');
186 e++;
187 /* The following apparently handles the case of <foo
188 * id="a""b">, however that is very rare and probably
189 * not conforming. More frequent (and mishandling it
190 * more fatal) is probably the typo of <foo id="a""> -
191 * we can handle it as long as this is commented out.
192 * --pasky */
193 #if 0
194 if (*e == quote) {
195 add_chr(attr, attrlen, *e);
196 goto parse_quoted_value;
198 #endif
201 found_endattr:
202 add_chr(attr, attrlen, '\0');
203 attrlen--;
205 if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */
206 memchr(attr, '&', attrlen)) {
207 unsigned char *saved_attr = attr;
209 attr = convert_string(NULL, saved_attr, attrlen, cp,
210 CSM_QUERY, NULL, NULL, NULL);
211 mem_free(saved_attr);
214 set_mem_comment(attr, name, strlen(name));
215 return attr;
217 } else {
218 if (!isquote(*e)) {
219 while (!isspace(*e) && !end_of_tag(*e)) {
220 if (!*e) goto parse_error;
221 e++;
223 } else {
224 unsigned char quote = *e;
226 do {
227 while (*(++e) != quote)
228 if (!*e) goto parse_error;
229 e++;
230 } while (/* See above. *e == quote */ 0);
234 goto next_attr;
236 parse_error:
237 mem_free_if(attr);
238 return NULL;
241 #undef add_chr
244 /* Extract numerical value of attribute @name.
245 * It will return a positive integer value on success,
246 * or -1 on error. */
248 get_num(unsigned char *a, unsigned char *name, int cp)
250 unsigned char *al = get_attr_val(a, name, cp);
251 int result = -1;
253 if (al) {
254 unsigned char *end;
255 long num;
257 errno = 0;
258 num = strtol(al, (char **) &end, 10);
259 if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)
260 result = (int) num;
262 mem_free(al);
265 return result;
268 /* Parse 'width[%],....'-like attribute @name of element @a. If @limited is
269 * set, it will limit the width value to the current usable width. Note that
270 * @limited must be set to be able to parse percentage widths. */
271 /* The function returns width in characters or -1 in case of error. */
273 get_width(unsigned char *a, unsigned char *name, int limited,
274 struct html_context *html_context)
276 unsigned char *value = get_attr_val(a, name, html_context->options->cp);
277 unsigned char *str = value;
278 unsigned char *end;
279 int percentage = 0;
280 int len;
281 long width;
283 if (!value) return -1;
285 /* Skip spaces at start of string if any. */
286 skip_space(str);
288 /* Search for end of string or ',' character (ie. in "100,200") */
289 for (len = 0; str[len] && str[len] != ','; len++);
291 /* Go back, and skip spaces after width if any. */
292 while (len && isspace(str[len - 1])) len--;
293 if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
295 /* Is this a percentage ? */
296 if (str[len - 1] == '%') len--, percentage = 1;
298 /* Skip spaces between width number and percentage if any. */
299 while (len && isspace(str[len - 1])) len--;
300 if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
302 /* Shorten the string a bit, so strtoul() will work on useful
303 * part of it. */
304 str[len] = '\0';
306 /* Convert to number if possible. */
307 errno = 0;
308 width = strtoul((char *) str, (char **) &end, 10);
310 /* @end points into the @value string so check @end position
311 * before freeing @value. */
312 /* We will accept floats but ceil() them. */
313 if (errno || (*end && *end != '.') || width >= INT_MAX) {
314 /* Not a valid number. */
315 mem_free(value);
316 return -1;
319 mem_free(value);
321 #define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;
323 if (limited) {
324 int maxwidth = get_html_max_width();
326 if (percentage) {
327 /* Value is a percentage. */
328 width = width * maxwidth / 100;
329 } else {
330 /* Value is a number of pixels, makes an approximation. */
331 width = WIDTH_PIXELS2CHARS(width);
334 if (width > maxwidth)
335 width = maxwidth;
337 } else {
338 if (percentage) {
339 /* No sense, we need @limited and @maxwidth for percentage. */
340 return -1;
341 } else {
342 /* Value is a number of pixels, makes an approximation,
343 * no limit here */
344 width = WIDTH_PIXELS2CHARS(width);
348 #undef WIDTH_PIXELS2CHARS
350 if (width < 0)
351 width = 0;
353 return width;
357 unsigned char *
358 skip_comment(unsigned char *html, unsigned char *eof)
360 if (html + 4 <= eof && html[2] == '-' && html[3] == '-') {
361 html += 4;
362 while (html < eof) {
363 if (html + 2 <= eof && html[0] == '-' && html[1] == '-') {
364 html += 2;
365 while (html < eof && *html == '-') html++;
366 while (html < eof && isspace(*html)) html++;
367 if (html >= eof) return eof;
368 if (*html == '>') return html + 1;
369 continue;
371 html++;
374 } else {
375 html += 2;
376 while (html < eof) {
377 if (html[0] == '>') return html + 1;
378 html++;
382 return eof;
388 enum element_type {
389 ET_NESTABLE,
390 ET_NON_NESTABLE,
391 ET_NON_PAIRABLE,
392 ET_LI,
395 struct element_info {
396 /* Element name, uppercase. */
397 unsigned char *name;
399 /* Element handler. This does the relevant arguments processing and
400 * formatting (by calling renderer hooks). Note that in a few cases,
401 * this is just a placeholder and the element is given special care
402 * in start_element() (which is also where we call these handlers). */
403 element_handler_T *open;
405 element_handler_T *close;
407 /* How many line-breaks to ensure we have before and after an element.
408 * Value of 1 means the element will be on a line on its own, value
409 * of 2 means that it will also have empty lines before and after.
410 * Note that this does not add up - it just ensures that there is
411 * at least so many linebreaks, but does not add more if that is the
412 * case. Therefore, something like e.g. </pre></p> will add only two
413 * linebreaks, not four. */
414 /* In some stack killing logic, we use some weird heuristic based on
415 * whether an element is block or inline. That is determined from
416 * whether this attribute is zero on non-zero. */
417 int linebreak;
419 enum element_type type;
422 static struct element_info elements[] = {
423 {"A", html_a, NULL, 0, ET_NON_NESTABLE},
424 {"ABBR", html_italic, NULL, 0, ET_NESTABLE },
425 {"ADDRESS", html_address, NULL, 2, ET_NESTABLE },
426 {"APPLET", html_applet, NULL, 1, ET_NON_PAIRABLE},
427 {"B", html_bold, NULL, 0, ET_NESTABLE },
428 {"BASE", html_base, NULL, 0, ET_NON_PAIRABLE},
429 {"BASEFONT", html_font, NULL, 0, ET_NON_PAIRABLE},
430 {"BLOCKQUOTE", html_blockquote, NULL, 2, ET_NESTABLE },
431 {"BODY", html_body, NULL, 0, ET_NESTABLE },
432 {"BR", html_br, NULL, 1, ET_NON_PAIRABLE},
433 {"BUTTON", html_button, NULL, 0, ET_NESTABLE },
434 {"CAPTION", html_center, NULL, 1, ET_NESTABLE },
435 {"CENTER", html_center, NULL, 1, ET_NESTABLE },
436 {"CODE", html_fixed, NULL, 0, ET_NESTABLE },
437 {"DD", html_dd, NULL, 1, ET_NON_PAIRABLE},
438 {"DFN", html_bold, NULL, 0, ET_NESTABLE },
439 {"DIR", html_ul, NULL, 2, ET_NESTABLE },
440 {"DIV", html_linebrk, NULL, 1, ET_NESTABLE },
441 {"DL", html_dl, NULL, 2, ET_NESTABLE },
442 {"DT", html_dt, NULL, 1, ET_NON_PAIRABLE},
443 {"EM", html_italic, NULL, 0, ET_NESTABLE },
444 {"EMBED", html_embed, NULL, 0, ET_NON_PAIRABLE},
445 {"FIXED", html_fixed, NULL, 0, ET_NESTABLE },
446 {"FONT", html_font, NULL, 0, ET_NESTABLE },
447 {"FORM", html_form, NULL, 1, ET_NESTABLE },
448 {"FRAME", html_frame, NULL, 1, ET_NON_PAIRABLE},
449 {"FRAMESET", html_frameset, NULL, 1, ET_NESTABLE },
450 {"H1", html_h1, NULL, 2, ET_NON_NESTABLE},
451 {"H2", html_h2, NULL, 2, ET_NON_NESTABLE},
452 {"H3", html_h3, NULL, 2, ET_NON_NESTABLE},
453 {"H4", html_h4, NULL, 2, ET_NON_NESTABLE},
454 {"H5", html_h5, NULL, 2, ET_NON_NESTABLE},
455 {"H6", html_h6, NULL, 2, ET_NON_NESTABLE},
456 {"HEAD", html_head, NULL, 0, ET_NESTABLE },
457 {"HR", html_hr, NULL, 2, ET_NON_PAIRABLE},
458 {"HTML", html_html, html_html_close, 0, ET_NESTABLE },
459 {"I", html_italic, NULL, 0, ET_NESTABLE },
460 {"IFRAME", html_iframe, NULL, 1, ET_NON_PAIRABLE},
461 {"IMG", html_img, NULL, 0, ET_NON_PAIRABLE},
462 {"INPUT", html_input, NULL, 0, ET_NON_PAIRABLE},
463 {"LI", html_li, NULL, 1, ET_LI },
464 {"LINK", html_link, NULL, 1, ET_NON_PAIRABLE},
465 {"LISTING", html_pre, NULL, 2, ET_NESTABLE },
466 {"MENU", html_ul, NULL, 2, ET_NESTABLE },
467 {"META", html_meta, NULL, 0, ET_NON_PAIRABLE},
468 {"NOFRAMES", html_noframes, NULL, 0, ET_NESTABLE },
469 {"NOSCRIPT", html_noscript, NULL, 0, ET_NESTABLE },
470 {"OBJECT", html_object, NULL, 1, ET_NON_PAIRABLE},
471 {"OL", html_ol, NULL, 2, ET_NESTABLE },
472 {"OPTION", html_option, NULL, 1, ET_NON_PAIRABLE},
473 {"P", html_p, NULL, 2, ET_NON_NESTABLE},
474 {"PRE", html_pre, NULL, 2, ET_NESTABLE },
475 {"Q", html_quote, html_quote_close, 0, ET_NESTABLE },
476 {"S", html_underline, NULL, 0, ET_NESTABLE },
477 {"SCRIPT", html_script, NULL, 0, ET_NESTABLE },
478 {"SELECT", html_select, NULL, 0, ET_NESTABLE },
479 {"SPAN", html_span, NULL, 0, ET_NESTABLE },
480 {"STRIKE", html_underline, NULL, 0, ET_NESTABLE },
481 {"STRONG", html_bold, NULL, 0, ET_NESTABLE },
482 {"STYLE", html_style, html_style_close, 0, ET_NESTABLE },
483 {"SUB", html_subscript, html_subscript_close, 0, ET_NESTABLE },
484 {"SUP", html_superscript, NULL, 0, ET_NESTABLE },
485 {"TABLE", html_table, NULL, 2, ET_NESTABLE },
486 {"TD", html_td, NULL, 0, ET_NESTABLE },
487 {"TEXTAREA", html_textarea, NULL, 0, ET_NON_PAIRABLE},
488 {"TH", html_th, NULL, 0, ET_NESTABLE },
489 {"TITLE", html_title, NULL, 0, ET_NESTABLE },
490 {"TR", html_tr, NULL, 1, ET_NESTABLE },
491 {"TT", html_tt, NULL, 0, ET_NON_NESTABLE},
492 {"U", html_underline, NULL, 0, ET_NESTABLE },
493 {"UL", html_ul, NULL, 2, ET_NESTABLE },
494 {"XMP", html_xmp, html_xmp_close, 2, ET_NESTABLE },
495 {NULL, NULL, NULL, 0, ET_NESTABLE },
498 #define NUMBER_OF_TAGS (sizeof_array(elements) - 1)
501 #ifndef USE_FASTFIND
503 static int
504 compar(const void *a, const void *b)
506 return strcasecmp(((struct element_info *) a)->name,
507 ((struct element_info *) b)->name);
510 #else
512 static struct element_info *internal_pointer;
514 /* Reset internal list pointer */
515 static void
516 tags_list_reset(void)
518 internal_pointer = elements;
521 /* Returns a pointer to a struct that contains
522 * current key and data pointers and increment
523 * internal pointer.
524 * It returns NULL when key is NULL. */
525 static struct fastfind_key_value *
526 tags_list_next(void)
528 static struct fastfind_key_value kv;
530 if (!internal_pointer->name) return NULL;
532 kv.key = internal_pointer->name;
533 kv.data = internal_pointer;
535 internal_pointer++;
537 return &kv;
540 static struct fastfind_index ff_tags_index
541 = INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset, tags_list_next);
543 #endif /* USE_FASTFIND */
546 void
547 init_tags_lookup(void)
549 #ifdef USE_FASTFIND
550 fastfind_index(&ff_tags_index, FF_COMPRESS);
551 #endif
554 void
555 free_tags_lookup(void)
557 #ifdef USE_FASTFIND
558 fastfind_done(&ff_tags_index);
559 #endif
563 static unsigned char *process_element(unsigned char *name, int namelen, int endingtag,
564 unsigned char *html, unsigned char *prev_html,
565 unsigned char *eof, unsigned char *attr,
566 struct html_context *html_context);
568 /* Count the consecutive newline entity references (e.g. "&#13;") at
569 * the beginning of the range from @html to @eof. Store the number of
570 * newlines to *@newlines_out and return the address where they end.
572 * This function currently requires a semicolon at the end of any
573 * entity reference, and does not support U+2028 LINE SEPARATOR and
574 * U+2029 PARAGRAPH SEPARATOR. */
575 static const unsigned char *
576 count_newline_entities(const unsigned char *html, const unsigned char *eof,
577 int *newlines_out)
579 int newlines = 0;
580 int prev_was_cr = 0; /* treat CRLF as one newline, not two */
582 while ((html + 5 < eof && html[0] == '&' && html[1] == '#')) {
583 const unsigned char *peek = html + 2;
584 int this_is_cr;
586 if (*peek == 'x' || *peek == 'X') {
587 ++peek;
588 while (peek < eof && *peek == '0')
589 ++peek;
590 if (peek == eof)
591 break;
592 else if (*peek == 'a' || *peek == 'A')
593 this_is_cr = 0;
594 else if (*peek == 'd' || *peek == 'D')
595 this_is_cr = 1;
596 else
597 break;
598 ++peek;
599 } else {
600 while (peek < eof && *peek == '0')
601 ++peek;
602 if (eof - peek < 2 || *peek != '1')
603 break;
604 else if (peek[1] == '0')
605 this_is_cr = 0;
606 else if (peek[1] == '3')
607 this_is_cr = 1;
608 else
609 break;
610 peek += 2;
612 /* @peek should now be pointing to the semicolon of
613 * e.g. "&#00013;" or "&#x00a;". Or more digits might
614 * follow. */
615 if (peek == eof || *peek != ';')
616 break;
617 ++peek;
619 if (this_is_cr || !prev_was_cr)
620 ++newlines;
621 prev_was_cr = this_is_cr;
622 html = peek;
625 *newlines_out = newlines;
626 return html;
629 void
630 parse_html(unsigned char *html, unsigned char *eof,
631 struct part *part, unsigned char *head,
632 struct html_context *html_context)
634 unsigned char *base_pos = html;
635 int noupdate = 0;
637 html_context->putsp = HTML_SPACE_SUPPRESS;
638 html_context->line_breax = html_context->table_level ? 2 : 1;
639 html_context->position = 0;
640 html_context->was_br = 0;
641 html_context->was_li = 0;
642 html_context->was_body = 0;
643 /* html_context->was_body_background = 0; */
644 html_context->part = part;
645 html_context->eoff = eof;
646 if (head) process_head(html_context, head);
648 main_loop:
649 while (html < eof) {
650 unsigned char *name, *attr, *end;
651 int namelen, endingtag;
652 int dotcounter = 0;
654 if (!noupdate) {
655 html_context->part = part;
656 html_context->eoff = eof;
657 base_pos = html;
658 } else {
659 noupdate = 0;
662 if (isspace(*html) && !html_is_preformatted()) {
663 unsigned char *h = html;
665 while (h < eof && isspace(*h))
666 h++;
667 if (h + 1 < eof && h[0] == '<' && h[1] == '/') {
668 if (!parse_element(h, eof, &name, &namelen, &attr, &end)) {
669 put_chrs(html_context, base_pos, html - base_pos);
670 base_pos = html = h;
671 html_context->putsp = HTML_SPACE_ADD;
672 goto element;
675 html++;
676 if (!(html_context->position + (html - base_pos - 1)))
677 goto skip_w; /* ??? */
678 if (*(html - 1) == ' ') { /* Do not replace with isspace() ! --Zas */
679 /* BIG performance win; not sure if it doesn't cause any bug */
680 if (html < eof && !isspace(*html)) {
681 noupdate = 1;
682 continue;
684 put_chrs(html_context, base_pos, html - base_pos);
685 } else {
686 put_chrs(html_context, base_pos, html - base_pos - 1);
687 put_chrs(html_context, " ", 1);
690 skip_w:
691 while (html < eof && isspace(*html))
692 html++;
693 continue;
696 if (html_is_preformatted()) {
697 html_context->putsp = HTML_SPACE_NORMAL;
698 if (*html == ASCII_TAB) {
699 put_chrs(html_context, base_pos, html - base_pos);
700 put_chrs(html_context, " ",
701 8 - (html_context->position % 8));
702 html++;
703 continue;
705 } else if (*html == ASCII_CR || *html == ASCII_LF) {
706 put_chrs(html_context, base_pos, html - base_pos);
707 if (html - base_pos == 0 && html_context->line_breax > 0)
708 html_context->line_breax--;
709 next_break:
710 if (*html == ASCII_CR && html < eof - 1
711 && html[1] == ASCII_LF)
712 html++;
713 ln_break(html_context, 1);
714 html++;
715 if (*html == ASCII_CR || *html == ASCII_LF) {
716 html_context->line_breax = 0;
717 goto next_break;
719 continue;
721 } else if (html + 5 < eof && *html == '&') {
722 /* Really nasty hack to make &#13; handling in
723 * <pre>-tags lynx-compatible. It works around
724 * the entity handling done in the renderer,
725 * since checking #13 value there would require
726 * something along the lines of NBSP_CHAR or
727 * checking for '\n's in AT_PREFORMATTED text. */
728 /* See bug 52 and 387 for more info. */
729 int length = html - base_pos;
730 int newlines;
732 html = (unsigned char *) count_newline_entities(html, eof, &newlines);
733 if (newlines) {
734 put_chrs(html_context, base_pos, length);
735 ln_break(html_context, newlines);
736 continue;
741 while (*html < ' ') {
742 if (html - base_pos)
743 put_chrs(html_context, base_pos, html - base_pos);
745 dotcounter++;
746 base_pos = ++html;
747 if (*html >= ' ' || isspace(*html) || html >= eof) {
748 unsigned char *dots = fmem_alloc(dotcounter);
750 if (dots) {
751 memset(dots, '.', dotcounter);
752 put_chrs(html_context, dots, dotcounter);
753 fmem_free(dots);
755 goto main_loop;
759 if (html + 2 <= eof && html[0] == '<' && (html[1] == '!' || html[1] == '?')
760 && !(html_context->was_xmp || html_context->was_style)) {
761 put_chrs(html_context, base_pos, html - base_pos);
762 html = skip_comment(html, eof);
763 continue;
766 if (*html != '<' || parse_element(html, eof, &name, &namelen, &attr, &end)) {
767 html++;
768 noupdate = 1;
769 continue;
772 element:
773 endingtag = *name == '/'; name += endingtag; namelen -= endingtag;
774 if (!endingtag && html_context->putsp == HTML_SPACE_ADD && !html_top->invisible)
775 put_chrs(html_context, " ", 1);
776 put_chrs(html_context, base_pos, html - base_pos);
777 if (!html_is_preformatted() && !endingtag && html_context->putsp == HTML_SPACE_NORMAL) {
778 unsigned char *ee = end;
779 unsigned char *nm;
781 while (!parse_element(ee, eof, &nm, NULL, NULL, &ee))
782 if (*nm == '/')
783 goto ng;
784 if (ee < eof && isspace(*ee)) {
785 put_chrs(html_context, " ", 1);
789 html = process_element(name, namelen, endingtag, end, html, eof, attr, html_context);
792 if (noupdate) put_chrs(html_context, base_pos, html - base_pos);
793 ln_break(html_context, 1);
794 /* Restore the part in case the html_context was trashed in the last
795 * iteration so that when destroying the stack in the caller we still
796 * get the right part pointer. */
797 html_context->part = part;
798 html_context->putsp = HTML_SPACE_SUPPRESS;
799 html_context->position = 0;
800 html_context->was_br = 0;
803 static unsigned char *
804 start_element(struct element_info *ei,
805 unsigned char *name, int namelen,
806 unsigned char *html,
807 unsigned char *eof, unsigned char *attr,
808 struct html_context *html_context)
810 #define ELEMENT_RENDER_PROLOGUE \
811 ln_break(html_context, ei->linebreak); \
812 a = get_attr_val(attr, "id", html_context->options->cp); \
813 if (a) { \
814 html_context->special_f(html_context, SP_TAG, a); \
815 mem_free(a); \
818 unsigned char *a;
819 struct par_attrib old_format;
820 int restore_format;
821 #ifdef CONFIG_CSS
822 struct css_selector *selector = NULL;
823 #endif
825 if (html_top->type == ELEMENT_WEAK) {
826 pop_html_element(html_context);
829 /* We try to process nested <script> if we didn't process the parent
830 * one. */
831 if (html_top->invisible
832 && (ei->open != html_script || html_top->invisible < 2)) {
833 ELEMENT_RENDER_PROLOGUE
834 return html;
837 restore_format = html_is_preformatted();
838 old_format = par_format;
840 /* Support for <meta refresh="..."> inside <body>. (bug 700) */
841 if (ei->open == html_meta && html_context->was_body) {
842 html_handle_body_meta(html_context, name - 1, eof);
843 html_context->was_body = 0;
846 #ifdef CONFIG_CSS
847 if (ei->open == html_style && html_context->options->css_enable) {
848 css_parse_stylesheet(&html_context->css_styles,
849 html_context->base_href, html, eof);
851 #endif
853 if (ei->type == ET_NON_NESTABLE || ei->type == ET_LI) {
854 struct html_element *e;
856 if (ei->type == ET_NON_NESTABLE) {
857 foreach (e, html_context->stack) {
858 if (e->type < ELEMENT_KILLABLE) break;
859 if (is_block_element(e) || is_inline_element(ei)) break;
861 } else {
862 foreach (e, html_context->stack) {
863 if (is_block_element(e) && is_inline_element(ei)) break;
864 if (e->type < ELEMENT_KILLABLE) break;
865 if (!strlcasecmp(e->name, e->namelen, name, namelen)) break;
869 if (!strlcasecmp(e->name, e->namelen, name, namelen)) {
870 while (e->prev != (void *) &html_context->stack)
871 kill_html_stack_item(html_context, e->prev);
873 if (e->type > ELEMENT_IMMORTAL)
874 kill_html_stack_item(html_context, e);
878 if (ei->type != ET_NON_PAIRABLE) {
879 html_stack_dup(html_context, ELEMENT_KILLABLE);
880 html_top->name = name;
881 html_top->namelen = namelen;
882 html_top->options = attr;
883 html_top->linebreak = ei->linebreak;
885 #ifdef CONFIG_ECMASCRIPT
886 if (has_attr(attr, "onClick", html_context->options->cp)) {
887 /* XXX: Put something better to format.link. --pasky */
888 mem_free_set(&format.link, stracpy("javascript:void(0);"));
889 mem_free_set(&format.target, stracpy(html_context->base_target));
890 format.style.fg = format.clink;
891 html_top->pseudo_class = ELEMENT_LINK;
892 mem_free_set(&format.title, stracpy("onClick placeholder"));
893 /* Er. I know. Well, double html_focusable()s shouldn't
894 * really hurt. */
895 html_focusable(html_context, attr);
897 #endif
900 #ifdef CONFIG_CSS
901 if (html_top->options && html_context->options->css_enable) {
902 /* XXX: We should apply CSS otherwise as well, but that'll need
903 * some deeper changes in order to have options filled etc.
904 * Probably just applying CSS from more places, since we
905 * usually have type != ET_NESTABLE when we either (1)
906 * rescan on your own from somewhere else (2) html_stack_dup()
907 * in our own way. --pasky */
908 /* Call it now to gain some of the stuff which might affect
909 * formatting of some elements. */
910 /* FIXME: The caching of the CSS selector is broken, since t can
911 * lead to wrong styles being applied to following elements, so
912 * disabled for now. */
913 selector = get_css_selector_for_element(html_context, html_top,
914 &html_context->css_styles,
915 &html_context->stack);
917 if (selector) {
918 apply_css_selector_style(html_context, html_top, selector);
919 done_css_selector(selector);
922 /* Now this was the reason for this whole funny ELEMENT_RENDER_PROLOGUE
923 * bussiness. Only now we have the definitive linebreak value, since
924 * that's what the display: property plays with. */
925 #endif
926 ELEMENT_RENDER_PROLOGUE
927 if (ei->open) ei->open(html_context, attr, html, eof, &html);
928 #ifdef CONFIG_CSS
929 if (selector && html_top->options) {
930 /* Call it now to override default colors of the elements. */
931 selector = get_css_selector_for_element(html_context, html_top,
932 &html_context->css_styles,
933 &html_context->stack);
935 if (selector) {
936 apply_css_selector_style(html_context, html_top, selector);
937 done_css_selector(selector);
940 #endif
942 if (ei->open != html_br) html_context->was_br = 0;
944 if (restore_format) par_format = old_format;
946 return html;
947 #undef ELEMENT_RENDER_PROLOGUE
950 static unsigned char *
951 end_element(struct element_info *ei,
952 unsigned char *name, int namelen,
953 unsigned char *html,
954 unsigned char *eof, unsigned char *attr,
955 struct html_context *html_context)
957 struct html_element *e, *elt;
958 int lnb = 0;
959 int kill = 0;
961 html_context->was_br = 0;
962 if (ei->type == ET_NON_PAIRABLE || ei->type == ET_LI)
963 return html;
965 if (ei->close) ei->close(html_context, attr, html, eof, &html);
967 /* dump_html_stack(html_context); */
968 foreach (e, html_context->stack) {
969 if (is_block_element(e) && is_inline_element(ei)) kill = 1;
970 if (strlcasecmp(e->name, e->namelen, name, namelen)) {
971 if (e->type < ELEMENT_KILLABLE)
972 break;
973 else
974 continue;
976 if (kill) {
977 kill_html_stack_item(html_context, e);
978 break;
980 for (elt = e;
981 elt != (void *) &html_context->stack;
982 elt = elt->prev)
983 if (elt->linebreak > lnb)
984 lnb = elt->linebreak;
986 /* This hack forces a line break after a list end. It is needed
987 * when ending a list with the last <li> having no text the
988 * line_breax is 2 so the ending list's linebreak will be
989 * ignored when calling ln_break(). */
990 if (html_context->was_li)
991 html_context->line_breax = 0;
993 ln_break(html_context, lnb);
994 while (e->prev != (void *) &html_context->stack)
995 kill_html_stack_item(html_context, e->prev);
996 kill_html_stack_item(html_context, e);
997 break;
999 /* dump_html_stack(html_context); */
1001 return html;
1004 static unsigned char *
1005 process_element(unsigned char *name, int namelen, int endingtag,
1006 unsigned char *html, unsigned char *prev_html,
1007 unsigned char *eof, unsigned char *attr,
1008 struct html_context *html_context)
1011 struct element_info *ei;
1013 #ifndef USE_FASTFIND
1015 struct element_info elem;
1016 unsigned char tmp;
1018 tmp = name[namelen];
1019 name[namelen] = '\0';
1021 elem.name = name;
1022 ei = bsearch(&elem, elements, NUMBER_OF_TAGS, sizeof(elem), compar);
1023 name[namelen] = tmp;
1025 #else
1026 ei = (struct element_info *) fastfind_search(&ff_tags_index, name, namelen);
1027 #endif
1028 if (html_context->was_xmp || html_context->was_style) {
1029 if (!ei || (ei->open != html_xmp && ei->open != html_style) || !endingtag) {
1030 put_chrs(html_context, "<", 1);
1031 return prev_html + 1;
1035 if (!ei) return html;
1037 if (!endingtag) {
1038 return start_element(ei, name, namelen, html, eof, attr, html_context);
1039 } else {
1040 return end_element(ei, name, namelen, html, eof, attr, html_context);
1044 void
1045 scan_http_equiv(unsigned char *s, unsigned char *eof, struct string *head,
1046 struct string *title, struct document_options *options)
1048 unsigned char *name, *attr, *he, *c;
1049 int namelen;
1051 if (title && !init_string(title)) return;
1053 add_char_to_string(head, '\n');
1056 while (s < eof && *s != '<') {
1058 s++;
1060 if (s >= eof) return;
1061 if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1062 s = skip_comment(s, eof);
1063 goto se;
1065 if (parse_element(s, eof, &name, &namelen, &attr, &s)) goto sp;
1068 if (!strlcasecmp(name, namelen, "HEAD", 4)) goto se;
1069 if (!strlcasecmp(name, namelen, "/HEAD", 5)) return;
1070 if (!strlcasecmp(name, namelen, "BODY", 4)) return;
1071 if (title && !title->length && !strlcasecmp(name, namelen, "TITLE", 5)) {
1072 unsigned char *s1;
1074 xse:
1075 s1 = s;
1076 while (s < eof && *s != '<') {
1077 xsp:
1078 s++;
1080 if (s - s1)
1081 add_bytes_to_string(title, s1, s - s1);
1082 if (s >= eof) goto se;
1083 if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1084 s = skip_comment(s, eof);
1085 goto xse;
1087 if (parse_element(s, eof, &name, &namelen, &attr, &s)) {
1088 s1 = s;
1089 goto xsp;
1091 clr_spaces(title->source);
1092 goto ps;
1094 if (strlcasecmp(name, namelen, "META", 4)) goto se;
1096 he = get_attr_val(attr, "charset", options->cp);
1097 if (he) {
1098 add_to_string(head, "Charset: ");
1099 add_to_string(head, he);
1100 mem_free(he);
1103 he = get_attr_val(attr, "http-equiv", options->cp);
1104 if (!he) goto se;
1106 add_to_string(head, he);
1107 mem_free(he);
1109 c = get_attr_val(attr, "content", options->cp);
1110 if (c) {
1111 add_to_string(head, ": ");
1112 add_to_string(head, c);
1113 mem_free(c);
1116 add_crlf_to_string(head);
1117 goto se;