1 /* HTML core parser routines */
15 #include "document/css/apply.h"
16 #include "document/css/parser.h"
17 #include "document/html/parser/forms.h"
18 #include "document/html/parser/general.h"
19 #include "document/html/parser/link.h"
20 #include "document/html/parser/parse.h"
21 #include "document/html/parser/stack.h"
22 #include "document/html/parser.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memdebug.h"
29 #include "util/memory.h"
30 #include "util/string.h"
33 #include "document/html/internal.h"
36 #define end_of_tag(c) ((c) == '>' || (c) == '<')
39 atchr(register unsigned char c
)
41 return (c
< 127 && (c
> '>' || (c
> ' ' && c
!= '=' && !end_of_tag(c
))));
44 /* This function eats one html element. */
45 /* - e is pointer to the begining of the element (*e must be '<')
46 * - eof is pointer to the end of scanned area
47 * - parsed element name is stored in name, it's length is namelen
48 * - first attribute is stored in attr
49 * - end points to first character behind the html element */
50 /* It returns -1 when it failed (returned values in pointers are invalid) and
53 parse_element(register unsigned char *e
, unsigned char *eof
,
54 unsigned char **name
, int *namelen
,
55 unsigned char **attr
, unsigned char **end
)
57 #define next_char() if (++e == eof) return -1;
60 if (e
>= eof
|| *e
!= '<') return -1;
65 if (*e
== '/') next_char();
66 if (!isident(*e
)) return -1;
68 while (isident(*e
)) next_char();
70 if (!isspace(*e
) && !end_of_tag(*e
) && *e
!= '/' && *e
!= ':' && *e
!= '=')
73 if (name
&& namelen
) *namelen
= e
- *name
;
75 while (isspace(*e
) || *e
== '/' || *e
== ':') next_char();
77 /* Skip bad attribute */
78 while (!atchr(*e
) && !end_of_tag(*e
) && !isspace(*e
)) next_char();
83 while (isspace(*e
)) next_char();
85 /* Skip bad attribute */
86 while (!atchr(*e
) && !end_of_tag(*e
) && !isspace(*e
)) next_char();
88 if (end_of_tag(*e
)) goto end
;
90 while (atchr(*e
)) next_char();
91 while (isspace(*e
)) next_char();
94 if (end_of_tag(*e
)) goto end
;
99 while (isspace(*e
)) next_char();
102 unsigned char quote
= *e
;
106 while (*e
!= quote
) next_char();
108 /* The following apparently handles the case of <foo
109 * id="a""b">, however that is very rare and probably not
110 * conforming. More frequent (and mishandling it more fatal) is
111 * probably the typo of <foo id="a""> - we can handle it as
112 * long as this is commented out. --pasky */
113 /* if (*e == quote) goto quoted_value; */
115 while (!isspace(*e
) && !end_of_tag(*e
)) next_char();
118 while (isspace(*e
)) next_char();
120 if (!end_of_tag(*e
)) goto next_attr
;
123 if (end
) *end
= e
+ (*e
== '>');
129 #define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, 0xFF)
131 #define add_chr(s, l, c) \
133 if (!realloc_chrs(&(s), l)) return NULL; \
138 get_attr_value(register unsigned char *e
, unsigned char *name
,
139 int cp
, enum html_attr_flags flags
)
142 unsigned char *name_start
;
143 unsigned char *attr
= NULL
;
149 if (end_of_tag(*e
) || !atchr(*e
)) goto parse_error
;
153 while (atchr(*n
) && atchr(*e
) && toupper(*e
) == toupper(*n
)) e
++, n
++;
154 found
= !*n
&& !atchr(*e
);
156 if (found
&& (flags
& HTML_ATTR_TEST
)) return name_start
;
158 while (atchr(*e
)) e
++;
161 if (found
) goto found_endattr
;
169 while (!isspace(*e
) && !end_of_tag(*e
)) {
170 if (!*e
) goto parse_error
;
171 add_chr(attr
, attrlen
, *e
);
175 unsigned char quote
= *e
;
177 /* parse_quoted_value: */
178 while (*(++e
) != quote
) {
179 if (*e
== ASCII_CR
) continue;
180 if (!*e
) goto parse_error
;
181 if (*e
!= ASCII_TAB
&& *e
!= ASCII_LF
)
182 add_chr(attr
, attrlen
, *e
);
183 else if (!(flags
& HTML_ATTR_EAT_NL
))
184 add_chr(attr
, attrlen
, ' ');
187 /* The following apparently handles the case of <foo
188 * id="a""b">, however that is very rare and probably
189 * not conforming. More frequent (and mishandling it
190 * more fatal) is probably the typo of <foo id="a""> -
191 * we can handle it as long as this is commented out.
195 add_chr(attr
, attrlen
, *e
);
196 goto parse_quoted_value
;
202 add_chr(attr
, attrlen
, '\0');
205 if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */
206 memchr(attr
, '&', attrlen
)) {
207 unsigned char *saved_attr
= attr
;
209 attr
= convert_string(NULL
, saved_attr
, attrlen
, cp
,
210 CSM_QUERY
, NULL
, NULL
, NULL
);
211 mem_free(saved_attr
);
214 set_mem_comment(attr
, name
, strlen(name
));
219 while (!isspace(*e
) && !end_of_tag(*e
)) {
220 if (!*e
) goto parse_error
;
224 unsigned char quote
= *e
;
227 while (*(++e
) != quote
)
228 if (!*e
) goto parse_error
;
230 } while (/* See above. *e == quote */ 0);
244 /* Extract numerical value of attribute @name.
245 * It will return a positive integer value on success,
248 get_num(unsigned char *a
, unsigned char *name
, int cp
)
250 unsigned char *al
= get_attr_val(a
, name
, cp
);
258 num
= strtol(al
, (char **) &end
, 10);
259 if (!errno
&& *al
&& !*end
&& num
>= 0 && num
<= INT_MAX
)
268 /* Parse 'width[%],....'-like attribute @name of element @a. If @limited is
269 * set, it will limit the width value to the current usable width. Note that
270 * @limited must be set to be able to parse percentage widths. */
271 /* The function returns width in characters or -1 in case of error. */
273 get_width(unsigned char *a
, unsigned char *name
, int limited
,
274 struct html_context
*html_context
)
276 unsigned char *value
= get_attr_val(a
, name
, html_context
->options
->cp
);
277 unsigned char *str
= value
;
283 if (!value
) return -1;
285 /* Skip spaces at start of string if any. */
288 /* Search for end of string or ',' character (ie. in "100,200") */
289 for (len
= 0; str
[len
] && str
[len
] != ','; len
++);
291 /* Go back, and skip spaces after width if any. */
292 while (len
&& isspace(str
[len
- 1])) len
--;
293 if (!len
) { mem_free(value
); return -1; } /* Nothing to parse. */
295 /* Is this a percentage ? */
296 if (str
[len
- 1] == '%') len
--, percentage
= 1;
298 /* Skip spaces between width number and percentage if any. */
299 while (len
&& isspace(str
[len
- 1])) len
--;
300 if (!len
) { mem_free(value
); return -1; } /* Nothing to parse. */
302 /* Shorten the string a bit, so strtoul() will work on useful
306 /* Convert to number if possible. */
308 width
= strtoul((char *) str
, (char **) &end
, 10);
310 /* @end points into the @value string so check @end position
311 * before freeing @value. */
312 /* We will accept floats but ceil() them. */
313 if (errno
|| (*end
&& *end
!= '.') || width
>= INT_MAX
) {
314 /* Not a valid number. */
321 #define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;
324 int maxwidth
= get_html_max_width();
327 /* Value is a percentage. */
328 width
= width
* maxwidth
/ 100;
330 /* Value is a number of pixels, makes an approximation. */
331 width
= WIDTH_PIXELS2CHARS(width
);
334 if (width
> maxwidth
)
339 /* No sense, we need @limited and @maxwidth for percentage. */
342 /* Value is a number of pixels, makes an approximation,
344 width
= WIDTH_PIXELS2CHARS(width
);
348 #undef WIDTH_PIXELS2CHARS
358 skip_comment(unsigned char *html
, unsigned char *eof
)
360 if (html
+ 4 <= eof
&& html
[2] == '-' && html
[3] == '-') {
363 if (html
+ 2 <= eof
&& html
[0] == '-' && html
[1] == '-') {
365 while (html
< eof
&& *html
== '-') html
++;
366 while (html
< eof
&& isspace(*html
)) html
++;
367 if (html
>= eof
) return eof
;
368 if (*html
== '>') return html
+ 1;
377 if (html
[0] == '>') return html
+ 1;
395 struct element_info
{
396 /* Element name, uppercase. */
399 /* Element handler. This does the relevant arguments processing and
400 * formatting (by calling renderer hooks). Note that in a few cases,
401 * this is just a placeholder and the element is given special care
402 * in start_element() (which is also where we call these handlers). */
403 element_handler_T
*open
;
405 element_handler_T
*close
;
407 /* How many line-breaks to ensure we have before and after an element.
408 * Value of 1 means the element will be on a line on its own, value
409 * of 2 means that it will also have empty lines before and after.
410 * Note that this does not add up - it just ensures that there is
411 * at least so many linebreaks, but does not add more if that is the
412 * case. Therefore, something like e.g. </pre></p> will add only two
413 * linebreaks, not four. */
414 /* In some stack killing logic, we use some weird heuristic based on
415 * whether an element is block or inline. That is determined from
416 * whether this attribute is zero on non-zero. */
419 enum element_type type
;
422 static struct element_info elements
[] = {
423 {"A", html_a
, NULL
, 0, ET_NON_NESTABLE
},
424 {"ABBR", html_italic
, NULL
, 0, ET_NESTABLE
},
425 {"ADDRESS", html_address
, NULL
, 2, ET_NESTABLE
},
426 {"APPLET", html_applet
, NULL
, 1, ET_NON_PAIRABLE
},
427 {"B", html_bold
, NULL
, 0, ET_NESTABLE
},
428 {"BASE", html_base
, NULL
, 0, ET_NON_PAIRABLE
},
429 {"BASEFONT", html_font
, NULL
, 0, ET_NON_PAIRABLE
},
430 {"BLOCKQUOTE", html_blockquote
, NULL
, 2, ET_NESTABLE
},
431 {"BODY", html_body
, NULL
, 0, ET_NESTABLE
},
432 {"BR", html_br
, NULL
, 1, ET_NON_PAIRABLE
},
433 {"BUTTON", html_button
, NULL
, 0, ET_NESTABLE
},
434 {"CAPTION", html_center
, NULL
, 1, ET_NESTABLE
},
435 {"CENTER", html_center
, NULL
, 1, ET_NESTABLE
},
436 {"CODE", html_fixed
, NULL
, 0, ET_NESTABLE
},
437 {"DD", html_dd
, NULL
, 1, ET_NON_PAIRABLE
},
438 {"DFN", html_bold
, NULL
, 0, ET_NESTABLE
},
439 {"DIR", html_ul
, NULL
, 2, ET_NESTABLE
},
440 {"DIV", html_linebrk
, NULL
, 1, ET_NESTABLE
},
441 {"DL", html_dl
, NULL
, 2, ET_NESTABLE
},
442 {"DT", html_dt
, NULL
, 1, ET_NON_PAIRABLE
},
443 {"EM", html_italic
, NULL
, 0, ET_NESTABLE
},
444 {"EMBED", html_embed
, NULL
, 0, ET_NON_PAIRABLE
},
445 {"FIXED", html_fixed
, NULL
, 0, ET_NESTABLE
},
446 {"FONT", html_font
, NULL
, 0, ET_NESTABLE
},
447 {"FORM", html_form
, NULL
, 1, ET_NESTABLE
},
448 {"FRAME", html_frame
, NULL
, 1, ET_NON_PAIRABLE
},
449 {"FRAMESET", html_frameset
, NULL
, 1, ET_NESTABLE
},
450 {"H1", html_h1
, NULL
, 2, ET_NON_NESTABLE
},
451 {"H2", html_h2
, NULL
, 2, ET_NON_NESTABLE
},
452 {"H3", html_h3
, NULL
, 2, ET_NON_NESTABLE
},
453 {"H4", html_h4
, NULL
, 2, ET_NON_NESTABLE
},
454 {"H5", html_h5
, NULL
, 2, ET_NON_NESTABLE
},
455 {"H6", html_h6
, NULL
, 2, ET_NON_NESTABLE
},
456 {"HEAD", html_head
, NULL
, 0, ET_NESTABLE
},
457 {"HR", html_hr
, NULL
, 2, ET_NON_PAIRABLE
},
458 {"HTML", html_html
, html_html_close
, 0, ET_NESTABLE
},
459 {"I", html_italic
, NULL
, 0, ET_NESTABLE
},
460 {"IFRAME", html_iframe
, NULL
, 1, ET_NON_PAIRABLE
},
461 {"IMG", html_img
, NULL
, 0, ET_NON_PAIRABLE
},
462 {"INPUT", html_input
, NULL
, 0, ET_NON_PAIRABLE
},
463 {"LI", html_li
, NULL
, 1, ET_LI
},
464 {"LINK", html_link
, NULL
, 1, ET_NON_PAIRABLE
},
465 {"LISTING", html_pre
, NULL
, 2, ET_NESTABLE
},
466 {"MENU", html_ul
, NULL
, 2, ET_NESTABLE
},
467 {"META", html_meta
, NULL
, 0, ET_NON_PAIRABLE
},
468 {"NOFRAMES", html_noframes
, NULL
, 0, ET_NESTABLE
},
469 {"NOSCRIPT", html_noscript
, NULL
, 0, ET_NESTABLE
},
470 {"OBJECT", html_object
, NULL
, 1, ET_NON_PAIRABLE
},
471 {"OL", html_ol
, NULL
, 2, ET_NESTABLE
},
472 {"OPTION", html_option
, NULL
, 1, ET_NON_PAIRABLE
},
473 {"P", html_p
, NULL
, 2, ET_NON_NESTABLE
},
474 {"PRE", html_pre
, NULL
, 2, ET_NESTABLE
},
475 {"Q", html_quote
, html_quote_close
, 0, ET_NESTABLE
},
476 {"S", html_underline
, NULL
, 0, ET_NESTABLE
},
477 {"SCRIPT", html_script
, NULL
, 0, ET_NESTABLE
},
478 {"SELECT", html_select
, NULL
, 0, ET_NESTABLE
},
479 {"SPAN", html_span
, NULL
, 0, ET_NESTABLE
},
480 {"STRIKE", html_underline
, NULL
, 0, ET_NESTABLE
},
481 {"STRONG", html_bold
, NULL
, 0, ET_NESTABLE
},
482 {"STYLE", html_style
, html_style_close
, 0, ET_NESTABLE
},
483 {"SUB", html_subscript
, html_subscript_close
, 0, ET_NESTABLE
},
484 {"SUP", html_superscript
, NULL
, 0, ET_NESTABLE
},
485 {"TABLE", html_table
, NULL
, 2, ET_NESTABLE
},
486 {"TD", html_td
, NULL
, 0, ET_NESTABLE
},
487 {"TEXTAREA", html_textarea
, NULL
, 0, ET_NON_PAIRABLE
},
488 {"TH", html_th
, NULL
, 0, ET_NESTABLE
},
489 {"TITLE", html_title
, NULL
, 0, ET_NESTABLE
},
490 {"TR", html_tr
, NULL
, 1, ET_NESTABLE
},
491 {"TT", html_tt
, NULL
, 0, ET_NON_NESTABLE
},
492 {"U", html_underline
, NULL
, 0, ET_NESTABLE
},
493 {"UL", html_ul
, NULL
, 2, ET_NESTABLE
},
494 {"XMP", html_xmp
, html_xmp_close
, 2, ET_NESTABLE
},
495 {NULL
, NULL
, NULL
, 0, ET_NESTABLE
},
498 #define NUMBER_OF_TAGS (sizeof_array(elements) - 1)
504 compar(const void *a
, const void *b
)
506 return strcasecmp(((struct element_info
*) a
)->name
,
507 ((struct element_info
*) b
)->name
);
512 static struct element_info
*internal_pointer
;
514 /* Reset internal list pointer */
516 tags_list_reset(void)
518 internal_pointer
= elements
;
521 /* Returns a pointer to a struct that contains
522 * current key and data pointers and increment
524 * It returns NULL when key is NULL. */
525 static struct fastfind_key_value
*
528 static struct fastfind_key_value kv
;
530 if (!internal_pointer
->name
) return NULL
;
532 kv
.key
= internal_pointer
->name
;
533 kv
.data
= internal_pointer
;
540 static struct fastfind_index ff_tags_index
541 = INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset
, tags_list_next
);
543 #endif /* USE_FASTFIND */
547 init_tags_lookup(void)
550 fastfind_index(&ff_tags_index
, FF_COMPRESS
);
555 free_tags_lookup(void)
558 fastfind_done(&ff_tags_index
);
563 static unsigned char *process_element(unsigned char *name
, int namelen
, int endingtag
,
564 unsigned char *html
, unsigned char *prev_html
,
565 unsigned char *eof
, unsigned char *attr
,
566 struct html_context
*html_context
);
568 /* Count the consecutive newline entity references (e.g. " ") at
569 * the beginning of the range from @html to @eof. Store the number of
570 * newlines to *@newlines_out and return the address where they end.
572 * This function currently requires a semicolon at the end of any
573 * entity reference, and does not support U+2028 LINE SEPARATOR and
574 * U+2029 PARAGRAPH SEPARATOR. */
575 static const unsigned char *
576 count_newline_entities(const unsigned char *html
, const unsigned char *eof
,
580 int prev_was_cr
= 0; /* treat CRLF as one newline, not two */
582 while ((html
+ 5 < eof
&& html
[0] == '&' && html
[1] == '#')) {
583 const unsigned char *peek
= html
+ 2;
586 if (*peek
== 'x' || *peek
== 'X') {
588 while (peek
< eof
&& *peek
== '0')
592 else if (*peek
== 'a' || *peek
== 'A')
594 else if (*peek
== 'd' || *peek
== 'D')
600 while (peek
< eof
&& *peek
== '0')
602 if (eof
- peek
< 2 || *peek
!= '1')
604 else if (peek
[1] == '0')
606 else if (peek
[1] == '3')
612 /* @peek should now be pointing to the semicolon of
613 * e.g. "
" or "
". Or more digits might
615 if (peek
== eof
|| *peek
!= ';')
619 if (this_is_cr
|| !prev_was_cr
)
621 prev_was_cr
= this_is_cr
;
625 *newlines_out
= newlines
;
630 parse_html(unsigned char *html
, unsigned char *eof
,
631 struct part
*part
, unsigned char *head
,
632 struct html_context
*html_context
)
634 unsigned char *base_pos
= html
;
637 html_context
->putsp
= HTML_SPACE_SUPPRESS
;
638 html_context
->line_breax
= html_context
->table_level
? 2 : 1;
639 html_context
->position
= 0;
640 html_context
->was_br
= 0;
641 html_context
->was_li
= 0;
642 html_context
->was_body
= 0;
643 /* html_context->was_body_background = 0; */
644 html_context
->part
= part
;
645 html_context
->eoff
= eof
;
646 if (head
) process_head(html_context
, head
);
650 unsigned char *name
, *attr
, *end
;
651 int namelen
, endingtag
;
655 html_context
->part
= part
;
656 html_context
->eoff
= eof
;
662 if (isspace(*html
) && !html_is_preformatted()) {
663 unsigned char *h
= html
;
665 while (h
< eof
&& isspace(*h
))
667 if (h
+ 1 < eof
&& h
[0] == '<' && h
[1] == '/') {
668 if (!parse_element(h
, eof
, &name
, &namelen
, &attr
, &end
)) {
669 put_chrs(html_context
, base_pos
, html
- base_pos
);
671 html_context
->putsp
= HTML_SPACE_ADD
;
676 if (!(html_context
->position
+ (html
- base_pos
- 1)))
677 goto skip_w
; /* ??? */
678 if (*(html
- 1) == ' ') { /* Do not replace with isspace() ! --Zas */
679 /* BIG performance win; not sure if it doesn't cause any bug */
680 if (html
< eof
&& !isspace(*html
)) {
684 put_chrs(html_context
, base_pos
, html
- base_pos
);
686 put_chrs(html_context
, base_pos
, html
- base_pos
- 1);
687 put_chrs(html_context
, " ", 1);
691 while (html
< eof
&& isspace(*html
))
696 if (html_is_preformatted()) {
697 html_context
->putsp
= HTML_SPACE_NORMAL
;
698 if (*html
== ASCII_TAB
) {
699 put_chrs(html_context
, base_pos
, html
- base_pos
);
700 put_chrs(html_context
, " ",
701 8 - (html_context
->position
% 8));
705 } else if (*html
== ASCII_CR
|| *html
== ASCII_LF
) {
706 put_chrs(html_context
, base_pos
, html
- base_pos
);
707 if (html
- base_pos
== 0 && html_context
->line_breax
> 0)
708 html_context
->line_breax
--;
710 if (*html
== ASCII_CR
&& html
< eof
- 1
711 && html
[1] == ASCII_LF
)
713 ln_break(html_context
, 1);
715 if (*html
== ASCII_CR
|| *html
== ASCII_LF
) {
716 html_context
->line_breax
= 0;
721 } else if (html
+ 5 < eof
&& *html
== '&') {
722 /* Really nasty hack to make handling in
723 * <pre>-tags lynx-compatible. It works around
724 * the entity handling done in the renderer,
725 * since checking #13 value there would require
726 * something along the lines of NBSP_CHAR or
727 * checking for '\n's in AT_PREFORMATTED text. */
728 /* See bug 52 and 387 for more info. */
729 int length
= html
- base_pos
;
732 html
= (unsigned char *) count_newline_entities(html
, eof
, &newlines
);
734 put_chrs(html_context
, base_pos
, length
);
735 ln_break(html_context
, newlines
);
741 while (*html
< ' ') {
743 put_chrs(html_context
, base_pos
, html
- base_pos
);
747 if (*html
>= ' ' || isspace(*html
) || html
>= eof
) {
748 unsigned char *dots
= fmem_alloc(dotcounter
);
751 memset(dots
, '.', dotcounter
);
752 put_chrs(html_context
, dots
, dotcounter
);
759 if (html
+ 2 <= eof
&& html
[0] == '<' && (html
[1] == '!' || html
[1] == '?')
760 && !(html_context
->was_xmp
|| html_context
->was_style
)) {
761 put_chrs(html_context
, base_pos
, html
- base_pos
);
762 html
= skip_comment(html
, eof
);
766 if (*html
!= '<' || parse_element(html
, eof
, &name
, &namelen
, &attr
, &end
)) {
773 endingtag
= *name
== '/'; name
+= endingtag
; namelen
-= endingtag
;
774 if (!endingtag
&& html_context
->putsp
== HTML_SPACE_ADD
&& !html_top
->invisible
)
775 put_chrs(html_context
, " ", 1);
776 put_chrs(html_context
, base_pos
, html
- base_pos
);
777 if (!html_is_preformatted() && !endingtag
&& html_context
->putsp
== HTML_SPACE_NORMAL
) {
778 unsigned char *ee
= end
;
781 while (!parse_element(ee
, eof
, &nm
, NULL
, NULL
, &ee
))
784 if (ee
< eof
&& isspace(*ee
)) {
785 put_chrs(html_context
, " ", 1);
789 html
= process_element(name
, namelen
, endingtag
, end
, html
, eof
, attr
, html_context
);
792 if (noupdate
) put_chrs(html_context
, base_pos
, html
- base_pos
);
793 ln_break(html_context
, 1);
794 /* Restore the part in case the html_context was trashed in the last
795 * iteration so that when destroying the stack in the caller we still
796 * get the right part pointer. */
797 html_context
->part
= part
;
798 html_context
->putsp
= HTML_SPACE_SUPPRESS
;
799 html_context
->position
= 0;
800 html_context
->was_br
= 0;
803 static unsigned char *
804 start_element(struct element_info
*ei
,
805 unsigned char *name
, int namelen
,
807 unsigned char *eof
, unsigned char *attr
,
808 struct html_context
*html_context
)
810 #define ELEMENT_RENDER_PROLOGUE \
811 ln_break(html_context, ei->linebreak); \
812 a = get_attr_val(attr, "id", html_context->options->cp); \
814 html_context->special_f(html_context, SP_TAG, a); \
819 struct par_attrib old_format
;
822 struct css_selector
*selector
= NULL
;
825 if (html_top
->type
== ELEMENT_WEAK
) {
826 pop_html_element(html_context
);
829 /* We try to process nested <script> if we didn't process the parent
831 if (html_top
->invisible
832 && (ei
->open
!= html_script
|| html_top
->invisible
< 2)) {
833 ELEMENT_RENDER_PROLOGUE
837 restore_format
= html_is_preformatted();
838 old_format
= par_format
;
840 /* Support for <meta refresh="..."> inside <body>. (bug 700) */
841 if (ei
->open
== html_meta
&& html_context
->was_body
) {
842 html_handle_body_meta(html_context
, name
- 1, eof
);
843 html_context
->was_body
= 0;
847 if (ei
->open
== html_style
&& html_context
->options
->css_enable
) {
848 css_parse_stylesheet(&html_context
->css_styles
,
849 html_context
->base_href
, html
, eof
);
853 if (ei
->type
== ET_NON_NESTABLE
|| ei
->type
== ET_LI
) {
854 struct html_element
*e
;
856 if (ei
->type
== ET_NON_NESTABLE
) {
857 foreach (e
, html_context
->stack
) {
858 if (e
->type
< ELEMENT_KILLABLE
) break;
859 if (is_block_element(e
) || is_inline_element(ei
)) break;
862 foreach (e
, html_context
->stack
) {
863 if (is_block_element(e
) && is_inline_element(ei
)) break;
864 if (e
->type
< ELEMENT_KILLABLE
) break;
865 if (!strlcasecmp(e
->name
, e
->namelen
, name
, namelen
)) break;
869 if (!strlcasecmp(e
->name
, e
->namelen
, name
, namelen
)) {
870 while (e
->prev
!= (void *) &html_context
->stack
)
871 kill_html_stack_item(html_context
, e
->prev
);
873 if (e
->type
> ELEMENT_IMMORTAL
)
874 kill_html_stack_item(html_context
, e
);
878 if (ei
->type
!= ET_NON_PAIRABLE
) {
879 html_stack_dup(html_context
, ELEMENT_KILLABLE
);
880 html_top
->name
= name
;
881 html_top
->namelen
= namelen
;
882 html_top
->options
= attr
;
883 html_top
->linebreak
= ei
->linebreak
;
885 #ifdef CONFIG_ECMASCRIPT
886 if (has_attr(attr
, "onClick", html_context
->options
->cp
)) {
887 /* XXX: Put something better to format.link. --pasky */
888 mem_free_set(&format
.link
, stracpy("javascript:void(0);"));
889 mem_free_set(&format
.target
, stracpy(html_context
->base_target
));
890 format
.style
.fg
= format
.clink
;
891 html_top
->pseudo_class
= ELEMENT_LINK
;
892 mem_free_set(&format
.title
, stracpy("onClick placeholder"));
893 /* Er. I know. Well, double html_focusable()s shouldn't
895 html_focusable(html_context
, attr
);
901 if (html_top
->options
&& html_context
->options
->css_enable
) {
902 /* XXX: We should apply CSS otherwise as well, but that'll need
903 * some deeper changes in order to have options filled etc.
904 * Probably just applying CSS from more places, since we
905 * usually have type != ET_NESTABLE when we either (1)
906 * rescan on your own from somewhere else (2) html_stack_dup()
907 * in our own way. --pasky */
908 /* Call it now to gain some of the stuff which might affect
909 * formatting of some elements. */
910 /* FIXME: The caching of the CSS selector is broken, since t can
911 * lead to wrong styles being applied to following elements, so
912 * disabled for now. */
913 selector
= get_css_selector_for_element(html_context
, html_top
,
914 &html_context
->css_styles
,
915 &html_context
->stack
);
918 apply_css_selector_style(html_context
, html_top
, selector
);
919 done_css_selector(selector
);
922 /* Now this was the reason for this whole funny ELEMENT_RENDER_PROLOGUE
923 * bussiness. Only now we have the definitive linebreak value, since
924 * that's what the display: property plays with. */
926 ELEMENT_RENDER_PROLOGUE
927 if (ei
->open
) ei
->open(html_context
, attr
, html
, eof
, &html
);
929 if (selector
&& html_top
->options
) {
930 /* Call it now to override default colors of the elements. */
931 selector
= get_css_selector_for_element(html_context
, html_top
,
932 &html_context
->css_styles
,
933 &html_context
->stack
);
936 apply_css_selector_style(html_context
, html_top
, selector
);
937 done_css_selector(selector
);
942 if (ei
->open
!= html_br
) html_context
->was_br
= 0;
944 if (restore_format
) par_format
= old_format
;
947 #undef ELEMENT_RENDER_PROLOGUE
950 static unsigned char *
951 end_element(struct element_info
*ei
,
952 unsigned char *name
, int namelen
,
954 unsigned char *eof
, unsigned char *attr
,
955 struct html_context
*html_context
)
957 struct html_element
*e
, *elt
;
961 html_context
->was_br
= 0;
962 if (ei
->type
== ET_NON_PAIRABLE
|| ei
->type
== ET_LI
)
965 if (ei
->close
) ei
->close(html_context
, attr
, html
, eof
, &html
);
967 /* dump_html_stack(html_context); */
968 foreach (e
, html_context
->stack
) {
969 if (is_block_element(e
) && is_inline_element(ei
)) kill
= 1;
970 if (strlcasecmp(e
->name
, e
->namelen
, name
, namelen
)) {
971 if (e
->type
< ELEMENT_KILLABLE
)
977 kill_html_stack_item(html_context
, e
);
981 elt
!= (void *) &html_context
->stack
;
983 if (elt
->linebreak
> lnb
)
984 lnb
= elt
->linebreak
;
986 /* This hack forces a line break after a list end. It is needed
987 * when ending a list with the last <li> having no text the
988 * line_breax is 2 so the ending list's linebreak will be
989 * ignored when calling ln_break(). */
990 if (html_context
->was_li
)
991 html_context
->line_breax
= 0;
993 ln_break(html_context
, lnb
);
994 while (e
->prev
!= (void *) &html_context
->stack
)
995 kill_html_stack_item(html_context
, e
->prev
);
996 kill_html_stack_item(html_context
, e
);
999 /* dump_html_stack(html_context); */
1004 static unsigned char *
1005 process_element(unsigned char *name
, int namelen
, int endingtag
,
1006 unsigned char *html
, unsigned char *prev_html
,
1007 unsigned char *eof
, unsigned char *attr
,
1008 struct html_context
*html_context
)
1011 struct element_info
*ei
;
1013 #ifndef USE_FASTFIND
1015 struct element_info elem
;
1018 tmp
= name
[namelen
];
1019 name
[namelen
] = '\0';
1022 ei
= bsearch(&elem
, elements
, NUMBER_OF_TAGS
, sizeof(elem
), compar
);
1023 name
[namelen
] = tmp
;
1026 ei
= (struct element_info
*) fastfind_search(&ff_tags_index
, name
, namelen
);
1028 if (html_context
->was_xmp
|| html_context
->was_style
) {
1029 if (!ei
|| (ei
->open
!= html_xmp
&& ei
->open
!= html_style
) || !endingtag
) {
1030 put_chrs(html_context
, "<", 1);
1031 return prev_html
+ 1;
1035 if (!ei
) return html
;
1038 return start_element(ei
, name
, namelen
, html
, eof
, attr
, html_context
);
1040 return end_element(ei
, name
, namelen
, html
, eof
, attr
, html_context
);
1045 scan_http_equiv(unsigned char *s
, unsigned char *eof
, struct string
*head
,
1046 struct string
*title
, struct document_options
*options
)
1048 unsigned char *name
, *attr
, *he
, *c
;
1051 if (title
&& !init_string(title
)) return;
1053 add_char_to_string(head
, '\n');
1056 while (s
< eof
&& *s
!= '<') {
1060 if (s
>= eof
) return;
1061 if (s
+ 2 <= eof
&& (s
[1] == '!' || s
[1] == '?')) {
1062 s
= skip_comment(s
, eof
);
1065 if (parse_element(s
, eof
, &name
, &namelen
, &attr
, &s
)) goto sp
;
1068 if (!strlcasecmp(name
, namelen
, "HEAD", 4)) goto se
;
1069 if (!strlcasecmp(name
, namelen
, "/HEAD", 5)) return;
1070 if (!strlcasecmp(name
, namelen
, "BODY", 4)) return;
1071 if (title
&& !title
->length
&& !strlcasecmp(name
, namelen
, "TITLE", 5)) {
1076 while (s
< eof
&& *s
!= '<') {
1081 add_bytes_to_string(title
, s1
, s
- s1
);
1082 if (s
>= eof
) goto se
;
1083 if (s
+ 2 <= eof
&& (s
[1] == '!' || s
[1] == '?')) {
1084 s
= skip_comment(s
, eof
);
1087 if (parse_element(s
, eof
, &name
, &namelen
, &attr
, &s
)) {
1091 clr_spaces(title
->source
);
1094 if (strlcasecmp(name
, namelen
, "META", 4)) goto se
;
1096 he
= get_attr_val(attr
, "charset", options
->cp
);
1098 add_to_string(head
, "Charset: ");
1099 add_to_string(head
, he
);
1103 he
= get_attr_val(attr
, "http-equiv", options
->cp
);
1106 add_to_string(head
, he
);
1109 c
= get_attr_val(attr
, "content", options
->cp
);
1111 add_to_string(head
, ": ");
1112 add_to_string(head
, c
);
1116 add_crlf_to_string(head
);