4 #define _GNU_SOURCE /* XXX: we _WANT_ strcasestr() ! */
19 #include "bfu/listmenu.h"
21 #include "document/css/apply.h"
22 #include "document/css/css.h"
23 #include "document/css/stylesheet.h"
24 #include "document/html/frames.h"
25 #include "document/html/parser/link.h"
26 #include "document/html/parser/stack.h"
27 #include "document/html/parser/parse.h"
28 #include "document/html/parser.h"
29 #include "document/html/renderer.h"
30 #include "document/options.h"
31 #include "document/renderer.h"
32 #include "intl/charsets.h"
33 #include "protocol/date.h"
34 #include "protocol/header.h"
35 #include "protocol/uri.h"
36 #include "session/task.h"
37 #include "terminal/draw.h"
38 #include "util/align.h"
40 #include "util/color.h"
41 #include "util/conv.h"
42 #include "util/error.h"
43 #include "util/memdebug.h"
44 #include "util/memlist.h"
45 #include "util/memory.h"
46 #include "util/string.h"
49 #include "document/html/internal.h"
51 /* TODO: This needs rewrite. Yes, no kidding. */
55 get_color(struct html_context
*html_context
, unsigned char *a
,
56 unsigned char *c
, color_T
*rgb
)
61 if (!use_document_fg_colors(html_context
->options
))
64 at
= get_attr_val(a
, c
, html_context
->doc_cp
);
67 r
= decode_color(at
, strlen(at
), rgb
);
74 get_bgcolor(struct html_context
*html_context
, unsigned char *a
, color_T
*rgb
)
76 if (!use_document_bg_colors(html_context
->options
))
79 return get_color(html_context
, a
, "bgcolor", rgb
);
83 get_target(struct document_options
*options
, unsigned char *a
)
85 /* FIXME (bug 784): options->cp is the terminal charset;
86 * should use the document charset instead. */
87 unsigned char *v
= get_attr_val(a
, "target", options
->cp
);
91 if (!*v
|| !strcasecmp(v
, "_self")) {
92 mem_free_set(&v
, stracpy(options
->framename
));
100 ln_break(struct html_context
*html_context
, int n
)
102 if (!n
|| html_top
->invisible
) return;
103 while (n
> html_context
->line_breax
) {
104 html_context
->line_breax
++;
105 html_context
->line_break_f(html_context
);
107 html_context
->position
= 0;
108 html_context
->putsp
= HTML_SPACE_SUPPRESS
;
112 put_chrs(struct html_context
*html_context
, unsigned char *start
, int len
)
114 if (html_is_preformatted())
115 html_context
->putsp
= HTML_SPACE_NORMAL
;
117 if (!len
|| html_top
->invisible
)
120 switch (html_context
->putsp
) {
121 case HTML_SPACE_NORMAL
:
125 html_context
->put_chars_f(html_context
, " ", 1);
126 html_context
->position
++;
127 html_context
->putsp
= HTML_SPACE_SUPPRESS
;
131 case HTML_SPACE_SUPPRESS
:
132 html_context
->putsp
= HTML_SPACE_NORMAL
;
133 if (isspace(start
[0])) {
137 html_context
->putsp
= HTML_SPACE_SUPPRESS
;
145 if (isspace(start
[len
- 1]) && !html_is_preformatted())
146 html_context
->putsp
= HTML_SPACE_SUPPRESS
;
147 html_context
->was_br
= 0;
149 html_context
->put_chars_f(html_context
, start
, len
);
151 html_context
->position
+= len
;
152 html_context
->line_breax
= 0;
153 if (html_context
->was_li
> 0)
154 html_context
->was_li
--;
158 set_fragment_identifier(struct html_context
*html_context
,
159 unsigned char *attr_name
, unsigned char *attr
)
161 unsigned char *id_attr
;
163 id_attr
= get_attr_val(attr_name
, attr
, html_context
->doc_cp
);
166 html_context
->special_f(html_context
, SP_TAG
, id_attr
);
172 add_fragment_identifier(struct html_context
*html_context
,
173 struct part
*part
, unsigned char *attr
)
175 struct part
*saved_part
= html_context
->part
;
177 html_context
->part
= part
;
178 html_context
->special_f(html_context
, SP_TAG
, attr
);
179 html_context
->part
= saved_part
;
184 import_css_stylesheet(struct css_stylesheet
*css
, struct uri
*base_uri
,
185 unsigned char *url
, int len
)
187 struct html_context
*html_context
= css
->import_data
;
188 unsigned char *import_url
;
191 assert(html_context
);
194 if (!html_context
->options
->css_enable
195 || !html_context
->options
->css_import
)
198 url
= memacpy(url
, len
);
201 /* HTML <head> urls should already be fine but we can.t detect them. */
202 import_url
= join_urls(base_uri
, url
);
205 if (!import_url
) return;
207 uri
= get_uri(import_url
, URI_BASE
);
208 mem_free(import_url
);
212 /* Request the imported stylesheet as part of the document ... */
213 html_context
->special_f(html_context
, SP_STYLESHEET
, uri
);
215 /* ... and then attempt to import from the cache. */
216 import_css(css
, uri
);
222 /* Extract the extra information that is available for elements which can
223 * receive focus. Call this from each element which supports tabindex or
225 /* Note that in ELinks, we support those attributes (I mean, we call this
226 * function) while processing any focusable element (otherwise it'd have zero
227 * tabindex, thus messing up navigation between links), thus we support these
228 * attributes even near tags where we're not supposed to (like IFRAME, FRAME or
229 * LINK). I think this doesn't make any harm ;). --pasky */
231 html_focusable(struct html_context
*html_context
, unsigned char *a
)
233 struct document_options
*options
;
234 unsigned char *accesskey
;
238 format
.accesskey
= 0;
239 format
.tabindex
= 0x80000000;
243 options
= html_context
->options
;
244 cp
= html_context
->doc_cp
;
246 accesskey
= get_attr_val(a
, "accesskey", cp
);
248 format
.accesskey
= accesskey_string_to_unicode(accesskey
);
252 tabindex
= get_num(a
, "tabindex", html_context
->doc_cp
);
253 if (0 < tabindex
&& tabindex
< 32767) {
254 format
.tabindex
= (tabindex
& 0x7fff) << 16;
257 mem_free_set(&format
.onclick
, get_attr_val(a
, "onclick", cp
));
258 mem_free_set(&format
.ondblclick
, get_attr_val(a
, "ondblclick", cp
));
259 mem_free_set(&format
.onmouseover
, get_attr_val(a
, "onmouseover", cp
));
260 mem_free_set(&format
.onhover
, get_attr_val(a
, "onhover", cp
));
261 mem_free_set(&format
.onfocus
, get_attr_val(a
, "onfocus", cp
));
262 mem_free_set(&format
.onmouseout
, get_attr_val(a
, "onmouseout", cp
));
263 mem_free_set(&format
.onblur
, get_attr_val(a
, "onblur", cp
));
267 html_skip(struct html_context
*html_context
, unsigned char *a
)
269 html_top
->invisible
= 1;
270 html_top
->type
= ELEMENT_DONT_KILL
;
273 #define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
275 /* Parse meta refresh without URL= in it:
276 * <meta http-equiv="refresh" content="3,http://elinks.or.cz/">
277 * <meta http-equiv="refresh" content="3; http://elinks.or.cz/">
278 * <meta http-equiv="refresh" content=" 3 ; http://elinks.or.cz/ ">
281 parse_old_meta_refresh(unsigned char *str
, unsigned char **ret
)
283 unsigned char *p
= str
;
287 if_assert_failed
return;
290 while (*p
&& LWS(*p
)) p
++;
292 while (*p
&& *p
>= '0' && *p
<= '9') p
++;
294 while (*p
&& LWS(*p
)) p
++;
296 if (*p
== ';' || *p
== ',') p
++; else return;
297 while (*p
&& LWS(*p
)) p
++;
301 while (len
&& LWS(p
[len
])) len
--;
302 if (len
) *ret
= memacpy(p
, len
);
305 /* Search for the url part in the content attribute and returns
307 * It searches the first occurence of 'url' marker somewhere ignoring
308 * anything before it.
309 * It should cope with most situations including:
310 * content="0; URL='http://www.site.com/path/xxx.htm'"
311 * content="0 url=http://www.site.com/path/xxx.htm"
312 * content="anything ; some url === ''''http://www.site.com/path/xxx.htm''''
314 * The return value is one of:
316 * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
317 * - HEADER_PARAM_NOT_FOUND: the parameter is not there. *@ret is now NULL.
318 * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
320 * If @ret is NULL, then this function doesn't actually access *@ret,
321 * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY. Some callers may
323 static enum parse_header_param
324 search_for_url_param(unsigned char *str
, unsigned char **ret
)
329 if (ret
) *ret
= NULL
; /* default in case of early return */
332 if_assert_failed
return HEADER_PARAM_NOT_FOUND
;
334 /* Returns now if string @str is empty. */
335 if (!*str
) return HEADER_PARAM_NOT_FOUND
;
337 p
= strcasestr(str
, "url");
338 if (!p
) return HEADER_PARAM_NOT_FOUND
;
341 while (*p
&& (*p
<= ' ' || *p
== '=')) p
++;
346 return HEADER_PARAM_OUT_OF_MEMORY
;
348 return HEADER_PARAM_FOUND
;
351 while ((p
[plen
] > ' ' || LWS(p
[plen
])) && p
[plen
] != ';') plen
++;
353 /* Trim ending spaces */
354 while (plen
> 0 && LWS(p
[plen
- 1])) plen
--;
356 /* XXX: Drop enclosing single quotes if there's some.
358 * Some websites like newsnow.co.uk are using single quotes around url
359 * in URL field in meta tag content attribute like this:
360 * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
362 * This is an attempt to handle that, but it may break something else.
363 * We drop all pair of enclosing quotes found (eg. '''url''' => url).
364 * Please report any issue related to this. --Zas */
365 while (plen
> 1 && *p
== '\'' && p
[plen
- 1] == '\'') {
371 *ret
= memacpy(p
, plen
);
373 return HEADER_PARAM_OUT_OF_MEMORY
;
375 return HEADER_PARAM_FOUND
;
381 process_head(struct html_context
*html_context
, unsigned char *head
)
383 unsigned char *refresh
, *url
;
385 refresh
= parse_header(head
, "Refresh", NULL
);
386 if (!refresh
) return;
388 search_for_url_param(refresh
, &url
);
390 /* Let's try a more tolerant parsing. */
391 parse_old_meta_refresh(refresh
, &url
);
393 /* If the URL parameter is missing assume that the
394 * document being processed should be refreshed. */
395 url
= get_uri_string(html_context
->base_href
, URI_ORIGINAL
);
400 /* Extraction of refresh time. */
401 unsigned long seconds
= 0;
404 /* We try to extract the refresh time, and to handle weird things
405 * in an elegant way. Among things we can have negative values,
406 * too big ones, just ';' (we assume 0 seconds in that case) and
408 if (*refresh
!= ';') {
409 if (isdigit(*refresh
)) {
410 unsigned long max_seconds
= HTTP_REFRESH_MAX_DELAY
;
413 seconds
= strtoul(refresh
, NULL
, 10);
414 if (errno
== ERANGE
|| seconds
> max_seconds
) {
415 /* Too big refresh value, limit it. */
416 seconds
= max_seconds
;
422 /* May be a negative number, or some bad syntax. */
428 unsigned char *joined_url
= join_urls(html_context
->base_href
, url
);
430 html_focusable(html_context
, NULL
);
432 put_link_line("Refresh: ", url
, joined_url
,
433 html_context
->options
->framename
, html_context
);
434 html_context
->special_f(html_context
, SP_REFRESH
, seconds
, joined_url
);
436 mem_free(joined_url
);
444 if (!get_opt_bool("document.cache.ignore_cache_control")) {
449 /* XXX: Code duplication with HTTP protocol backend. */
450 /* I am not entirely sure in what order we should process these
451 * headers and if we should still process Cache-Control max-age
452 * if we already set max age to date mentioned in Expires.
454 if ((d
= parse_header(head
, "Pragma", NULL
))) {
455 if (strstr(d
, "no-cache")) {
461 if (!no_cache
&& (d
= parse_header(head
, "Cache-Control", NULL
))) {
462 if (strstr(d
, "no-cache") || strstr(d
, "must-revalidate")) {
466 unsigned char *pos
= strstr(d
, "max-age=");
471 /* Grab the number of seconds. */
472 timeval_T max_age
, seconds
;
474 timeval_from_seconds(&seconds
, atol(pos
+ 8));
475 timeval_now(&max_age
);
476 timeval_add_interval(&max_age
, &seconds
);
478 expires
= timeval_to_seconds(&max_age
);
485 if (!no_cache
&& (d
= parse_header(head
, "Expires", NULL
))) {
486 /* Convert date to seconds. */
487 if (strstr(d
, "now")) {
491 expires
= timeval_to_seconds(&now
);
493 expires
= parse_date(&d
, NULL
, 0, 1);
500 html_context
->special_f(html_context
, SP_CACHE_CONTROL
);
502 html_context
->special_f(html_context
,
503 SP_CACHE_EXPIRES
, expires
);
511 look_for_map(unsigned char **pos
, unsigned char *eof
, struct uri
*uri
,
512 struct document_options
*options
)
514 unsigned char *al
, *attr
, *name
;
517 while (*pos
< eof
&& **pos
!= '<') {
521 if (*pos
>= eof
) return 0;
523 if (*pos
+ 2 <= eof
&& ((*pos
)[1] == '!' || (*pos
)[1] == '?')) {
524 *pos
= skip_comment(*pos
, eof
);
528 if (parse_element(*pos
, eof
, &name
, &namelen
, &attr
, pos
)) {
533 if (strlcasecmp(name
, namelen
, "MAP", 3)) return 1;
535 if (uri
&& uri
->fragment
) {
536 /* FIXME (bug 784): options->cp is the terminal charset;
537 * should use the document charset instead. */
538 al
= get_attr_val(attr
, "name", options
->cp
);
541 if (strlcasecmp(al
, -1, uri
->fragment
, uri
->fragmentlen
)) {
553 look_for_tag(unsigned char **pos
, unsigned char *eof
,
554 unsigned char *name
, int namelen
, unsigned char **label
)
559 if (!init_string(&str
)) {
560 /* Is this the right way to bail out? --jonas */
566 while (pos2
< eof
&& *pos2
!= '<') {
576 add_bytes_to_string(&str
, *pos
, pos2
- *pos
);
581 if (*pos
+ 2 <= eof
&& ((*pos
)[1] == '!' || (*pos
)[1] == '?')) {
582 *pos
= skip_comment(*pos
, eof
);
586 if (parse_element(*pos
, eof
, NULL
, NULL
, NULL
, &pos2
)) return 1;
588 if (strlcasecmp(name
, namelen
, "A", 1)
589 && strlcasecmp(name
, namelen
, "/A", 2)
590 && strlcasecmp(name
, namelen
, "MAP", 3)
591 && strlcasecmp(name
, namelen
, "/MAP", 4)
592 && strlcasecmp(name
, namelen
, "AREA", 4)
593 && strlcasecmp(name
, namelen
, "/AREA", 5)) {
602 look_for_link(unsigned char **pos
, unsigned char *eof
, struct menu_item
**menu
,
603 struct memory_list
**ml
, struct uri
*href_base
,
604 unsigned char *target_base
, struct conv_table
*ct
,
605 struct document_options
*options
)
607 unsigned char *attr
, *href
, *name
, *target
;
608 unsigned char *label
= NULL
; /* shut up warning */
610 struct menu_item
*nm
;
614 while (*pos
< eof
&& **pos
!= '<') {
618 if (*pos
>= eof
) return 0;
620 if (*pos
+ 2 <= eof
&& ((*pos
)[1] == '!' || (*pos
)[1] == '?')) {
621 *pos
= skip_comment(*pos
, eof
);
625 if (parse_element(*pos
, eof
, &name
, &namelen
, &attr
, pos
)) {
630 if (!strlcasecmp(name
, namelen
, "A", 1)) {
631 while (look_for_tag(pos
, eof
, name
, namelen
, &label
));
633 if (*pos
>= eof
) return 0;
635 } else if (!strlcasecmp(name
, namelen
, "AREA", 4)) {
636 /* FIXME (bug 784): options->cp is the terminal charset;
637 * should use the document charset instead. */
638 unsigned char *alt
= get_attr_val(attr
, "alt", options
->cp
);
641 label
= convert_string(ct
, alt
, strlen(alt
),
642 options
->cp
, CSM_DEFAULT
,
649 } else if (!strlcasecmp(name
, namelen
, "/MAP", 4)) {
650 /* This is the only successful return from here! */
651 add_to_ml(ml
, (void *) *menu
, (void *) NULL
);
658 target
= get_target(options
, attr
);
659 if (!target
) target
= stracpy(empty_string_or_(target_base
));
665 ld
= mem_alloc(sizeof(*ld
));
672 /* FIXME (bug 784): options->cp is the terminal charset;
673 * should use the document charset instead. */
674 href
= get_url_val(attr
, "href", options
->cp
);
683 ld
->link
= join_urls(href_base
, href
);
694 for (nmenu
= 0; !mi_is_end_of_menu(&(*menu
)[nmenu
]); nmenu
++) {
695 struct link_def
*ll
= (*menu
)[nmenu
].data
;
697 if (!strcmp(ll
->link
, ld
->link
) &&
698 !strcmp(ll
->target
, ld
->target
)) {
700 mem_free(ld
->target
);
717 label
= stracpy(ld
->link
);
726 nm
= mem_realloc(*menu
, (nmenu
+ 2) * sizeof(*nm
));
729 memset(&nm
[nmenu
], 0, 2 * sizeof(*nm
));
730 nm
[nmenu
].text
= label
;
731 nm
[nmenu
].func
= map_selected
;
733 nm
[nmenu
].flags
= NO_INTL
;
736 add_to_ml(ml
, (void *) ld
, (void *) ld
->link
, (void *) ld
->target
,
737 (void *) label
, (void *) NULL
);
744 get_image_map(unsigned char *head
, unsigned char *pos
, unsigned char *eof
,
745 struct menu_item
**menu
, struct memory_list
**ml
, struct uri
*uri
,
746 struct document_options
*options
, unsigned char *target_base
,
747 int to
, int def
, int hdef
)
749 struct conv_table
*ct
;
752 if (!init_string(&hd
)) return -1;
754 if (head
) add_to_string(&hd
, head
);
755 scan_http_equiv(pos
, eof
, &hd
, NULL
, options
);
756 ct
= get_convert_table(hd
.source
, to
, def
, NULL
, NULL
, hdef
);
759 *menu
= mem_calloc(1, sizeof(**menu
));
760 if (!*menu
) return -1;
762 while (look_for_map(&pos
, eof
, uri
, options
));
771 while (look_for_link(&pos
, eof
, menu
, ml
, uri
, target_base
, ct
, options
))
786 struct html_element
*
787 init_html_parser_state(struct html_context
*html_context
,
788 enum html_element_mortality_type type
,
789 int align
, int margin
, int width
)
791 html_stack_dup(html_context
, type
);
793 par_format
.align
= align
;
795 if (type
<= ELEMENT_IMMORTAL
) {
796 par_format
.leftmargin
= margin
;
797 par_format
.rightmargin
= margin
;
798 par_format
.width
= width
;
799 par_format
.list_level
= 0;
800 par_format
.list_number
= 0;
801 par_format
.dd_margin
= 0;
802 html_top
->namelen
= 0;
811 done_html_parser_state(struct html_context
*html_context
,
812 struct html_element
*element
)
814 html_context
->line_breax
= 1;
816 while (html_top
!= element
) {
817 pop_html_element(html_context
);
819 /* I've preserved this bit to show an example of the Old Code
820 * of the Mikulas days (I _HOPE_ it's by Mikulas, at least ;-).
821 * I think this assert() can never fail, for one. --pasky */
822 assertm(html_top
&& (void *) html_top
!= (void *) &html_stack
,
823 "html stack trashed");
824 if_assert_failed
break;
828 html_top
->type
= ELEMENT_KILLABLE
;
829 pop_html_element(html_context
);
833 /* This function does not set html_context.doc_cp = document.cp,
834 * because it does not know the document, and because the codepage has
835 * not even been decided when it is called. */
836 struct html_context
*
837 init_html_parser(struct uri
*uri
, struct document_options
*options
,
838 unsigned char *start
, unsigned char *end
,
839 struct string
*head
, struct string
*title
,
840 void (*put_chars
)(struct html_context
*, unsigned char *, int),
841 void (*line_break
)(struct html_context
*),
842 void *(*special
)(struct html_context
*, enum html_special_type
, ...))
844 struct html_context
*html_context
;
845 struct html_element
*e
;
847 assert(uri
&& options
);
848 if_assert_failed
return NULL
;
850 html_context
= mem_calloc(1, sizeof(*html_context
));
851 if (!html_context
) return NULL
;
854 html_context
->css_styles
.import
= import_css_stylesheet
;
855 init_list(html_context
->css_styles
.selectors
);
858 init_list(html_context
->stack
);
860 html_context
->startf
= start
;
861 html_context
->put_chars_f
= put_chars
;
862 html_context
->line_break_f
= line_break
;
863 html_context
->special_f
= special
;
865 html_context
->base_href
= get_uri_reference(uri
);
866 html_context
->base_target
= null_or_stracpy(options
->framename
);
868 html_context
->options
= options
;
870 scan_http_equiv(start
, end
, head
, title
, options
);
872 e
= mem_calloc(1, sizeof(*e
));
874 add_to_list(html_context
->stack
, e
);
876 format
.style
.attr
= 0;
878 format
.link
= format
.target
= format
.image
= NULL
;
879 format
.onclick
= format
.ondblclick
= format
.onmouseover
= format
.onhover
880 = format
.onfocus
= format
.onmouseout
= format
.onblur
= NULL
;
881 format
.select
= NULL
;
885 format
.style
.fg
= options
->default_fg
;
886 format
.style
.bg
= options
->default_bg
;
887 format
.clink
= options
->default_link
;
888 format
.vlink
= options
->default_vlink
;
889 #ifdef CONFIG_BOOKMARKS
890 format
.bookmark_link
= options
->default_bookmark_link
;
892 format
.image_link
= options
->default_image_link
;
894 par_format
.align
= ALIGN_LEFT
;
895 par_format
.leftmargin
= options
->margin
;
896 par_format
.rightmargin
= options
->margin
;
898 par_format
.width
= options
->box
.width
;
899 par_format
.list_level
= par_format
.list_number
= 0;
900 par_format
.dd_margin
= options
->margin
;
901 par_format
.flags
= P_NONE
;
903 par_format
.bgcolor
= options
->default_bg
;
905 html_top
->invisible
= 0;
906 html_top
->name
= NULL
;
907 html_top
->namelen
= 0;
908 html_top
->options
= NULL
;
909 html_top
->linebreak
= 1;
910 html_top
->type
= ELEMENT_DONT_KILL
;
912 html_context
->has_link_lines
= 0;
913 html_context
->table_level
= 0;
916 html_context
->css_styles
.import_data
= html_context
;
918 if (options
->css_enable
)
919 mirror_css_stylesheet(&default_stylesheet
,
920 &html_context
->css_styles
);
927 done_html_parser(struct html_context
*html_context
)
930 if (html_context
->options
->css_enable
)
931 done_css_stylesheet(&html_context
->css_styles
);
934 mem_free(html_context
->base_target
);
935 done_uri(html_context
->base_href
);
937 kill_html_stack_item(html_context
, html_context
->stack
.next
);
939 assertm(list_empty(html_context
->stack
),
940 "html stack not empty after operation");
941 if_assert_failed
init_list(html_context
->stack
);
943 mem_free(html_context
);