1 /* DOM-based SGML (HTML) source view renderer (just syntax highlighting :-) */
7 #include <sys/types.h> /* FreeBSD needs this before regex.h */
14 #include "cache/cache.h"
15 #include "document/css/css.h"
16 #include "document/css/parser.h"
17 #include "document/css/property.h"
18 #include "document/css/stylesheet.h"
19 #include "document/document.h"
20 #include "document/dom/renderer.h"
21 #include "document/dom/util.h"
22 #include "document/dom/rss.h"
23 #include "document/renderer.h"
24 #include "dom/configuration.h"
25 #include "dom/scanner.h"
26 #include "dom/sgml/parser.h"
27 #include "dom/sgml/html/html.h"
28 #include "dom/sgml/rss/rss.h"
30 #include "dom/stack.h"
31 #include "intl/charsets.h"
32 #include "protocol/uri.h"
33 #include "terminal/draw.h"
34 #include "util/error.h"
35 #include "util/memory.h"
36 #include "util/string.h"
39 #define check_dom_node_source(renderer, str, len) \
40 ((renderer)->source <= (str) && (str) + (len) <= (renderer)->end)
42 #define assert_source(renderer, str, len) \
43 assertm(check_dom_node_source(renderer, str, len), "renderer[%p : %p] str[%p : %p]", \
44 (renderer)->source, (renderer)->end, (str), (str) + (len))
47 #define URL_REGEX "(file://|((f|ht|nt)tp(s)?|smb)://[[:alnum:]]+([-@:.]?[[:alnum:]])*\\.[[:alpha:]]{2,4}(:[[:digit:]]+)?)(/(%[[:xdigit:]]{2}|[-_~&=;?.a-z0-9])*)*"
48 #define URL_REGFLAGS (REG_ICASE | REG_EXTENDED)
51 struct source_renderer
{
54 unsigned int find_url
:1;
57 /* One style per node type. */
58 struct screen_char styles
[DOM_NODES
];
63 render_dom_flush(struct dom_renderer
*renderer
, unsigned char *string
)
65 struct source_renderer
*data
= renderer
->data
;
66 struct screen_char
*template_
= &data
->styles
[DOM_NODE_TEXT
];
67 int length
= string
- renderer
->position
;
69 assert_source(renderer
, renderer
->position
, 0);
70 assert_source(renderer
, string
, 0);
72 if (length
<= 0) return;
73 render_dom_text(renderer
, template_
, renderer
->position
, length
);
74 renderer
->position
= string
;
76 assert_source(renderer
, renderer
->position
, 0);
80 render_dom_node_text(struct dom_renderer
*renderer
, struct screen_char
*template_
,
81 struct dom_node
*node
)
83 unsigned char *string
= node
->string
.string
;
84 int length
= node
->string
.length
;
86 if (node
->type
== DOM_NODE_ENTITY_REFERENCE
) {
91 if (check_dom_node_source(renderer
, string
, length
)) {
92 render_dom_flush(renderer
, string
);
93 renderer
->position
= string
+ length
;
94 assert_source(renderer
, renderer
->position
, 0);
97 render_dom_text(renderer
, template_
, string
, length
);
102 render_dom_node_enhanced_text(struct dom_renderer
*renderer
, struct dom_node
*node
)
104 struct source_renderer
*data
= renderer
->data
;
105 regex_t
*regex
= &data
->url_regex
;
107 unsigned char *string
= node
->string
.string
;
108 int length
= node
->string
.length
;
109 struct screen_char
*template_
= &data
->styles
[node
->type
];
110 unsigned char *alloc_string
;
112 if (check_dom_node_source(renderer
, string
, length
)) {
113 render_dom_flush(renderer
, string
);
114 renderer
->position
= string
+ length
;
115 assert_source(renderer
, renderer
->position
, 0);
118 alloc_string
= memacpy(string
, length
);
120 string
= alloc_string
;
122 while (length
> 0 && !regexec(regex
, string
, 1, ®match
, 0)) {
123 int matchlen
= regmatch
.rm_eo
- regmatch
.rm_so
;
124 int offset
= regmatch
.rm_so
;
126 if (!matchlen
|| offset
< 0 || regmatch
.rm_eo
> length
)
130 render_dom_text(renderer
, template_
, string
, offset
);
135 add_dom_link(renderer
, string
, matchlen
, string
, matchlen
);
142 render_dom_text(renderer
, template_
, string
, length
);
144 mem_free_if(alloc_string
);
149 render_dom_node_source(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
151 struct dom_renderer
*renderer
= stack
->current
->data
;
152 struct source_renderer
*data
= renderer
->data
;
154 assert(node
&& renderer
&& renderer
->document
);
158 && (node
->type
== DOM_NODE_TEXT
159 || node
->type
== DOM_NODE_CDATA_SECTION
160 || node
->type
== DOM_NODE_COMMENT
)) {
161 render_dom_node_enhanced_text(renderer
, node
);
164 render_dom_node_text(renderer
, &data
->styles
[node
->type
], node
);
169 /* This callback is also used for rendering processing instruction nodes. */
171 render_dom_element_source(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
173 struct dom_renderer
*renderer
= stack
->current
->data
;
174 struct source_renderer
*data
= renderer
->data
;
176 assert(node
&& renderer
&& renderer
->document
);
178 render_dom_node_text(renderer
, &data
->styles
[node
->type
], node
);
184 render_dom_element_end_source(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
186 struct dom_renderer
*renderer
= stack
->current
->data
;
187 struct source_renderer
*data
= renderer
->data
;
188 struct dom_stack_state
*state
= get_dom_stack_top(stack
);
189 struct sgml_parser_state
*pstate
= get_dom_stack_state_data(stack
->contexts
[0], state
);
190 struct dom_scanner_token
*token
= &pstate
->end_token
;
191 unsigned char *string
= token
->string
.string
;
192 int length
= token
->string
.length
;
194 assert(node
&& renderer
&& renderer
->document
);
196 if (!string
|| !length
)
199 if (check_dom_node_source(renderer
, string
, length
)) {
200 render_dom_flush(renderer
, string
);
201 renderer
->position
= string
+ length
;
202 assert_source(renderer
, renderer
->position
, 0);
205 render_dom_text(renderer
, &data
->styles
[node
->type
], string
, length
);
211 set_base_uri(struct dom_renderer
*renderer
, unsigned char *value
, size_t valuelen
)
213 unsigned char *href
= memacpy(value
, valuelen
);
214 unsigned char *uristring
;
218 uristring
= join_urls(renderer
->base_uri
, href
);
221 if (!uristring
) return;
222 uri
= get_uri(uristring
, 0);
227 done_uri(renderer
->base_uri
);
228 renderer
->base_uri
= uri
;
232 render_dom_attribute_source(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
234 struct dom_renderer
*renderer
= stack
->current
->data
;
235 struct source_renderer
*data
= renderer
->data
;
236 struct screen_char
*template_
= &data
->styles
[node
->type
];
238 assert(node
&& renderer
->document
);
240 render_dom_node_text(renderer
, template_
, node
);
242 if (is_dom_string_set(&node
->data
.attribute
.value
)) {
243 int quoted
= node
->data
.attribute
.quoted
== 1;
244 unsigned char *value
= node
->data
.attribute
.value
.string
- quoted
;
245 int valuelen
= node
->data
.attribute
.value
.length
+ quoted
* 2;
247 if (check_dom_node_source(renderer
, value
, 0)) {
248 render_dom_flush(renderer
, value
);
249 renderer
->position
= value
+ valuelen
;
250 assert_source(renderer
, renderer
->position
, 0);
253 if (node
->data
.attribute
.reference
254 && valuelen
- quoted
* 2 > 0) {
257 /* Need to flush the first quoting delimiter and any
258 * leading whitespace so that the renderers x position
259 * is at the start of the value string. */
260 for (skips
= 0; skips
< valuelen
; skips
++) {
261 if ((quoted
&& skips
== 0)
262 || isspace(value
[skips
])
263 || value
[skips
] < ' ')
270 render_dom_text(renderer
, template_
, value
, skips
);
275 /* Figure out what should be skipped after the actual
277 for (skips
= 0; skips
< valuelen
; skips
++) {
278 if ((quoted
&& skips
== 0)
279 || isspace(value
[valuelen
- skips
- 1])
280 || value
[valuelen
- skips
- 1] < ' ')
286 if (renderer
->doctype
== SGML_DOCTYPE_HTML
287 && node
->data
.attribute
.type
== HTML_ATTRIBUTE_HREF
288 && node
->parent
->data
.element
.type
== HTML_ELEMENT_BASE
) {
289 set_base_uri(renderer
, value
, valuelen
- skips
);
292 add_dom_link(renderer
, value
, valuelen
- skips
,
293 value
, valuelen
- skips
);
296 value
+= valuelen
- skips
;
297 render_dom_text(renderer
, template_
, value
, skips
);
300 render_dom_text(renderer
, template_
, value
, valuelen
);
308 render_dom_cdata_source(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
310 struct dom_renderer
*renderer
= stack
->current
->data
;
311 struct source_renderer
*data
= renderer
->data
;
312 unsigned char *string
= node
->string
.string
;
314 assert(node
&& renderer
&& renderer
->document
);
316 /* Highlight the 'CDATA' part of <![CDATA[ if it is there. */
317 if (check_dom_node_source(renderer
, string
- 6, 6)) {
318 render_dom_flush(renderer
, string
- 6);
319 render_dom_text(renderer
, &data
->styles
[DOM_NODE_ATTRIBUTE
], string
- 6, 5);
320 renderer
->position
= string
- 1;
321 assert_source(renderer
, renderer
->position
, 0);
324 render_dom_node_text(renderer
, &data
->styles
[node
->type
], node
);
331 render_dom_document_start(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
333 struct dom_renderer
*renderer
= stack
->current
->data
;
334 struct document
*document
= renderer
->document
;
335 struct source_renderer
*data
;
336 enum dom_node_type type
;
338 struct css_stylesheet
*css
= &default_stylesheet
;
340 static int i_want_struct_module_for_dom
;
342 if (!i_want_struct_module_for_dom
) {
343 static const unsigned char default_colors
[] =
344 "document { color: yellow } "
345 "element { color: lightgreen } "
346 "entity-reference { color: red } "
347 "proc-instruction { color: red } "
348 "attribute { color: magenta } "
349 "comment { color: aqua } "
350 "cdata-section { color: orange2 } ";
352 i_want_struct_module_for_dom
= 1;
353 /* When someone will get here earlier than at 4am,
354 * this will be done in some init function, perhaps
355 * not overriding the user's default stylesheet. */
356 css_parse_stylesheet(css
, NULL
, default_colors
,
357 default_colors
+ sizeof(default_colors
));
361 data
= renderer
->data
= mem_calloc(1, sizeof(*data
));
363 /* Initialize styles for all the DOM node types. */
365 for (type
= 0; type
< DOM_NODES
; type
++) {
366 struct screen_char
*template_
= &data
->styles
[type
];
367 struct dom_string
*name
= get_dom_node_type_name(type
);
368 struct css_selector
*selector
= NULL
;
370 if (name
&& is_dom_string_set(name
))
371 selector
= find_css_selector(&css
->selectors
,
372 CST_ELEMENT
, CSR_ROOT
,
373 name
->string
, name
->length
);
374 init_template_by_style(template_
, &document
->options
,
375 selector
? &selector
->properties
: NULL
);
379 if (document
->options
.plain_display_links
) {
380 if (regcomp(&data
->url_regex
, URL_REGEX
, URL_REGFLAGS
)) {
381 regfree(&data
->url_regex
);
392 render_dom_document_end(struct dom_stack
*stack
, struct dom_node
*node
, void *xxx
)
394 struct dom_renderer
*renderer
= stack
->current
->data
;
395 struct source_renderer
*data
= renderer
->data
;
397 /* If there are no non-element nodes after the last element node make
398 * sure that we flush to the end of the cache entry source including
399 * the '>' of the last element tag if it has one. (bug 519) */
400 if (check_dom_node_source(renderer
, renderer
->position
, 0)) {
401 render_dom_flush(renderer
, renderer
->end
);
406 regfree(&data
->url_regex
);
411 /* It is not necessary to return DOM_CODE_FREE_NODE here.
412 * Because the parser was created with the SGML_PARSER_STREAM
413 * type, the stack has the DOM_STACK_FLAG_FREE_NODES flag and
414 * implicitly frees all nodes popped from it. */
419 struct dom_stack_context_info dom_source_renderer_context_info
= {
420 /* Object size: */ 0,
424 /* DOM_NODE_ELEMENT */ render_dom_element_source
,
425 /* DOM_NODE_ATTRIBUTE */ render_dom_attribute_source
,
426 /* DOM_NODE_TEXT */ render_dom_node_source
,
427 /* DOM_NODE_CDATA_SECTION */ render_dom_cdata_source
,
428 /* DOM_NODE_ENTITY_REFERENCE */ render_dom_node_source
,
429 /* DOM_NODE_ENTITY */ render_dom_node_source
,
430 /* DOM_NODE_PROC_INSTRUCTION */ render_dom_element_source
,
431 /* DOM_NODE_COMMENT */ render_dom_node_source
,
432 /* DOM_NODE_DOCUMENT */ render_dom_document_start
,
433 /* DOM_NODE_DOCUMENT_TYPE */ render_dom_node_source
,
434 /* DOM_NODE_DOCUMENT_FRAGMENT */ render_dom_node_source
,
435 /* DOM_NODE_NOTATION */ render_dom_node_source
,
440 /* DOM_NODE_ELEMENT */ render_dom_element_end_source
,
441 /* DOM_NODE_ATTRIBUTE */ NULL
,
442 /* DOM_NODE_TEXT */ NULL
,
443 /* DOM_NODE_CDATA_SECTION */ NULL
,
444 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
445 /* DOM_NODE_ENTITY */ NULL
,
446 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
447 /* DOM_NODE_COMMENT */ NULL
,
448 /* DOM_NODE_DOCUMENT */ render_dom_document_end
,
449 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
450 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
451 /* DOM_NODE_NOTATION */ NULL
,