Use separate data variables for storing DOM stack data
[elinks.git] / src / document / sgml / parser.c
blob6770825d029a094188640463a1501f040615fe3d
1 /* SGML node handling */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <stdlib.h>
8 #include <string.h>
10 #include "elinks.h"
12 #include "cache/cache.h"
13 #include "document/document.h"
14 #include "document/dom/node.h"
15 #include "document/dom/stack.h"
16 #include "document/html/renderer.h" /* TODO: Move get_convert_table() */
17 #include "document/sgml/html/html.h"
18 #include "document/sgml/parser.h"
19 #include "document/sgml/scanner.h"
20 #include "document/sgml/sgml.h"
21 #include "intl/charsets.h"
22 #include "protocol/uri.h"
23 #include "util/error.h"
24 #include "util/lists.h"
25 #include "util/memory.h"
26 #include "util/string.h"
29 /* Functions for adding new nodes to the DOM tree */
31 static inline struct dom_node *
32 add_sgml_document(struct dom_stack *stack, struct uri *uri)
34 unsigned char *string = struri(uri);
35 int length = strlen(string);
36 struct dom_node *node = init_dom_node(DOM_NODE_DOCUMENT, string, length);
38 return node ? push_dom_node(stack, node) : node;
41 static inline struct dom_node *
42 add_sgml_element(struct dom_stack *stack, struct scanner_token *token)
44 struct sgml_parser *parser = stack->parser;
45 struct dom_node *parent = get_dom_stack_top(stack)->node;
46 struct dom_stack_state *state;
47 struct sgml_parser_state *pstate;
48 struct dom_node *node;
49 struct sgml_node_info *node_info;
51 node = add_dom_element(parent, token->string, token->length);
52 if (!node) return NULL;
54 node_info = get_sgml_node_info(parser->info->elements, node);
55 node->data.element.type = node_info->type;
57 if (!push_dom_node(stack, node))
58 return NULL;
60 state = get_dom_stack_top(stack);
61 assert(node == state->node);
63 pstate = get_dom_stack_state_data(stack, state);
64 pstate->info = node_info;
66 return node;
70 static inline void
71 add_sgml_attribute(struct dom_stack *stack,
72 struct scanner_token *token, struct scanner_token *valtoken)
74 struct sgml_parser *parser = stack->parser;
75 struct dom_node *parent = get_dom_stack_top(stack)->node;
76 unsigned char *value = valtoken ? valtoken->string : NULL;
77 uint16_t valuelen = valtoken ? valtoken->length : 0;
78 struct sgml_node_info *info;
79 struct dom_node *node;
81 node = add_dom_attribute(parent, token->string, token->length,
82 value, valuelen);
84 info = get_sgml_node_info(parser->info->attributes, node);
86 node->data.attribute.type = info->type;
87 node->data.attribute.id = !!(info->flags & SGML_ATTRIBUTE_IDENTIFIER);
88 node->data.attribute.reference = !!(info->flags & SGML_ATTRIBUTE_REFERENCE);
90 if (valtoken && valtoken->type == SGML_TOKEN_STRING)
91 node->data.attribute.quoted = 1;
93 if (!node || !push_dom_node(stack, node))
94 return;
96 pop_dom_node(stack);
99 static inline struct dom_node *
100 add_sgml_proc_instruction(struct dom_stack *stack, struct scanner_token *token)
102 struct dom_node *parent = get_dom_stack_top(stack)->node;
103 struct dom_node *node;
104 /* Split the token in two if we can find a first space separator. */
105 unsigned char *separator = memchr(token->string, ' ', token->length);
107 /* Anything before the separator becomes the target name ... */
108 unsigned char *name = token->string;
109 int namelen = separator ? separator - token->string : token->length;
111 /* ... and everything after the instruction value. */
112 unsigned char *value = separator ? separator + 1 : NULL;
113 int valuelen = value ? token->length - namelen - 1 : 0;
115 node = add_dom_proc_instruction(parent, name, namelen, value, valuelen);
116 if (!node) return NULL;
118 switch (token->type) {
119 case SGML_TOKEN_PROCESS_XML:
120 node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
121 break;
123 case SGML_TOKEN_PROCESS:
124 default:
125 node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
128 if (!push_dom_node(stack, node))
129 return NULL;
131 if (token->type != SGML_TOKEN_PROCESS_XML)
132 pop_dom_node(stack);
134 return node;
137 static inline void
138 add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct scanner_token *token)
140 struct dom_node *parent = get_dom_stack_top(stack)->node;
141 struct dom_node *node = add_dom_node(parent, type, token->string, token->length);
143 if (!node) return;
145 if (token->type == SGML_TOKEN_SPACE)
146 node->data.text.only_space = 1;
148 if (push_dom_node(stack, node))
149 pop_dom_node(stack);
152 #define add_sgml_entityref(stack, t) add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, t)
153 #define add_sgml_text(stack, t) add_sgml_node(stack, DOM_NODE_TEXT, t)
154 #define add_sgml_comment(stack, t) add_sgml_node(stack, DOM_NODE_COMMENT, t)
156 static inline void
157 parse_sgml_attributes(struct dom_stack *stack, struct scanner *scanner)
159 struct scanner_token name;
161 assert(scanner_has_tokens(scanner)
162 && (get_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
163 || get_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML));
165 skip_scanner_token(scanner);
167 while (scanner_has_tokens(scanner)) {
168 struct scanner_token *token = get_scanner_token(scanner);
170 assert(token);
172 switch (token->type) {
173 case SGML_TOKEN_TAG_END:
174 skip_scanner_token(scanner);
175 /* and return */
176 case SGML_TOKEN_ELEMENT:
177 case SGML_TOKEN_ELEMENT_BEGIN:
178 case SGML_TOKEN_ELEMENT_END:
179 case SGML_TOKEN_ELEMENT_EMPTY_END:
180 return;
182 case SGML_TOKEN_IDENT:
183 copy_struct(&name, token);
185 /* Skip the attribute name token */
186 token = get_next_scanner_token(scanner);
187 if (token && token->type == '=') {
188 /* If the token is not a valid value token
189 * ignore it. */
190 token = get_next_scanner_token(scanner);
191 if (token
192 && token->type != SGML_TOKEN_IDENT
193 && token->type != SGML_TOKEN_ATTRIBUTE
194 && token->type != SGML_TOKEN_STRING)
195 token = NULL;
196 } else {
197 token = NULL;
200 add_sgml_attribute(stack, &name, token);
202 /* Skip the value token */
203 if (token)
204 skip_scanner_token(scanner);
205 break;
207 default:
208 skip_scanner_token(scanner);
214 void
215 parse_sgml_document(struct dom_stack *stack, struct scanner *scanner)
217 while (scanner_has_tokens(scanner)) {
218 struct scanner_token *token = get_scanner_token(scanner);
220 switch (token->type) {
221 case SGML_TOKEN_ELEMENT:
222 case SGML_TOKEN_ELEMENT_BEGIN:
223 if (!add_sgml_element(stack, token)) {
224 if (token->type == SGML_TOKEN_ELEMENT) {
225 skip_scanner_token(scanner);
226 break;
229 skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
230 break;
233 if (token->type == SGML_TOKEN_ELEMENT_BEGIN) {
234 parse_sgml_attributes(stack, scanner);
235 } else {
236 skip_scanner_token(scanner);
239 break;
241 case SGML_TOKEN_ELEMENT_EMPTY_END:
242 pop_dom_node(stack);
243 skip_scanner_token(scanner);
244 break;
246 case SGML_TOKEN_ELEMENT_END:
247 if (!token->length) {
248 pop_dom_node(stack);
249 } else {
250 pop_dom_nodes(stack, DOM_NODE_ELEMENT,
251 token->string, token->length);
253 skip_scanner_token(scanner);
254 break;
256 case SGML_TOKEN_NOTATION_COMMENT:
257 add_sgml_comment(stack, token);
258 skip_scanner_token(scanner);
259 break;
261 case SGML_TOKEN_NOTATION_ATTLIST:
262 case SGML_TOKEN_NOTATION_DOCTYPE:
263 case SGML_TOKEN_NOTATION_ELEMENT:
264 case SGML_TOKEN_NOTATION_ENTITY:
265 case SGML_TOKEN_NOTATION:
266 skip_scanner_token(scanner);
267 break;
269 case SGML_TOKEN_PROCESS_XML:
270 if (!add_sgml_proc_instruction(stack, token)) {
271 skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
272 break;
275 parse_sgml_attributes(stack, scanner);
276 pop_dom_node(stack);
277 break;
279 case SGML_TOKEN_PROCESS:
280 add_sgml_proc_instruction(stack, token);
281 skip_scanner_token(scanner);
282 break;
284 case SGML_TOKEN_ENTITY:
285 add_sgml_entityref(stack, token);
286 skip_scanner_token(scanner);
287 break;
289 case SGML_TOKEN_SPACE:
290 case SGML_TOKEN_TEXT:
291 default:
292 add_sgml_text(stack, token);
293 skip_scanner_token(scanner);
299 struct sgml_parser *
300 init_sgml_parser(struct cache_entry *cached, struct document *document)
302 size_t obj_size = sizeof(struct sgml_parser_state);
303 struct sgml_parser *parser;
305 parser = mem_calloc(1, sizeof(*parser));
306 if (!parser) return NULL;
308 parser->document = document;
309 parser->cache_entry = cached;
310 parser->info = &sgml_html_info;
312 init_dom_stack(&parser->stack, parser, NULL, parser->info->callbacks, obj_size);
314 if (document->options.plain)
315 parser->flags |= SGML_PARSER_ADD_ELEMENT_ENDS;
317 return parser;
320 void
321 done_sgml_parser(struct sgml_parser *parser)
323 done_dom_stack(&parser->stack);
324 mem_free(parser);
327 /* FIXME: Make it possible to push variable number of strings (even nested
328 * while parsing another string) so that we can feed back output of stuff
329 * like ECMAScripts document.write(). */
330 struct dom_node *
331 parse_sgml(struct sgml_parser *parser, struct string *buffer)
333 unsigned char *source = buffer->source;
334 unsigned char *end = source + buffer->length;
336 init_scanner(&parser->scanner, &sgml_scanner_info, source, end);
338 parser->root = add_sgml_document(&parser->stack, parser->document->uri);
339 if (parser->root) {
340 parse_sgml_document(&parser->stack, &parser->scanner);
343 return parser->root;