wined3d: Respect the BO memory offset in wined3d_context_gl_map_bo_address().
[wine.git] / libs / xml2 / HTMLparser.c
blob70c6bfa18f628f51e66b1376c919f963e31e7cdc
1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
6 * daniel@veillard.com
7 */
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
47 #include "buf.h"
48 #include "enc.h"
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
57 static int htmlOmittedDefaultValue = 1;
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
63 /************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
69 /**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
72 * @extra: extra information
74 * Handle a redefinition of attribute error
76 static void
77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
98 /**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
108 static void LIBXML_ATTR_FORMAT(3,0)
109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
135 static void LIBXML_ATTR_FORMAT(3,0)
136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
151 /************************************************************************
153 * Parser stacks related functions and macros *
155 ************************************************************************/
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
162 * Pushes a new element name on top of the name stack
164 * Returns 0 in case of error, the index in the stack otherwise
166 static int
167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
192 * Pops the top element name from the name stack
194 * Returns the name just removed
196 static const xmlChar *
197 htmlnamePop(htmlParserCtxtPtr ctxt)
199 const xmlChar *ret;
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
220 * Pushes a new element name on top of the node info stack
222 * Returns 0 in case of error, the index in the stack otherwise
224 static int
225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
249 * Pops the top element name from the node info stack
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
253 static htmlParserNodeInfo *
254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
272 * Dirty macros, i.e. one need to make assumption on the context to use them
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
297 #define UPPER (toupper(*ctxt->input->cur))
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
301 #define NXT(val) ctxt->input->cur[(val)]
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
312 #define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
316 #define CURRENT ((int) (*ctxt->input->cur))
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
320 /* Imported from XML */
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
329 #define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; \
334 } while (0)
336 /************
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
345 #define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
350 * htmlFindEncoding:
351 * @the HTML parser context
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
363 static xmlChar *
364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
412 * Returns the current char value and its length
415 static int
416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
445 return((int) *ctxt->input->cur);
449 * Humm this is bad, do an automatic flow conversion
451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
461 * Don't use UTF-8 encoder which isn't required and
462 * can produce invalid UTF-8.
464 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465 xmlSwitchToEncoding(ctxt, handler);
466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
475 * We are supposed to handle UTF8, check it's valid
476 * From rfc2044: encoding of the Unicode values on UTF-8:
478 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
479 * 0000 0000-0000 007F 0xxxxxxx
480 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
481 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
483 * Check for the 0x110000 limit too
485 cur = ctxt->input->cur;
486 c = *cur;
487 if (c & 0x80) {
488 if ((c & 0x40) == 0)
489 goto encoding_error;
490 if (cur[1] == 0) {
491 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492 cur = ctxt->input->cur;
494 if ((cur[1] & 0xc0) != 0x80)
495 goto encoding_error;
496 if ((c & 0xe0) == 0xe0) {
498 if (cur[2] == 0) {
499 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500 cur = ctxt->input->cur;
502 if ((cur[2] & 0xc0) != 0x80)
503 goto encoding_error;
504 if ((c & 0xf0) == 0xf0) {
505 if (cur[3] == 0) {
506 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507 cur = ctxt->input->cur;
509 if (((c & 0xf8) != 0xf0) ||
510 ((cur[3] & 0xc0) != 0x80))
511 goto encoding_error;
512 /* 4-byte code */
513 *len = 4;
514 val = (cur[0] & 0x7) << 18;
515 val |= (cur[1] & 0x3f) << 12;
516 val |= (cur[2] & 0x3f) << 6;
517 val |= cur[3] & 0x3f;
518 if (val < 0x10000)
519 goto encoding_error;
520 } else {
521 /* 3-byte code */
522 *len = 3;
523 val = (cur[0] & 0xf) << 12;
524 val |= (cur[1] & 0x3f) << 6;
525 val |= cur[2] & 0x3f;
526 if (val < 0x800)
527 goto encoding_error;
529 } else {
530 /* 2-byte code */
531 *len = 2;
532 val = (cur[0] & 0x1f) << 6;
533 val |= cur[1] & 0x3f;
534 if (val < 0x80)
535 goto encoding_error;
537 if (!IS_CHAR(val)) {
538 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539 "Char 0x%X out of allowed range\n", val);
541 return(val);
542 } else {
543 if ((*ctxt->input->cur == 0) &&
544 (ctxt->input->cur < ctxt->input->end)) {
545 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546 "Char 0x%X out of allowed range\n", 0);
547 *len = 1;
548 return(' ');
550 /* 1-byte code */
551 *len = 1;
552 return((int) *ctxt->input->cur);
555 encoding_error:
557 * If we detect an UTF8 error that probably mean that the
558 * input encoding didn't get properly advertised in the
559 * declaration header. Report the error and switch the encoding
560 * to ISO-Latin-1 (if you don't like this policy, just declare the
561 * encoding !)
564 char buffer[150];
566 if (ctxt->input->end - ctxt->input->cur >= 4) {
567 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 ctxt->input->cur[0], ctxt->input->cur[1],
569 ctxt->input->cur[2], ctxt->input->cur[3]);
570 } else {
571 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
573 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 "Input is not proper UTF-8, indicate encoding !\n",
575 BAD_CAST buffer, NULL);
579 * Don't switch encodings twice. Note that if there's an encoder, we
580 * shouldn't receive invalid UTF-8 anyway.
582 * Note that if ctxt->input->buf == NULL, switching encodings is
583 * impossible, see Gitlab issue #34.
585 if ((ctxt->input->buf != NULL) &&
586 (ctxt->input->buf->encoder == NULL))
587 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588 *len = 1;
589 return((int) *ctxt->input->cur);
593 * htmlSkipBlankChars:
594 * @ctxt: the HTML parser context
596 * skip all blanks character found at that point in the input streams.
598 * Returns the number of space chars skipped
601 static int
602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603 int res = 0;
605 while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 if ((*ctxt->input->cur == 0) &&
607 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 xmlPopInput(ctxt);
609 } else {
610 if (*(ctxt->input->cur) == '\n') {
611 ctxt->input->line++; ctxt->input->col = 1;
612 } else ctxt->input->col++;
613 ctxt->input->cur++;
614 if (*ctxt->input->cur == 0)
615 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
617 res++;
619 return(res);
624 /************************************************************************
626 * The list of HTML elements and their properties *
628 ************************************************************************/
631 * Start Tag: 1 means the start tag can be omitted
632 * End Tag: 1 means the end tag can be omitted
633 * 2 means it's forbidden (empty elements)
634 * 3 means the tag is stylistic and should be closed easily
635 * Depr: this element is deprecated
636 * DTD: 1 means that this element is valid only in the Loose DTD
637 * 2 means that this element is valid only in the Frameset DTD
639 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 , subElements , impliedsubelt , Attributes, userdata
643 /* Definitions and a couple of vars for HTML Elements */
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
678 /* ... and for HTML Attributes */
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 "tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 "archive", "alt", "name", "height", "width", "align",
714 "hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718 { "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723 "link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
793 #define DECL (const char**)
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
798 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
800 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
806 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
807 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
809 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
812 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
815 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
821 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
824 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
827 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
828 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
830 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
833 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
836 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
839 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
842 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
843 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
845 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
848 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
851 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
854 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
855 EMPTY , NULL , DECL col_attrs , NULL, NULL
857 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
860 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
861 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
863 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
866 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
867 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
869 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
870 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
872 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
875 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
876 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
878 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
879 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
881 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
884 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 EMPTY, NULL, DECL embed_attrs, NULL, NULL
887 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
890 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
893 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
896 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 EMPTY, NULL, NULL, DECL frame_attrs, NULL
899 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
902 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
905 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
908 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
911 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
914 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
917 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
920 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
923 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
926 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
929 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
932 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
935 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
938 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
939 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
941 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
942 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
944 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
947 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
950 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
953 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
956 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
957 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
959 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
962 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
965 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
968 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
971 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
974 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 DECL html_flow, "div", DECL html_attrs, NULL, NULL
977 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
980 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
983 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
986 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
989 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
992 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
993 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
995 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
998 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1001 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1004 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1010 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 DECL select_content, NULL, DECL select_attrs, NULL, NULL
1013 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1014 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1016 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1019 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1022 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1025 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1026 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1028 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1029 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1031 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1034 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1035 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1037 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1038 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1040 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1043 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1046 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1049 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1052 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1053 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1055 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1056 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1058 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1059 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1061 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1064 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1067 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1070 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1075 typedef struct {
1076 const char *oldTag;
1077 const char *newTag;
1078 } htmlStartCloseEntry;
1081 * start tags that imply the end of current element
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084 { "a", "a" },
1085 { "a", "fieldset" },
1086 { "a", "table" },
1087 { "a", "td" },
1088 { "a", "th" },
1089 { "address", "dd" },
1090 { "address", "dl" },
1091 { "address", "dt" },
1092 { "address", "form" },
1093 { "address", "li" },
1094 { "address", "ul" },
1095 { "b", "center" },
1096 { "b", "p" },
1097 { "b", "td" },
1098 { "b", "th" },
1099 { "big", "p" },
1100 { "caption", "col" },
1101 { "caption", "colgroup" },
1102 { "caption", "tbody" },
1103 { "caption", "tfoot" },
1104 { "caption", "thead" },
1105 { "caption", "tr" },
1106 { "col", "col" },
1107 { "col", "colgroup" },
1108 { "col", "tbody" },
1109 { "col", "tfoot" },
1110 { "col", "thead" },
1111 { "col", "tr" },
1112 { "colgroup", "colgroup" },
1113 { "colgroup", "tbody" },
1114 { "colgroup", "tfoot" },
1115 { "colgroup", "thead" },
1116 { "colgroup", "tr" },
1117 { "dd", "dt" },
1118 { "dir", "dd" },
1119 { "dir", "dl" },
1120 { "dir", "dt" },
1121 { "dir", "form" },
1122 { "dir", "ul" },
1123 { "dl", "form" },
1124 { "dl", "li" },
1125 { "dt", "dd" },
1126 { "dt", "dl" },
1127 { "font", "center" },
1128 { "font", "td" },
1129 { "font", "th" },
1130 { "form", "form" },
1131 { "h1", "fieldset" },
1132 { "h1", "form" },
1133 { "h1", "li" },
1134 { "h1", "p" },
1135 { "h1", "table" },
1136 { "h2", "fieldset" },
1137 { "h2", "form" },
1138 { "h2", "li" },
1139 { "h2", "p" },
1140 { "h2", "table" },
1141 { "h3", "fieldset" },
1142 { "h3", "form" },
1143 { "h3", "li" },
1144 { "h3", "p" },
1145 { "h3", "table" },
1146 { "h4", "fieldset" },
1147 { "h4", "form" },
1148 { "h4", "li" },
1149 { "h4", "p" },
1150 { "h4", "table" },
1151 { "h5", "fieldset" },
1152 { "h5", "form" },
1153 { "h5", "li" },
1154 { "h5", "p" },
1155 { "h5", "table" },
1156 { "h6", "fieldset" },
1157 { "h6", "form" },
1158 { "h6", "li" },
1159 { "h6", "p" },
1160 { "h6", "table" },
1161 { "head", "a" },
1162 { "head", "abbr" },
1163 { "head", "acronym" },
1164 { "head", "address" },
1165 { "head", "b" },
1166 { "head", "bdo" },
1167 { "head", "big" },
1168 { "head", "blockquote" },
1169 { "head", "body" },
1170 { "head", "br" },
1171 { "head", "center" },
1172 { "head", "cite" },
1173 { "head", "code" },
1174 { "head", "dd" },
1175 { "head", "dfn" },
1176 { "head", "dir" },
1177 { "head", "div" },
1178 { "head", "dl" },
1179 { "head", "dt" },
1180 { "head", "em" },
1181 { "head", "fieldset" },
1182 { "head", "font" },
1183 { "head", "form" },
1184 { "head", "frameset" },
1185 { "head", "h1" },
1186 { "head", "h2" },
1187 { "head", "h3" },
1188 { "head", "h4" },
1189 { "head", "h5" },
1190 { "head", "h6" },
1191 { "head", "hr" },
1192 { "head", "i" },
1193 { "head", "iframe" },
1194 { "head", "img" },
1195 { "head", "kbd" },
1196 { "head", "li" },
1197 { "head", "listing" },
1198 { "head", "map" },
1199 { "head", "menu" },
1200 { "head", "ol" },
1201 { "head", "p" },
1202 { "head", "pre" },
1203 { "head", "q" },
1204 { "head", "s" },
1205 { "head", "samp" },
1206 { "head", "small" },
1207 { "head", "span" },
1208 { "head", "strike" },
1209 { "head", "strong" },
1210 { "head", "sub" },
1211 { "head", "sup" },
1212 { "head", "table" },
1213 { "head", "tt" },
1214 { "head", "u" },
1215 { "head", "ul" },
1216 { "head", "var" },
1217 { "head", "xmp" },
1218 { "hr", "form" },
1219 { "i", "center" },
1220 { "i", "p" },
1221 { "i", "td" },
1222 { "i", "th" },
1223 { "legend", "fieldset" },
1224 { "li", "li" },
1225 { "link", "body" },
1226 { "link", "frameset" },
1227 { "listing", "dd" },
1228 { "listing", "dl" },
1229 { "listing", "dt" },
1230 { "listing", "fieldset" },
1231 { "listing", "form" },
1232 { "listing", "li" },
1233 { "listing", "table" },
1234 { "listing", "ul" },
1235 { "menu", "dd" },
1236 { "menu", "dl" },
1237 { "menu", "dt" },
1238 { "menu", "form" },
1239 { "menu", "ul" },
1240 { "ol", "form" },
1241 { "ol", "ul" },
1242 { "option", "optgroup" },
1243 { "option", "option" },
1244 { "p", "address" },
1245 { "p", "blockquote" },
1246 { "p", "body" },
1247 { "p", "caption" },
1248 { "p", "center" },
1249 { "p", "col" },
1250 { "p", "colgroup" },
1251 { "p", "dd" },
1252 { "p", "dir" },
1253 { "p", "div" },
1254 { "p", "dl" },
1255 { "p", "dt" },
1256 { "p", "fieldset" },
1257 { "p", "form" },
1258 { "p", "frameset" },
1259 { "p", "h1" },
1260 { "p", "h2" },
1261 { "p", "h3" },
1262 { "p", "h4" },
1263 { "p", "h5" },
1264 { "p", "h6" },
1265 { "p", "head" },
1266 { "p", "hr" },
1267 { "p", "li" },
1268 { "p", "listing" },
1269 { "p", "menu" },
1270 { "p", "ol" },
1271 { "p", "p" },
1272 { "p", "pre" },
1273 { "p", "table" },
1274 { "p", "tbody" },
1275 { "p", "td" },
1276 { "p", "tfoot" },
1277 { "p", "th" },
1278 { "p", "title" },
1279 { "p", "tr" },
1280 { "p", "ul" },
1281 { "p", "xmp" },
1282 { "pre", "dd" },
1283 { "pre", "dl" },
1284 { "pre", "dt" },
1285 { "pre", "fieldset" },
1286 { "pre", "form" },
1287 { "pre", "li" },
1288 { "pre", "table" },
1289 { "pre", "ul" },
1290 { "s", "p" },
1291 { "script", "noscript" },
1292 { "small", "p" },
1293 { "span", "td" },
1294 { "span", "th" },
1295 { "strike", "p" },
1296 { "style", "body" },
1297 { "style", "frameset" },
1298 { "tbody", "tbody" },
1299 { "tbody", "tfoot" },
1300 { "td", "tbody" },
1301 { "td", "td" },
1302 { "td", "tfoot" },
1303 { "td", "th" },
1304 { "td", "tr" },
1305 { "tfoot", "tbody" },
1306 { "th", "tbody" },
1307 { "th", "td" },
1308 { "th", "tfoot" },
1309 { "th", "th" },
1310 { "th", "tr" },
1311 { "thead", "tbody" },
1312 { "thead", "tfoot" },
1313 { "title", "body" },
1314 { "title", "frameset" },
1315 { "tr", "tbody" },
1316 { "tr", "tfoot" },
1317 { "tr", "tr" },
1318 { "tt", "p" },
1319 { "u", "p" },
1320 { "u", "td" },
1321 { "u", "th" },
1322 { "ul", "address" },
1323 { "ul", "form" },
1324 { "ul", "menu" },
1325 { "ul", "ol" },
1326 { "ul", "pre" },
1327 { "xmp", "dd" },
1328 { "xmp", "dl" },
1329 { "xmp", "dt" },
1330 { "xmp", "fieldset" },
1331 { "xmp", "form" },
1332 { "xmp", "li" },
1333 { "xmp", "table" },
1334 { "xmp", "ul" }
1338 * The list of HTML elements which are supposed not to have
1339 * CDATA content and where a p element will be implied
1341 * TODO: extend that list by reading the HTML SGML DTD on
1342 * implied paragraph
1344 static const char *const htmlNoContentElements[] = {
1345 "html",
1346 "head",
1347 NULL
1351 * The list of HTML attributes which are of content %Script;
1352 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353 * it assumes the name starts with 'on'
1355 static const char *const htmlScriptAttributes[] = {
1356 "onclick",
1357 "ondblclick",
1358 "onmousedown",
1359 "onmouseup",
1360 "onmouseover",
1361 "onmousemove",
1362 "onmouseout",
1363 "onkeypress",
1364 "onkeydown",
1365 "onkeyup",
1366 "onload",
1367 "onunload",
1368 "onfocus",
1369 "onblur",
1370 "onsubmit",
1371 "onreset",
1372 "onchange",
1373 "onselect"
1377 * This table is used by the htmlparser to know what to do with
1378 * broken html pages. By assigning different priorities to different
1379 * elements the parser can decide how to handle extra endtags.
1380 * Endtags are only allowed to close elements with lower or equal
1381 * priority.
1384 typedef struct {
1385 const char *name;
1386 int priority;
1387 } elementPriority;
1389 static const elementPriority htmlEndPriority[] = {
1390 {"div", 150},
1391 {"td", 160},
1392 {"th", 160},
1393 {"tr", 170},
1394 {"thead", 180},
1395 {"tbody", 180},
1396 {"tfoot", 180},
1397 {"table", 190},
1398 {"head", 200},
1399 {"body", 200},
1400 {"html", 220},
1401 {NULL, 100} /* Default priority */
1404 /************************************************************************
1406 * functions to handle HTML specific data *
1408 ************************************************************************/
1411 * htmlInitAutoClose:
1413 * This is a no-op now.
1415 void
1416 htmlInitAutoClose(void) {
1419 static int __cdecl
1420 htmlCompareTags(const void *key, const void *member) {
1421 const xmlChar *tag = (const xmlChar *) key;
1422 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1424 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1428 * htmlTagLookup:
1429 * @tag: The tag name in lowercase
1431 * Lookup the HTML tag in the ElementTable
1433 * Returns the related htmlElemDescPtr or NULL if not found.
1435 const htmlElemDesc *
1436 htmlTagLookup(const xmlChar *tag) {
1437 if (tag == NULL)
1438 return(NULL);
1440 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442 sizeof(htmlElemDesc), htmlCompareTags));
1446 * htmlGetEndPriority:
1447 * @name: The name of the element to look up the priority for.
1449 * Return value: The "endtag" priority.
1451 static int
1452 htmlGetEndPriority (const xmlChar *name) {
1453 int i = 0;
1455 while ((htmlEndPriority[i].name != NULL) &&
1456 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 i++;
1459 return(htmlEndPriority[i].priority);
1463 static int __cdecl
1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467 int ret;
1469 ret = strcmp(key->oldTag, entry->oldTag);
1470 if (ret == 0)
1471 ret = strcmp(key->newTag, entry->newTag);
1473 return(ret);
1477 * htmlCheckAutoClose:
1478 * @newtag: The new tag name
1479 * @oldtag: The old tag name
1481 * Checks whether the new tag is one of the registered valid tags for
1482 * closing old.
1484 * Returns 0 if no, 1 if yes.
1486 static int
1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1489 htmlStartCloseEntry key;
1490 void *res;
1492 key.oldTag = (const char *) oldtag;
1493 key.newTag = (const char *) newtag;
1494 res = bsearch(&key, htmlStartClose,
1495 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497 return(res != NULL);
1501 * htmlAutoCloseOnClose:
1502 * @ctxt: an HTML parser context
1503 * @newtag: The new tag name
1504 * @force: force the tag closure
1506 * The HTML DTD allows an ending tag to implicitly close other tags.
1508 static void
1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1511 const htmlElemDesc *info;
1512 int i, priority;
1514 priority = htmlGetEndPriority(newtag);
1516 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1518 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519 break;
1521 * A misplaced endtag can only close elements with lower
1522 * or equal priority, so if we find an element with higher
1523 * priority before we find an element with
1524 * matching name, we just ignore this endtag
1526 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527 return;
1529 if (i < 0)
1530 return;
1532 while (!xmlStrEqual(newtag, ctxt->name)) {
1533 info = htmlTagLookup(ctxt->name);
1534 if ((info != NULL) && (info->endTag == 3)) {
1535 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 "Opening and ending tag mismatch: %s and %s\n",
1537 newtag, ctxt->name);
1539 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 htmlnamePop(ctxt);
1546 * htmlAutoCloseOnEnd:
1547 * @ctxt: an HTML parser context
1549 * Close all remaining tags at the end of the stream
1551 static void
1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1554 int i;
1556 if (ctxt->nameNr == 0)
1557 return;
1558 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 htmlnamePop(ctxt);
1566 * htmlAutoClose:
1567 * @ctxt: an HTML parser context
1568 * @newtag: The new tag name or NULL
1570 * The HTML DTD allows a tag to implicitly close other tags.
1571 * The list is kept in htmlStartClose array. This function is
1572 * called when a new tag has been detected and generates the
1573 * appropriates closes if possible/needed.
1574 * If newtag is NULL this mean we are at the end of the resource
1575 * and we should check
1577 static void
1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1580 while ((newtag != NULL) && (ctxt->name != NULL) &&
1581 (htmlCheckAutoClose(newtag, ctxt->name))) {
1582 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 htmlnamePop(ctxt);
1586 if (newtag == NULL) {
1587 htmlAutoCloseOnEnd(ctxt);
1588 return;
1590 while ((newtag == NULL) && (ctxt->name != NULL) &&
1591 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 htmlnamePop(ctxt);
1601 * htmlAutoCloseTag:
1602 * @doc: the HTML document
1603 * @name: The tag name
1604 * @elem: the HTML element
1606 * The HTML DTD allows a tag to implicitly close other tags.
1607 * The list is kept in htmlStartClose array. This function checks
1608 * if the element or one of it's children would autoclose the
1609 * given tag.
1611 * Returns 1 if autoclose, 0 otherwise
1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615 htmlNodePtr child;
1617 if (elem == NULL) return(1);
1618 if (xmlStrEqual(name, elem->name)) return(0);
1619 if (htmlCheckAutoClose(elem->name, name)) return(1);
1620 child = elem->children;
1621 while (child != NULL) {
1622 if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 child = child->next;
1625 return(0);
1629 * htmlIsAutoClosed:
1630 * @doc: the HTML document
1631 * @elem: the HTML element
1633 * The HTML DTD allows a tag to implicitly close other tags.
1634 * The list is kept in htmlStartClose array. This function checks
1635 * if a tag is autoclosed by one of it's child
1637 * Returns 1 if autoclosed, 0 otherwise
1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641 htmlNodePtr child;
1643 if (elem == NULL) return(1);
1644 child = elem->children;
1645 while (child != NULL) {
1646 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 child = child->next;
1649 return(0);
1653 * htmlCheckImplied:
1654 * @ctxt: an HTML parser context
1655 * @newtag: The new tag name
1657 * The HTML DTD allows a tag to exists only implicitly
1658 * called when a new tag has been detected and generates the
1659 * appropriates implicit tags if missing
1661 static void
1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663 int i;
1665 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666 return;
1667 if (!htmlOmittedDefaultValue)
1668 return;
1669 if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 return;
1671 if (ctxt->nameNr <= 0) {
1672 htmlnamePush(ctxt, BAD_CAST"html");
1673 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1676 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677 return;
1678 if ((ctxt->nameNr <= 1) &&
1679 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685 if (ctxt->html >= 3) {
1686 /* we already saw or generated an <head> before */
1687 return;
1690 * dropped OBJECT ... i you put it first BODY will be
1691 * assumed !
1693 htmlnamePush(ctxt, BAD_CAST"head");
1694 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699 if (ctxt->html >= 10) {
1700 /* we already saw or generated a <body> before */
1701 return;
1703 for (i = 0;i < ctxt->nameNr;i++) {
1704 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 return;
1707 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 return;
1712 htmlnamePush(ctxt, BAD_CAST"body");
1713 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1719 * htmlCheckParagraph
1720 * @ctxt: an HTML parser context
1722 * Check whether a p element need to be implied before inserting
1723 * characters in the current element.
1725 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726 * in case of error.
1729 static int
1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731 const xmlChar *tag;
1732 int i;
1734 if (ctxt == NULL)
1735 return(-1);
1736 tag = ctxt->name;
1737 if (tag == NULL) {
1738 htmlAutoClose(ctxt, BAD_CAST"p");
1739 htmlCheckImplied(ctxt, BAD_CAST"p");
1740 htmlnamePush(ctxt, BAD_CAST"p");
1741 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 return(1);
1745 if (!htmlOmittedDefaultValue)
1746 return(0);
1747 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 htmlAutoClose(ctxt, BAD_CAST"p");
1750 htmlCheckImplied(ctxt, BAD_CAST"p");
1751 htmlnamePush(ctxt, BAD_CAST"p");
1752 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 return(1);
1757 return(0);
1761 * htmlIsScriptAttribute:
1762 * @name: an attribute name
1764 * Check if an attribute is of content type Script
1766 * Returns 1 is the attribute is a script 0 otherwise
1769 htmlIsScriptAttribute(const xmlChar *name) {
1770 unsigned int i;
1772 if (name == NULL)
1773 return(0);
1775 * all script attributes start with 'on'
1777 if ((name[0] != 'o') || (name[1] != 'n'))
1778 return(0);
1779 for (i = 0;
1780 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 i++) {
1782 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 return(1);
1785 return(0);
1788 /************************************************************************
1790 * The list of HTML predefined entities *
1792 ************************************************************************/
1795 static const htmlEntityDesc html40EntitiesTable[] = {
1797 * the 4 absolute ones, plus apostrophe.
1799 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38, "amp", "ampersand, U+0026 ISOnum" },
1801 { 39, "apos", "single quote" },
1802 { 60, "lt", "less-than sign, U+003C ISOnum" },
1803 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1806 * A bunch still in the 128-255 range
1807 * Replacing them depend really on the charset used.
1809 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1812 { 163, "pound","pound sign, U+00A3 ISOnum" },
1813 { 164, "curren","currency sign, U+00A4 ISOnum" },
1814 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167, "sect", "section sign, U+00A7 ISOnum" },
1817 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1819 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172, "not", "not sign, U+00AC ISOnum" },
1822 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1826 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181, "micro","micro sign, U+00B5 ISOnum" },
1831 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1865 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1890 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247, "divide","division sign, U+00F7 ISOnum" },
1897 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1906 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1913 * Anything below should really be kept as entities references
1915 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1917 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732, "tilde","small tilde, U+02DC ISOdia" },
1920 { 913, "Alpha","greek capital letter alpha, U+0391" },
1921 { 914, "Beta", "greek capital letter beta, U+0392" },
1922 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1926 { 919, "Eta", "greek capital letter eta, U+0397" },
1927 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921, "Iota", "greek capital letter iota, U+0399" },
1929 { 922, "Kappa","greek capital letter kappa, U+039A" },
1930 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924, "Mu", "greek capital letter mu, U+039C" },
1932 { 925, "Nu", "greek capital letter nu, U+039D" },
1933 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1934 { 927, "Omicron","greek capital letter omicron, U+039F" },
1935 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929, "Rho", "greek capital letter rho, U+03A1" },
1937 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932, "Tau", "greek capital letter tau, U+03A4" },
1939 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935, "Chi", "greek capital letter chi, U+03A7" },
1942 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1945 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1957 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1958 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1959 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1974 { 8194, "ensp", "en space, U+2002 ISOpub" },
1975 { 8195, "emsp", "em space, U+2003 ISOpub" },
1976 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1977 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211, "ndash","en dash, U+2013 ISOpub" },
1982 { 8212, "mdash","em dash, U+2014 ISOpub" },
1983 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224, "dagger","dagger, U+2020 ISOpub" },
1990 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1992 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1995 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1997 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
2000 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2003 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260, "frasl","fraction slash, U+2044 NEW" },
2006 { 8364, "euro", "euro sign, U+20AC NEW" },
2008 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2025 { 8704, "forall","for all, U+2200 ISOtech" },
2026 { 8706, "part", "partial differential, U+2202 ISOtech" },
2027 { 8707, "exist","there exists, U+2203 ISOtech" },
2028 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712, "isin", "element of, U+2208 ISOtech" },
2031 { 8713, "notin","not an element of, U+2209 ISOtech" },
2032 { 8715, "ni", "contains as member, U+220B ISOtech" },
2033 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
2035 { 8722, "minus","minus sign, U+2212 ISOtech" },
2036 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733, "prop", "proportional to, U+221D ISOtech" },
2039 { 8734, "infin","infinity, U+221E ISOtech" },
2040 { 8736, "ang", "angle, U+2220 ISOamso" },
2041 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
2042 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
2043 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
2044 { 8746, "cup", "union = cup, U+222A ISOtech" },
2045 { 8747, "int", "integral, U+222B ISOtech" },
2046 { 8756, "there4","therefore, U+2234 ISOtech" },
2047 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800, "ne", "not equal to, U+2260 ISOtech" },
2051 { 8801, "equiv","identical to, U+2261 ISOtech" },
2052 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2053 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2054 { 8834, "sub", "subset of, U+2282 ISOtech" },
2055 { 8835, "sup", "superset of, U+2283 ISOtech" },
2056 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2067 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674, "loz", "lozenge, U+25CA ISOpub" },
2071 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2072 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2078 /************************************************************************
2080 * Commodity functions to handle entities *
2082 ************************************************************************/
2085 * Macro used to grow the current buffer.
2087 #define growBuffer(buffer) { \
2088 xmlChar *tmp; \
2089 buffer##_size *= 2; \
2090 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091 if (tmp == NULL) { \
2092 htmlErrMemory(ctxt, "growing buffer\n"); \
2093 xmlFree(buffer); \
2094 return(NULL); \
2096 buffer = tmp; \
2100 * htmlEntityLookup:
2101 * @name: the entity name
2103 * Lookup the given entity in EntitiesTable
2105 * TODO: the linear scan is really ugly, an hash table is really needed.
2107 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2109 const htmlEntityDesc *
2110 htmlEntityLookup(const xmlChar *name) {
2111 unsigned int i;
2113 for (i = 0;i < (sizeof(html40EntitiesTable)/
2114 sizeof(html40EntitiesTable[0]));i++) {
2115 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2119 return(NULL);
2123 * htmlEntityValueLookup:
2124 * @value: the entity's unicode value
2126 * Lookup the given entity in EntitiesTable
2128 * TODO: the linear scan is really ugly, an hash table is really needed.
2130 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2132 const htmlEntityDesc *
2133 htmlEntityValueLookup(unsigned int value) {
2134 unsigned int i;
2136 for (i = 0;i < (sizeof(html40EntitiesTable)/
2137 sizeof(html40EntitiesTable[0]));i++) {
2138 if (html40EntitiesTable[i].value >= value) {
2139 if (html40EntitiesTable[i].value > value)
2140 break;
2141 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2144 return(NULL);
2148 * UTF8ToHtml:
2149 * @out: a pointer to an array of bytes to store the result
2150 * @outlen: the length of @out
2151 * @in: a pointer to an array of UTF-8 chars
2152 * @inlen: the length of @in
2154 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155 * plus HTML entities block of chars out.
2157 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158 * The value of @inlen after return is the number of octets consumed
2159 * as the return value is positive, else unpredictable.
2160 * The value of @outlen after return is the number of octets consumed.
2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164 const unsigned char* in, int *inlen) {
2165 const unsigned char* processed = in;
2166 const unsigned char* outend;
2167 const unsigned char* outstart = out;
2168 const unsigned char* instart = in;
2169 const unsigned char* inend;
2170 unsigned int c, d;
2171 int trailing;
2173 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174 if (in == NULL) {
2176 * initialization nothing to do
2178 *outlen = 0;
2179 *inlen = 0;
2180 return(0);
2182 inend = in + (*inlen);
2183 outend = out + (*outlen);
2184 while (in < inend) {
2185 d = *in++;
2186 if (d < 0x80) { c= d; trailing= 0; }
2187 else if (d < 0xC0) {
2188 /* trailing byte in leading position */
2189 *outlen = out - outstart;
2190 *inlen = processed - instart;
2191 return(-2);
2192 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2193 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2194 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2195 else {
2196 /* no chance for this in Ascii */
2197 *outlen = out - outstart;
2198 *inlen = processed - instart;
2199 return(-2);
2202 if (inend - in < trailing) {
2203 break;
2206 for ( ; trailing; trailing--) {
2207 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 break;
2209 c <<= 6;
2210 c |= d & 0x3F;
2213 /* assertion: c is a single UTF-4 value */
2214 if (c < 0x80) {
2215 if (out + 1 >= outend)
2216 break;
2217 *out++ = c;
2218 } else {
2219 int len;
2220 const htmlEntityDesc * ent;
2221 const char *cp;
2222 char nbuf[16];
2225 * Try to lookup a predefined HTML entity for it
2228 ent = htmlEntityValueLookup(c);
2229 if (ent == NULL) {
2230 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 cp = nbuf;
2233 else
2234 cp = ent->name;
2235 len = strlen(cp);
2236 if (out + 2 + len >= outend)
2237 break;
2238 *out++ = '&';
2239 memcpy(out, cp, len);
2240 out += len;
2241 *out++ = ';';
2243 processed = in;
2245 *outlen = out - outstart;
2246 *inlen = processed - instart;
2247 return(0);
2251 * htmlEncodeEntities:
2252 * @out: a pointer to an array of bytes to store the result
2253 * @outlen: the length of @out
2254 * @in: a pointer to an array of UTF-8 chars
2255 * @inlen: the length of @in
2256 * @quoteChar: the quote character to escape (' or ") or zero.
2258 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259 * plus HTML entities block of chars out.
2261 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262 * The value of @inlen after return is the number of octets consumed
2263 * as the return value is positive, else unpredictable.
2264 * The value of @outlen after return is the number of octets consumed.
2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268 const unsigned char* in, int *inlen, int quoteChar) {
2269 const unsigned char* processed = in;
2270 const unsigned char* outend;
2271 const unsigned char* outstart = out;
2272 const unsigned char* instart = in;
2273 const unsigned char* inend;
2274 unsigned int c, d;
2275 int trailing;
2277 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278 return(-1);
2279 outend = out + (*outlen);
2280 inend = in + (*inlen);
2281 while (in < inend) {
2282 d = *in++;
2283 if (d < 0x80) { c= d; trailing= 0; }
2284 else if (d < 0xC0) {
2285 /* trailing byte in leading position */
2286 *outlen = out - outstart;
2287 *inlen = processed - instart;
2288 return(-2);
2289 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2290 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2291 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2292 else {
2293 /* no chance for this in Ascii */
2294 *outlen = out - outstart;
2295 *inlen = processed - instart;
2296 return(-2);
2299 if (inend - in < trailing)
2300 break;
2302 while (trailing--) {
2303 if (((d= *in++) & 0xC0) != 0x80) {
2304 *outlen = out - outstart;
2305 *inlen = processed - instart;
2306 return(-2);
2308 c <<= 6;
2309 c |= d & 0x3F;
2312 /* assertion: c is a single UTF-4 value */
2313 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 (c != '&') && (c != '<') && (c != '>')) {
2315 if (out >= outend)
2316 break;
2317 *out++ = c;
2318 } else {
2319 const htmlEntityDesc * ent;
2320 const char *cp;
2321 char nbuf[16];
2322 int len;
2325 * Try to lookup a predefined HTML entity for it
2327 ent = htmlEntityValueLookup(c);
2328 if (ent == NULL) {
2329 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 cp = nbuf;
2332 else
2333 cp = ent->name;
2334 len = strlen(cp);
2335 if (out + 2 + len > outend)
2336 break;
2337 *out++ = '&';
2338 memcpy(out, cp, len);
2339 out += len;
2340 *out++ = ';';
2342 processed = in;
2344 *outlen = out - outstart;
2345 *inlen = processed - instart;
2346 return(0);
2349 /************************************************************************
2351 * Commodity functions to handle streams *
2353 ************************************************************************/
2355 #ifdef LIBXML_PUSH_ENABLED
2357 * htmlNewInputStream:
2358 * @ctxt: an HTML parser context
2360 * Create a new input stream structure
2361 * Returns the new input stream or NULL
2363 static htmlParserInputPtr
2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365 htmlParserInputPtr input;
2367 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368 if (input == NULL) {
2369 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 return(NULL);
2372 memset(input, 0, sizeof(htmlParserInput));
2373 input->filename = NULL;
2374 input->directory = NULL;
2375 input->base = NULL;
2376 input->cur = NULL;
2377 input->buf = NULL;
2378 input->line = 1;
2379 input->col = 1;
2380 input->buf = NULL;
2381 input->free = NULL;
2382 input->version = NULL;
2383 input->consumed = 0;
2384 input->length = 0;
2385 return(input);
2387 #endif
2390 /************************************************************************
2392 * Commodity functions, cleanup needed ? *
2394 ************************************************************************/
2396 * all tags allowing pc data from the html 4.01 loose dtd
2397 * NOTE: it might be more appropriate to integrate this information
2398 * into the html40ElementTable array but I don't want to risk any
2399 * binary incompatibility
2401 static const char *allowPCData[] = {
2402 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403 "blockquote", "body", "button", "caption", "center", "cite", "code",
2404 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2411 * areBlanks:
2412 * @ctxt: an HTML parser context
2413 * @str: a xmlChar *
2414 * @len: the size of @str
2416 * Is this a sequence of blank chars that one can ignore ?
2418 * Returns 1 if ignorable 0 otherwise.
2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422 unsigned int i;
2423 int j;
2424 xmlNodePtr lastChild;
2425 xmlDtdPtr dtd;
2427 for (j = 0;j < len;j++)
2428 if (!(IS_BLANK_CH(str[j]))) return(0);
2430 if (CUR == 0) return(1);
2431 if (CUR != '<') return(0);
2432 if (ctxt->name == NULL)
2433 return(1);
2434 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 return(1);
2436 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 return(1);
2439 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441 dtd = xmlGetIntSubset(ctxt->myDoc);
2442 if (dtd != NULL && dtd->ExternalID != NULL) {
2443 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445 return(1);
2449 if (ctxt->node == NULL) return(0);
2450 lastChild = xmlGetLastChild(ctxt->node);
2451 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 lastChild = lastChild->prev;
2453 if (lastChild == NULL) {
2454 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455 (ctxt->node->content != NULL)) return(0);
2456 /* keep ws in constructs like ...<b> </b>...
2457 for all tags "b" allowing PCDATA */
2458 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 return(0);
2463 } else if (xmlNodeIsText(lastChild)) {
2464 return(0);
2465 } else {
2466 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 for all tags "p" allowing PCDATA */
2468 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 return(0);
2474 return(1);
2478 * htmlNewDocNoDtD:
2479 * @URI: URI for the dtd, or NULL
2480 * @ExternalID: the external ID of the DTD, or NULL
2482 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483 * are NULL
2485 * Returns a new document, do not initialize the DTD if not provided
2487 htmlDocPtr
2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489 xmlDocPtr cur;
2492 * Allocate a new document and fill the fields.
2494 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495 if (cur == NULL) {
2496 htmlErrMemory(NULL, "HTML document creation failed\n");
2497 return(NULL);
2499 memset(cur, 0, sizeof(xmlDoc));
2501 cur->type = XML_HTML_DOCUMENT_NODE;
2502 cur->version = NULL;
2503 cur->intSubset = NULL;
2504 cur->doc = cur;
2505 cur->name = NULL;
2506 cur->children = NULL;
2507 cur->extSubset = NULL;
2508 cur->oldNs = NULL;
2509 cur->encoding = NULL;
2510 cur->standalone = 1;
2511 cur->compression = 0;
2512 cur->ids = NULL;
2513 cur->refs = NULL;
2514 cur->_private = NULL;
2515 cur->charset = XML_CHAR_ENCODING_UTF8;
2516 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517 if ((ExternalID != NULL) ||
2518 (URI != NULL))
2519 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520 return(cur);
2524 * htmlNewDoc:
2525 * @URI: URI for the dtd, or NULL
2526 * @ExternalID: the external ID of the DTD, or NULL
2528 * Creates a new HTML document
2530 * Returns a new document
2532 htmlDocPtr
2533 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2534 if ((URI == NULL) && (ExternalID == NULL))
2535 return(htmlNewDocNoDtD(
2536 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2537 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2539 return(htmlNewDocNoDtD(URI, ExternalID));
2543 /************************************************************************
2545 * The parser itself *
2546 * Relates to http://www.w3.org/TR/html40 *
2548 ************************************************************************/
2550 /************************************************************************
2552 * The parser itself *
2554 ************************************************************************/
2556 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2559 * htmlParseHTMLName:
2560 * @ctxt: an HTML parser context
2562 * parse an HTML tag or attribute name, note that we convert it to lowercase
2563 * since HTML names are not case-sensitive.
2565 * Returns the Tag Name parsed or NULL
2568 static const xmlChar *
2569 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2570 int i = 0;
2571 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2573 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2574 (CUR != ':') && (CUR != '.')) return(NULL);
2576 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2577 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2578 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2579 (CUR == '.'))) {
2580 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2581 else loc[i] = CUR;
2582 i++;
2584 NEXT;
2587 return(xmlDictLookup(ctxt->dict, loc, i));
2592 * htmlParseHTMLName_nonInvasive:
2593 * @ctxt: an HTML parser context
2595 * parse an HTML tag or attribute name, note that we convert it to lowercase
2596 * since HTML names are not case-sensitive, this doesn't consume the data
2597 * from the stream, it's a look-ahead
2599 * Returns the Tag Name parsed or NULL
2602 static const xmlChar *
2603 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2604 int i = 0;
2605 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2607 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2608 (NXT(1) != ':')) return(NULL);
2610 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2611 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2612 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2613 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2614 else loc[i] = NXT(1+i);
2615 i++;
2618 return(xmlDictLookup(ctxt->dict, loc, i));
2623 * htmlParseName:
2624 * @ctxt: an HTML parser context
2626 * parse an HTML name, this routine is case sensitive.
2628 * Returns the Name parsed or NULL
2631 static const xmlChar *
2632 htmlParseName(htmlParserCtxtPtr ctxt) {
2633 const xmlChar *in;
2634 const xmlChar *ret;
2635 int count = 0;
2637 GROW;
2640 * Accelerator for simple ASCII names
2642 in = ctxt->input->cur;
2643 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2644 ((*in >= 0x41) && (*in <= 0x5A)) ||
2645 (*in == '_') || (*in == ':')) {
2646 in++;
2647 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2648 ((*in >= 0x41) && (*in <= 0x5A)) ||
2649 ((*in >= 0x30) && (*in <= 0x39)) ||
2650 (*in == '_') || (*in == '-') ||
2651 (*in == ':') || (*in == '.'))
2652 in++;
2654 if (in == ctxt->input->end)
2655 return(NULL);
2657 if ((*in > 0) && (*in < 0x80)) {
2658 count = in - ctxt->input->cur;
2659 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2660 ctxt->input->cur = in;
2661 ctxt->input->col += count;
2662 return(ret);
2665 return(htmlParseNameComplex(ctxt));
2668 static const xmlChar *
2669 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2670 int len = 0, l;
2671 int c;
2672 int count = 0;
2673 const xmlChar *base = ctxt->input->base;
2676 * Handler for more complex cases
2678 GROW;
2679 c = CUR_CHAR(l);
2680 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2681 (!IS_LETTER(c) && (c != '_') &&
2682 (c != ':'))) {
2683 return(NULL);
2686 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2687 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2688 (c == '.') || (c == '-') ||
2689 (c == '_') || (c == ':') ||
2690 (IS_COMBINING(c)) ||
2691 (IS_EXTENDER(c)))) {
2692 if (count++ > 100) {
2693 count = 0;
2694 GROW;
2696 len += l;
2697 NEXTL(l);
2698 c = CUR_CHAR(l);
2699 if (ctxt->input->base != base) {
2701 * We changed encoding from an unknown encoding
2702 * Input buffer changed location, so we better start again
2704 return(htmlParseNameComplex(ctxt));
2708 if (ctxt->input->cur - ctxt->input->base < len) {
2709 /* Sanity check */
2710 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2711 "unexpected change of input buffer", NULL, NULL);
2712 return (NULL);
2715 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2720 * htmlParseHTMLAttribute:
2721 * @ctxt: an HTML parser context
2722 * @stop: a char stop value
2724 * parse an HTML attribute value till the stop (quote), if
2725 * stop is 0 then it stops at the first space
2727 * Returns the attribute parsed or NULL
2730 static xmlChar *
2731 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2732 xmlChar *buffer = NULL;
2733 int buffer_size = 0;
2734 xmlChar *out = NULL;
2735 const xmlChar *name = NULL;
2736 const xmlChar *cur = NULL;
2737 const htmlEntityDesc * ent;
2740 * allocate a translation buffer.
2742 buffer_size = HTML_PARSER_BUFFER_SIZE;
2743 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2744 if (buffer == NULL) {
2745 htmlErrMemory(ctxt, "buffer allocation failed\n");
2746 return(NULL);
2748 out = buffer;
2751 * Ok loop until we reach one of the ending chars
2753 while ((CUR != 0) && (CUR != stop)) {
2754 if ((stop == 0) && (CUR == '>')) break;
2755 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2756 if (CUR == '&') {
2757 if (NXT(1) == '#') {
2758 unsigned int c;
2759 int bits;
2761 c = htmlParseCharRef(ctxt);
2762 if (c < 0x80)
2763 { *out++ = c; bits= -6; }
2764 else if (c < 0x800)
2765 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2766 else if (c < 0x10000)
2767 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2768 else
2769 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2771 for ( ; bits >= 0; bits-= 6) {
2772 *out++ = ((c >> bits) & 0x3F) | 0x80;
2775 if (out - buffer > buffer_size - 100) {
2776 int indx = out - buffer;
2778 growBuffer(buffer);
2779 out = &buffer[indx];
2781 } else {
2782 ent = htmlParseEntityRef(ctxt, &name);
2783 if (name == NULL) {
2784 *out++ = '&';
2785 if (out - buffer > buffer_size - 100) {
2786 int indx = out - buffer;
2788 growBuffer(buffer);
2789 out = &buffer[indx];
2791 } else if (ent == NULL) {
2792 *out++ = '&';
2793 cur = name;
2794 while (*cur != 0) {
2795 if (out - buffer > buffer_size - 100) {
2796 int indx = out - buffer;
2798 growBuffer(buffer);
2799 out = &buffer[indx];
2801 *out++ = *cur++;
2803 } else {
2804 unsigned int c;
2805 int bits;
2807 if (out - buffer > buffer_size - 100) {
2808 int indx = out - buffer;
2810 growBuffer(buffer);
2811 out = &buffer[indx];
2813 c = ent->value;
2814 if (c < 0x80)
2815 { *out++ = c; bits= -6; }
2816 else if (c < 0x800)
2817 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2818 else if (c < 0x10000)
2819 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2820 else
2821 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2823 for ( ; bits >= 0; bits-= 6) {
2824 *out++ = ((c >> bits) & 0x3F) | 0x80;
2828 } else {
2829 unsigned int c;
2830 int bits, l;
2832 if (out - buffer > buffer_size - 100) {
2833 int indx = out - buffer;
2835 growBuffer(buffer);
2836 out = &buffer[indx];
2838 c = CUR_CHAR(l);
2839 if (c < 0x80)
2840 { *out++ = c; bits= -6; }
2841 else if (c < 0x800)
2842 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2843 else if (c < 0x10000)
2844 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2845 else
2846 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2848 for ( ; bits >= 0; bits-= 6) {
2849 *out++ = ((c >> bits) & 0x3F) | 0x80;
2851 NEXT;
2854 *out = 0;
2855 return(buffer);
2859 * htmlParseEntityRef:
2860 * @ctxt: an HTML parser context
2861 * @str: location to store the entity name
2863 * parse an HTML ENTITY references
2865 * [68] EntityRef ::= '&' Name ';'
2867 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2868 * if non-NULL *str will have to be freed by the caller.
2870 const htmlEntityDesc *
2871 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2872 const xmlChar *name;
2873 const htmlEntityDesc * ent = NULL;
2875 if (str != NULL) *str = NULL;
2876 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2878 if (CUR == '&') {
2879 NEXT;
2880 name = htmlParseName(ctxt);
2881 if (name == NULL) {
2882 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2883 "htmlParseEntityRef: no name\n", NULL, NULL);
2884 } else {
2885 GROW;
2886 if (CUR == ';') {
2887 if (str != NULL)
2888 *str = name;
2891 * Lookup the entity in the table.
2893 ent = htmlEntityLookup(name);
2894 if (ent != NULL) /* OK that's ugly !!! */
2895 NEXT;
2896 } else {
2897 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2898 "htmlParseEntityRef: expecting ';'\n",
2899 NULL, NULL);
2900 if (str != NULL)
2901 *str = name;
2905 return(ent);
2909 * htmlParseAttValue:
2910 * @ctxt: an HTML parser context
2912 * parse a value for an attribute
2913 * Note: the parser won't do substitution of entities here, this
2914 * will be handled later in xmlStringGetNodeList, unless it was
2915 * asked for ctxt->replaceEntities != 0
2917 * Returns the AttValue parsed or NULL.
2920 static xmlChar *
2921 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2922 xmlChar *ret = NULL;
2924 if (CUR == '"') {
2925 NEXT;
2926 ret = htmlParseHTMLAttribute(ctxt, '"');
2927 if (CUR != '"') {
2928 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2929 "AttValue: \" expected\n", NULL, NULL);
2930 } else
2931 NEXT;
2932 } else if (CUR == '\'') {
2933 NEXT;
2934 ret = htmlParseHTMLAttribute(ctxt, '\'');
2935 if (CUR != '\'') {
2936 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2937 "AttValue: ' expected\n", NULL, NULL);
2938 } else
2939 NEXT;
2940 } else {
2942 * That's an HTMLism, the attribute value may not be quoted
2944 ret = htmlParseHTMLAttribute(ctxt, 0);
2945 if (ret == NULL) {
2946 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2947 "AttValue: no value found\n", NULL, NULL);
2950 return(ret);
2954 * htmlParseSystemLiteral:
2955 * @ctxt: an HTML parser context
2957 * parse an HTML Literal
2959 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2961 * Returns the SystemLiteral parsed or NULL
2964 static xmlChar *
2965 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2966 size_t len = 0, startPosition = 0;
2967 int err = 0;
2968 int quote;
2969 xmlChar *ret = NULL;
2971 if ((CUR != '"') && (CUR != '\'')) {
2972 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2973 "SystemLiteral \" or ' expected\n", NULL, NULL);
2974 return(NULL);
2976 quote = CUR;
2977 NEXT;
2979 if (CUR_PTR < BASE_PTR)
2980 return(ret);
2981 startPosition = CUR_PTR - BASE_PTR;
2983 while ((CUR != 0) && (CUR != quote)) {
2984 /* TODO: Handle UTF-8 */
2985 if (!IS_CHAR_CH(CUR)) {
2986 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2987 "Invalid char in SystemLiteral 0x%X\n", CUR);
2988 err = 1;
2990 NEXT;
2991 len++;
2993 if (CUR != quote) {
2994 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2995 "Unfinished SystemLiteral\n", NULL, NULL);
2996 } else {
2997 NEXT;
2998 if (err == 0)
2999 ret = xmlStrndup((BASE_PTR+startPosition), len);
3002 return(ret);
3006 * htmlParsePubidLiteral:
3007 * @ctxt: an HTML parser context
3009 * parse an HTML public literal
3011 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3013 * Returns the PubidLiteral parsed or NULL.
3016 static xmlChar *
3017 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3018 size_t len = 0, startPosition = 0;
3019 int err = 0;
3020 int quote;
3021 xmlChar *ret = NULL;
3023 if ((CUR != '"') && (CUR != '\'')) {
3024 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3025 "PubidLiteral \" or ' expected\n", NULL, NULL);
3026 return(NULL);
3028 quote = CUR;
3029 NEXT;
3032 * Name ::= (Letter | '_') (NameChar)*
3034 if (CUR_PTR < BASE_PTR)
3035 return(ret);
3036 startPosition = CUR_PTR - BASE_PTR;
3038 while ((CUR != 0) && (CUR != quote)) {
3039 if (!IS_PUBIDCHAR_CH(CUR)) {
3040 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3041 "Invalid char in PubidLiteral 0x%X\n", CUR);
3042 err = 1;
3044 len++;
3045 NEXT;
3048 if (CUR != '"') {
3049 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3050 "Unfinished PubidLiteral\n", NULL, NULL);
3051 } else {
3052 NEXT;
3053 if (err == 0)
3054 ret = xmlStrndup((BASE_PTR + startPosition), len);
3057 return(ret);
3061 * htmlParseScript:
3062 * @ctxt: an HTML parser context
3064 * parse the content of an HTML SCRIPT or STYLE element
3065 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3066 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3067 * http://www.w3.org/TR/html4/types.html#type-script
3068 * http://www.w3.org/TR/html4/types.html#h-6.15
3069 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3071 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3072 * element and the value of intrinsic event attributes. User agents must
3073 * not evaluate script data as HTML markup but instead must pass it on as
3074 * data to a script engine.
3075 * NOTES:
3076 * - The content is passed like CDATA
3077 * - the attributes for style and scripting "onXXX" are also described
3078 * as CDATA but SGML allows entities references in attributes so their
3079 * processing is identical as other attributes
3081 static void
3082 htmlParseScript(htmlParserCtxtPtr ctxt) {
3083 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3084 int nbchar = 0;
3085 int cur,l;
3087 SHRINK;
3088 cur = CUR_CHAR(l);
3089 while (cur != 0) {
3090 if ((cur == '<') && (NXT(1) == '/')) {
3092 * One should break here, the specification is clear:
3093 * Authors should therefore escape "</" within the content.
3094 * Escape mechanisms are specific to each scripting or
3095 * style sheet language.
3097 * In recovery mode, only break if end tag match the
3098 * current tag, effectively ignoring all tags inside the
3099 * script/style block and treating the entire block as
3100 * CDATA.
3102 if (ctxt->recovery) {
3103 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3104 xmlStrlen(ctxt->name)) == 0)
3106 break; /* while */
3107 } else {
3108 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3109 "Element %s embeds close tag\n",
3110 ctxt->name, NULL);
3112 } else {
3113 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3114 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3116 break; /* while */
3120 if (IS_CHAR(cur)) {
3121 COPY_BUF(l,buf,nbchar,cur);
3122 } else {
3123 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3124 "Invalid char in CDATA 0x%X\n", cur);
3126 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3127 buf[nbchar] = 0;
3128 if (ctxt->sax->cdataBlock!= NULL) {
3130 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3132 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3133 } else if (ctxt->sax->characters != NULL) {
3134 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3136 nbchar = 0;
3138 GROW;
3139 NEXTL(l);
3140 cur = CUR_CHAR(l);
3143 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144 buf[nbchar] = 0;
3145 if (ctxt->sax->cdataBlock!= NULL) {
3147 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3149 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3150 } else if (ctxt->sax->characters != NULL) {
3151 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3158 * htmlParseCharDataInternal:
3159 * @ctxt: an HTML parser context
3160 * @readahead: optional read ahead character in ascii range
3162 * parse a CharData section.
3163 * if we are within a CDATA section ']]>' marks an end of section.
3165 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3168 static void
3169 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3170 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3171 int nbchar = 0;
3172 int cur, l;
3173 int chunk = 0;
3175 if (readahead)
3176 buf[nbchar++] = readahead;
3178 SHRINK;
3179 cur = CUR_CHAR(l);
3180 while (((cur != '<') || (ctxt->token == '<')) &&
3181 ((cur != '&') || (ctxt->token == '&')) &&
3182 (cur != 0)) {
3183 if (!(IS_CHAR(cur))) {
3184 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3185 "Invalid char in CDATA 0x%X\n", cur);
3186 } else {
3187 COPY_BUF(l,buf,nbchar,cur);
3189 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3190 buf[nbchar] = 0;
3193 * Ok the segment is to be consumed as chars.
3195 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3196 if (areBlanks(ctxt, buf, nbchar)) {
3197 if (ctxt->keepBlanks) {
3198 if (ctxt->sax->characters != NULL)
3199 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3200 } else {
3201 if (ctxt->sax->ignorableWhitespace != NULL)
3202 ctxt->sax->ignorableWhitespace(ctxt->userData,
3203 buf, nbchar);
3205 } else {
3206 htmlCheckParagraph(ctxt);
3207 if (ctxt->sax->characters != NULL)
3208 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3211 nbchar = 0;
3213 NEXTL(l);
3214 chunk++;
3215 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3216 chunk = 0;
3217 SHRINK;
3218 GROW;
3220 cur = CUR_CHAR(l);
3221 if (cur == 0) {
3222 SHRINK;
3223 GROW;
3224 cur = CUR_CHAR(l);
3227 if (nbchar != 0) {
3228 buf[nbchar] = 0;
3231 * Ok the segment is to be consumed as chars.
3233 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234 if (areBlanks(ctxt, buf, nbchar)) {
3235 if (ctxt->keepBlanks) {
3236 if (ctxt->sax->characters != NULL)
3237 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3238 } else {
3239 if (ctxt->sax->ignorableWhitespace != NULL)
3240 ctxt->sax->ignorableWhitespace(ctxt->userData,
3241 buf, nbchar);
3243 } else {
3244 htmlCheckParagraph(ctxt);
3245 if (ctxt->sax->characters != NULL)
3246 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3249 } else {
3251 * Loop detection
3253 if (cur == 0)
3254 ctxt->instate = XML_PARSER_EOF;
3259 * htmlParseCharData:
3260 * @ctxt: an HTML parser context
3262 * parse a CharData section.
3263 * if we are within a CDATA section ']]>' marks an end of section.
3265 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268 static void
3269 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3270 htmlParseCharDataInternal(ctxt, 0);
3274 * htmlParseExternalID:
3275 * @ctxt: an HTML parser context
3276 * @publicID: a xmlChar** receiving PubidLiteral
3278 * Parse an External ID or a Public ID
3280 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3281 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3283 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3285 * Returns the function returns SystemLiteral and in the second
3286 * case publicID receives PubidLiteral, is strict is off
3287 * it is possible to return NULL and have publicID set.
3290 static xmlChar *
3291 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3292 xmlChar *URI = NULL;
3294 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3295 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3296 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3297 SKIP(6);
3298 if (!IS_BLANK_CH(CUR)) {
3299 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3300 "Space required after 'SYSTEM'\n", NULL, NULL);
3302 SKIP_BLANKS;
3303 URI = htmlParseSystemLiteral(ctxt);
3304 if (URI == NULL) {
3305 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3306 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3308 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3309 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3310 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3311 SKIP(6);
3312 if (!IS_BLANK_CH(CUR)) {
3313 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3314 "Space required after 'PUBLIC'\n", NULL, NULL);
3316 SKIP_BLANKS;
3317 *publicID = htmlParsePubidLiteral(ctxt);
3318 if (*publicID == NULL) {
3319 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3320 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3321 NULL, NULL);
3323 SKIP_BLANKS;
3324 if ((CUR == '"') || (CUR == '\'')) {
3325 URI = htmlParseSystemLiteral(ctxt);
3328 return(URI);
3332 * xmlParsePI:
3333 * @ctxt: an XML parser context
3335 * parse an XML Processing Instruction.
3337 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3339 static void
3340 htmlParsePI(htmlParserCtxtPtr ctxt) {
3341 xmlChar *buf = NULL;
3342 int len = 0;
3343 int size = HTML_PARSER_BUFFER_SIZE;
3344 int cur, l;
3345 const xmlChar *target;
3346 xmlParserInputState state;
3347 int count = 0;
3349 if ((RAW == '<') && (NXT(1) == '?')) {
3350 state = ctxt->instate;
3351 ctxt->instate = XML_PARSER_PI;
3353 * this is a Processing Instruction.
3355 SKIP(2);
3356 SHRINK;
3359 * Parse the target name and check for special support like
3360 * namespace.
3362 target = htmlParseName(ctxt);
3363 if (target != NULL) {
3364 if (RAW == '>') {
3365 SKIP(1);
3368 * SAX: PI detected.
3370 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3371 (ctxt->sax->processingInstruction != NULL))
3372 ctxt->sax->processingInstruction(ctxt->userData,
3373 target, NULL);
3374 ctxt->instate = state;
3375 return;
3377 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3378 if (buf == NULL) {
3379 htmlErrMemory(ctxt, NULL);
3380 ctxt->instate = state;
3381 return;
3383 cur = CUR;
3384 if (!IS_BLANK(cur)) {
3385 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3386 "ParsePI: PI %s space expected\n", target, NULL);
3388 SKIP_BLANKS;
3389 cur = CUR_CHAR(l);
3390 while ((cur != 0) && (cur != '>')) {
3391 if (len + 5 >= size) {
3392 xmlChar *tmp;
3394 size *= 2;
3395 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3396 if (tmp == NULL) {
3397 htmlErrMemory(ctxt, NULL);
3398 xmlFree(buf);
3399 ctxt->instate = state;
3400 return;
3402 buf = tmp;
3404 count++;
3405 if (count > 50) {
3406 GROW;
3407 count = 0;
3409 if (IS_CHAR(cur)) {
3410 COPY_BUF(l,buf,len,cur);
3411 } else {
3412 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3413 "Invalid char in processing instruction "
3414 "0x%X\n", cur);
3416 NEXTL(l);
3417 cur = CUR_CHAR(l);
3418 if (cur == 0) {
3419 SHRINK;
3420 GROW;
3421 cur = CUR_CHAR(l);
3424 buf[len] = 0;
3425 if (cur != '>') {
3426 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3427 "ParsePI: PI %s never end ...\n", target, NULL);
3428 } else {
3429 SKIP(1);
3432 * SAX: PI detected.
3434 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3435 (ctxt->sax->processingInstruction != NULL))
3436 ctxt->sax->processingInstruction(ctxt->userData,
3437 target, buf);
3439 xmlFree(buf);
3440 } else {
3441 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3442 "PI is not started correctly", NULL, NULL);
3444 ctxt->instate = state;
3449 * htmlParseComment:
3450 * @ctxt: an HTML parser context
3452 * Parse an XML (SGML) comment <!-- .... -->
3454 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3456 static void
3457 htmlParseComment(htmlParserCtxtPtr ctxt) {
3458 xmlChar *buf = NULL;
3459 int len;
3460 int size = HTML_PARSER_BUFFER_SIZE;
3461 int q, ql;
3462 int r, rl;
3463 int cur, l;
3464 int next, nl;
3465 xmlParserInputState state;
3468 * Check that there is a comment right here.
3470 if ((RAW != '<') || (NXT(1) != '!') ||
3471 (NXT(2) != '-') || (NXT(3) != '-')) return;
3473 state = ctxt->instate;
3474 ctxt->instate = XML_PARSER_COMMENT;
3475 SHRINK;
3476 SKIP(4);
3477 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3478 if (buf == NULL) {
3479 htmlErrMemory(ctxt, "buffer allocation failed\n");
3480 ctxt->instate = state;
3481 return;
3483 len = 0;
3484 buf[len] = 0;
3485 q = CUR_CHAR(ql);
3486 if (q == 0)
3487 goto unfinished;
3488 NEXTL(ql);
3489 r = CUR_CHAR(rl);
3490 if (r == 0)
3491 goto unfinished;
3492 NEXTL(rl);
3493 cur = CUR_CHAR(l);
3494 while ((cur != 0) &&
3495 ((cur != '>') ||
3496 (r != '-') || (q != '-'))) {
3497 NEXTL(l);
3498 next = CUR_CHAR(nl);
3499 if (next == 0) {
3500 SHRINK;
3501 GROW;
3502 next = CUR_CHAR(nl);
3505 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3506 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3507 "Comment incorrectly closed by '--!>'", NULL, NULL);
3508 cur = '>';
3509 break;
3512 if (len + 5 >= size) {
3513 xmlChar *tmp;
3515 size *= 2;
3516 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3517 if (tmp == NULL) {
3518 xmlFree(buf);
3519 htmlErrMemory(ctxt, "growing buffer failed\n");
3520 ctxt->instate = state;
3521 return;
3523 buf = tmp;
3525 if (IS_CHAR(q)) {
3526 COPY_BUF(ql,buf,len,q);
3527 } else {
3528 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3529 "Invalid char in comment 0x%X\n", q);
3532 q = r;
3533 ql = rl;
3534 r = cur;
3535 rl = l;
3536 cur = next;
3537 l = nl;
3539 buf[len] = 0;
3540 if (cur == '>') {
3541 NEXT;
3542 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3543 (!ctxt->disableSAX))
3544 ctxt->sax->comment(ctxt->userData, buf);
3545 xmlFree(buf);
3546 ctxt->instate = state;
3547 return;
3550 unfinished:
3551 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3552 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3553 xmlFree(buf);
3557 * htmlParseCharRef:
3558 * @ctxt: an HTML parser context
3560 * parse Reference declarations
3562 * [66] CharRef ::= '&#' [0-9]+ ';' |
3563 * '&#x' [0-9a-fA-F]+ ';'
3565 * Returns the value parsed (as an int)
3568 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3569 int val = 0;
3571 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3572 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3573 "htmlParseCharRef: context error\n",
3574 NULL, NULL);
3575 return(0);
3577 if ((CUR == '&') && (NXT(1) == '#') &&
3578 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3579 SKIP(3);
3580 while (CUR != ';') {
3581 if ((CUR >= '0') && (CUR <= '9')) {
3582 if (val < 0x110000)
3583 val = val * 16 + (CUR - '0');
3584 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3585 if (val < 0x110000)
3586 val = val * 16 + (CUR - 'a') + 10;
3587 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3588 if (val < 0x110000)
3589 val = val * 16 + (CUR - 'A') + 10;
3590 } else {
3591 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3592 "htmlParseCharRef: missing semicolon\n",
3593 NULL, NULL);
3594 break;
3596 NEXT;
3598 if (CUR == ';')
3599 NEXT;
3600 } else if ((CUR == '&') && (NXT(1) == '#')) {
3601 SKIP(2);
3602 while (CUR != ';') {
3603 if ((CUR >= '0') && (CUR <= '9')) {
3604 if (val < 0x110000)
3605 val = val * 10 + (CUR - '0');
3606 } else {
3607 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3608 "htmlParseCharRef: missing semicolon\n",
3609 NULL, NULL);
3610 break;
3612 NEXT;
3614 if (CUR == ';')
3615 NEXT;
3616 } else {
3617 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3618 "htmlParseCharRef: invalid value\n", NULL, NULL);
3621 * Check the value IS_CHAR ...
3623 if (IS_CHAR(val)) {
3624 return(val);
3625 } else if (val >= 0x110000) {
3626 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3627 "htmlParseCharRef: value too large\n", NULL, NULL);
3628 } else {
3629 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3630 "htmlParseCharRef: invalid xmlChar value %d\n",
3631 val);
3633 return(0);
3638 * htmlParseDocTypeDecl:
3639 * @ctxt: an HTML parser context
3641 * parse a DOCTYPE declaration
3643 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3644 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3647 static void
3648 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3649 const xmlChar *name;
3650 xmlChar *ExternalID = NULL;
3651 xmlChar *URI = NULL;
3654 * We know that '<!DOCTYPE' has been detected.
3656 SKIP(9);
3658 SKIP_BLANKS;
3661 * Parse the DOCTYPE name.
3663 name = htmlParseName(ctxt);
3664 if (name == NULL) {
3665 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3666 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3667 NULL, NULL);
3670 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3673 SKIP_BLANKS;
3676 * Check for SystemID and ExternalID
3678 URI = htmlParseExternalID(ctxt, &ExternalID);
3679 SKIP_BLANKS;
3682 * We should be at the end of the DOCTYPE declaration.
3684 if (CUR != '>') {
3685 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3686 "DOCTYPE improperly terminated\n", NULL, NULL);
3687 /* Ignore bogus content */
3688 while ((CUR != 0) && (CUR != '>'))
3689 NEXT;
3691 if (CUR == '>')
3692 NEXT;
3695 * Create or update the document accordingly to the DOCTYPE
3697 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3698 (!ctxt->disableSAX))
3699 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3702 * Cleanup, since we don't use all those identifiers
3704 if (URI != NULL) xmlFree(URI);
3705 if (ExternalID != NULL) xmlFree(ExternalID);
3709 * htmlParseAttribute:
3710 * @ctxt: an HTML parser context
3711 * @value: a xmlChar ** used to store the value of the attribute
3713 * parse an attribute
3715 * [41] Attribute ::= Name Eq AttValue
3717 * [25] Eq ::= S? '=' S?
3719 * With namespace:
3721 * [NS 11] Attribute ::= QName Eq AttValue
3723 * Also the case QName == xmlns:??? is handled independently as a namespace
3724 * definition.
3726 * Returns the attribute name, and the value in *value.
3729 static const xmlChar *
3730 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3731 const xmlChar *name;
3732 xmlChar *val = NULL;
3734 *value = NULL;
3735 name = htmlParseHTMLName(ctxt);
3736 if (name == NULL) {
3737 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3738 "error parsing attribute name\n", NULL, NULL);
3739 return(NULL);
3743 * read the value
3745 SKIP_BLANKS;
3746 if (CUR == '=') {
3747 NEXT;
3748 SKIP_BLANKS;
3749 val = htmlParseAttValue(ctxt);
3752 *value = val;
3753 return(name);
3757 * htmlCheckEncodingDirect:
3758 * @ctxt: an HTML parser context
3759 * @attvalue: the attribute value
3761 * Checks an attribute value to detect
3762 * the encoding
3763 * If a new encoding is detected the parser is switched to decode
3764 * it and pass UTF8
3766 static void
3767 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3769 if ((ctxt == NULL) || (encoding == NULL) ||
3770 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3771 return;
3773 /* do not change encoding */
3774 if (ctxt->input->encoding != NULL)
3775 return;
3777 if (encoding != NULL) {
3778 xmlCharEncoding enc;
3779 xmlCharEncodingHandlerPtr handler;
3781 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3783 if (ctxt->input->encoding != NULL)
3784 xmlFree((xmlChar *) ctxt->input->encoding);
3785 ctxt->input->encoding = xmlStrdup(encoding);
3787 enc = xmlParseCharEncoding((const char *) encoding);
3789 * registered set of known encodings
3791 if (enc != XML_CHAR_ENCODING_ERROR) {
3792 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3793 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3794 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3795 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3796 (ctxt->input->buf != NULL) &&
3797 (ctxt->input->buf->encoder == NULL)) {
3798 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3799 "htmlCheckEncoding: wrong encoding meta\n",
3800 NULL, NULL);
3801 } else {
3802 xmlSwitchEncoding(ctxt, enc);
3804 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3805 } else {
3807 * fallback for unknown encodings
3809 handler = xmlFindCharEncodingHandler((const char *) encoding);
3810 if (handler != NULL) {
3811 xmlSwitchToEncoding(ctxt, handler);
3812 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3813 } else {
3814 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3815 "htmlCheckEncoding: unknown encoding %s\n",
3816 encoding, NULL);
3820 if ((ctxt->input->buf != NULL) &&
3821 (ctxt->input->buf->encoder != NULL) &&
3822 (ctxt->input->buf->raw != NULL) &&
3823 (ctxt->input->buf->buffer != NULL)) {
3824 int nbchars;
3825 int processed;
3828 * convert as much as possible to the parser reading buffer.
3830 processed = ctxt->input->cur - ctxt->input->base;
3831 xmlBufShrink(ctxt->input->buf->buffer, processed);
3832 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3833 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3834 if (nbchars < 0) {
3835 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3836 "htmlCheckEncoding: encoder error\n",
3837 NULL, NULL);
3844 * htmlCheckEncoding:
3845 * @ctxt: an HTML parser context
3846 * @attvalue: the attribute value
3848 * Checks an http-equiv attribute from a Meta tag to detect
3849 * the encoding
3850 * If a new encoding is detected the parser is switched to decode
3851 * it and pass UTF8
3853 static void
3854 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3855 const xmlChar *encoding;
3857 if (!attvalue)
3858 return;
3860 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3861 if (encoding != NULL) {
3862 encoding += 7;
3865 * skip blank
3867 if (encoding && IS_BLANK_CH(*encoding))
3868 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3869 if (encoding && *encoding == '=') {
3870 encoding ++;
3871 htmlCheckEncodingDirect(ctxt, encoding);
3876 * htmlCheckMeta:
3877 * @ctxt: an HTML parser context
3878 * @atts: the attributes values
3880 * Checks an attributes from a Meta tag
3882 static void
3883 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3884 int i;
3885 const xmlChar *att, *value;
3886 int http = 0;
3887 const xmlChar *content = NULL;
3889 if ((ctxt == NULL) || (atts == NULL))
3890 return;
3892 i = 0;
3893 att = atts[i++];
3894 while (att != NULL) {
3895 value = atts[i++];
3896 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3897 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3898 http = 1;
3899 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3900 htmlCheckEncodingDirect(ctxt, value);
3901 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3902 content = value;
3903 att = atts[i++];
3905 if ((http) && (content != NULL))
3906 htmlCheckEncoding(ctxt, content);
3911 * htmlParseStartTag:
3912 * @ctxt: an HTML parser context
3914 * parse a start of tag either for rule element or
3915 * EmptyElement. In both case we don't parse the tag closing chars.
3917 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3919 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3921 * With namespace:
3923 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3925 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3927 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3930 static int
3931 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3932 const xmlChar *name;
3933 const xmlChar *attname;
3934 xmlChar *attvalue;
3935 const xmlChar **atts;
3936 int nbatts = 0;
3937 int maxatts;
3938 int meta = 0;
3939 int i;
3940 int discardtag = 0;
3942 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3943 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3944 "htmlParseStartTag: context error\n", NULL, NULL);
3945 return -1;
3947 if (ctxt->instate == XML_PARSER_EOF)
3948 return(-1);
3949 if (CUR != '<') return -1;
3950 NEXT;
3952 atts = ctxt->atts;
3953 maxatts = ctxt->maxatts;
3955 GROW;
3956 name = htmlParseHTMLName(ctxt);
3957 if (name == NULL) {
3958 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3959 "htmlParseStartTag: invalid element name\n",
3960 NULL, NULL);
3961 /* if recover preserve text on classic misconstructs */
3962 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3963 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3964 htmlParseCharDataInternal(ctxt, '<');
3965 return(-1);
3969 /* Dump the bogus tag like browsers do */
3970 while ((CUR != 0) && (CUR != '>') &&
3971 (ctxt->instate != XML_PARSER_EOF))
3972 NEXT;
3973 return -1;
3975 if (xmlStrEqual(name, BAD_CAST"meta"))
3976 meta = 1;
3979 * Check for auto-closure of HTML elements.
3981 htmlAutoClose(ctxt, name);
3984 * Check for implied HTML elements.
3986 htmlCheckImplied(ctxt, name);
3989 * Avoid html at any level > 0, head at any level != 1
3990 * or any attempt to recurse body
3992 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3993 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3994 "htmlParseStartTag: misplaced <html> tag\n",
3995 name, NULL);
3996 discardtag = 1;
3997 ctxt->depth++;
3999 if ((ctxt->nameNr != 1) &&
4000 (xmlStrEqual(name, BAD_CAST"head"))) {
4001 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002 "htmlParseStartTag: misplaced <head> tag\n",
4003 name, NULL);
4004 discardtag = 1;
4005 ctxt->depth++;
4007 if (xmlStrEqual(name, BAD_CAST"body")) {
4008 int indx;
4009 for (indx = 0;indx < ctxt->nameNr;indx++) {
4010 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4011 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012 "htmlParseStartTag: misplaced <body> tag\n",
4013 name, NULL);
4014 discardtag = 1;
4015 ctxt->depth++;
4021 * Now parse the attributes, it ends up with the ending
4023 * (S Attribute)* S?
4025 SKIP_BLANKS;
4026 while ((CUR != 0) &&
4027 (CUR != '>') &&
4028 ((CUR != '/') || (NXT(1) != '>'))) {
4029 GROW;
4030 attname = htmlParseAttribute(ctxt, &attvalue);
4031 if (attname != NULL) {
4034 * Well formedness requires at most one declaration of an attribute
4036 for (i = 0; i < nbatts;i += 2) {
4037 if (xmlStrEqual(atts[i], attname)) {
4038 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4039 "Attribute %s redefined\n", attname, NULL);
4040 if (attvalue != NULL)
4041 xmlFree(attvalue);
4042 goto failed;
4047 * Add the pair to atts
4049 if (atts == NULL) {
4050 maxatts = 22; /* allow for 10 attrs by default */
4051 atts = (const xmlChar **)
4052 xmlMalloc(maxatts * sizeof(xmlChar *));
4053 if (atts == NULL) {
4054 htmlErrMemory(ctxt, NULL);
4055 if (attvalue != NULL)
4056 xmlFree(attvalue);
4057 goto failed;
4059 ctxt->atts = atts;
4060 ctxt->maxatts = maxatts;
4061 } else if (nbatts + 4 > maxatts) {
4062 const xmlChar **n;
4064 maxatts *= 2;
4065 n = (const xmlChar **) xmlRealloc((void *) atts,
4066 maxatts * sizeof(const xmlChar *));
4067 if (n == NULL) {
4068 htmlErrMemory(ctxt, NULL);
4069 if (attvalue != NULL)
4070 xmlFree(attvalue);
4071 goto failed;
4073 atts = n;
4074 ctxt->atts = atts;
4075 ctxt->maxatts = maxatts;
4077 atts[nbatts++] = attname;
4078 atts[nbatts++] = attvalue;
4079 atts[nbatts] = NULL;
4080 atts[nbatts + 1] = NULL;
4082 else {
4083 if (attvalue != NULL)
4084 xmlFree(attvalue);
4085 /* Dump the bogus attribute string up to the next blank or
4086 * the end of the tag. */
4087 while ((CUR != 0) &&
4088 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4089 ((CUR != '/') || (NXT(1) != '>')))
4090 NEXT;
4093 failed:
4094 SKIP_BLANKS;
4098 * Handle specific association to the META tag
4100 if (meta && (nbatts != 0))
4101 htmlCheckMeta(ctxt, atts);
4104 * SAX: Start of Element !
4106 if (!discardtag) {
4107 htmlnamePush(ctxt, name);
4108 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4109 if (nbatts != 0)
4110 ctxt->sax->startElement(ctxt->userData, name, atts);
4111 else
4112 ctxt->sax->startElement(ctxt->userData, name, NULL);
4116 if (atts != NULL) {
4117 for (i = 1;i < nbatts;i += 2) {
4118 if (atts[i] != NULL)
4119 xmlFree((xmlChar *) atts[i]);
4123 return(discardtag);
4127 * htmlParseEndTag:
4128 * @ctxt: an HTML parser context
4130 * parse an end of tag
4132 * [42] ETag ::= '</' Name S? '>'
4134 * With namespace
4136 * [NS 9] ETag ::= '</' QName S? '>'
4138 * Returns 1 if the current level should be closed.
4141 static int
4142 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4144 const xmlChar *name;
4145 const xmlChar *oldname;
4146 int i, ret;
4148 if ((CUR != '<') || (NXT(1) != '/')) {
4149 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4150 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4151 return (0);
4153 SKIP(2);
4155 name = htmlParseHTMLName(ctxt);
4156 if (name == NULL)
4157 return (0);
4159 * We should definitely be at the ending "S? '>'" part
4161 SKIP_BLANKS;
4162 if (CUR != '>') {
4163 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4164 "End tag : expected '>'\n", NULL, NULL);
4165 /* Skip to next '>' */
4166 while ((CUR != 0) && (CUR != '>'))
4167 NEXT;
4169 if (CUR == '>')
4170 NEXT;
4173 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4174 * out now.
4176 if ((ctxt->depth > 0) &&
4177 (xmlStrEqual(name, BAD_CAST "html") ||
4178 xmlStrEqual(name, BAD_CAST "body") ||
4179 xmlStrEqual(name, BAD_CAST "head"))) {
4180 ctxt->depth--;
4181 return (0);
4185 * If the name read is not one of the element in the parsing stack
4186 * then return, it's just an error.
4188 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4189 if (xmlStrEqual(name, ctxt->nameTab[i]))
4190 break;
4192 if (i < 0) {
4193 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4194 "Unexpected end tag : %s\n", name, NULL);
4195 return (0);
4200 * Check for auto-closure of HTML elements.
4203 htmlAutoCloseOnClose(ctxt, name);
4206 * Well formedness constraints, opening and closing must match.
4207 * With the exception that the autoclose may have popped stuff out
4208 * of the stack.
4210 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4211 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4212 "Opening and ending tag mismatch: %s and %s\n",
4213 name, ctxt->name);
4217 * SAX: End of Tag
4219 oldname = ctxt->name;
4220 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4221 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4222 ctxt->sax->endElement(ctxt->userData, name);
4223 htmlNodeInfoPop(ctxt);
4224 htmlnamePop(ctxt);
4225 ret = 1;
4226 } else {
4227 ret = 0;
4230 return (ret);
4235 * htmlParseReference:
4236 * @ctxt: an HTML parser context
4238 * parse and handle entity references in content,
4239 * this will end-up in a call to character() since this is either a
4240 * CharRef, or a predefined entity.
4242 static void
4243 htmlParseReference(htmlParserCtxtPtr ctxt) {
4244 const htmlEntityDesc * ent;
4245 xmlChar out[6];
4246 const xmlChar *name;
4247 if (CUR != '&') return;
4249 if (NXT(1) == '#') {
4250 unsigned int c;
4251 int bits, i = 0;
4253 c = htmlParseCharRef(ctxt);
4254 if (c == 0)
4255 return;
4257 if (c < 0x80) { out[i++]= c; bits= -6; }
4258 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4259 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4260 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4262 for ( ; bits >= 0; bits-= 6) {
4263 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4265 out[i] = 0;
4267 htmlCheckParagraph(ctxt);
4268 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269 ctxt->sax->characters(ctxt->userData, out, i);
4270 } else {
4271 ent = htmlParseEntityRef(ctxt, &name);
4272 if (name == NULL) {
4273 htmlCheckParagraph(ctxt);
4274 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4275 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276 return;
4278 if ((ent == NULL) || !(ent->value > 0)) {
4279 htmlCheckParagraph(ctxt);
4280 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4281 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4282 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4283 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4285 } else {
4286 unsigned int c;
4287 int bits, i = 0;
4289 c = ent->value;
4290 if (c < 0x80)
4291 { out[i++]= c; bits= -6; }
4292 else if (c < 0x800)
4293 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4294 else if (c < 0x10000)
4295 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4296 else
4297 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4299 for ( ; bits >= 0; bits-= 6) {
4300 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4302 out[i] = 0;
4304 htmlCheckParagraph(ctxt);
4305 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4306 ctxt->sax->characters(ctxt->userData, out, i);
4312 * htmlParseContent:
4313 * @ctxt: an HTML parser context
4315 * Parse a content: comment, sub-element, reference or text.
4316 * Kept for compatibility with old code
4319 static void
4320 htmlParseContent(htmlParserCtxtPtr ctxt) {
4321 xmlChar *currentNode;
4322 int depth;
4323 const xmlChar *name;
4325 currentNode = xmlStrdup(ctxt->name);
4326 depth = ctxt->nameNr;
4327 while (1) {
4328 GROW;
4330 if (ctxt->instate == XML_PARSER_EOF)
4331 break;
4334 * Our tag or one of it's parent or children is ending.
4336 if ((CUR == '<') && (NXT(1) == '/')) {
4337 if (htmlParseEndTag(ctxt) &&
4338 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4339 if (currentNode != NULL)
4340 xmlFree(currentNode);
4341 return;
4343 continue; /* while */
4346 else if ((CUR == '<') &&
4347 ((IS_ASCII_LETTER(NXT(1))) ||
4348 (NXT(1) == '_') || (NXT(1) == ':'))) {
4349 name = htmlParseHTMLName_nonInvasive(ctxt);
4350 if (name == NULL) {
4351 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4352 "htmlParseStartTag: invalid element name\n",
4353 NULL, NULL);
4354 /* Dump the bogus tag like browsers do */
4355 while ((CUR != 0) && (CUR != '>'))
4356 NEXT;
4358 if (currentNode != NULL)
4359 xmlFree(currentNode);
4360 return;
4363 if (ctxt->name != NULL) {
4364 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4365 htmlAutoClose(ctxt, name);
4366 continue;
4372 * Has this node been popped out during parsing of
4373 * the next element
4375 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4376 (!xmlStrEqual(currentNode, ctxt->name)))
4378 if (currentNode != NULL) xmlFree(currentNode);
4379 return;
4382 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4383 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4385 * Handle SCRIPT/STYLE separately
4387 htmlParseScript(ctxt);
4388 } else {
4390 * Sometimes DOCTYPE arrives in the middle of the document
4392 if ((CUR == '<') && (NXT(1) == '!') &&
4393 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4394 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4395 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4396 (UPP(8) == 'E')) {
4397 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4398 "Misplaced DOCTYPE declaration\n",
4399 BAD_CAST "DOCTYPE" , NULL);
4400 htmlParseDocTypeDecl(ctxt);
4404 * First case : a comment
4406 if ((CUR == '<') && (NXT(1) == '!') &&
4407 (NXT(2) == '-') && (NXT(3) == '-')) {
4408 htmlParseComment(ctxt);
4412 * Second case : a Processing Instruction.
4414 else if ((CUR == '<') && (NXT(1) == '?')) {
4415 htmlParsePI(ctxt);
4419 * Third case : a sub-element.
4421 else if (CUR == '<') {
4422 htmlParseElement(ctxt);
4426 * Fourth case : a reference. If if has not been resolved,
4427 * parsing returns it's Name, create the node
4429 else if (CUR == '&') {
4430 htmlParseReference(ctxt);
4434 * Fifth case : end of the resource
4436 else if (CUR == 0) {
4437 htmlAutoCloseOnEnd(ctxt);
4438 break;
4442 * Last case, text. Note that References are handled directly.
4444 else {
4445 htmlParseCharData(ctxt);
4448 GROW;
4450 if (currentNode != NULL) xmlFree(currentNode);
4454 * htmlParseElement:
4455 * @ctxt: an HTML parser context
4457 * parse an HTML element, this is highly recursive
4458 * this is kept for compatibility with previous code versions
4460 * [39] element ::= EmptyElemTag | STag content ETag
4462 * [41] Attribute ::= Name Eq AttValue
4465 void
4466 htmlParseElement(htmlParserCtxtPtr ctxt) {
4467 const xmlChar *name;
4468 xmlChar *currentNode = NULL;
4469 const htmlElemDesc * info;
4470 htmlParserNodeInfo node_info;
4471 int failed;
4472 int depth;
4473 const xmlChar *oldptr;
4475 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477 "htmlParseElement: context error\n", NULL, NULL);
4478 return;
4481 if (ctxt->instate == XML_PARSER_EOF)
4482 return;
4484 /* Capture start position */
4485 if (ctxt->record_info) {
4486 node_info.begin_pos = ctxt->input->consumed +
4487 (CUR_PTR - ctxt->input->base);
4488 node_info.begin_line = ctxt->input->line;
4491 failed = htmlParseStartTag(ctxt);
4492 name = ctxt->name;
4493 if ((failed == -1) || (name == NULL)) {
4494 if (CUR == '>')
4495 NEXT;
4496 return;
4500 * Lookup the info for that element.
4502 info = htmlTagLookup(name);
4503 if (info == NULL) {
4504 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505 "Tag %s invalid\n", name, NULL);
4509 * Check for an Empty Element labeled the XML/SGML way
4511 if ((CUR == '/') && (NXT(1) == '>')) {
4512 SKIP(2);
4513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514 ctxt->sax->endElement(ctxt->userData, name);
4515 htmlnamePop(ctxt);
4516 return;
4519 if (CUR == '>') {
4520 NEXT;
4521 } else {
4522 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523 "Couldn't find end of Start Tag %s\n", name, NULL);
4526 * end of parsing of this node.
4528 if (xmlStrEqual(name, ctxt->name)) {
4529 nodePop(ctxt);
4530 htmlnamePop(ctxt);
4534 * Capture end position and add node
4536 if (ctxt->record_info) {
4537 node_info.end_pos = ctxt->input->consumed +
4538 (CUR_PTR - ctxt->input->base);
4539 node_info.end_line = ctxt->input->line;
4540 node_info.node = ctxt->node;
4541 xmlParserAddNodeInfo(ctxt, &node_info);
4543 return;
4547 * Check for an Empty Element from DTD definition
4549 if ((info != NULL) && (info->empty)) {
4550 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551 ctxt->sax->endElement(ctxt->userData, name);
4552 htmlnamePop(ctxt);
4553 return;
4557 * Parse the content of the element:
4559 currentNode = xmlStrdup(ctxt->name);
4560 depth = ctxt->nameNr;
4561 while (CUR != 0) {
4562 oldptr = ctxt->input->cur;
4563 htmlParseContent(ctxt);
4564 if (oldptr==ctxt->input->cur) break;
4565 if (ctxt->nameNr < depth) break;
4569 * Capture end position and add node
4571 if ( currentNode != NULL && ctxt->record_info ) {
4572 node_info.end_pos = ctxt->input->consumed +
4573 (CUR_PTR - ctxt->input->base);
4574 node_info.end_line = ctxt->input->line;
4575 node_info.node = ctxt->node;
4576 xmlParserAddNodeInfo(ctxt, &node_info);
4578 if (CUR == 0) {
4579 htmlAutoCloseOnEnd(ctxt);
4582 if (currentNode != NULL)
4583 xmlFree(currentNode);
4586 static void
4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4589 * Capture end position and add node
4591 if ( ctxt->node != NULL && ctxt->record_info ) {
4592 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593 (CUR_PTR - ctxt->input->base);
4594 ctxt->nodeInfo->end_line = ctxt->input->line;
4595 ctxt->nodeInfo->node = ctxt->node;
4596 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597 htmlNodeInfoPop(ctxt);
4599 if (CUR == 0) {
4600 htmlAutoCloseOnEnd(ctxt);
4605 * htmlParseElementInternal:
4606 * @ctxt: an HTML parser context
4608 * parse an HTML element, new version, non recursive
4610 * [39] element ::= EmptyElemTag | STag content ETag
4612 * [41] Attribute ::= Name Eq AttValue
4615 static void
4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617 const xmlChar *name;
4618 const htmlElemDesc * info;
4619 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620 int failed;
4622 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624 "htmlParseElementInternal: context error\n", NULL, NULL);
4625 return;
4628 if (ctxt->instate == XML_PARSER_EOF)
4629 return;
4631 /* Capture start position */
4632 if (ctxt->record_info) {
4633 node_info.begin_pos = ctxt->input->consumed +
4634 (CUR_PTR - ctxt->input->base);
4635 node_info.begin_line = ctxt->input->line;
4638 failed = htmlParseStartTag(ctxt);
4639 name = ctxt->name;
4640 if ((failed == -1) || (name == NULL)) {
4641 if (CUR == '>')
4642 NEXT;
4643 return;
4647 * Lookup the info for that element.
4649 info = htmlTagLookup(name);
4650 if (info == NULL) {
4651 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652 "Tag %s invalid\n", name, NULL);
4656 * Check for an Empty Element labeled the XML/SGML way
4658 if ((CUR == '/') && (NXT(1) == '>')) {
4659 SKIP(2);
4660 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661 ctxt->sax->endElement(ctxt->userData, name);
4662 htmlnamePop(ctxt);
4663 return;
4666 if (CUR == '>') {
4667 NEXT;
4668 } else {
4669 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670 "Couldn't find end of Start Tag %s\n", name, NULL);
4673 * end of parsing of this node.
4675 if (xmlStrEqual(name, ctxt->name)) {
4676 nodePop(ctxt);
4677 htmlnamePop(ctxt);
4680 if (ctxt->record_info)
4681 htmlNodeInfoPush(ctxt, &node_info);
4682 htmlParserFinishElementParsing(ctxt);
4683 return;
4687 * Check for an Empty Element from DTD definition
4689 if ((info != NULL) && (info->empty)) {
4690 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691 ctxt->sax->endElement(ctxt->userData, name);
4692 htmlnamePop(ctxt);
4693 return;
4696 if (ctxt->record_info)
4697 htmlNodeInfoPush(ctxt, &node_info);
4701 * htmlParseContentInternal:
4702 * @ctxt: an HTML parser context
4704 * Parse a content: comment, sub-element, reference or text.
4705 * New version for non recursive htmlParseElementInternal
4708 static void
4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710 xmlChar *currentNode;
4711 int depth;
4712 const xmlChar *name;
4714 currentNode = xmlStrdup(ctxt->name);
4715 depth = ctxt->nameNr;
4716 while (1) {
4717 GROW;
4719 if (ctxt->instate == XML_PARSER_EOF)
4720 break;
4723 * Our tag or one of it's parent or children is ending.
4725 if ((CUR == '<') && (NXT(1) == '/')) {
4726 if (htmlParseEndTag(ctxt) &&
4727 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728 if (currentNode != NULL)
4729 xmlFree(currentNode);
4731 currentNode = xmlStrdup(ctxt->name);
4732 depth = ctxt->nameNr;
4734 continue; /* while */
4737 else if ((CUR == '<') &&
4738 ((IS_ASCII_LETTER(NXT(1))) ||
4739 (NXT(1) == '_') || (NXT(1) == ':'))) {
4740 name = htmlParseHTMLName_nonInvasive(ctxt);
4741 if (name == NULL) {
4742 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743 "htmlParseStartTag: invalid element name\n",
4744 NULL, NULL);
4745 /* Dump the bogus tag like browsers do */
4746 while ((CUR == 0) && (CUR != '>'))
4747 NEXT;
4749 htmlParserFinishElementParsing(ctxt);
4750 if (currentNode != NULL)
4751 xmlFree(currentNode);
4753 currentNode = xmlStrdup(ctxt->name);
4754 depth = ctxt->nameNr;
4755 continue;
4758 if (ctxt->name != NULL) {
4759 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760 htmlAutoClose(ctxt, name);
4761 continue;
4767 * Has this node been popped out during parsing of
4768 * the next element
4770 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771 (!xmlStrEqual(currentNode, ctxt->name)))
4773 htmlParserFinishElementParsing(ctxt);
4774 if (currentNode != NULL) xmlFree(currentNode);
4776 currentNode = xmlStrdup(ctxt->name);
4777 depth = ctxt->nameNr;
4778 continue;
4781 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4784 * Handle SCRIPT/STYLE separately
4786 htmlParseScript(ctxt);
4787 } else {
4789 * Sometimes DOCTYPE arrives in the middle of the document
4791 if ((CUR == '<') && (NXT(1) == '!') &&
4792 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4793 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4794 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795 (UPP(8) == 'E')) {
4796 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797 "Misplaced DOCTYPE declaration\n",
4798 BAD_CAST "DOCTYPE" , NULL);
4799 htmlParseDocTypeDecl(ctxt);
4803 * First case : a comment
4805 if ((CUR == '<') && (NXT(1) == '!') &&
4806 (NXT(2) == '-') && (NXT(3) == '-')) {
4807 htmlParseComment(ctxt);
4811 * Second case : a Processing Instruction.
4813 else if ((CUR == '<') && (NXT(1) == '?')) {
4814 htmlParsePI(ctxt);
4818 * Third case : a sub-element.
4820 else if (CUR == '<') {
4821 htmlParseElementInternal(ctxt);
4822 if (currentNode != NULL) xmlFree(currentNode);
4824 currentNode = xmlStrdup(ctxt->name);
4825 depth = ctxt->nameNr;
4829 * Fourth case : a reference. If if has not been resolved,
4830 * parsing returns it's Name, create the node
4832 else if (CUR == '&') {
4833 htmlParseReference(ctxt);
4837 * Fifth case : end of the resource
4839 else if (CUR == 0) {
4840 htmlAutoCloseOnEnd(ctxt);
4841 break;
4845 * Last case, text. Note that References are handled directly.
4847 else {
4848 htmlParseCharData(ctxt);
4851 GROW;
4853 if (currentNode != NULL) xmlFree(currentNode);
4857 * htmlParseContent:
4858 * @ctxt: an HTML parser context
4860 * Parse a content: comment, sub-element, reference or text.
4861 * This is the entry point when called from parser.c
4864 void
4865 __htmlParseContent(void *ctxt) {
4866 if (ctxt != NULL)
4867 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4871 * htmlParseDocument:
4872 * @ctxt: an HTML parser context
4874 * parse an HTML document (and build a tree if using the standard SAX
4875 * interface).
4877 * Returns 0, -1 in case of error. the parser context is augmented
4878 * as a result of the parsing.
4882 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4883 xmlChar start[4];
4884 xmlCharEncoding enc;
4885 xmlDtdPtr dtd;
4887 xmlInitParser();
4889 htmlDefaultSAXHandlerInit();
4891 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4892 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4893 "htmlParseDocument: context error\n", NULL, NULL);
4894 return(XML_ERR_INTERNAL_ERROR);
4896 ctxt->html = 1;
4897 ctxt->linenumbers = 1;
4898 GROW;
4900 * SAX: beginning of the document processing.
4902 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4903 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4905 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4906 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4908 * Get the 4 first bytes and decode the charset
4909 * if enc != XML_CHAR_ENCODING_NONE
4910 * plug some encoding conversion routines.
4912 start[0] = RAW;
4913 start[1] = NXT(1);
4914 start[2] = NXT(2);
4915 start[3] = NXT(3);
4916 enc = xmlDetectCharEncoding(&start[0], 4);
4917 if (enc != XML_CHAR_ENCODING_NONE) {
4918 xmlSwitchEncoding(ctxt, enc);
4923 * Wipe out everything which is before the first '<'
4925 SKIP_BLANKS;
4926 if (CUR == 0) {
4927 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4928 "Document is empty\n", NULL, NULL);
4931 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4932 ctxt->sax->startDocument(ctxt->userData);
4936 * Parse possible comments and PIs before any content
4938 while (((CUR == '<') && (NXT(1) == '!') &&
4939 (NXT(2) == '-') && (NXT(3) == '-')) ||
4940 ((CUR == '<') && (NXT(1) == '?'))) {
4941 htmlParseComment(ctxt);
4942 htmlParsePI(ctxt);
4943 SKIP_BLANKS;
4948 * Then possibly doc type declaration(s) and more Misc
4949 * (doctypedecl Misc*)?
4951 if ((CUR == '<') && (NXT(1) == '!') &&
4952 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4953 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4954 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4955 (UPP(8) == 'E')) {
4956 htmlParseDocTypeDecl(ctxt);
4958 SKIP_BLANKS;
4961 * Parse possible comments and PIs before any content
4963 while (((CUR == '<') && (NXT(1) == '!') &&
4964 (NXT(2) == '-') && (NXT(3) == '-')) ||
4965 ((CUR == '<') && (NXT(1) == '?'))) {
4966 htmlParseComment(ctxt);
4967 htmlParsePI(ctxt);
4968 SKIP_BLANKS;
4972 * Time to start parsing the tree itself
4974 htmlParseContentInternal(ctxt);
4977 * autoclose
4979 if (CUR == 0)
4980 htmlAutoCloseOnEnd(ctxt);
4984 * SAX: end of the document processing.
4986 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4987 ctxt->sax->endDocument(ctxt->userData);
4989 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4990 dtd = xmlGetIntSubset(ctxt->myDoc);
4991 if (dtd == NULL)
4992 ctxt->myDoc->intSubset =
4993 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4994 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4995 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4997 if (! ctxt->wellFormed) return(-1);
4998 return(0);
5002 /************************************************************************
5004 * Parser contexts handling *
5006 ************************************************************************/
5009 * htmlInitParserCtxt:
5010 * @ctxt: an HTML parser context
5012 * Initialize a parser context
5014 * Returns 0 in case of success and -1 in case of error
5017 static int
5018 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5020 htmlSAXHandler *sax;
5022 if (ctxt == NULL) return(-1);
5023 memset(ctxt, 0, sizeof(htmlParserCtxt));
5025 ctxt->dict = xmlDictCreate();
5026 if (ctxt->dict == NULL) {
5027 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5028 return(-1);
5030 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5031 if (sax == NULL) {
5032 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5033 return(-1);
5035 else
5036 memset(sax, 0, sizeof(htmlSAXHandler));
5038 /* Allocate the Input stack */
5039 ctxt->inputTab = (htmlParserInputPtr *)
5040 xmlMalloc(5 * sizeof(htmlParserInputPtr));
5041 if (ctxt->inputTab == NULL) {
5042 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5043 ctxt->inputNr = 0;
5044 ctxt->inputMax = 0;
5045 ctxt->input = NULL;
5046 return(-1);
5048 ctxt->inputNr = 0;
5049 ctxt->inputMax = 5;
5050 ctxt->input = NULL;
5051 ctxt->version = NULL;
5052 ctxt->encoding = NULL;
5053 ctxt->standalone = -1;
5054 ctxt->instate = XML_PARSER_START;
5056 /* Allocate the Node stack */
5057 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5058 if (ctxt->nodeTab == NULL) {
5059 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5060 ctxt->nodeNr = 0;
5061 ctxt->nodeMax = 0;
5062 ctxt->node = NULL;
5063 ctxt->inputNr = 0;
5064 ctxt->inputMax = 0;
5065 ctxt->input = NULL;
5066 return(-1);
5068 ctxt->nodeNr = 0;
5069 ctxt->nodeMax = 10;
5070 ctxt->node = NULL;
5072 /* Allocate the Name stack */
5073 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5074 if (ctxt->nameTab == NULL) {
5075 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5076 ctxt->nameNr = 0;
5077 ctxt->nameMax = 0;
5078 ctxt->name = NULL;
5079 ctxt->nodeNr = 0;
5080 ctxt->nodeMax = 0;
5081 ctxt->node = NULL;
5082 ctxt->inputNr = 0;
5083 ctxt->inputMax = 0;
5084 ctxt->input = NULL;
5085 return(-1);
5087 ctxt->nameNr = 0;
5088 ctxt->nameMax = 10;
5089 ctxt->name = NULL;
5091 ctxt->nodeInfoTab = NULL;
5092 ctxt->nodeInfoNr = 0;
5093 ctxt->nodeInfoMax = 0;
5095 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5096 else {
5097 ctxt->sax = sax;
5098 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5100 ctxt->userData = ctxt;
5101 ctxt->myDoc = NULL;
5102 ctxt->wellFormed = 1;
5103 ctxt->replaceEntities = 0;
5104 ctxt->linenumbers = xmlLineNumbersDefaultValue;
5105 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5106 ctxt->html = 1;
5107 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5108 ctxt->vctxt.userData = ctxt;
5109 ctxt->vctxt.error = xmlParserValidityError;
5110 ctxt->vctxt.warning = xmlParserValidityWarning;
5111 ctxt->record_info = 0;
5112 ctxt->validate = 0;
5113 ctxt->checkIndex = 0;
5114 ctxt->catalogs = NULL;
5115 xmlInitNodeInfoSeq(&ctxt->node_seq);
5116 return(0);
5120 * htmlFreeParserCtxt:
5121 * @ctxt: an HTML parser context
5123 * Free all the memory used by a parser context. However the parsed
5124 * document in ctxt->myDoc is not freed.
5127 void
5128 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5130 xmlFreeParserCtxt(ctxt);
5134 * htmlNewParserCtxt:
5136 * Allocate and initialize a new parser context.
5138 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5141 htmlParserCtxtPtr
5142 htmlNewParserCtxt(void)
5144 xmlParserCtxtPtr ctxt;
5146 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5147 if (ctxt == NULL) {
5148 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5149 return(NULL);
5151 memset(ctxt, 0, sizeof(xmlParserCtxt));
5152 if (htmlInitParserCtxt(ctxt) < 0) {
5153 htmlFreeParserCtxt(ctxt);
5154 return(NULL);
5156 return(ctxt);
5160 * htmlCreateMemoryParserCtxt:
5161 * @buffer: a pointer to a char array
5162 * @size: the size of the array
5164 * Create a parser context for an HTML in-memory document.
5166 * Returns the new parser context or NULL
5168 htmlParserCtxtPtr
5169 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5170 xmlParserCtxtPtr ctxt;
5171 xmlParserInputPtr input;
5172 xmlParserInputBufferPtr buf;
5174 if (buffer == NULL)
5175 return(NULL);
5176 if (size <= 0)
5177 return(NULL);
5179 ctxt = htmlNewParserCtxt();
5180 if (ctxt == NULL)
5181 return(NULL);
5183 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5184 if (buf == NULL) return(NULL);
5186 input = xmlNewInputStream(ctxt);
5187 if (input == NULL) {
5188 xmlFreeParserCtxt(ctxt);
5189 return(NULL);
5192 input->filename = NULL;
5193 input->buf = buf;
5194 xmlBufResetInput(buf->buffer, input);
5196 inputPush(ctxt, input);
5197 return(ctxt);
5201 * htmlCreateDocParserCtxt:
5202 * @cur: a pointer to an array of xmlChar
5203 * @encoding: a free form C string describing the HTML document encoding, or NULL
5205 * Create a parser context for an HTML document.
5207 * TODO: check the need to add encoding handling there
5209 * Returns the new parser context or NULL
5211 static htmlParserCtxtPtr
5212 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5213 int len;
5214 htmlParserCtxtPtr ctxt;
5216 if (cur == NULL)
5217 return(NULL);
5218 len = xmlStrlen(cur);
5219 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5220 if (ctxt == NULL)
5221 return(NULL);
5223 if (encoding != NULL) {
5224 xmlCharEncoding enc;
5225 xmlCharEncodingHandlerPtr handler;
5227 if (ctxt->input->encoding != NULL)
5228 xmlFree((xmlChar *) ctxt->input->encoding);
5229 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5231 enc = xmlParseCharEncoding(encoding);
5233 * registered set of known encodings
5235 if (enc != XML_CHAR_ENCODING_ERROR) {
5236 xmlSwitchEncoding(ctxt, enc);
5237 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5238 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5239 "Unsupported encoding %s\n",
5240 (const xmlChar *) encoding, NULL);
5242 } else {
5244 * fallback for unknown encodings
5246 handler = xmlFindCharEncodingHandler((const char *) encoding);
5247 if (handler != NULL) {
5248 xmlSwitchToEncoding(ctxt, handler);
5249 } else {
5250 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5251 "Unsupported encoding %s\n",
5252 (const xmlChar *) encoding, NULL);
5256 return(ctxt);
5259 #ifdef LIBXML_PUSH_ENABLED
5260 /************************************************************************
5262 * Progressive parsing interfaces *
5264 ************************************************************************/
5267 * htmlParseLookupSequence:
5268 * @ctxt: an HTML parser context
5269 * @first: the first char to lookup
5270 * @next: the next char to lookup or zero
5271 * @third: the next char to lookup or zero
5272 * @ignoreattrval: skip over attribute values
5274 * Try to find if a sequence (first, next, third) or just (first next) or
5275 * (first) is available in the input stream.
5276 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5277 * to avoid rescanning sequences of bytes, it DOES change the state of the
5278 * parser, do not use liberally.
5279 * This is basically similar to xmlParseLookupSequence()
5281 * Returns the index to the current parsing point if the full sequence
5282 * is available, -1 otherwise.
5284 static int
5285 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5286 xmlChar next, xmlChar third, int ignoreattrval)
5288 int base, len;
5289 htmlParserInputPtr in;
5290 const xmlChar *buf;
5291 int invalue = 0;
5292 char valdellim = 0x0;
5294 in = ctxt->input;
5295 if (in == NULL)
5296 return (-1);
5298 base = in->cur - in->base;
5299 if (base < 0)
5300 return (-1);
5302 if (ctxt->checkIndex > base) {
5303 base = ctxt->checkIndex;
5304 /* Abuse hasPErefs member to restore current state. */
5305 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5308 if (in->buf == NULL) {
5309 buf = in->base;
5310 len = in->length;
5311 } else {
5312 buf = xmlBufContent(in->buf->buffer);
5313 len = xmlBufUse(in->buf->buffer);
5316 /* take into account the sequence length */
5317 if (third)
5318 len -= 2;
5319 else if (next)
5320 len--;
5321 for (; base < len; base++) {
5322 if (ignoreattrval) {
5323 if (buf[base] == '"' || buf[base] == '\'') {
5324 if (invalue) {
5325 if (buf[base] == valdellim) {
5326 invalue = 0;
5327 continue;
5329 } else {
5330 valdellim = buf[base];
5331 invalue = 1;
5332 continue;
5334 } else if (invalue) {
5335 continue;
5338 if (buf[base] == first) {
5339 if (third != 0) {
5340 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5341 continue;
5342 } else if (next != 0) {
5343 if (buf[base + 1] != next)
5344 continue;
5346 ctxt->checkIndex = 0;
5347 #ifdef DEBUG_PUSH
5348 if (next == 0)
5349 xmlGenericError(xmlGenericErrorContext,
5350 "HPP: lookup '%c' found at %d\n",
5351 first, base);
5352 else if (third == 0)
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: lookup '%c%c' found at %d\n",
5355 first, next, base);
5356 else
5357 xmlGenericError(xmlGenericErrorContext,
5358 "HPP: lookup '%c%c%c' found at %d\n",
5359 first, next, third, base);
5360 #endif
5361 return (base - (in->cur - in->base));
5364 ctxt->checkIndex = base;
5365 /* Abuse hasPErefs member to track current state. */
5366 if (invalue)
5367 ctxt->hasPErefs |= 1;
5368 else
5369 ctxt->hasPErefs &= ~1;
5370 #ifdef DEBUG_PUSH
5371 if (next == 0)
5372 xmlGenericError(xmlGenericErrorContext,
5373 "HPP: lookup '%c' failed\n", first);
5374 else if (third == 0)
5375 xmlGenericError(xmlGenericErrorContext,
5376 "HPP: lookup '%c%c' failed\n", first, next);
5377 else
5378 xmlGenericError(xmlGenericErrorContext,
5379 "HPP: lookup '%c%c%c' failed\n", first, next,
5380 third);
5381 #endif
5382 return (-1);
5386 * htmlParseLookupCommentEnd:
5387 * @ctxt: an HTML parser context
5389 * Try to find a comment end tag in the input stream
5390 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5391 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5392 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5393 * to avoid rescanning sequences of bytes, it DOES change the state of the
5394 * parser, do not use liberally.
5395 * This wraps to htmlParseLookupSequence()
5397 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5399 static int
5400 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5402 int mark = 0;
5403 int cur = CUR_PTR - BASE_PTR;
5405 while (mark >= 0) {
5406 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5407 if ((mark < 0) ||
5408 (NXT(mark+2) == '>') ||
5409 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5410 return mark;
5412 ctxt->checkIndex = cur + mark + 1;
5414 return mark;
5419 * htmlParseTryOrFinish:
5420 * @ctxt: an HTML parser context
5421 * @terminate: last chunk indicator
5423 * Try to progress on parsing
5425 * Returns zero if no parsing was possible
5427 static int
5428 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5429 int ret = 0;
5430 htmlParserInputPtr in;
5431 ptrdiff_t avail = 0;
5432 xmlChar cur, next;
5434 htmlParserNodeInfo node_info;
5436 #ifdef DEBUG_PUSH
5437 switch (ctxt->instate) {
5438 case XML_PARSER_EOF:
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: try EOF\n"); break;
5441 case XML_PARSER_START:
5442 xmlGenericError(xmlGenericErrorContext,
5443 "HPP: try START\n"); break;
5444 case XML_PARSER_MISC:
5445 xmlGenericError(xmlGenericErrorContext,
5446 "HPP: try MISC\n");break;
5447 case XML_PARSER_COMMENT:
5448 xmlGenericError(xmlGenericErrorContext,
5449 "HPP: try COMMENT\n");break;
5450 case XML_PARSER_PROLOG:
5451 xmlGenericError(xmlGenericErrorContext,
5452 "HPP: try PROLOG\n");break;
5453 case XML_PARSER_START_TAG:
5454 xmlGenericError(xmlGenericErrorContext,
5455 "HPP: try START_TAG\n");break;
5456 case XML_PARSER_CONTENT:
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: try CONTENT\n");break;
5459 case XML_PARSER_CDATA_SECTION:
5460 xmlGenericError(xmlGenericErrorContext,
5461 "HPP: try CDATA_SECTION\n");break;
5462 case XML_PARSER_END_TAG:
5463 xmlGenericError(xmlGenericErrorContext,
5464 "HPP: try END_TAG\n");break;
5465 case XML_PARSER_ENTITY_DECL:
5466 xmlGenericError(xmlGenericErrorContext,
5467 "HPP: try ENTITY_DECL\n");break;
5468 case XML_PARSER_ENTITY_VALUE:
5469 xmlGenericError(xmlGenericErrorContext,
5470 "HPP: try ENTITY_VALUE\n");break;
5471 case XML_PARSER_ATTRIBUTE_VALUE:
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: try ATTRIBUTE_VALUE\n");break;
5474 case XML_PARSER_DTD:
5475 xmlGenericError(xmlGenericErrorContext,
5476 "HPP: try DTD\n");break;
5477 case XML_PARSER_EPILOG:
5478 xmlGenericError(xmlGenericErrorContext,
5479 "HPP: try EPILOG\n");break;
5480 case XML_PARSER_PI:
5481 xmlGenericError(xmlGenericErrorContext,
5482 "HPP: try PI\n");break;
5483 case XML_PARSER_SYSTEM_LITERAL:
5484 xmlGenericError(xmlGenericErrorContext,
5485 "HPP: try SYSTEM_LITERAL\n");break;
5487 #endif
5489 while (1) {
5491 in = ctxt->input;
5492 if (in == NULL) break;
5493 if (in->buf == NULL)
5494 avail = in->length - (in->cur - in->base);
5495 else
5496 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5497 (in->cur - in->base);
5498 if ((avail == 0) && (terminate)) {
5499 htmlAutoCloseOnEnd(ctxt);
5500 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5502 * SAX: end of the document processing.
5504 ctxt->instate = XML_PARSER_EOF;
5505 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5506 ctxt->sax->endDocument(ctxt->userData);
5509 if (avail < 1)
5510 goto done;
5512 * This is done to make progress and avoid an infinite loop
5513 * if a parsing attempt was aborted by hitting a NUL byte. After
5514 * changing htmlCurrentChar, this probably isn't necessary anymore.
5515 * We should consider removing this check.
5517 cur = in->cur[0];
5518 if (cur == 0) {
5519 SKIP(1);
5520 continue;
5523 switch (ctxt->instate) {
5524 case XML_PARSER_EOF:
5526 * Document parsing is done !
5528 goto done;
5529 case XML_PARSER_START:
5531 * Very first chars read from the document flow.
5533 cur = in->cur[0];
5534 if (IS_BLANK_CH(cur)) {
5535 SKIP_BLANKS;
5536 if (in->buf == NULL)
5537 avail = in->length - (in->cur - in->base);
5538 else
5539 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5540 (in->cur - in->base);
5542 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5543 ctxt->sax->setDocumentLocator(ctxt->userData,
5544 &xmlDefaultSAXLocator);
5545 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5546 (!ctxt->disableSAX))
5547 ctxt->sax->startDocument(ctxt->userData);
5549 cur = in->cur[0];
5550 next = in->cur[1];
5551 if ((cur == '<') && (next == '!') &&
5552 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5553 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5554 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5555 (UPP(8) == 'E')) {
5556 if ((!terminate) &&
5557 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5558 goto done;
5559 #ifdef DEBUG_PUSH
5560 xmlGenericError(xmlGenericErrorContext,
5561 "HPP: Parsing internal subset\n");
5562 #endif
5563 htmlParseDocTypeDecl(ctxt);
5564 ctxt->instate = XML_PARSER_PROLOG;
5565 #ifdef DEBUG_PUSH
5566 xmlGenericError(xmlGenericErrorContext,
5567 "HPP: entering PROLOG\n");
5568 #endif
5569 } else {
5570 ctxt->instate = XML_PARSER_MISC;
5571 #ifdef DEBUG_PUSH
5572 xmlGenericError(xmlGenericErrorContext,
5573 "HPP: entering MISC\n");
5574 #endif
5576 break;
5577 case XML_PARSER_MISC:
5578 SKIP_BLANKS;
5579 if (in->buf == NULL)
5580 avail = in->length - (in->cur - in->base);
5581 else
5582 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5583 (in->cur - in->base);
5585 * no chars in buffer
5587 if (avail < 1)
5588 goto done;
5590 * not enough chars in buffer
5592 if (avail < 2) {
5593 if (!terminate)
5594 goto done;
5595 else
5596 next = ' ';
5597 } else {
5598 next = in->cur[1];
5600 cur = in->cur[0];
5601 if ((cur == '<') && (next == '!') &&
5602 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5603 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5604 goto done;
5605 #ifdef DEBUG_PUSH
5606 xmlGenericError(xmlGenericErrorContext,
5607 "HPP: Parsing Comment\n");
5608 #endif
5609 htmlParseComment(ctxt);
5610 ctxt->instate = XML_PARSER_MISC;
5611 } else if ((cur == '<') && (next == '?')) {
5612 if ((!terminate) &&
5613 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5614 goto done;
5615 #ifdef DEBUG_PUSH
5616 xmlGenericError(xmlGenericErrorContext,
5617 "HPP: Parsing PI\n");
5618 #endif
5619 htmlParsePI(ctxt);
5620 ctxt->instate = XML_PARSER_MISC;
5621 } else if ((cur == '<') && (next == '!') &&
5622 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5623 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5624 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5625 (UPP(8) == 'E')) {
5626 if ((!terminate) &&
5627 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5628 goto done;
5629 #ifdef DEBUG_PUSH
5630 xmlGenericError(xmlGenericErrorContext,
5631 "HPP: Parsing internal subset\n");
5632 #endif
5633 htmlParseDocTypeDecl(ctxt);
5634 ctxt->instate = XML_PARSER_PROLOG;
5635 #ifdef DEBUG_PUSH
5636 xmlGenericError(xmlGenericErrorContext,
5637 "HPP: entering PROLOG\n");
5638 #endif
5639 } else if ((cur == '<') && (next == '!') &&
5640 (avail < 9)) {
5641 goto done;
5642 } else {
5643 ctxt->instate = XML_PARSER_CONTENT;
5644 #ifdef DEBUG_PUSH
5645 xmlGenericError(xmlGenericErrorContext,
5646 "HPP: entering START_TAG\n");
5647 #endif
5649 break;
5650 case XML_PARSER_PROLOG:
5651 SKIP_BLANKS;
5652 if (in->buf == NULL)
5653 avail = in->length - (in->cur - in->base);
5654 else
5655 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5656 (in->cur - in->base);
5657 if (avail < 2)
5658 goto done;
5659 cur = in->cur[0];
5660 next = in->cur[1];
5661 if ((cur == '<') && (next == '!') &&
5662 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5663 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5664 goto done;
5665 #ifdef DEBUG_PUSH
5666 xmlGenericError(xmlGenericErrorContext,
5667 "HPP: Parsing Comment\n");
5668 #endif
5669 htmlParseComment(ctxt);
5670 ctxt->instate = XML_PARSER_PROLOG;
5671 } else if ((cur == '<') && (next == '?')) {
5672 if ((!terminate) &&
5673 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5674 goto done;
5675 #ifdef DEBUG_PUSH
5676 xmlGenericError(xmlGenericErrorContext,
5677 "HPP: Parsing PI\n");
5678 #endif
5679 htmlParsePI(ctxt);
5680 ctxt->instate = XML_PARSER_PROLOG;
5681 } else if ((cur == '<') && (next == '!') &&
5682 (avail < 4)) {
5683 goto done;
5684 } else {
5685 ctxt->instate = XML_PARSER_CONTENT;
5686 #ifdef DEBUG_PUSH
5687 xmlGenericError(xmlGenericErrorContext,
5688 "HPP: entering START_TAG\n");
5689 #endif
5691 break;
5692 case XML_PARSER_EPILOG:
5693 if (in->buf == NULL)
5694 avail = in->length - (in->cur - in->base);
5695 else
5696 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5697 (in->cur - in->base);
5698 if (avail < 1)
5699 goto done;
5700 cur = in->cur[0];
5701 if (IS_BLANK_CH(cur)) {
5702 htmlParseCharData(ctxt);
5703 goto done;
5705 if (avail < 2)
5706 goto done;
5707 next = in->cur[1];
5708 if ((cur == '<') && (next == '!') &&
5709 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5710 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5711 goto done;
5712 #ifdef DEBUG_PUSH
5713 xmlGenericError(xmlGenericErrorContext,
5714 "HPP: Parsing Comment\n");
5715 #endif
5716 htmlParseComment(ctxt);
5717 ctxt->instate = XML_PARSER_EPILOG;
5718 } else if ((cur == '<') && (next == '?')) {
5719 if ((!terminate) &&
5720 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5721 goto done;
5722 #ifdef DEBUG_PUSH
5723 xmlGenericError(xmlGenericErrorContext,
5724 "HPP: Parsing PI\n");
5725 #endif
5726 htmlParsePI(ctxt);
5727 ctxt->instate = XML_PARSER_EPILOG;
5728 } else if ((cur == '<') && (next == '!') &&
5729 (avail < 4)) {
5730 goto done;
5731 } else {
5732 ctxt->errNo = XML_ERR_DOCUMENT_END;
5733 ctxt->wellFormed = 0;
5734 ctxt->instate = XML_PARSER_EOF;
5735 #ifdef DEBUG_PUSH
5736 xmlGenericError(xmlGenericErrorContext,
5737 "HPP: entering EOF\n");
5738 #endif
5739 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5740 ctxt->sax->endDocument(ctxt->userData);
5741 goto done;
5743 break;
5744 case XML_PARSER_START_TAG: {
5745 const xmlChar *name;
5746 int failed;
5747 const htmlElemDesc * info;
5750 * no chars in buffer
5752 if (avail < 1)
5753 goto done;
5755 * not enough chars in buffer
5757 if (avail < 2) {
5758 if (!terminate)
5759 goto done;
5760 else
5761 next = ' ';
5762 } else {
5763 next = in->cur[1];
5765 cur = in->cur[0];
5766 if (cur != '<') {
5767 ctxt->instate = XML_PARSER_CONTENT;
5768 #ifdef DEBUG_PUSH
5769 xmlGenericError(xmlGenericErrorContext,
5770 "HPP: entering CONTENT\n");
5771 #endif
5772 break;
5774 if (next == '/') {
5775 ctxt->instate = XML_PARSER_END_TAG;
5776 ctxt->checkIndex = 0;
5777 #ifdef DEBUG_PUSH
5778 xmlGenericError(xmlGenericErrorContext,
5779 "HPP: entering END_TAG\n");
5780 #endif
5781 break;
5783 if ((!terminate) &&
5784 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5785 goto done;
5787 /* Capture start position */
5788 if (ctxt->record_info) {
5789 node_info.begin_pos = ctxt->input->consumed +
5790 (CUR_PTR - ctxt->input->base);
5791 node_info.begin_line = ctxt->input->line;
5795 failed = htmlParseStartTag(ctxt);
5796 name = ctxt->name;
5797 if ((failed == -1) ||
5798 (name == NULL)) {
5799 if (CUR == '>')
5800 NEXT;
5801 break;
5805 * Lookup the info for that element.
5807 info = htmlTagLookup(name);
5808 if (info == NULL) {
5809 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5810 "Tag %s invalid\n", name, NULL);
5814 * Check for an Empty Element labeled the XML/SGML way
5816 if ((CUR == '/') && (NXT(1) == '>')) {
5817 SKIP(2);
5818 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5819 ctxt->sax->endElement(ctxt->userData, name);
5820 htmlnamePop(ctxt);
5821 ctxt->instate = XML_PARSER_CONTENT;
5822 #ifdef DEBUG_PUSH
5823 xmlGenericError(xmlGenericErrorContext,
5824 "HPP: entering CONTENT\n");
5825 #endif
5826 break;
5829 if (CUR == '>') {
5830 NEXT;
5831 } else {
5832 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5833 "Couldn't find end of Start Tag %s\n",
5834 name, NULL);
5837 * end of parsing of this node.
5839 if (xmlStrEqual(name, ctxt->name)) {
5840 nodePop(ctxt);
5841 htmlnamePop(ctxt);
5844 if (ctxt->record_info)
5845 htmlNodeInfoPush(ctxt, &node_info);
5847 ctxt->instate = XML_PARSER_CONTENT;
5848 #ifdef DEBUG_PUSH
5849 xmlGenericError(xmlGenericErrorContext,
5850 "HPP: entering CONTENT\n");
5851 #endif
5852 break;
5856 * Check for an Empty Element from DTD definition
5858 if ((info != NULL) && (info->empty)) {
5859 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5860 ctxt->sax->endElement(ctxt->userData, name);
5861 htmlnamePop(ctxt);
5864 if (ctxt->record_info)
5865 htmlNodeInfoPush(ctxt, &node_info);
5867 ctxt->instate = XML_PARSER_CONTENT;
5868 #ifdef DEBUG_PUSH
5869 xmlGenericError(xmlGenericErrorContext,
5870 "HPP: entering CONTENT\n");
5871 #endif
5872 break;
5874 case XML_PARSER_CONTENT: {
5875 xmlChar chr[2] = { 0, 0 };
5878 * Handle preparsed entities and charRef
5880 if (ctxt->token != 0) {
5881 chr[0] = (xmlChar) ctxt->token;
5882 htmlCheckParagraph(ctxt);
5883 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5884 ctxt->sax->characters(ctxt->userData, chr, 1);
5885 ctxt->token = 0;
5886 ctxt->checkIndex = 0;
5888 if ((avail == 1) && (terminate)) {
5889 cur = in->cur[0];
5890 if ((cur != '<') && (cur != '&')) {
5891 if (ctxt->sax != NULL) {
5892 chr[0] = cur;
5893 if (IS_BLANK_CH(cur)) {
5894 if (ctxt->keepBlanks) {
5895 if (ctxt->sax->characters != NULL)
5896 ctxt->sax->characters(
5897 ctxt->userData, chr, 1);
5898 } else {
5899 if (ctxt->sax->ignorableWhitespace != NULL)
5900 ctxt->sax->ignorableWhitespace(
5901 ctxt->userData, chr, 1);
5903 } else {
5904 htmlCheckParagraph(ctxt);
5905 if (ctxt->sax->characters != NULL)
5906 ctxt->sax->characters(
5907 ctxt->userData, chr, 1);
5910 ctxt->token = 0;
5911 ctxt->checkIndex = 0;
5912 in->cur++;
5913 break;
5916 if (avail < 2)
5917 goto done;
5918 cur = in->cur[0];
5919 next = in->cur[1];
5920 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5921 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5923 * Handle SCRIPT/STYLE separately
5925 if (!terminate) {
5926 int idx;
5927 xmlChar val;
5929 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5930 if (idx < 0)
5931 goto done;
5932 val = in->cur[idx + 2];
5933 if (val == 0) /* bad cut of input */
5934 goto done;
5936 htmlParseScript(ctxt);
5937 if ((cur == '<') && (next == '/')) {
5938 ctxt->instate = XML_PARSER_END_TAG;
5939 ctxt->checkIndex = 0;
5940 #ifdef DEBUG_PUSH
5941 xmlGenericError(xmlGenericErrorContext,
5942 "HPP: entering END_TAG\n");
5943 #endif
5944 break;
5946 } else {
5948 * Sometimes DOCTYPE arrives in the middle of the document
5950 if ((cur == '<') && (next == '!') &&
5951 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5952 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5953 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5954 (UPP(8) == 'E')) {
5955 if ((!terminate) &&
5956 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5957 goto done;
5958 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5959 "Misplaced DOCTYPE declaration\n",
5960 BAD_CAST "DOCTYPE" , NULL);
5961 htmlParseDocTypeDecl(ctxt);
5962 } else if ((cur == '<') && (next == '!') &&
5963 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5964 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5965 goto done;
5966 #ifdef DEBUG_PUSH
5967 xmlGenericError(xmlGenericErrorContext,
5968 "HPP: Parsing Comment\n");
5969 #endif
5970 htmlParseComment(ctxt);
5971 ctxt->instate = XML_PARSER_CONTENT;
5972 } else if ((cur == '<') && (next == '?')) {
5973 if ((!terminate) &&
5974 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5975 goto done;
5976 #ifdef DEBUG_PUSH
5977 xmlGenericError(xmlGenericErrorContext,
5978 "HPP: Parsing PI\n");
5979 #endif
5980 htmlParsePI(ctxt);
5981 ctxt->instate = XML_PARSER_CONTENT;
5982 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5983 goto done;
5984 } else if ((cur == '<') && (next == '/')) {
5985 ctxt->instate = XML_PARSER_END_TAG;
5986 ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988 xmlGenericError(xmlGenericErrorContext,
5989 "HPP: entering END_TAG\n");
5990 #endif
5991 break;
5992 } else if (cur == '<') {
5993 if ((!terminate) && (next == 0))
5994 goto done;
5996 * Only switch to START_TAG if the next character
5997 * starts a valid name. Otherwise, htmlParseStartTag
5998 * might return without consuming all characters
5999 * up to the final '>'.
6001 if ((IS_ASCII_LETTER(next)) ||
6002 (next == '_') || (next == ':') || (next == '.')) {
6003 ctxt->instate = XML_PARSER_START_TAG;
6004 ctxt->checkIndex = 0;
6005 #ifdef DEBUG_PUSH
6006 xmlGenericError(xmlGenericErrorContext,
6007 "HPP: entering START_TAG\n");
6008 #endif
6009 } else {
6010 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
6011 "htmlParseTryOrFinish: "
6012 "invalid element name\n",
6013 NULL, NULL);
6014 htmlCheckParagraph(ctxt);
6015 if ((ctxt->sax != NULL) &&
6016 (ctxt->sax->characters != NULL))
6017 ctxt->sax->characters(ctxt->userData,
6018 in->cur, 1);
6019 NEXT;
6021 break;
6022 } else {
6024 * check that the text sequence is complete
6025 * before handing out the data to the parser
6026 * to avoid problems with erroneous end of
6027 * data detection.
6029 if ((!terminate) &&
6030 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6031 goto done;
6032 ctxt->checkIndex = 0;
6033 #ifdef DEBUG_PUSH
6034 xmlGenericError(xmlGenericErrorContext,
6035 "HPP: Parsing char data\n");
6036 #endif
6037 while ((ctxt->instate != XML_PARSER_EOF) &&
6038 (cur != '<') && (in->cur < in->end)) {
6039 if (cur == '&') {
6040 htmlParseReference(ctxt);
6041 } else {
6042 htmlParseCharData(ctxt);
6044 cur = in->cur[0];
6049 break;
6051 case XML_PARSER_END_TAG:
6052 if (avail < 2)
6053 goto done;
6054 if ((!terminate) &&
6055 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6056 goto done;
6057 htmlParseEndTag(ctxt);
6058 if (ctxt->nameNr == 0) {
6059 ctxt->instate = XML_PARSER_EPILOG;
6060 } else {
6061 ctxt->instate = XML_PARSER_CONTENT;
6063 ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065 xmlGenericError(xmlGenericErrorContext,
6066 "HPP: entering CONTENT\n");
6067 #endif
6068 break;
6069 case XML_PARSER_CDATA_SECTION:
6070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6071 "HPP: internal error, state == CDATA\n",
6072 NULL, NULL);
6073 ctxt->instate = XML_PARSER_CONTENT;
6074 ctxt->checkIndex = 0;
6075 #ifdef DEBUG_PUSH
6076 xmlGenericError(xmlGenericErrorContext,
6077 "HPP: entering CONTENT\n");
6078 #endif
6079 break;
6080 case XML_PARSER_DTD:
6081 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6082 "HPP: internal error, state == DTD\n",
6083 NULL, NULL);
6084 ctxt->instate = XML_PARSER_CONTENT;
6085 ctxt->checkIndex = 0;
6086 #ifdef DEBUG_PUSH
6087 xmlGenericError(xmlGenericErrorContext,
6088 "HPP: entering CONTENT\n");
6089 #endif
6090 break;
6091 case XML_PARSER_COMMENT:
6092 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6093 "HPP: internal error, state == COMMENT\n",
6094 NULL, NULL);
6095 ctxt->instate = XML_PARSER_CONTENT;
6096 ctxt->checkIndex = 0;
6097 #ifdef DEBUG_PUSH
6098 xmlGenericError(xmlGenericErrorContext,
6099 "HPP: entering CONTENT\n");
6100 #endif
6101 break;
6102 case XML_PARSER_PI:
6103 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6104 "HPP: internal error, state == PI\n",
6105 NULL, NULL);
6106 ctxt->instate = XML_PARSER_CONTENT;
6107 ctxt->checkIndex = 0;
6108 #ifdef DEBUG_PUSH
6109 xmlGenericError(xmlGenericErrorContext,
6110 "HPP: entering CONTENT\n");
6111 #endif
6112 break;
6113 case XML_PARSER_ENTITY_DECL:
6114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6115 "HPP: internal error, state == ENTITY_DECL\n",
6116 NULL, NULL);
6117 ctxt->instate = XML_PARSER_CONTENT;
6118 ctxt->checkIndex = 0;
6119 #ifdef DEBUG_PUSH
6120 xmlGenericError(xmlGenericErrorContext,
6121 "HPP: entering CONTENT\n");
6122 #endif
6123 break;
6124 case XML_PARSER_ENTITY_VALUE:
6125 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6126 "HPP: internal error, state == ENTITY_VALUE\n",
6127 NULL, NULL);
6128 ctxt->instate = XML_PARSER_CONTENT;
6129 ctxt->checkIndex = 0;
6130 #ifdef DEBUG_PUSH
6131 xmlGenericError(xmlGenericErrorContext,
6132 "HPP: entering DTD\n");
6133 #endif
6134 break;
6135 case XML_PARSER_ATTRIBUTE_VALUE:
6136 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6137 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6138 NULL, NULL);
6139 ctxt->instate = XML_PARSER_START_TAG;
6140 ctxt->checkIndex = 0;
6141 #ifdef DEBUG_PUSH
6142 xmlGenericError(xmlGenericErrorContext,
6143 "HPP: entering START_TAG\n");
6144 #endif
6145 break;
6146 case XML_PARSER_SYSTEM_LITERAL:
6147 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6148 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6149 NULL, NULL);
6150 ctxt->instate = XML_PARSER_CONTENT;
6151 ctxt->checkIndex = 0;
6152 #ifdef DEBUG_PUSH
6153 xmlGenericError(xmlGenericErrorContext,
6154 "HPP: entering CONTENT\n");
6155 #endif
6156 break;
6157 case XML_PARSER_IGNORE:
6158 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6159 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6160 NULL, NULL);
6161 ctxt->instate = XML_PARSER_CONTENT;
6162 ctxt->checkIndex = 0;
6163 #ifdef DEBUG_PUSH
6164 xmlGenericError(xmlGenericErrorContext,
6165 "HPP: entering CONTENT\n");
6166 #endif
6167 break;
6168 case XML_PARSER_PUBLIC_LITERAL:
6169 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6170 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6171 NULL, NULL);
6172 ctxt->instate = XML_PARSER_CONTENT;
6173 ctxt->checkIndex = 0;
6174 #ifdef DEBUG_PUSH
6175 xmlGenericError(xmlGenericErrorContext,
6176 "HPP: entering CONTENT\n");
6177 #endif
6178 break;
6182 done:
6183 if ((avail == 0) && (terminate)) {
6184 htmlAutoCloseOnEnd(ctxt);
6185 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6187 * SAX: end of the document processing.
6189 ctxt->instate = XML_PARSER_EOF;
6190 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6191 ctxt->sax->endDocument(ctxt->userData);
6194 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6195 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6196 (ctxt->instate == XML_PARSER_EPILOG))) {
6197 xmlDtdPtr dtd;
6198 dtd = xmlGetIntSubset(ctxt->myDoc);
6199 if (dtd == NULL)
6200 ctxt->myDoc->intSubset =
6201 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6202 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6203 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6205 #ifdef DEBUG_PUSH
6206 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6207 #endif
6208 return(ret);
6212 * htmlParseChunk:
6213 * @ctxt: an HTML parser context
6214 * @chunk: an char array
6215 * @size: the size in byte of the chunk
6216 * @terminate: last chunk indicator
6218 * Parse a Chunk of memory
6220 * Returns zero if no error, the xmlParserErrors otherwise.
6223 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6224 int terminate) {
6225 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6226 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6227 "htmlParseChunk: context error\n", NULL, NULL);
6228 return(XML_ERR_INTERNAL_ERROR);
6230 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6231 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6232 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6233 size_t cur = ctxt->input->cur - ctxt->input->base;
6234 int res;
6236 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6237 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6238 if (res < 0) {
6239 ctxt->errNo = XML_PARSER_EOF;
6240 ctxt->disableSAX = 1;
6241 return (XML_PARSER_EOF);
6243 #ifdef DEBUG_PUSH
6244 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6245 #endif
6247 #if 0
6248 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6249 htmlParseTryOrFinish(ctxt, terminate);
6250 #endif
6251 } else if (ctxt->instate != XML_PARSER_EOF) {
6252 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6253 xmlParserInputBufferPtr in = ctxt->input->buf;
6254 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6255 (in->raw != NULL)) {
6256 int nbchars;
6257 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6258 size_t current = ctxt->input->cur - ctxt->input->base;
6260 nbchars = xmlCharEncInput(in, terminate);
6261 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6262 if (nbchars < 0) {
6263 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6264 "encoder error\n", NULL, NULL);
6265 return(XML_ERR_INVALID_ENCODING);
6270 htmlParseTryOrFinish(ctxt, terminate);
6271 if (terminate) {
6272 if ((ctxt->instate != XML_PARSER_EOF) &&
6273 (ctxt->instate != XML_PARSER_EPILOG) &&
6274 (ctxt->instate != XML_PARSER_MISC)) {
6275 ctxt->errNo = XML_ERR_DOCUMENT_END;
6276 ctxt->wellFormed = 0;
6278 if (ctxt->instate != XML_PARSER_EOF) {
6279 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6280 ctxt->sax->endDocument(ctxt->userData);
6282 ctxt->instate = XML_PARSER_EOF;
6284 return((xmlParserErrors) ctxt->errNo);
6287 /************************************************************************
6289 * User entry points *
6291 ************************************************************************/
6294 * htmlCreatePushParserCtxt:
6295 * @sax: a SAX handler
6296 * @user_data: The user data returned on SAX callbacks
6297 * @chunk: a pointer to an array of chars
6298 * @size: number of chars in the array
6299 * @filename: an optional file name or URI
6300 * @enc: an optional encoding
6302 * Create a parser context for using the HTML parser in push mode
6303 * The value of @filename is used for fetching external entities
6304 * and error/warning reports.
6306 * Returns the new parser context or NULL
6308 htmlParserCtxtPtr
6309 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6310 const char *chunk, int size, const char *filename,
6311 xmlCharEncoding enc) {
6312 htmlParserCtxtPtr ctxt;
6313 htmlParserInputPtr inputStream;
6314 xmlParserInputBufferPtr buf;
6316 xmlInitParser();
6318 buf = xmlAllocParserInputBuffer(enc);
6319 if (buf == NULL) return(NULL);
6321 ctxt = htmlNewParserCtxt();
6322 if (ctxt == NULL) {
6323 xmlFreeParserInputBuffer(buf);
6324 return(NULL);
6326 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6327 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6328 if (sax != NULL) {
6329 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6330 xmlFree(ctxt->sax);
6331 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6332 if (ctxt->sax == NULL) {
6333 xmlFree(buf);
6334 xmlFree(ctxt);
6335 return(NULL);
6337 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6338 if (user_data != NULL)
6339 ctxt->userData = user_data;
6341 if (filename == NULL) {
6342 ctxt->directory = NULL;
6343 } else {
6344 ctxt->directory = xmlParserGetDirectory(filename);
6347 inputStream = htmlNewInputStream(ctxt);
6348 if (inputStream == NULL) {
6349 xmlFreeParserCtxt(ctxt);
6350 xmlFree(buf);
6351 return(NULL);
6354 if (filename == NULL)
6355 inputStream->filename = NULL;
6356 else
6357 inputStream->filename = (char *)
6358 xmlCanonicPath((const xmlChar *) filename);
6359 inputStream->buf = buf;
6360 xmlBufResetInput(buf->buffer, inputStream);
6362 inputPush(ctxt, inputStream);
6364 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6365 (ctxt->input->buf != NULL)) {
6366 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6367 size_t cur = ctxt->input->cur - ctxt->input->base;
6369 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6371 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6372 #ifdef DEBUG_PUSH
6373 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6374 #endif
6376 ctxt->progressive = 1;
6378 return(ctxt);
6380 #endif /* LIBXML_PUSH_ENABLED */
6383 * htmlSAXParseDoc:
6384 * @cur: a pointer to an array of xmlChar
6385 * @encoding: a free form C string describing the HTML document encoding, or NULL
6386 * @sax: the SAX handler block
6387 * @userData: if using SAX, this pointer will be provided on callbacks.
6389 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6390 * to handle parse events. If sax is NULL, fallback to the default DOM
6391 * behavior and return a tree.
6393 * Returns the resulting document tree unless SAX is NULL or the document is
6394 * not well formed.
6397 htmlDocPtr
6398 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6399 htmlSAXHandlerPtr sax, void *userData) {
6400 htmlDocPtr ret;
6401 htmlParserCtxtPtr ctxt;
6403 xmlInitParser();
6405 if (cur == NULL) return(NULL);
6408 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6409 if (ctxt == NULL) return(NULL);
6410 if (sax != NULL) {
6411 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6412 ctxt->sax = sax;
6413 ctxt->userData = userData;
6416 htmlParseDocument(ctxt);
6417 ret = ctxt->myDoc;
6418 if (sax != NULL) {
6419 ctxt->sax = NULL;
6420 ctxt->userData = NULL;
6422 htmlFreeParserCtxt(ctxt);
6424 return(ret);
6428 * htmlParseDoc:
6429 * @cur: a pointer to an array of xmlChar
6430 * @encoding: a free form C string describing the HTML document encoding, or NULL
6432 * parse an HTML in-memory document and build a tree.
6434 * Returns the resulting document tree
6437 htmlDocPtr
6438 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6439 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6444 * htmlCreateFileParserCtxt:
6445 * @filename: the filename
6446 * @encoding: a free form C string describing the HTML document encoding, or NULL
6448 * Create a parser context for a file content.
6449 * Automatic support for ZLIB/Compress compressed document is provided
6450 * by default if found at compile-time.
6452 * Returns the new parser context or NULL
6454 htmlParserCtxtPtr
6455 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6457 htmlParserCtxtPtr ctxt;
6458 htmlParserInputPtr inputStream;
6459 char *canonicFilename;
6460 /* htmlCharEncoding enc; */
6461 xmlChar *content, *content_line = (xmlChar *) "charset=";
6463 if (filename == NULL)
6464 return(NULL);
6466 ctxt = htmlNewParserCtxt();
6467 if (ctxt == NULL) {
6468 return(NULL);
6470 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6471 if (canonicFilename == NULL) {
6472 #ifdef LIBXML_SAX1_ENABLED
6473 if (xmlDefaultSAXHandler.error != NULL) {
6474 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6476 #endif
6477 xmlFreeParserCtxt(ctxt);
6478 return(NULL);
6481 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6482 xmlFree(canonicFilename);
6483 if (inputStream == NULL) {
6484 xmlFreeParserCtxt(ctxt);
6485 return(NULL);
6488 inputPush(ctxt, inputStream);
6490 /* set encoding */
6491 if (encoding) {
6492 size_t l = strlen(encoding);
6494 if (l < 1000) {
6495 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6496 if (content) {
6497 strcpy ((char *)content, (char *)content_line);
6498 strcat ((char *)content, (char *)encoding);
6499 htmlCheckEncoding (ctxt, content);
6500 xmlFree (content);
6505 return(ctxt);
6509 * htmlSAXParseFile:
6510 * @filename: the filename
6511 * @encoding: a free form C string describing the HTML document encoding, or NULL
6512 * @sax: the SAX handler block
6513 * @userData: if using SAX, this pointer will be provided on callbacks.
6515 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6516 * compressed document is provided by default if found at compile-time.
6517 * It use the given SAX function block to handle the parsing callback.
6518 * If sax is NULL, fallback to the default DOM tree building routines.
6520 * Returns the resulting document tree unless SAX is NULL or the document is
6521 * not well formed.
6524 htmlDocPtr
6525 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6526 void *userData) {
6527 htmlDocPtr ret;
6528 htmlParserCtxtPtr ctxt;
6529 htmlSAXHandlerPtr oldsax = NULL;
6531 xmlInitParser();
6533 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6534 if (ctxt == NULL) return(NULL);
6535 if (sax != NULL) {
6536 oldsax = ctxt->sax;
6537 ctxt->sax = sax;
6538 ctxt->userData = userData;
6541 htmlParseDocument(ctxt);
6543 ret = ctxt->myDoc;
6544 if (sax != NULL) {
6545 ctxt->sax = oldsax;
6546 ctxt->userData = NULL;
6548 htmlFreeParserCtxt(ctxt);
6550 return(ret);
6554 * htmlParseFile:
6555 * @filename: the filename
6556 * @encoding: a free form C string describing the HTML document encoding, or NULL
6558 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6559 * compressed document is provided by default if found at compile-time.
6561 * Returns the resulting document tree
6564 htmlDocPtr
6565 htmlParseFile(const char *filename, const char *encoding) {
6566 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6570 * htmlHandleOmittedElem:
6571 * @val: int 0 or 1
6573 * Set and return the previous value for handling HTML omitted tags.
6575 * Returns the last value for 0 for no handling, 1 for auto insertion.
6579 htmlHandleOmittedElem(int val) {
6580 int old = htmlOmittedDefaultValue;
6582 htmlOmittedDefaultValue = val;
6583 return(old);
6587 * htmlElementAllowedHere:
6588 * @parent: HTML parent element
6589 * @elt: HTML element
6591 * Checks whether an HTML element may be a direct child of a parent element.
6592 * Note - doesn't check for deprecated elements
6594 * Returns 1 if allowed; 0 otherwise.
6597 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6598 const char** p ;
6600 if ( ! elt || ! parent || ! parent->subelts )
6601 return 0 ;
6603 for ( p = parent->subelts; *p; ++p )
6604 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6605 return 1 ;
6607 return 0 ;
6610 * htmlElementStatusHere:
6611 * @parent: HTML parent element
6612 * @elt: HTML element
6614 * Checks whether an HTML element may be a direct child of a parent element.
6615 * and if so whether it is valid or deprecated.
6617 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6619 htmlStatus
6620 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6621 if ( ! parent || ! elt )
6622 return HTML_INVALID ;
6623 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6624 return HTML_INVALID ;
6626 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6629 * htmlAttrAllowed:
6630 * @elt: HTML element
6631 * @attr: HTML attribute
6632 * @legacy: whether to allow deprecated attributes
6634 * Checks whether an attribute is valid for an element
6635 * Has full knowledge of Required and Deprecated attributes
6637 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6639 htmlStatus
6640 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6641 const char** p ;
6643 if ( !elt || ! attr )
6644 return HTML_INVALID ;
6646 if ( elt->attrs_req )
6647 for ( p = elt->attrs_req; *p; ++p)
6648 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649 return HTML_REQUIRED ;
6651 if ( elt->attrs_opt )
6652 for ( p = elt->attrs_opt; *p; ++p)
6653 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654 return HTML_VALID ;
6656 if ( legacy && elt->attrs_depr )
6657 for ( p = elt->attrs_depr; *p; ++p)
6658 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6659 return HTML_DEPRECATED ;
6661 return HTML_INVALID ;
6664 * htmlNodeStatus:
6665 * @node: an htmlNodePtr in a tree
6666 * @legacy: whether to allow deprecated elements (YES is faster here
6667 * for Element nodes)
6669 * Checks whether the tree node is valid. Experimental (the author
6670 * only uses the HTML enhancements in a SAX parser)
6672 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6673 * legacy allowed) or htmlElementStatusHere (otherwise).
6674 * for Attribute nodes, a return from htmlAttrAllowed
6675 * for other nodes, HTML_NA (no checks performed)
6677 htmlStatus
6678 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6679 if ( ! node )
6680 return HTML_INVALID ;
6682 switch ( node->type ) {
6683 case XML_ELEMENT_NODE:
6684 return legacy
6685 ? ( htmlElementAllowedHere (
6686 htmlTagLookup(node->parent->name) , node->name
6687 ) ? HTML_VALID : HTML_INVALID )
6688 : htmlElementStatusHere(
6689 htmlTagLookup(node->parent->name) ,
6690 htmlTagLookup(node->name) )
6692 case XML_ATTRIBUTE_NODE:
6693 return htmlAttrAllowed(
6694 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6695 default: return HTML_NA ;
6698 /************************************************************************
6700 * New set (2.6.0) of simpler and more flexible APIs *
6702 ************************************************************************/
6704 * DICT_FREE:
6705 * @str: a string
6707 * Free a string if it is not owned by the "dict" dictionary in the
6708 * current scope
6710 #define DICT_FREE(str) \
6711 if ((str) && ((!dict) || \
6712 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6713 xmlFree((char *)(str));
6716 * htmlCtxtReset:
6717 * @ctxt: an HTML parser context
6719 * Reset a parser context
6721 void
6722 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6724 xmlParserInputPtr input;
6725 xmlDictPtr dict;
6727 if (ctxt == NULL)
6728 return;
6730 xmlInitParser();
6731 dict = ctxt->dict;
6733 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6734 xmlFreeInputStream(input);
6736 ctxt->inputNr = 0;
6737 ctxt->input = NULL;
6739 ctxt->spaceNr = 0;
6740 if (ctxt->spaceTab != NULL) {
6741 ctxt->spaceTab[0] = -1;
6742 ctxt->space = &ctxt->spaceTab[0];
6743 } else {
6744 ctxt->space = NULL;
6748 ctxt->nodeNr = 0;
6749 ctxt->node = NULL;
6751 ctxt->nameNr = 0;
6752 ctxt->name = NULL;
6754 DICT_FREE(ctxt->version);
6755 ctxt->version = NULL;
6756 DICT_FREE(ctxt->encoding);
6757 ctxt->encoding = NULL;
6758 DICT_FREE(ctxt->directory);
6759 ctxt->directory = NULL;
6760 DICT_FREE(ctxt->extSubURI);
6761 ctxt->extSubURI = NULL;
6762 DICT_FREE(ctxt->extSubSystem);
6763 ctxt->extSubSystem = NULL;
6764 if (ctxt->myDoc != NULL)
6765 xmlFreeDoc(ctxt->myDoc);
6766 ctxt->myDoc = NULL;
6768 ctxt->standalone = -1;
6769 ctxt->hasExternalSubset = 0;
6770 ctxt->hasPErefs = 0;
6771 ctxt->html = 1;
6772 ctxt->external = 0;
6773 ctxt->instate = XML_PARSER_START;
6774 ctxt->token = 0;
6776 ctxt->wellFormed = 1;
6777 ctxt->nsWellFormed = 1;
6778 ctxt->disableSAX = 0;
6779 ctxt->valid = 1;
6780 ctxt->vctxt.userData = ctxt;
6781 ctxt->vctxt.error = xmlParserValidityError;
6782 ctxt->vctxt.warning = xmlParserValidityWarning;
6783 ctxt->record_info = 0;
6784 ctxt->checkIndex = 0;
6785 ctxt->inSubset = 0;
6786 ctxt->errNo = XML_ERR_OK;
6787 ctxt->depth = 0;
6788 ctxt->charset = XML_CHAR_ENCODING_NONE;
6789 ctxt->catalogs = NULL;
6790 xmlInitNodeInfoSeq(&ctxt->node_seq);
6792 if (ctxt->attsDefault != NULL) {
6793 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6794 ctxt->attsDefault = NULL;
6796 if (ctxt->attsSpecial != NULL) {
6797 xmlHashFree(ctxt->attsSpecial, NULL);
6798 ctxt->attsSpecial = NULL;
6803 * htmlCtxtUseOptions:
6804 * @ctxt: an HTML parser context
6805 * @options: a combination of htmlParserOption(s)
6807 * Applies the options to the parser context
6809 * Returns 0 in case of success, the set of unknown or unimplemented options
6810 * in case of error.
6813 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6815 if (ctxt == NULL)
6816 return(-1);
6818 if (options & HTML_PARSE_NOWARNING) {
6819 ctxt->sax->warning = NULL;
6820 ctxt->vctxt.warning = NULL;
6821 options -= XML_PARSE_NOWARNING;
6822 ctxt->options |= XML_PARSE_NOWARNING;
6824 if (options & HTML_PARSE_NOERROR) {
6825 ctxt->sax->error = NULL;
6826 ctxt->vctxt.error = NULL;
6827 ctxt->sax->fatalError = NULL;
6828 options -= XML_PARSE_NOERROR;
6829 ctxt->options |= XML_PARSE_NOERROR;
6831 if (options & HTML_PARSE_PEDANTIC) {
6832 ctxt->pedantic = 1;
6833 options -= XML_PARSE_PEDANTIC;
6834 ctxt->options |= XML_PARSE_PEDANTIC;
6835 } else
6836 ctxt->pedantic = 0;
6837 if (options & XML_PARSE_NOBLANKS) {
6838 ctxt->keepBlanks = 0;
6839 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6840 options -= XML_PARSE_NOBLANKS;
6841 ctxt->options |= XML_PARSE_NOBLANKS;
6842 } else
6843 ctxt->keepBlanks = 1;
6844 if (options & HTML_PARSE_RECOVER) {
6845 ctxt->recovery = 1;
6846 options -= HTML_PARSE_RECOVER;
6847 } else
6848 ctxt->recovery = 0;
6849 if (options & HTML_PARSE_COMPACT) {
6850 ctxt->options |= HTML_PARSE_COMPACT;
6851 options -= HTML_PARSE_COMPACT;
6853 if (options & XML_PARSE_HUGE) {
6854 ctxt->options |= XML_PARSE_HUGE;
6855 options -= XML_PARSE_HUGE;
6857 if (options & HTML_PARSE_NODEFDTD) {
6858 ctxt->options |= HTML_PARSE_NODEFDTD;
6859 options -= HTML_PARSE_NODEFDTD;
6861 if (options & HTML_PARSE_IGNORE_ENC) {
6862 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6863 options -= HTML_PARSE_IGNORE_ENC;
6865 if (options & HTML_PARSE_NOIMPLIED) {
6866 ctxt->options |= HTML_PARSE_NOIMPLIED;
6867 options -= HTML_PARSE_NOIMPLIED;
6869 ctxt->dictNames = 0;
6870 return (options);
6874 * htmlDoRead:
6875 * @ctxt: an HTML parser context
6876 * @URL: the base URL to use for the document
6877 * @encoding: the document encoding, or NULL
6878 * @options: a combination of htmlParserOption(s)
6879 * @reuse: keep the context for reuse
6881 * Common front-end for the htmlRead functions
6883 * Returns the resulting document tree or NULL
6885 static htmlDocPtr
6886 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6887 int options, int reuse)
6889 htmlDocPtr ret;
6891 htmlCtxtUseOptions(ctxt, options);
6892 ctxt->html = 1;
6893 if (encoding != NULL) {
6894 xmlCharEncodingHandlerPtr hdlr;
6896 hdlr = xmlFindCharEncodingHandler(encoding);
6897 if (hdlr != NULL) {
6898 xmlSwitchToEncoding(ctxt, hdlr);
6899 if (ctxt->input->encoding != NULL)
6900 xmlFree((xmlChar *) ctxt->input->encoding);
6901 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6904 if ((URL != NULL) && (ctxt->input != NULL) &&
6905 (ctxt->input->filename == NULL))
6906 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6907 htmlParseDocument(ctxt);
6908 ret = ctxt->myDoc;
6909 ctxt->myDoc = NULL;
6910 if (!reuse) {
6911 if ((ctxt->dictNames) &&
6912 (ret != NULL) &&
6913 (ret->dict == ctxt->dict))
6914 ctxt->dict = NULL;
6915 xmlFreeParserCtxt(ctxt);
6917 return (ret);
6921 * htmlReadDoc:
6922 * @cur: a pointer to a zero terminated string
6923 * @URL: the base URL to use for the document
6924 * @encoding: the document encoding, or NULL
6925 * @options: a combination of htmlParserOption(s)
6927 * parse an XML in-memory document and build a tree.
6929 * Returns the resulting document tree
6931 htmlDocPtr
6932 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6934 htmlParserCtxtPtr ctxt;
6936 if (cur == NULL)
6937 return (NULL);
6939 xmlInitParser();
6940 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6941 if (ctxt == NULL)
6942 return (NULL);
6943 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6947 * htmlReadFile:
6948 * @filename: a file or URL
6949 * @encoding: the document encoding, or NULL
6950 * @options: a combination of htmlParserOption(s)
6952 * parse an XML file from the filesystem or the network.
6954 * Returns the resulting document tree
6956 htmlDocPtr
6957 htmlReadFile(const char *filename, const char *encoding, int options)
6959 htmlParserCtxtPtr ctxt;
6961 xmlInitParser();
6962 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6963 if (ctxt == NULL)
6964 return (NULL);
6965 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6969 * htmlReadMemory:
6970 * @buffer: a pointer to a char array
6971 * @size: the size of the array
6972 * @URL: the base URL to use for the document
6973 * @encoding: the document encoding, or NULL
6974 * @options: a combination of htmlParserOption(s)
6976 * parse an XML in-memory document and build a tree.
6978 * Returns the resulting document tree
6980 htmlDocPtr
6981 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6983 htmlParserCtxtPtr ctxt;
6985 xmlInitParser();
6986 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6987 if (ctxt == NULL)
6988 return (NULL);
6989 htmlDefaultSAXHandlerInit();
6990 if (ctxt->sax != NULL)
6991 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6992 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6996 * htmlReadFd:
6997 * @fd: an open file descriptor
6998 * @URL: the base URL to use for the document
6999 * @encoding: the document encoding, or NULL
7000 * @options: a combination of htmlParserOption(s)
7002 * parse an XML from a file descriptor and build a tree.
7004 * Returns the resulting document tree
7006 htmlDocPtr
7007 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7009 htmlParserCtxtPtr ctxt;
7010 xmlParserInputBufferPtr input;
7011 xmlParserInputPtr stream;
7013 if (fd < 0)
7014 return (NULL);
7015 xmlInitParser();
7017 xmlInitParser();
7018 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7019 if (input == NULL)
7020 return (NULL);
7021 ctxt = xmlNewParserCtxt();
7022 if (ctxt == NULL) {
7023 xmlFreeParserInputBuffer(input);
7024 return (NULL);
7026 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7027 if (stream == NULL) {
7028 xmlFreeParserInputBuffer(input);
7029 xmlFreeParserCtxt(ctxt);
7030 return (NULL);
7032 inputPush(ctxt, stream);
7033 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7037 * htmlReadIO:
7038 * @ioread: an I/O read function
7039 * @ioclose: an I/O close function
7040 * @ioctx: an I/O handler
7041 * @URL: the base URL to use for the document
7042 * @encoding: the document encoding, or NULL
7043 * @options: a combination of htmlParserOption(s)
7045 * parse an HTML document from I/O functions and source and build a tree.
7047 * Returns the resulting document tree
7049 htmlDocPtr
7050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7051 void *ioctx, const char *URL, const char *encoding, int options)
7053 htmlParserCtxtPtr ctxt;
7054 xmlParserInputBufferPtr input;
7055 xmlParserInputPtr stream;
7057 if (ioread == NULL)
7058 return (NULL);
7059 xmlInitParser();
7061 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7062 XML_CHAR_ENCODING_NONE);
7063 if (input == NULL) {
7064 if (ioclose != NULL)
7065 ioclose(ioctx);
7066 return (NULL);
7068 ctxt = htmlNewParserCtxt();
7069 if (ctxt == NULL) {
7070 xmlFreeParserInputBuffer(input);
7071 return (NULL);
7073 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7074 if (stream == NULL) {
7075 xmlFreeParserInputBuffer(input);
7076 xmlFreeParserCtxt(ctxt);
7077 return (NULL);
7079 inputPush(ctxt, stream);
7080 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7084 * htmlCtxtReadDoc:
7085 * @ctxt: an HTML parser context
7086 * @cur: a pointer to a zero terminated string
7087 * @URL: the base URL to use for the document
7088 * @encoding: the document encoding, or NULL
7089 * @options: a combination of htmlParserOption(s)
7091 * parse an XML in-memory document and build a tree.
7092 * This reuses the existing @ctxt parser context
7094 * Returns the resulting document tree
7096 htmlDocPtr
7097 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7098 const char *URL, const char *encoding, int options)
7100 xmlParserInputPtr stream;
7102 if (cur == NULL)
7103 return (NULL);
7104 if (ctxt == NULL)
7105 return (NULL);
7106 xmlInitParser();
7108 htmlCtxtReset(ctxt);
7110 stream = xmlNewStringInputStream(ctxt, cur);
7111 if (stream == NULL) {
7112 return (NULL);
7114 inputPush(ctxt, stream);
7115 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7119 * htmlCtxtReadFile:
7120 * @ctxt: an HTML parser context
7121 * @filename: a file or URL
7122 * @encoding: the document encoding, or NULL
7123 * @options: a combination of htmlParserOption(s)
7125 * parse an XML file from the filesystem or the network.
7126 * This reuses the existing @ctxt parser context
7128 * Returns the resulting document tree
7130 htmlDocPtr
7131 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7132 const char *encoding, int options)
7134 xmlParserInputPtr stream;
7136 if (filename == NULL)
7137 return (NULL);
7138 if (ctxt == NULL)
7139 return (NULL);
7140 xmlInitParser();
7142 htmlCtxtReset(ctxt);
7144 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7145 if (stream == NULL) {
7146 return (NULL);
7148 inputPush(ctxt, stream);
7149 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7153 * htmlCtxtReadMemory:
7154 * @ctxt: an HTML parser context
7155 * @buffer: a pointer to a char array
7156 * @size: the size of the array
7157 * @URL: the base URL to use for the document
7158 * @encoding: the document encoding, or NULL
7159 * @options: a combination of htmlParserOption(s)
7161 * parse an XML in-memory document and build a tree.
7162 * This reuses the existing @ctxt parser context
7164 * Returns the resulting document tree
7166 htmlDocPtr
7167 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7168 const char *URL, const char *encoding, int options)
7170 xmlParserInputBufferPtr input;
7171 xmlParserInputPtr stream;
7173 if (ctxt == NULL)
7174 return (NULL);
7175 if (buffer == NULL)
7176 return (NULL);
7177 xmlInitParser();
7179 htmlCtxtReset(ctxt);
7181 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7182 if (input == NULL) {
7183 return(NULL);
7186 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7187 if (stream == NULL) {
7188 xmlFreeParserInputBuffer(input);
7189 return(NULL);
7192 inputPush(ctxt, stream);
7193 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7197 * htmlCtxtReadFd:
7198 * @ctxt: an HTML parser context
7199 * @fd: an open file descriptor
7200 * @URL: the base URL to use for the document
7201 * @encoding: the document encoding, or NULL
7202 * @options: a combination of htmlParserOption(s)
7204 * parse an XML from a file descriptor and build a tree.
7205 * This reuses the existing @ctxt parser context
7207 * Returns the resulting document tree
7209 htmlDocPtr
7210 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7211 const char *URL, const char *encoding, int options)
7213 xmlParserInputBufferPtr input;
7214 xmlParserInputPtr stream;
7216 if (fd < 0)
7217 return (NULL);
7218 if (ctxt == NULL)
7219 return (NULL);
7220 xmlInitParser();
7222 htmlCtxtReset(ctxt);
7225 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7226 if (input == NULL)
7227 return (NULL);
7228 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7229 if (stream == NULL) {
7230 xmlFreeParserInputBuffer(input);
7231 return (NULL);
7233 inputPush(ctxt, stream);
7234 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7238 * htmlCtxtReadIO:
7239 * @ctxt: an HTML parser context
7240 * @ioread: an I/O read function
7241 * @ioclose: an I/O close function
7242 * @ioctx: an I/O handler
7243 * @URL: the base URL to use for the document
7244 * @encoding: the document encoding, or NULL
7245 * @options: a combination of htmlParserOption(s)
7247 * parse an HTML document from I/O functions and source and build a tree.
7248 * This reuses the existing @ctxt parser context
7250 * Returns the resulting document tree
7252 htmlDocPtr
7253 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7254 xmlInputCloseCallback ioclose, void *ioctx,
7255 const char *URL,
7256 const char *encoding, int options)
7258 xmlParserInputBufferPtr input;
7259 xmlParserInputPtr stream;
7261 if (ioread == NULL)
7262 return (NULL);
7263 if (ctxt == NULL)
7264 return (NULL);
7265 xmlInitParser();
7267 htmlCtxtReset(ctxt);
7269 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7270 XML_CHAR_ENCODING_NONE);
7271 if (input == NULL) {
7272 if (ioclose != NULL)
7273 ioclose(ioctx);
7274 return (NULL);
7276 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7277 if (stream == NULL) {
7278 xmlFreeParserInputBuffer(input);
7279 return (NULL);
7281 inputPush(ctxt, stream);
7282 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7285 #define bottom_HTMLparser
7286 #include "elfgcchack.h"
7287 #endif /* LIBXML_HTML_ENABLED */