mscms: Fix double free on error path in EnumColorProfilesA (scan-build).
[wine.git] / libs / xml2 / HTMLparser.c
blobabcdfe2460d50a45ec58398df0c3a6e82ae33511
1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
6 * daniel@veillard.com
7 */
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
17 #include <libxml/xmlmemory.h>
18 #include <libxml/tree.h>
19 #include <libxml/parser.h>
20 #include <libxml/parserInternals.h>
21 #include <libxml/xmlerror.h>
22 #include <libxml/HTMLparser.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/valid.h>
27 #include <libxml/xmlIO.h>
28 #include <libxml/globals.h>
29 #include <libxml/uri.h>
31 #include "private/buf.h"
32 #include "private/enc.h"
33 #include "private/error.h"
34 #include "private/html.h"
35 #include "private/parser.h"
36 #include "private/tree.h"
38 #define HTML_MAX_NAMELEN 1000
39 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
40 #define HTML_PARSER_BUFFER_SIZE 100
42 /* #define DEBUG */
43 /* #define DEBUG_PUSH */
45 static int htmlOmittedDefaultValue = 1;
47 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
48 xmlChar end, xmlChar end2, xmlChar end3);
49 static void htmlParseComment(htmlParserCtxtPtr ctxt);
51 /************************************************************************
52 * *
53 * Some factorized error routines *
54 * *
55 ************************************************************************/
57 /**
58 * htmlErrMemory:
59 * @ctxt: an HTML parser context
60 * @extra: extra information
62 * Handle a redefinition of attribute error
64 static void
65 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
67 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
68 (ctxt->instate == XML_PARSER_EOF))
69 return;
70 if (ctxt != NULL) {
71 ctxt->errNo = XML_ERR_NO_MEMORY;
72 ctxt->instate = XML_PARSER_EOF;
73 ctxt->disableSAX = 1;
75 if (extra)
76 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
77 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
78 NULL, NULL, 0, 0,
79 "Memory allocation failed : %s\n", extra);
80 else
81 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
82 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
83 NULL, NULL, 0, 0, "Memory allocation failed\n");
86 /**
87 * htmlParseErr:
88 * @ctxt: an HTML parser context
89 * @error: the error number
90 * @msg: the error message
91 * @str1: string infor
92 * @str2: string infor
94 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
96 static void LIBXML_ATTR_FORMAT(3,0)
97 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
98 const char *msg, const xmlChar *str1, const xmlChar *str2)
100 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
101 (ctxt->instate == XML_PARSER_EOF))
102 return;
103 if (ctxt != NULL)
104 ctxt->errNo = error;
105 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
106 XML_ERR_ERROR, NULL, 0,
107 (const char *) str1, (const char *) str2,
108 NULL, 0, 0,
109 msg, str1, str2);
110 if (ctxt != NULL)
111 ctxt->wellFormed = 0;
115 * htmlParseErrInt:
116 * @ctxt: an HTML parser context
117 * @error: the error number
118 * @msg: the error message
119 * @val: integer info
121 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
123 static void LIBXML_ATTR_FORMAT(3,0)
124 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
125 const char *msg, int val)
127 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
128 (ctxt->instate == XML_PARSER_EOF))
129 return;
130 if (ctxt != NULL)
131 ctxt->errNo = error;
132 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
133 XML_ERR_ERROR, NULL, 0, NULL, NULL,
134 NULL, val, 0, msg, val);
135 if (ctxt != NULL)
136 ctxt->wellFormed = 0;
139 /************************************************************************
141 * Parser stacks related functions and macros *
143 ************************************************************************/
146 * htmlnamePush:
147 * @ctxt: an HTML parser context
148 * @value: the element name
150 * Pushes a new element name on top of the name stack
152 * Returns -1 in case of error, the index in the stack otherwise
154 static int
155 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
157 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
158 ctxt->html = 3;
159 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
160 ctxt->html = 10;
161 if (ctxt->nameNr >= ctxt->nameMax) {
162 size_t newSize = ctxt->nameMax * 2;
163 const xmlChar **tmp;
165 tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
166 newSize * sizeof(ctxt->nameTab[0]));
167 if (tmp == NULL) {
168 htmlErrMemory(ctxt, NULL);
169 return (-1);
171 ctxt->nameTab = tmp;
172 ctxt->nameMax = newSize;
174 ctxt->nameTab[ctxt->nameNr] = value;
175 ctxt->name = value;
176 return (ctxt->nameNr++);
179 * htmlnamePop:
180 * @ctxt: an HTML parser context
182 * Pops the top element name from the name stack
184 * Returns the name just removed
186 static const xmlChar *
187 htmlnamePop(htmlParserCtxtPtr ctxt)
189 const xmlChar *ret;
191 if (ctxt->nameNr <= 0)
192 return (NULL);
193 ctxt->nameNr--;
194 if (ctxt->nameNr < 0)
195 return (NULL);
196 if (ctxt->nameNr > 0)
197 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
198 else
199 ctxt->name = NULL;
200 ret = ctxt->nameTab[ctxt->nameNr];
201 ctxt->nameTab[ctxt->nameNr] = NULL;
202 return (ret);
206 * htmlNodeInfoPush:
207 * @ctxt: an HTML parser context
208 * @value: the node info
210 * Pushes a new element name on top of the node info stack
212 * Returns 0 in case of error, the index in the stack otherwise
214 static int
215 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
217 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
218 if (ctxt->nodeInfoMax == 0)
219 ctxt->nodeInfoMax = 5;
220 ctxt->nodeInfoMax *= 2;
221 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
222 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
223 ctxt->nodeInfoMax *
224 sizeof(ctxt->nodeInfoTab[0]));
225 if (ctxt->nodeInfoTab == NULL) {
226 htmlErrMemory(ctxt, NULL);
227 return (0);
230 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
231 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
232 return (ctxt->nodeInfoNr++);
236 * htmlNodeInfoPop:
237 * @ctxt: an HTML parser context
239 * Pops the top element name from the node info stack
241 * Returns 0 in case of error, the pointer to NodeInfo otherwise
243 static htmlParserNodeInfo *
244 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
246 if (ctxt->nodeInfoNr <= 0)
247 return (NULL);
248 ctxt->nodeInfoNr--;
249 if (ctxt->nodeInfoNr < 0)
250 return (NULL);
251 if (ctxt->nodeInfoNr > 0)
252 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
253 else
254 ctxt->nodeInfo = NULL;
255 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
259 * Macros for accessing the content. Those should be used only by the parser,
260 * and not exported.
262 * Dirty macros, i.e. one need to make assumption on the context to use them
264 * CUR_PTR return the current pointer to the xmlChar to be parsed.
265 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
266 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
267 * in UNICODE mode. This should be used internally by the parser
268 * only to compare to ASCII values otherwise it would break when
269 * running with UTF-8 encoding.
270 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
271 * to compare on ASCII based substring.
272 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
273 * it should be used only to compare on ASCII based substring.
274 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
275 * strings without newlines within the parser.
277 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
279 * NEXT Skip to the next character, this does the proper decoding
280 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
281 * NEXTL(l) Skip the current unicode character of l xmlChars long.
282 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
285 #define UPPER (toupper(*ctxt->input->cur))
287 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
289 #define NXT(val) ctxt->input->cur[(val)]
291 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
293 #define CUR_PTR ctxt->input->cur
294 #define BASE_PTR ctxt->input->base
296 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
297 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
298 xmlParserShrink(ctxt)
300 #define GROW if ((ctxt->progressive == 0) && \
301 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
302 xmlParserGrow(ctxt)
304 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
306 /* Imported from XML */
308 #define CUR (*ctxt->input->cur)
309 #define NEXT xmlNextChar(ctxt)
311 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
314 #define NEXTL(l) do { \
315 if (*(ctxt->input->cur) == '\n') { \
316 ctxt->input->line++; ctxt->input->col = 1; \
317 } else ctxt->input->col++; \
318 ctxt->token = 0; ctxt->input->cur += l; \
319 } while (0)
321 /************
323 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
324 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
325 ************/
327 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
328 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
330 #define COPY_BUF(l,b,i,v) \
331 if (l == 1) b[i++] = v; \
332 else i += xmlCopyChar(l,&b[i],v)
335 * htmlFindEncoding:
336 * @the HTML parser context
338 * Ty to find and encoding in the current data available in the input
339 * buffer this is needed to try to switch to the proper encoding when
340 * one face a character error.
341 * That's an heuristic, since it's operating outside of parsing it could
342 * try to use a meta which had been commented out, that's the reason it
343 * should only be used in case of error, not as a default.
345 * Returns an encoding string or NULL if not found, the string need to
346 * be freed
348 static xmlChar *
349 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
350 const xmlChar *start, *cur, *end;
352 if ((ctxt == NULL) || (ctxt->input == NULL) ||
353 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
354 (ctxt->input->buf->encoder != NULL))
355 return(NULL);
356 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
357 return(NULL);
359 start = ctxt->input->cur;
360 end = ctxt->input->end;
361 /* we also expect the input buffer to be zero terminated */
362 if (*end != 0)
363 return(NULL);
365 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
366 if (cur == NULL)
367 return(NULL);
368 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
369 if (cur == NULL)
370 return(NULL);
371 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
372 if (cur == NULL)
373 return(NULL);
374 cur += 8;
375 start = cur;
376 while (((*cur >= 'A') && (*cur <= 'Z')) ||
377 ((*cur >= 'a') && (*cur <= 'z')) ||
378 ((*cur >= '0') && (*cur <= '9')) ||
379 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
380 cur++;
381 if (cur == start)
382 return(NULL);
383 return(xmlStrndup(start, cur - start));
387 * htmlCurrentChar:
388 * @ctxt: the HTML parser context
389 * @len: pointer to the length of the char read
391 * The current char value, if using UTF-8 this may actually span multiple
392 * bytes in the input buffer. Implement the end of line normalization:
393 * 2.11 End-of-Line Handling
394 * If the encoding is unspecified, in the case we find an ISO-Latin-1
395 * char, then the encoding converter is plugged in automatically.
397 * Returns the current char value and its length
400 static int
401 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
402 const unsigned char *cur;
403 unsigned char c;
404 unsigned int val;
406 if (ctxt->instate == XML_PARSER_EOF)
407 return(0);
409 if (ctxt->token != 0) {
410 *len = 0;
411 return(ctxt->token);
414 if ((ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) &&
415 (xmlParserGrow(ctxt) < 0))
416 return(0);
418 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
419 xmlChar * guess;
420 xmlCharEncodingHandlerPtr handler;
423 * Assume it's a fixed length encoding (1) with
424 * a compatible encoding for the ASCII set, since
425 * HTML constructs only use < 128 chars
427 if (*ctxt->input->cur < 0x80) {
428 *len = 1;
429 if ((*ctxt->input->cur == 0) &&
430 (ctxt->input->cur < ctxt->input->end)) {
431 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
432 "Char 0x%X out of allowed range\n", 0);
433 return(' ');
435 return(*ctxt->input->cur);
439 * Humm this is bad, do an automatic flow conversion
441 guess = htmlFindEncoding(ctxt);
442 if (guess == NULL) {
443 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
444 } else {
445 if (ctxt->input->encoding != NULL)
446 xmlFree((xmlChar *) ctxt->input->encoding);
447 ctxt->input->encoding = guess;
448 handler = xmlFindCharEncodingHandler((const char *) guess);
449 if (handler != NULL) {
451 * Don't use UTF-8 encoder which isn't required and
452 * can produce invalid UTF-8.
454 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
455 xmlSwitchToEncoding(ctxt, handler);
456 } else {
457 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
458 "Unsupported encoding %s", guess, NULL);
461 ctxt->charset = XML_CHAR_ENCODING_UTF8;
465 * We are supposed to handle UTF8, check it's valid
466 * From rfc2044: encoding of the Unicode values on UTF-8:
468 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
469 * 0000 0000-0000 007F 0xxxxxxx
470 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
471 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
473 * Check for the 0x110000 limit too
475 cur = ctxt->input->cur;
476 c = *cur;
477 if (c & 0x80) {
478 size_t avail;
480 if ((c & 0x40) == 0)
481 goto encoding_error;
483 avail = ctxt->input->end - ctxt->input->cur;
485 if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
486 goto encoding_error;
487 if ((c & 0xe0) == 0xe0) {
488 if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
489 goto encoding_error;
490 if ((c & 0xf0) == 0xf0) {
491 if (((c & 0xf8) != 0xf0) ||
492 (avail < 4) || ((cur[3] & 0xc0) != 0x80))
493 goto encoding_error;
494 /* 4-byte code */
495 *len = 4;
496 val = (cur[0] & 0x7) << 18;
497 val |= (cur[1] & 0x3f) << 12;
498 val |= (cur[2] & 0x3f) << 6;
499 val |= cur[3] & 0x3f;
500 if (val < 0x10000)
501 goto encoding_error;
502 } else {
503 /* 3-byte code */
504 *len = 3;
505 val = (cur[0] & 0xf) << 12;
506 val |= (cur[1] & 0x3f) << 6;
507 val |= cur[2] & 0x3f;
508 if (val < 0x800)
509 goto encoding_error;
511 } else {
512 /* 2-byte code */
513 *len = 2;
514 val = (cur[0] & 0x1f) << 6;
515 val |= cur[1] & 0x3f;
516 if (val < 0x80)
517 goto encoding_error;
519 if (!IS_CHAR(val)) {
520 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
521 "Char 0x%X out of allowed range\n", val);
523 return(val);
524 } else {
525 if ((*ctxt->input->cur == 0) &&
526 (ctxt->input->cur < ctxt->input->end)) {
527 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
528 "Char 0x%X out of allowed range\n", 0);
529 *len = 1;
530 return(' ');
532 /* 1-byte code */
533 *len = 1;
534 return(*ctxt->input->cur);
537 encoding_error:
539 * If we detect an UTF8 error that probably mean that the
540 * input encoding didn't get properly advertised in the
541 * declaration header. Report the error and switch the encoding
542 * to ISO-Latin-1 (if you don't like this policy, just declare the
543 * encoding !)
546 char buffer[150];
548 if (ctxt->input->end - ctxt->input->cur >= 4) {
549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550 ctxt->input->cur[0], ctxt->input->cur[1],
551 ctxt->input->cur[2], ctxt->input->cur[3]);
552 } else {
553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556 "Input is not proper UTF-8, indicate encoding !\n",
557 BAD_CAST buffer, NULL);
561 * Don't switch encodings twice. Note that if there's an encoder, we
562 * shouldn't receive invalid UTF-8 anyway.
564 * Note that if ctxt->input->buf == NULL, switching encodings is
565 * impossible, see Gitlab issue #34.
567 if ((ctxt->input->buf != NULL) &&
568 (ctxt->input->buf->encoder == NULL))
569 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
570 *len = 1;
571 return(*ctxt->input->cur);
575 * htmlSkipBlankChars:
576 * @ctxt: the HTML parser context
578 * skip all blanks character found at that point in the input streams.
580 * Returns the number of space chars skipped
583 static int
584 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
585 int res = 0;
587 while (IS_BLANK_CH(*(ctxt->input->cur))) {
588 if (*(ctxt->input->cur) == '\n') {
589 ctxt->input->line++; ctxt->input->col = 1;
590 } else ctxt->input->col++;
591 ctxt->input->cur++;
592 if (*ctxt->input->cur == 0)
593 xmlParserGrow(ctxt);
594 if (res < INT_MAX)
595 res++;
597 return(res);
602 /************************************************************************
604 * The list of HTML elements and their properties *
606 ************************************************************************/
609 * Start Tag: 1 means the start tag can be omitted
610 * End Tag: 1 means the end tag can be omitted
611 * 2 means it's forbidden (empty elements)
612 * 3 means the tag is stylistic and should be closed easily
613 * Depr: this element is deprecated
614 * DTD: 1 means that this element is valid only in the Loose DTD
615 * 2 means that this element is valid only in the Frameset DTD
617 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
618 , subElements , impliedsubelt , Attributes, userdata
621 /* Definitions and a couple of vars for HTML Elements */
623 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
624 #define NB_FONTSTYLE 8
625 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
626 #define NB_PHRASE 10
627 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
628 #define NB_SPECIAL 16
629 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
630 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
631 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
632 #define NB_BLOCK NB_HEADING + NB_LIST + 14
633 #define FORMCTRL "input", "select", "textarea", "label", "button"
634 #define NB_FORMCTRL 5
635 #define PCDATA
636 #define NB_PCDATA 0
637 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
638 #define NB_HEADING 6
639 #define LIST "ul", "ol", "dir", "menu"
640 #define NB_LIST 4
641 #define MODIFIER
642 #define NB_MODIFIER 0
643 #define FLOW BLOCK,INLINE
644 #define NB_FLOW NB_BLOCK + NB_INLINE
645 #define EMPTY NULL
648 static const char* const html_flow[] = { FLOW, NULL } ;
649 static const char* const html_inline[] = { INLINE, NULL } ;
651 /* placeholders: elts with content but no subelements */
652 static const char* const html_pcdata[] = { NULL } ;
653 #define html_cdata html_pcdata
656 /* ... and for HTML Attributes */
658 #define COREATTRS "id", "class", "style", "title"
659 #define NB_COREATTRS 4
660 #define I18N "lang", "dir"
661 #define NB_I18N 2
662 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
663 #define NB_EVENTS 9
664 #define ATTRS COREATTRS,I18N,EVENTS
665 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
666 #define CELLHALIGN "align", "char", "charoff"
667 #define NB_CELLHALIGN 3
668 #define CELLVALIGN "valign"
669 #define NB_CELLVALIGN 1
671 static const char* const html_attrs[] = { ATTRS, NULL } ;
672 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
673 static const char* const core_attrs[] = { COREATTRS, NULL } ;
674 static const char* const i18n_attrs[] = { I18N, NULL } ;
677 /* Other declarations that should go inline ... */
678 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
679 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
680 "tabindex", "onfocus", "onblur", NULL } ;
681 static const char* const target_attr[] = { "target", NULL } ;
682 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
683 static const char* const alt_attr[] = { "alt", NULL } ;
684 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
685 static const char* const href_attrs[] = { "href", NULL } ;
686 static const char* const clear_attrs[] = { "clear", NULL } ;
687 static const char* const inline_p[] = { INLINE, "p", NULL } ;
689 static const char* const flow_param[] = { FLOW, "param", NULL } ;
690 static const char* const applet_attrs[] = { COREATTRS , "codebase",
691 "archive", "alt", "name", "height", "width", "align",
692 "hspace", "vspace", NULL } ;
693 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
694 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
695 static const char* const basefont_attrs[] =
696 { "id", "size", "color", "face", NULL } ;
697 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
698 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
699 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
700 static const char* const body_depr[] = { "background", "bgcolor", "text",
701 "link", "vlink", "alink", NULL } ;
702 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
703 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
706 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
707 static const char* const col_elt[] = { "col", NULL } ;
708 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
709 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
710 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
711 static const char* const compact_attr[] = { "compact", NULL } ;
712 static const char* const label_attr[] = { "label", NULL } ;
713 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
714 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
715 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
716 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
717 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
718 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
719 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
720 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
721 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
722 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
723 static const char* const version_attr[] = { "version", NULL } ;
724 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
725 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
726 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
727 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
728 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
729 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
730 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
731 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
732 static const char* const align_attr[] = { "align", NULL } ;
733 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
734 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
735 static const char* const name_attr[] = { "name", NULL } ;
736 static const char* const action_attr[] = { "action", NULL } ;
737 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
738 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
739 static const char* const content_attr[] = { "content", NULL } ;
740 static const char* const type_attr[] = { "type", NULL } ;
741 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
742 static const char* const object_contents[] = { FLOW, "param", NULL } ;
743 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
744 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
745 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
746 static const char* const option_elt[] = { "option", NULL } ;
747 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
748 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
749 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
750 static const char* const width_attr[] = { "width", NULL } ;
751 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
752 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
753 static const char* const language_attr[] = { "language", NULL } ;
754 static const char* const select_content[] = { "optgroup", "option", NULL } ;
755 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
756 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
757 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
758 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
759 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
760 static const char* const tr_elt[] = { "tr", NULL } ;
761 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
762 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
763 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
764 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
765 static const char* const tr_contents[] = { "th", "td", NULL } ;
766 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
767 static const char* const li_elt[] = { "li", NULL } ;
768 static const char* const ul_depr[] = { "type", "compact", NULL} ;
769 static const char* const dir_attr[] = { "dir", NULL} ;
771 #define DECL (const char**)
773 static const htmlElemDesc
774 html40ElementTable[] = {
775 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
776 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
778 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
779 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
781 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
782 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
784 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
785 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
787 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
788 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
790 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
791 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
793 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
794 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
796 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
797 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
799 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
800 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
802 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
803 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
805 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
806 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
808 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
809 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
811 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
812 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
814 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
815 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
817 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
818 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
820 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
821 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
824 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
826 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
827 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
830 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
832 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
833 EMPTY , NULL , DECL col_attrs , NULL, NULL
835 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
836 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
838 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
839 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
841 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
842 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
844 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
845 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
847 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
848 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
850 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
851 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
853 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
854 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
856 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
857 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
859 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
860 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
862 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
863 EMPTY, NULL, DECL embed_attrs, NULL, NULL
865 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
866 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
868 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
869 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
871 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
872 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
874 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
875 EMPTY, NULL, NULL, DECL frame_attrs, NULL
877 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
878 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
880 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
881 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
884 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
887 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
890 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
892 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
893 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
895 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
896 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
898 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
899 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
901 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
902 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
904 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
905 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
907 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
908 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
910 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
911 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
913 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
914 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
916 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
917 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
919 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
920 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
922 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
923 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
925 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
926 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
928 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
929 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
931 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
932 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
934 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
935 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
937 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
938 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
940 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
941 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
943 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
944 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
946 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
947 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
949 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
950 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
952 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
953 DECL html_flow, "div", DECL html_attrs, NULL, NULL
955 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
956 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
958 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
959 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
961 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
962 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
964 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
965 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
967 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
968 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
970 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
971 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
973 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
974 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
976 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
977 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
979 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
980 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
982 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
983 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
986 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
988 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
989 DECL select_content, NULL, DECL select_attrs, NULL, NULL
991 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
992 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
995 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
997 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
998 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1000 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1001 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1004 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1006 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1007 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1009 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1010 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1012 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1013 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1015 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1016 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1019 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1022 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1024 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1025 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1027 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1028 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1030 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1031 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1033 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1034 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1036 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1037 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1039 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1040 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1043 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1045 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1046 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1048 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1049 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1053 typedef struct {
1054 const char *oldTag;
1055 const char *newTag;
1056 } htmlStartCloseEntry;
1059 * start tags that imply the end of current element
1061 static const htmlStartCloseEntry htmlStartClose[] = {
1062 { "a", "a" },
1063 { "a", "fieldset" },
1064 { "a", "table" },
1065 { "a", "td" },
1066 { "a", "th" },
1067 { "address", "dd" },
1068 { "address", "dl" },
1069 { "address", "dt" },
1070 { "address", "form" },
1071 { "address", "li" },
1072 { "address", "ul" },
1073 { "b", "center" },
1074 { "b", "p" },
1075 { "b", "td" },
1076 { "b", "th" },
1077 { "big", "p" },
1078 { "caption", "col" },
1079 { "caption", "colgroup" },
1080 { "caption", "tbody" },
1081 { "caption", "tfoot" },
1082 { "caption", "thead" },
1083 { "caption", "tr" },
1084 { "col", "col" },
1085 { "col", "colgroup" },
1086 { "col", "tbody" },
1087 { "col", "tfoot" },
1088 { "col", "thead" },
1089 { "col", "tr" },
1090 { "colgroup", "colgroup" },
1091 { "colgroup", "tbody" },
1092 { "colgroup", "tfoot" },
1093 { "colgroup", "thead" },
1094 { "colgroup", "tr" },
1095 { "dd", "dt" },
1096 { "dir", "dd" },
1097 { "dir", "dl" },
1098 { "dir", "dt" },
1099 { "dir", "form" },
1100 { "dir", "ul" },
1101 { "dl", "form" },
1102 { "dl", "li" },
1103 { "dt", "dd" },
1104 { "dt", "dl" },
1105 { "font", "center" },
1106 { "font", "td" },
1107 { "font", "th" },
1108 { "form", "form" },
1109 { "h1", "fieldset" },
1110 { "h1", "form" },
1111 { "h1", "li" },
1112 { "h1", "p" },
1113 { "h1", "table" },
1114 { "h2", "fieldset" },
1115 { "h2", "form" },
1116 { "h2", "li" },
1117 { "h2", "p" },
1118 { "h2", "table" },
1119 { "h3", "fieldset" },
1120 { "h3", "form" },
1121 { "h3", "li" },
1122 { "h3", "p" },
1123 { "h3", "table" },
1124 { "h4", "fieldset" },
1125 { "h4", "form" },
1126 { "h4", "li" },
1127 { "h4", "p" },
1128 { "h4", "table" },
1129 { "h5", "fieldset" },
1130 { "h5", "form" },
1131 { "h5", "li" },
1132 { "h5", "p" },
1133 { "h5", "table" },
1134 { "h6", "fieldset" },
1135 { "h6", "form" },
1136 { "h6", "li" },
1137 { "h6", "p" },
1138 { "h6", "table" },
1139 { "head", "a" },
1140 { "head", "abbr" },
1141 { "head", "acronym" },
1142 { "head", "address" },
1143 { "head", "b" },
1144 { "head", "bdo" },
1145 { "head", "big" },
1146 { "head", "blockquote" },
1147 { "head", "body" },
1148 { "head", "br" },
1149 { "head", "center" },
1150 { "head", "cite" },
1151 { "head", "code" },
1152 { "head", "dd" },
1153 { "head", "dfn" },
1154 { "head", "dir" },
1155 { "head", "div" },
1156 { "head", "dl" },
1157 { "head", "dt" },
1158 { "head", "em" },
1159 { "head", "fieldset" },
1160 { "head", "font" },
1161 { "head", "form" },
1162 { "head", "frameset" },
1163 { "head", "h1" },
1164 { "head", "h2" },
1165 { "head", "h3" },
1166 { "head", "h4" },
1167 { "head", "h5" },
1168 { "head", "h6" },
1169 { "head", "hr" },
1170 { "head", "i" },
1171 { "head", "iframe" },
1172 { "head", "img" },
1173 { "head", "kbd" },
1174 { "head", "li" },
1175 { "head", "listing" },
1176 { "head", "map" },
1177 { "head", "menu" },
1178 { "head", "ol" },
1179 { "head", "p" },
1180 { "head", "pre" },
1181 { "head", "q" },
1182 { "head", "s" },
1183 { "head", "samp" },
1184 { "head", "small" },
1185 { "head", "span" },
1186 { "head", "strike" },
1187 { "head", "strong" },
1188 { "head", "sub" },
1189 { "head", "sup" },
1190 { "head", "table" },
1191 { "head", "tt" },
1192 { "head", "u" },
1193 { "head", "ul" },
1194 { "head", "var" },
1195 { "head", "xmp" },
1196 { "hr", "form" },
1197 { "i", "center" },
1198 { "i", "p" },
1199 { "i", "td" },
1200 { "i", "th" },
1201 { "legend", "fieldset" },
1202 { "li", "li" },
1203 { "link", "body" },
1204 { "link", "frameset" },
1205 { "listing", "dd" },
1206 { "listing", "dl" },
1207 { "listing", "dt" },
1208 { "listing", "fieldset" },
1209 { "listing", "form" },
1210 { "listing", "li" },
1211 { "listing", "table" },
1212 { "listing", "ul" },
1213 { "menu", "dd" },
1214 { "menu", "dl" },
1215 { "menu", "dt" },
1216 { "menu", "form" },
1217 { "menu", "ul" },
1218 { "ol", "form" },
1219 { "option", "optgroup" },
1220 { "option", "option" },
1221 { "p", "address" },
1222 { "p", "blockquote" },
1223 { "p", "body" },
1224 { "p", "caption" },
1225 { "p", "center" },
1226 { "p", "col" },
1227 { "p", "colgroup" },
1228 { "p", "dd" },
1229 { "p", "dir" },
1230 { "p", "div" },
1231 { "p", "dl" },
1232 { "p", "dt" },
1233 { "p", "fieldset" },
1234 { "p", "form" },
1235 { "p", "frameset" },
1236 { "p", "h1" },
1237 { "p", "h2" },
1238 { "p", "h3" },
1239 { "p", "h4" },
1240 { "p", "h5" },
1241 { "p", "h6" },
1242 { "p", "head" },
1243 { "p", "hr" },
1244 { "p", "li" },
1245 { "p", "listing" },
1246 { "p", "menu" },
1247 { "p", "ol" },
1248 { "p", "p" },
1249 { "p", "pre" },
1250 { "p", "table" },
1251 { "p", "tbody" },
1252 { "p", "td" },
1253 { "p", "tfoot" },
1254 { "p", "th" },
1255 { "p", "title" },
1256 { "p", "tr" },
1257 { "p", "ul" },
1258 { "p", "xmp" },
1259 { "pre", "dd" },
1260 { "pre", "dl" },
1261 { "pre", "dt" },
1262 { "pre", "fieldset" },
1263 { "pre", "form" },
1264 { "pre", "li" },
1265 { "pre", "table" },
1266 { "pre", "ul" },
1267 { "s", "p" },
1268 { "script", "noscript" },
1269 { "small", "p" },
1270 { "span", "td" },
1271 { "span", "th" },
1272 { "strike", "p" },
1273 { "style", "body" },
1274 { "style", "frameset" },
1275 { "tbody", "tbody" },
1276 { "tbody", "tfoot" },
1277 { "td", "tbody" },
1278 { "td", "td" },
1279 { "td", "tfoot" },
1280 { "td", "th" },
1281 { "td", "tr" },
1282 { "tfoot", "tbody" },
1283 { "th", "tbody" },
1284 { "th", "td" },
1285 { "th", "tfoot" },
1286 { "th", "th" },
1287 { "th", "tr" },
1288 { "thead", "tbody" },
1289 { "thead", "tfoot" },
1290 { "title", "body" },
1291 { "title", "frameset" },
1292 { "tr", "tbody" },
1293 { "tr", "tfoot" },
1294 { "tr", "tr" },
1295 { "tt", "p" },
1296 { "u", "p" },
1297 { "u", "td" },
1298 { "u", "th" },
1299 { "ul", "address" },
1300 { "ul", "form" },
1301 { "ul", "menu" },
1302 { "ul", "pre" },
1303 { "xmp", "dd" },
1304 { "xmp", "dl" },
1305 { "xmp", "dt" },
1306 { "xmp", "fieldset" },
1307 { "xmp", "form" },
1308 { "xmp", "li" },
1309 { "xmp", "table" },
1310 { "xmp", "ul" }
1314 * The list of HTML elements which are supposed not to have
1315 * CDATA content and where a p element will be implied
1317 * TODO: extend that list by reading the HTML SGML DTD on
1318 * implied paragraph
1320 static const char *const htmlNoContentElements[] = {
1321 "html",
1322 "head",
1323 NULL
1327 * The list of HTML attributes which are of content %Script;
1328 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1329 * it assumes the name starts with 'on'
1331 static const char *const htmlScriptAttributes[] = {
1332 "onclick",
1333 "ondblclick",
1334 "onmousedown",
1335 "onmouseup",
1336 "onmouseover",
1337 "onmousemove",
1338 "onmouseout",
1339 "onkeypress",
1340 "onkeydown",
1341 "onkeyup",
1342 "onload",
1343 "onunload",
1344 "onfocus",
1345 "onblur",
1346 "onsubmit",
1347 "onreset",
1348 "onchange",
1349 "onselect"
1353 * This table is used by the htmlparser to know what to do with
1354 * broken html pages. By assigning different priorities to different
1355 * elements the parser can decide how to handle extra endtags.
1356 * Endtags are only allowed to close elements with lower or equal
1357 * priority.
1360 typedef struct {
1361 const char *name;
1362 int priority;
1363 } elementPriority;
1365 static const elementPriority htmlEndPriority[] = {
1366 {"div", 150},
1367 {"td", 160},
1368 {"th", 160},
1369 {"tr", 170},
1370 {"thead", 180},
1371 {"tbody", 180},
1372 {"tfoot", 180},
1373 {"table", 190},
1374 {"head", 200},
1375 {"body", 200},
1376 {"html", 220},
1377 {NULL, 100} /* Default priority */
1380 /************************************************************************
1382 * functions to handle HTML specific data *
1384 ************************************************************************/
1387 * htmlInitAutoClose:
1389 * DEPRECATED: This is a no-op.
1391 void
1392 htmlInitAutoClose(void) {
1395 static int
1396 htmlCompareTags(const void *key, const void *member) {
1397 const xmlChar *tag = (const xmlChar *) key;
1398 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1400 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1404 * htmlTagLookup:
1405 * @tag: The tag name in lowercase
1407 * Lookup the HTML tag in the ElementTable
1409 * Returns the related htmlElemDescPtr or NULL if not found.
1411 const htmlElemDesc *
1412 htmlTagLookup(const xmlChar *tag) {
1413 if (tag == NULL)
1414 return(NULL);
1416 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1417 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1418 sizeof(htmlElemDesc), htmlCompareTags));
1422 * htmlGetEndPriority:
1423 * @name: The name of the element to look up the priority for.
1425 * Return value: The "endtag" priority.
1427 static int
1428 htmlGetEndPriority (const xmlChar *name) {
1429 int i = 0;
1431 while ((htmlEndPriority[i].name != NULL) &&
1432 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1433 i++;
1435 return(htmlEndPriority[i].priority);
1439 static int
1440 htmlCompareStartClose(const void *vkey, const void *member) {
1441 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1442 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1443 int ret;
1445 ret = strcmp(key->oldTag, entry->oldTag);
1446 if (ret == 0)
1447 ret = strcmp(key->newTag, entry->newTag);
1449 return(ret);
1453 * htmlCheckAutoClose:
1454 * @newtag: The new tag name
1455 * @oldtag: The old tag name
1457 * Checks whether the new tag is one of the registered valid tags for
1458 * closing old.
1460 * Returns 0 if no, 1 if yes.
1462 static int
1463 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1465 htmlStartCloseEntry key;
1466 void *res;
1468 key.oldTag = (const char *) oldtag;
1469 key.newTag = (const char *) newtag;
1470 res = bsearch(&key, htmlStartClose,
1471 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1472 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1473 return(res != NULL);
1477 * htmlAutoCloseOnClose:
1478 * @ctxt: an HTML parser context
1479 * @newtag: The new tag name
1480 * @force: force the tag closure
1482 * The HTML DTD allows an ending tag to implicitly close other tags.
1484 static void
1485 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1487 const htmlElemDesc *info;
1488 int i, priority;
1490 priority = htmlGetEndPriority(newtag);
1492 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1494 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1495 break;
1497 * A misplaced endtag can only close elements with lower
1498 * or equal priority, so if we find an element with higher
1499 * priority before we find an element with
1500 * matching name, we just ignore this endtag
1502 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1503 return;
1505 if (i < 0)
1506 return;
1508 while (!xmlStrEqual(newtag, ctxt->name)) {
1509 info = htmlTagLookup(ctxt->name);
1510 if ((info != NULL) && (info->endTag == 3)) {
1511 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1512 "Opening and ending tag mismatch: %s and %s\n",
1513 newtag, ctxt->name);
1515 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1516 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1517 htmlnamePop(ctxt);
1522 * htmlAutoCloseOnEnd:
1523 * @ctxt: an HTML parser context
1525 * Close all remaining tags at the end of the stream
1527 static void
1528 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1530 int i;
1532 if (ctxt->nameNr == 0)
1533 return;
1534 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1535 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1536 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1537 htmlnamePop(ctxt);
1542 * htmlAutoClose:
1543 * @ctxt: an HTML parser context
1544 * @newtag: The new tag name or NULL
1546 * The HTML DTD allows a tag to implicitly close other tags.
1547 * The list is kept in htmlStartClose array. This function is
1548 * called when a new tag has been detected and generates the
1549 * appropriates closes if possible/needed.
1550 * If newtag is NULL this mean we are at the end of the resource
1551 * and we should check
1553 static void
1554 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1556 while ((newtag != NULL) && (ctxt->name != NULL) &&
1557 (htmlCheckAutoClose(newtag, ctxt->name))) {
1558 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1559 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1560 htmlnamePop(ctxt);
1562 if (newtag == NULL) {
1563 htmlAutoCloseOnEnd(ctxt);
1564 return;
1566 while ((newtag == NULL) && (ctxt->name != NULL) &&
1567 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1568 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1569 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1570 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1571 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1572 htmlnamePop(ctxt);
1577 * htmlAutoCloseTag:
1578 * @doc: the HTML document
1579 * @name: The tag name
1580 * @elem: the HTML element
1582 * The HTML DTD allows a tag to implicitly close other tags.
1583 * The list is kept in htmlStartClose array. This function checks
1584 * if the element or one of it's children would autoclose the
1585 * given tag.
1587 * Returns 1 if autoclose, 0 otherwise
1590 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1591 htmlNodePtr child;
1593 if (elem == NULL) return(1);
1594 if (xmlStrEqual(name, elem->name)) return(0);
1595 if (htmlCheckAutoClose(elem->name, name)) return(1);
1596 child = elem->children;
1597 while (child != NULL) {
1598 if (htmlAutoCloseTag(doc, name, child)) return(1);
1599 child = child->next;
1601 return(0);
1605 * htmlIsAutoClosed:
1606 * @doc: the HTML document
1607 * @elem: the HTML element
1609 * The HTML DTD allows a tag to implicitly close other tags.
1610 * The list is kept in htmlStartClose array. This function checks
1611 * if a tag is autoclosed by one of it's child
1613 * Returns 1 if autoclosed, 0 otherwise
1616 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1617 htmlNodePtr child;
1619 if (elem == NULL) return(1);
1620 child = elem->children;
1621 while (child != NULL) {
1622 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1623 child = child->next;
1625 return(0);
1629 * htmlCheckImplied:
1630 * @ctxt: an HTML parser context
1631 * @newtag: The new tag name
1633 * The HTML DTD allows a tag to exists only implicitly
1634 * called when a new tag has been detected and generates the
1635 * appropriates implicit tags if missing
1637 static void
1638 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1639 int i;
1641 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1642 return;
1643 if (!htmlOmittedDefaultValue)
1644 return;
1645 if (xmlStrEqual(newtag, BAD_CAST"html"))
1646 return;
1647 if (ctxt->nameNr <= 0) {
1648 htmlnamePush(ctxt, BAD_CAST"html");
1649 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1650 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1652 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1653 return;
1654 if ((ctxt->nameNr <= 1) &&
1655 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1656 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1657 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1658 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1659 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1660 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1661 if (ctxt->html >= 3) {
1662 /* we already saw or generated an <head> before */
1663 return;
1666 * dropped OBJECT ... i you put it first BODY will be
1667 * assumed !
1669 htmlnamePush(ctxt, BAD_CAST"head");
1670 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1671 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1672 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1673 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1674 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1675 if (ctxt->html >= 10) {
1676 /* we already saw or generated a <body> before */
1677 return;
1679 for (i = 0;i < ctxt->nameNr;i++) {
1680 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1681 return;
1683 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1684 return;
1688 htmlnamePush(ctxt, BAD_CAST"body");
1689 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1690 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1695 * htmlCheckParagraph
1696 * @ctxt: an HTML parser context
1698 * Check whether a p element need to be implied before inserting
1699 * characters in the current element.
1701 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1702 * in case of error.
1705 static int
1706 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1707 const xmlChar *tag;
1708 int i;
1710 if (ctxt == NULL)
1711 return(-1);
1712 tag = ctxt->name;
1713 if (tag == NULL) {
1714 htmlAutoClose(ctxt, BAD_CAST"p");
1715 htmlCheckImplied(ctxt, BAD_CAST"p");
1716 htmlnamePush(ctxt, BAD_CAST"p");
1717 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1718 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1719 return(1);
1721 if (!htmlOmittedDefaultValue)
1722 return(0);
1723 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1724 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1725 htmlAutoClose(ctxt, BAD_CAST"p");
1726 htmlCheckImplied(ctxt, BAD_CAST"p");
1727 htmlnamePush(ctxt, BAD_CAST"p");
1728 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1729 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1730 return(1);
1733 return(0);
1737 * htmlIsScriptAttribute:
1738 * @name: an attribute name
1740 * Check if an attribute is of content type Script
1742 * Returns 1 is the attribute is a script 0 otherwise
1745 htmlIsScriptAttribute(const xmlChar *name) {
1746 unsigned int i;
1748 if (name == NULL)
1749 return(0);
1751 * all script attributes start with 'on'
1753 if ((name[0] != 'o') || (name[1] != 'n'))
1754 return(0);
1755 for (i = 0;
1756 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1757 i++) {
1758 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1759 return(1);
1761 return(0);
1764 /************************************************************************
1766 * The list of HTML predefined entities *
1768 ************************************************************************/
1771 static const htmlEntityDesc html40EntitiesTable[] = {
1773 * the 4 absolute ones, plus apostrophe.
1775 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1776 { 38, "amp", "ampersand, U+0026 ISOnum" },
1777 { 39, "apos", "single quote" },
1778 { 60, "lt", "less-than sign, U+003C ISOnum" },
1779 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1782 * A bunch still in the 128-255 range
1783 * Replacing them depend really on the charset used.
1785 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1786 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1787 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1788 { 163, "pound","pound sign, U+00A3 ISOnum" },
1789 { 164, "curren","currency sign, U+00A4 ISOnum" },
1790 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1791 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1792 { 167, "sect", "section sign, U+00A7 ISOnum" },
1793 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1794 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1795 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1796 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1797 { 172, "not", "not sign, U+00AC ISOnum" },
1798 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1799 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1800 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1801 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1802 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1803 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1804 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1805 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1806 { 181, "micro","micro sign, U+00B5 ISOnum" },
1807 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1808 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1809 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1810 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1811 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1812 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1813 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1814 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1815 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1816 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1817 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1818 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1819 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1820 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1821 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1822 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1823 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1824 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1825 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1826 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1827 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1828 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1829 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1830 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1831 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1832 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1833 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1834 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1835 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1836 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1837 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1838 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1839 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1840 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1841 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1842 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1843 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1844 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1845 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1846 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1847 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1848 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1849 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1850 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1851 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1852 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1853 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1854 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1855 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1856 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1857 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1858 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1859 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1860 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1861 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1862 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1863 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1864 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1865 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1866 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1867 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1868 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1869 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1870 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1871 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1872 { 247, "divide","division sign, U+00F7 ISOnum" },
1873 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1874 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1875 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1876 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1877 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1878 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1879 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1880 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1882 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1883 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1884 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1885 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1886 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1889 * Anything below should really be kept as entities references
1891 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1893 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1894 { 732, "tilde","small tilde, U+02DC ISOdia" },
1896 { 913, "Alpha","greek capital letter alpha, U+0391" },
1897 { 914, "Beta", "greek capital letter beta, U+0392" },
1898 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1899 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1900 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1901 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1902 { 919, "Eta", "greek capital letter eta, U+0397" },
1903 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1904 { 921, "Iota", "greek capital letter iota, U+0399" },
1905 { 922, "Kappa","greek capital letter kappa, U+039A" },
1906 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1907 { 924, "Mu", "greek capital letter mu, U+039C" },
1908 { 925, "Nu", "greek capital letter nu, U+039D" },
1909 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1910 { 927, "Omicron","greek capital letter omicron, U+039F" },
1911 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1912 { 929, "Rho", "greek capital letter rho, U+03A1" },
1913 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1914 { 932, "Tau", "greek capital letter tau, U+03A4" },
1915 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1916 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1917 { 935, "Chi", "greek capital letter chi, U+03A7" },
1918 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1919 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1921 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1922 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1923 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1924 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1925 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1926 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1927 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1928 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1929 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1930 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1931 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1932 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1933 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1934 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1935 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1936 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1937 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1938 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1939 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1940 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1941 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1942 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1943 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1944 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1945 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1946 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1947 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1948 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1950 { 8194, "ensp", "en space, U+2002 ISOpub" },
1951 { 8195, "emsp", "em space, U+2003 ISOpub" },
1952 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1953 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1954 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1955 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1956 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1957 { 8211, "ndash","en dash, U+2013 ISOpub" },
1958 { 8212, "mdash","em dash, U+2014 ISOpub" },
1959 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1960 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1961 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1962 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1963 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1964 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1965 { 8224, "dagger","dagger, U+2020 ISOpub" },
1966 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1968 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1969 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1971 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1973 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1974 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1976 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1977 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1979 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1980 { 8260, "frasl","fraction slash, U+2044 NEW" },
1982 { 8364, "euro", "euro sign, U+20AC NEW" },
1984 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1985 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1986 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1987 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1988 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1989 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1990 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1991 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1992 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1993 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1994 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1995 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1996 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1997 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1998 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1999 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2001 { 8704, "forall","for all, U+2200 ISOtech" },
2002 { 8706, "part", "partial differential, U+2202 ISOtech" },
2003 { 8707, "exist","there exists, U+2203 ISOtech" },
2004 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2005 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2006 { 8712, "isin", "element of, U+2208 ISOtech" },
2007 { 8713, "notin","not an element of, U+2209 ISOtech" },
2008 { 8715, "ni", "contains as member, U+220B ISOtech" },
2009 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2010 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
2011 { 8722, "minus","minus sign, U+2212 ISOtech" },
2012 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2013 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2014 { 8733, "prop", "proportional to, U+221D ISOtech" },
2015 { 8734, "infin","infinity, U+221E ISOtech" },
2016 { 8736, "ang", "angle, U+2220 ISOamso" },
2017 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
2018 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
2019 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
2020 { 8746, "cup", "union = cup, U+222A ISOtech" },
2021 { 8747, "int", "integral, U+222B ISOtech" },
2022 { 8756, "there4","therefore, U+2234 ISOtech" },
2023 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
2024 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2025 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2026 { 8800, "ne", "not equal to, U+2260 ISOtech" },
2027 { 8801, "equiv","identical to, U+2261 ISOtech" },
2028 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2029 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2030 { 8834, "sub", "subset of, U+2282 ISOtech" },
2031 { 8835, "sup", "superset of, U+2283 ISOtech" },
2032 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2033 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2034 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2035 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2036 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2037 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2038 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2039 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2040 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2041 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2042 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2043 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2044 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2045 { 9674, "loz", "lozenge, U+25CA ISOpub" },
2047 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2048 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2049 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2050 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2054 /************************************************************************
2056 * Commodity functions to handle entities *
2058 ************************************************************************/
2061 * Macro used to grow the current buffer.
2063 #define growBuffer(buffer) { \
2064 xmlChar *tmp; \
2065 buffer##_size *= 2; \
2066 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
2067 if (tmp == NULL) { \
2068 htmlErrMemory(ctxt, "growing buffer\n"); \
2069 xmlFree(buffer); \
2070 return(NULL); \
2072 buffer = tmp; \
2076 * htmlEntityLookup:
2077 * @name: the entity name
2079 * Lookup the given entity in EntitiesTable
2081 * TODO: the linear scan is really ugly, an hash table is really needed.
2083 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2085 const htmlEntityDesc *
2086 htmlEntityLookup(const xmlChar *name) {
2087 unsigned int i;
2089 for (i = 0;i < (sizeof(html40EntitiesTable)/
2090 sizeof(html40EntitiesTable[0]));i++) {
2091 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2092 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2095 return(NULL);
2099 * htmlEntityValueLookup:
2100 * @value: the entity's unicode value
2102 * Lookup the given entity in EntitiesTable
2104 * TODO: the linear scan is really ugly, an hash table is really needed.
2106 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108 const htmlEntityDesc *
2109 htmlEntityValueLookup(unsigned int value) {
2110 unsigned int i;
2112 for (i = 0;i < (sizeof(html40EntitiesTable)/
2113 sizeof(html40EntitiesTable[0]));i++) {
2114 if (html40EntitiesTable[i].value >= value) {
2115 if (html40EntitiesTable[i].value > value)
2116 break;
2117 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2120 return(NULL);
2124 * UTF8ToHtml:
2125 * @out: a pointer to an array of bytes to store the result
2126 * @outlen: the length of @out
2127 * @in: a pointer to an array of UTF-8 chars
2128 * @inlen: the length of @in
2130 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2131 * plus HTML entities block of chars out.
2133 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2134 * The value of @inlen after return is the number of octets consumed
2135 * as the return value is positive, else unpredictable.
2136 * The value of @outlen after return is the number of octets consumed.
2139 UTF8ToHtml(unsigned char* out, int *outlen,
2140 const unsigned char* in, int *inlen) {
2141 const unsigned char* processed = in;
2142 const unsigned char* outend;
2143 const unsigned char* outstart = out;
2144 const unsigned char* instart = in;
2145 const unsigned char* inend;
2146 unsigned int c, d;
2147 int trailing;
2149 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2150 if (in == NULL) {
2152 * initialization nothing to do
2154 *outlen = 0;
2155 *inlen = 0;
2156 return(0);
2158 inend = in + (*inlen);
2159 outend = out + (*outlen);
2160 while (in < inend) {
2161 d = *in++;
2162 if (d < 0x80) { c= d; trailing= 0; }
2163 else if (d < 0xC0) {
2164 /* trailing byte in leading position */
2165 *outlen = out - outstart;
2166 *inlen = processed - instart;
2167 return(-2);
2168 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2169 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2170 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2171 else {
2172 /* no chance for this in Ascii */
2173 *outlen = out - outstart;
2174 *inlen = processed - instart;
2175 return(-2);
2178 if (inend - in < trailing) {
2179 break;
2182 for ( ; trailing; trailing--) {
2183 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2184 break;
2185 c <<= 6;
2186 c |= d & 0x3F;
2189 /* assertion: c is a single UTF-4 value */
2190 if (c < 0x80) {
2191 if (out + 1 >= outend)
2192 break;
2193 *out++ = c;
2194 } else {
2195 int len;
2196 const htmlEntityDesc * ent;
2197 const char *cp;
2198 char nbuf[16];
2201 * Try to lookup a predefined HTML entity for it
2204 ent = htmlEntityValueLookup(c);
2205 if (ent == NULL) {
2206 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2207 cp = nbuf;
2209 else
2210 cp = ent->name;
2211 len = strlen(cp);
2212 if (out + 2 + len >= outend)
2213 break;
2214 *out++ = '&';
2215 memcpy(out, cp, len);
2216 out += len;
2217 *out++ = ';';
2219 processed = in;
2221 *outlen = out - outstart;
2222 *inlen = processed - instart;
2223 return(0);
2227 * htmlEncodeEntities:
2228 * @out: a pointer to an array of bytes to store the result
2229 * @outlen: the length of @out
2230 * @in: a pointer to an array of UTF-8 chars
2231 * @inlen: the length of @in
2232 * @quoteChar: the quote character to escape (' or ") or zero.
2234 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2235 * plus HTML entities block of chars out.
2237 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2238 * The value of @inlen after return is the number of octets consumed
2239 * as the return value is positive, else unpredictable.
2240 * The value of @outlen after return is the number of octets consumed.
2243 htmlEncodeEntities(unsigned char* out, int *outlen,
2244 const unsigned char* in, int *inlen, int quoteChar) {
2245 const unsigned char* processed = in;
2246 const unsigned char* outend;
2247 const unsigned char* outstart = out;
2248 const unsigned char* instart = in;
2249 const unsigned char* inend;
2250 unsigned int c, d;
2251 int trailing;
2253 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2254 return(-1);
2255 outend = out + (*outlen);
2256 inend = in + (*inlen);
2257 while (in < inend) {
2258 d = *in++;
2259 if (d < 0x80) { c= d; trailing= 0; }
2260 else if (d < 0xC0) {
2261 /* trailing byte in leading position */
2262 *outlen = out - outstart;
2263 *inlen = processed - instart;
2264 return(-2);
2265 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2266 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2267 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2268 else {
2269 /* no chance for this in Ascii */
2270 *outlen = out - outstart;
2271 *inlen = processed - instart;
2272 return(-2);
2275 if (inend - in < trailing)
2276 break;
2278 while (trailing--) {
2279 if (((d= *in++) & 0xC0) != 0x80) {
2280 *outlen = out - outstart;
2281 *inlen = processed - instart;
2282 return(-2);
2284 c <<= 6;
2285 c |= d & 0x3F;
2288 /* assertion: c is a single UTF-4 value */
2289 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2290 (c != '&') && (c != '<') && (c != '>')) {
2291 if (out >= outend)
2292 break;
2293 *out++ = c;
2294 } else {
2295 const htmlEntityDesc * ent;
2296 const char *cp;
2297 char nbuf[16];
2298 int len;
2301 * Try to lookup a predefined HTML entity for it
2303 ent = htmlEntityValueLookup(c);
2304 if (ent == NULL) {
2305 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2306 cp = nbuf;
2308 else
2309 cp = ent->name;
2310 len = strlen(cp);
2311 if (outend - out < len + 2)
2312 break;
2313 *out++ = '&';
2314 memcpy(out, cp, len);
2315 out += len;
2316 *out++ = ';';
2318 processed = in;
2320 *outlen = out - outstart;
2321 *inlen = processed - instart;
2322 return(0);
2325 /************************************************************************
2327 * Commodity functions to handle streams *
2329 ************************************************************************/
2331 #ifdef LIBXML_PUSH_ENABLED
2333 * htmlNewInputStream:
2334 * @ctxt: an HTML parser context
2336 * Create a new input stream structure
2337 * Returns the new input stream or NULL
2339 static htmlParserInputPtr
2340 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2341 htmlParserInputPtr input;
2343 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2344 if (input == NULL) {
2345 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2346 return(NULL);
2348 memset(input, 0, sizeof(htmlParserInput));
2349 input->filename = NULL;
2350 input->directory = NULL;
2351 input->base = NULL;
2352 input->cur = NULL;
2353 input->buf = NULL;
2354 input->line = 1;
2355 input->col = 1;
2356 input->buf = NULL;
2357 input->free = NULL;
2358 input->version = NULL;
2359 input->consumed = 0;
2360 input->length = 0;
2361 return(input);
2363 #endif
2366 /************************************************************************
2368 * Commodity functions, cleanup needed ? *
2370 ************************************************************************/
2372 * all tags allowing pc data from the html 4.01 loose dtd
2373 * NOTE: it might be more appropriate to integrate this information
2374 * into the html40ElementTable array but I don't want to risk any
2375 * binary incompatibility
2377 static const char *allowPCData[] = {
2378 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2379 "blockquote", "body", "button", "caption", "center", "cite", "code",
2380 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2381 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2382 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2383 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2387 * areBlanks:
2388 * @ctxt: an HTML parser context
2389 * @str: a xmlChar *
2390 * @len: the size of @str
2392 * Is this a sequence of blank chars that one can ignore ?
2394 * Returns 1 if ignorable 0 otherwise.
2397 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2398 unsigned int i;
2399 int j;
2400 xmlNodePtr lastChild;
2401 xmlDtdPtr dtd;
2403 for (j = 0;j < len;j++)
2404 if (!(IS_BLANK_CH(str[j]))) return(0);
2406 if (CUR == 0) return(1);
2407 if (CUR != '<') return(0);
2408 if (ctxt->name == NULL)
2409 return(1);
2410 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2411 return(1);
2412 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2413 return(1);
2415 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2416 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2417 dtd = xmlGetIntSubset(ctxt->myDoc);
2418 if (dtd != NULL && dtd->ExternalID != NULL) {
2419 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2420 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2421 return(1);
2425 if (ctxt->node == NULL) return(0);
2426 lastChild = xmlGetLastChild(ctxt->node);
2427 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2428 lastChild = lastChild->prev;
2429 if (lastChild == NULL) {
2430 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2431 (ctxt->node->content != NULL)) return(0);
2432 /* keep ws in constructs like ...<b> </b>...
2433 for all tags "b" allowing PCDATA */
2434 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2435 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2436 return(0);
2439 } else if (xmlNodeIsText(lastChild)) {
2440 return(0);
2441 } else {
2442 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2443 for all tags "p" allowing PCDATA */
2444 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2445 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2446 return(0);
2450 return(1);
2454 * htmlNewDocNoDtD:
2455 * @URI: URI for the dtd, or NULL
2456 * @ExternalID: the external ID of the DTD, or NULL
2458 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2459 * are NULL
2461 * Returns a new document, do not initialize the DTD if not provided
2463 htmlDocPtr
2464 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2465 xmlDocPtr cur;
2468 * Allocate a new document and fill the fields.
2470 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2471 if (cur == NULL) {
2472 htmlErrMemory(NULL, "HTML document creation failed\n");
2473 return(NULL);
2475 memset(cur, 0, sizeof(xmlDoc));
2477 cur->type = XML_HTML_DOCUMENT_NODE;
2478 cur->version = NULL;
2479 cur->intSubset = NULL;
2480 cur->doc = cur;
2481 cur->name = NULL;
2482 cur->children = NULL;
2483 cur->extSubset = NULL;
2484 cur->oldNs = NULL;
2485 cur->encoding = NULL;
2486 cur->standalone = 1;
2487 cur->compression = 0;
2488 cur->ids = NULL;
2489 cur->refs = NULL;
2490 cur->_private = NULL;
2491 cur->charset = XML_CHAR_ENCODING_UTF8;
2492 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2493 if ((ExternalID != NULL) ||
2494 (URI != NULL))
2495 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2496 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2497 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2498 return(cur);
2502 * htmlNewDoc:
2503 * @URI: URI for the dtd, or NULL
2504 * @ExternalID: the external ID of the DTD, or NULL
2506 * Creates a new HTML document
2508 * Returns a new document
2510 htmlDocPtr
2511 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2512 if ((URI == NULL) && (ExternalID == NULL))
2513 return(htmlNewDocNoDtD(
2514 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2515 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2517 return(htmlNewDocNoDtD(URI, ExternalID));
2521 /************************************************************************
2523 * The parser itself *
2524 * Relates to http://www.w3.org/TR/html40 *
2526 ************************************************************************/
2528 /************************************************************************
2530 * The parser itself *
2532 ************************************************************************/
2534 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2536 static void
2537 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2538 int c;
2540 htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2541 "Incorrectly opened comment\n", NULL, NULL);
2543 do {
2544 c = CUR;
2545 if (c == 0)
2546 break;
2547 NEXT;
2548 } while (c != '>');
2552 * htmlParseHTMLName:
2553 * @ctxt: an HTML parser context
2555 * parse an HTML tag or attribute name, note that we convert it to lowercase
2556 * since HTML names are not case-sensitive.
2558 * Returns the Tag Name parsed or NULL
2561 static const xmlChar *
2562 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2563 const xmlChar *ret;
2564 int i = 0;
2565 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2567 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2568 (CUR != ':') && (CUR != '.')) return(NULL);
2570 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2571 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2572 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2573 (CUR == '.'))) {
2574 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2575 else loc[i] = CUR;
2576 i++;
2578 NEXT;
2581 ret = xmlDictLookup(ctxt->dict, loc, i);
2582 if (ret == NULL)
2583 htmlErrMemory(ctxt, NULL);
2585 return(ret);
2590 * htmlParseHTMLName_nonInvasive:
2591 * @ctxt: an HTML parser context
2593 * parse an HTML tag or attribute name, note that we convert it to lowercase
2594 * since HTML names are not case-sensitive, this doesn't consume the data
2595 * from the stream, it's a look-ahead
2597 * Returns the Tag Name parsed or NULL
2600 static const xmlChar *
2601 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2602 int i = 0;
2603 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2605 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2606 (NXT(1) != ':')) return(NULL);
2608 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2609 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2610 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2611 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2612 else loc[i] = NXT(1+i);
2613 i++;
2616 return(xmlDictLookup(ctxt->dict, loc, i));
2621 * htmlParseName:
2622 * @ctxt: an HTML parser context
2624 * parse an HTML name, this routine is case sensitive.
2626 * Returns the Name parsed or NULL
2629 static const xmlChar *
2630 htmlParseName(htmlParserCtxtPtr ctxt) {
2631 const xmlChar *in;
2632 const xmlChar *ret;
2633 int count = 0;
2635 GROW;
2638 * Accelerator for simple ASCII names
2640 in = ctxt->input->cur;
2641 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2642 ((*in >= 0x41) && (*in <= 0x5A)) ||
2643 (*in == '_') || (*in == ':')) {
2644 in++;
2645 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2646 ((*in >= 0x41) && (*in <= 0x5A)) ||
2647 ((*in >= 0x30) && (*in <= 0x39)) ||
2648 (*in == '_') || (*in == '-') ||
2649 (*in == ':') || (*in == '.'))
2650 in++;
2652 if (in == ctxt->input->end)
2653 return(NULL);
2655 if ((*in > 0) && (*in < 0x80)) {
2656 count = in - ctxt->input->cur;
2657 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2658 ctxt->input->cur = in;
2659 ctxt->input->col += count;
2660 return(ret);
2663 return(htmlParseNameComplex(ctxt));
2666 static const xmlChar *
2667 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2668 int len = 0, l;
2669 int c;
2670 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2671 XML_MAX_TEXT_LENGTH :
2672 XML_MAX_NAME_LENGTH;
2673 const xmlChar *base = ctxt->input->base;
2676 * Handler for more complex cases
2678 c = CUR_CHAR(l);
2679 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2680 (!IS_LETTER(c) && (c != '_') &&
2681 (c != ':'))) {
2682 return(NULL);
2685 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2686 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2687 (c == '.') || (c == '-') ||
2688 (c == '_') || (c == ':') ||
2689 (IS_COMBINING(c)) ||
2690 (IS_EXTENDER(c)))) {
2691 len += l;
2692 if (len > maxLength) {
2693 htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2694 return(NULL);
2696 NEXTL(l);
2697 c = CUR_CHAR(l);
2698 if (ctxt->input->base != base) {
2700 * We changed encoding from an unknown encoding
2701 * Input buffer changed location, so we better start again
2703 return(htmlParseNameComplex(ctxt));
2706 if (ctxt->instate == XML_PARSER_EOF)
2707 return(NULL);
2709 if (ctxt->input->cur - ctxt->input->base < len) {
2710 /* Sanity check */
2711 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2712 "unexpected change of input buffer", NULL, NULL);
2713 return (NULL);
2716 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2721 * htmlParseHTMLAttribute:
2722 * @ctxt: an HTML parser context
2723 * @stop: a char stop value
2725 * parse an HTML attribute value till the stop (quote), if
2726 * stop is 0 then it stops at the first space
2728 * Returns the attribute parsed or NULL
2731 static xmlChar *
2732 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2733 xmlChar *buffer = NULL;
2734 int buffer_size = 0;
2735 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2736 XML_MAX_HUGE_LENGTH :
2737 XML_MAX_TEXT_LENGTH;
2738 xmlChar *out = NULL;
2739 const xmlChar *name = NULL;
2740 const xmlChar *cur = NULL;
2741 const htmlEntityDesc * ent;
2744 * allocate a translation buffer.
2746 buffer_size = HTML_PARSER_BUFFER_SIZE;
2747 buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2748 if (buffer == NULL) {
2749 htmlErrMemory(ctxt, "buffer allocation failed\n");
2750 return(NULL);
2752 out = buffer;
2755 * Ok loop until we reach one of the ending chars
2757 while ((CUR != 0) && (CUR != stop)) {
2758 if ((stop == 0) && (CUR == '>')) break;
2759 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2760 if (CUR == '&') {
2761 if (NXT(1) == '#') {
2762 unsigned int c;
2763 int bits;
2765 c = htmlParseCharRef(ctxt);
2766 if (c < 0x80)
2767 { *out++ = c; bits= -6; }
2768 else if (c < 0x800)
2769 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2770 else if (c < 0x10000)
2771 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2772 else
2773 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2775 for ( ; bits >= 0; bits-= 6) {
2776 *out++ = ((c >> bits) & 0x3F) | 0x80;
2779 if (out - buffer > buffer_size - 100) {
2780 int indx = out - buffer;
2782 growBuffer(buffer);
2783 out = &buffer[indx];
2785 } else {
2786 ent = htmlParseEntityRef(ctxt, &name);
2787 if (name == NULL) {
2788 *out++ = '&';
2789 if (out - buffer > buffer_size - 100) {
2790 int indx = out - buffer;
2792 growBuffer(buffer);
2793 out = &buffer[indx];
2795 } else if (ent == NULL) {
2796 *out++ = '&';
2797 cur = name;
2798 while (*cur != 0) {
2799 if (out - buffer > buffer_size - 100) {
2800 int indx = out - buffer;
2802 growBuffer(buffer);
2803 out = &buffer[indx];
2805 *out++ = *cur++;
2807 } else {
2808 unsigned int c;
2809 int bits;
2811 if (out - buffer > buffer_size - 100) {
2812 int indx = out - buffer;
2814 growBuffer(buffer);
2815 out = &buffer[indx];
2817 c = ent->value;
2818 if (c < 0x80)
2819 { *out++ = c; bits= -6; }
2820 else if (c < 0x800)
2821 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2822 else if (c < 0x10000)
2823 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2824 else
2825 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2827 for ( ; bits >= 0; bits-= 6) {
2828 *out++ = ((c >> bits) & 0x3F) | 0x80;
2832 } else {
2833 unsigned int c;
2834 int bits, l;
2836 if (out - buffer > buffer_size - 100) {
2837 int indx = out - buffer;
2839 growBuffer(buffer);
2840 out = &buffer[indx];
2842 c = CUR_CHAR(l);
2843 if (ctxt->instate == XML_PARSER_EOF) {
2844 xmlFree(buffer);
2845 return(NULL);
2847 if (c < 0x80)
2848 { *out++ = c; bits= -6; }
2849 else if (c < 0x800)
2850 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2851 else if (c < 0x10000)
2852 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2853 else
2854 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2856 for ( ; bits >= 0; bits-= 6) {
2857 *out++ = ((c >> bits) & 0x3F) | 0x80;
2859 NEXTL(l);
2861 if (out - buffer > maxLength) {
2862 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2863 "attribute value too long\n", NULL, NULL);
2864 xmlFree(buffer);
2865 return(NULL);
2868 *out = 0;
2869 return(buffer);
2873 * htmlParseEntityRef:
2874 * @ctxt: an HTML parser context
2875 * @str: location to store the entity name
2877 * DEPRECATED: Internal function, don't use.
2879 * parse an HTML ENTITY references
2881 * [68] EntityRef ::= '&' Name ';'
2883 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2884 * if non-NULL *str will have to be freed by the caller.
2886 const htmlEntityDesc *
2887 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2888 const xmlChar *name;
2889 const htmlEntityDesc * ent = NULL;
2891 if (str != NULL) *str = NULL;
2892 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2894 if (CUR == '&') {
2895 NEXT;
2896 name = htmlParseName(ctxt);
2897 if (name == NULL) {
2898 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2899 "htmlParseEntityRef: no name\n", NULL, NULL);
2900 } else {
2901 GROW;
2902 if (CUR == ';') {
2903 if (str != NULL)
2904 *str = name;
2907 * Lookup the entity in the table.
2909 ent = htmlEntityLookup(name);
2910 if (ent != NULL) /* OK that's ugly !!! */
2911 NEXT;
2912 } else {
2913 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2914 "htmlParseEntityRef: expecting ';'\n",
2915 NULL, NULL);
2916 if (str != NULL)
2917 *str = name;
2921 return(ent);
2925 * htmlParseAttValue:
2926 * @ctxt: an HTML parser context
2928 * parse a value for an attribute
2929 * Note: the parser won't do substitution of entities here, this
2930 * will be handled later in xmlStringGetNodeList, unless it was
2931 * asked for ctxt->replaceEntities != 0
2933 * Returns the AttValue parsed or NULL.
2936 static xmlChar *
2937 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2938 xmlChar *ret = NULL;
2940 if (CUR == '"') {
2941 NEXT;
2942 ret = htmlParseHTMLAttribute(ctxt, '"');
2943 if (CUR != '"') {
2944 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2945 "AttValue: \" expected\n", NULL, NULL);
2946 } else
2947 NEXT;
2948 } else if (CUR == '\'') {
2949 NEXT;
2950 ret = htmlParseHTMLAttribute(ctxt, '\'');
2951 if (CUR != '\'') {
2952 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2953 "AttValue: ' expected\n", NULL, NULL);
2954 } else
2955 NEXT;
2956 } else {
2958 * That's an HTMLism, the attribute value may not be quoted
2960 ret = htmlParseHTMLAttribute(ctxt, 0);
2961 if (ret == NULL) {
2962 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2963 "AttValue: no value found\n", NULL, NULL);
2966 return(ret);
2970 * htmlParseSystemLiteral:
2971 * @ctxt: an HTML parser context
2973 * parse an HTML Literal
2975 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2977 * Returns the SystemLiteral parsed or NULL
2980 static xmlChar *
2981 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2982 size_t len = 0, startPosition = 0;
2983 int err = 0;
2984 int quote;
2985 xmlChar *ret = NULL;
2987 if ((CUR != '"') && (CUR != '\'')) {
2988 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2989 "SystemLiteral \" or ' expected\n", NULL, NULL);
2990 return(NULL);
2992 quote = CUR;
2993 NEXT;
2995 if (CUR_PTR < BASE_PTR)
2996 return(ret);
2997 startPosition = CUR_PTR - BASE_PTR;
2999 while ((CUR != 0) && (CUR != quote)) {
3000 /* TODO: Handle UTF-8 */
3001 if (!IS_CHAR_CH(CUR)) {
3002 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3003 "Invalid char in SystemLiteral 0x%X\n", CUR);
3004 err = 1;
3006 NEXT;
3007 len++;
3009 if (CUR != quote) {
3010 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3011 "Unfinished SystemLiteral\n", NULL, NULL);
3012 } else {
3013 if (err == 0)
3014 ret = xmlStrndup((BASE_PTR+startPosition), len);
3015 NEXT;
3018 return(ret);
3022 * htmlParsePubidLiteral:
3023 * @ctxt: an HTML parser context
3025 * parse an HTML public literal
3027 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3029 * Returns the PubidLiteral parsed or NULL.
3032 static xmlChar *
3033 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3034 size_t len = 0, startPosition = 0;
3035 int err = 0;
3036 int quote;
3037 xmlChar *ret = NULL;
3039 if ((CUR != '"') && (CUR != '\'')) {
3040 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3041 "PubidLiteral \" or ' expected\n", NULL, NULL);
3042 return(NULL);
3044 quote = CUR;
3045 NEXT;
3048 * Name ::= (Letter | '_') (NameChar)*
3050 if (CUR_PTR < BASE_PTR)
3051 return(ret);
3052 startPosition = CUR_PTR - BASE_PTR;
3054 while ((CUR != 0) && (CUR != quote)) {
3055 if (!IS_PUBIDCHAR_CH(CUR)) {
3056 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3057 "Invalid char in PubidLiteral 0x%X\n", CUR);
3058 err = 1;
3060 len++;
3061 NEXT;
3064 if (CUR != quote) {
3065 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3066 "Unfinished PubidLiteral\n", NULL, NULL);
3067 } else {
3068 if (err == 0)
3069 ret = xmlStrndup((BASE_PTR + startPosition), len);
3070 NEXT;
3073 return(ret);
3077 * htmlParseScript:
3078 * @ctxt: an HTML parser context
3080 * parse the content of an HTML SCRIPT or STYLE element
3081 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3082 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3083 * http://www.w3.org/TR/html4/types.html#type-script
3084 * http://www.w3.org/TR/html4/types.html#h-6.15
3085 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3087 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3088 * element and the value of intrinsic event attributes. User agents must
3089 * not evaluate script data as HTML markup but instead must pass it on as
3090 * data to a script engine.
3091 * NOTES:
3092 * - The content is passed like CDATA
3093 * - the attributes for style and scripting "onXXX" are also described
3094 * as CDATA but SGML allows entities references in attributes so their
3095 * processing is identical as other attributes
3097 static void
3098 htmlParseScript(htmlParserCtxtPtr ctxt) {
3099 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3100 int nbchar = 0;
3101 int cur,l;
3103 cur = CUR_CHAR(l);
3104 while (cur != 0) {
3105 if ((cur == '<') && (NXT(1) == '/')) {
3107 * One should break here, the specification is clear:
3108 * Authors should therefore escape "</" within the content.
3109 * Escape mechanisms are specific to each scripting or
3110 * style sheet language.
3112 * In recovery mode, only break if end tag match the
3113 * current tag, effectively ignoring all tags inside the
3114 * script/style block and treating the entire block as
3115 * CDATA.
3117 if (ctxt->recovery) {
3118 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3119 xmlStrlen(ctxt->name)) == 0)
3121 break; /* while */
3122 } else {
3123 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3124 "Element %s embeds close tag\n",
3125 ctxt->name, NULL);
3127 } else {
3128 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3129 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3131 break; /* while */
3135 if (IS_CHAR(cur)) {
3136 COPY_BUF(l,buf,nbchar,cur);
3137 } else {
3138 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3139 "Invalid char in CDATA 0x%X\n", cur);
3141 NEXTL(l);
3142 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3143 buf[nbchar] = 0;
3144 if (ctxt->sax->cdataBlock!= NULL) {
3146 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3148 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3149 } else if (ctxt->sax->characters != NULL) {
3150 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152 nbchar = 0;
3153 SHRINK;
3155 cur = CUR_CHAR(l);
3158 if (ctxt->instate == XML_PARSER_EOF)
3159 return;
3161 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3162 buf[nbchar] = 0;
3163 if (ctxt->sax->cdataBlock!= NULL) {
3165 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3167 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3168 } else if (ctxt->sax->characters != NULL) {
3169 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3176 * htmlParseCharDataInternal:
3177 * @ctxt: an HTML parser context
3178 * @readahead: optional read ahead character in ascii range
3180 * parse a CharData section.
3181 * if we are within a CDATA section ']]>' marks an end of section.
3183 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3186 static void
3187 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3188 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3189 int nbchar = 0;
3190 int cur, l;
3192 if (readahead)
3193 buf[nbchar++] = readahead;
3195 cur = CUR_CHAR(l);
3196 while (((cur != '<') || (ctxt->token == '<')) &&
3197 ((cur != '&') || (ctxt->token == '&')) &&
3198 (cur != 0)) {
3199 if (!(IS_CHAR(cur))) {
3200 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3201 "Invalid char in CDATA 0x%X\n", cur);
3202 } else {
3203 COPY_BUF(l,buf,nbchar,cur);
3205 NEXTL(l);
3206 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3207 buf[nbchar] = 0;
3210 * Ok the segment is to be consumed as chars.
3212 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3213 if (areBlanks(ctxt, buf, nbchar)) {
3214 if (ctxt->keepBlanks) {
3215 if (ctxt->sax->characters != NULL)
3216 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3217 } else {
3218 if (ctxt->sax->ignorableWhitespace != NULL)
3219 ctxt->sax->ignorableWhitespace(ctxt->userData,
3220 buf, nbchar);
3222 } else {
3223 htmlCheckParagraph(ctxt);
3224 if (ctxt->sax->characters != NULL)
3225 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3228 nbchar = 0;
3229 SHRINK;
3231 cur = CUR_CHAR(l);
3233 if (ctxt->instate == XML_PARSER_EOF)
3234 return;
3235 if (nbchar != 0) {
3236 buf[nbchar] = 0;
3239 * Ok the segment is to be consumed as chars.
3241 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3242 if (areBlanks(ctxt, buf, nbchar)) {
3243 if (ctxt->keepBlanks) {
3244 if (ctxt->sax->characters != NULL)
3245 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3246 } else {
3247 if (ctxt->sax->ignorableWhitespace != NULL)
3248 ctxt->sax->ignorableWhitespace(ctxt->userData,
3249 buf, nbchar);
3251 } else {
3252 htmlCheckParagraph(ctxt);
3253 if (ctxt->sax->characters != NULL)
3254 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3261 * htmlParseCharData:
3262 * @ctxt: an HTML parser context
3264 * parse a CharData section.
3265 * if we are within a CDATA section ']]>' marks an end of section.
3267 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3270 static void
3271 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272 htmlParseCharDataInternal(ctxt, 0);
3276 * htmlParseExternalID:
3277 * @ctxt: an HTML parser context
3278 * @publicID: a xmlChar** receiving PubidLiteral
3280 * Parse an External ID or a Public ID
3282 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3285 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3287 * Returns the function returns SystemLiteral and in the second
3288 * case publicID receives PubidLiteral, is strict is off
3289 * it is possible to return NULL and have publicID set.
3292 static xmlChar *
3293 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294 xmlChar *URI = NULL;
3296 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299 SKIP(6);
3300 if (!IS_BLANK_CH(CUR)) {
3301 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302 "Space required after 'SYSTEM'\n", NULL, NULL);
3304 SKIP_BLANKS;
3305 URI = htmlParseSystemLiteral(ctxt);
3306 if (URI == NULL) {
3307 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3310 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313 SKIP(6);
3314 if (!IS_BLANK_CH(CUR)) {
3315 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316 "Space required after 'PUBLIC'\n", NULL, NULL);
3318 SKIP_BLANKS;
3319 *publicID = htmlParsePubidLiteral(ctxt);
3320 if (*publicID == NULL) {
3321 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323 NULL, NULL);
3325 SKIP_BLANKS;
3326 if ((CUR == '"') || (CUR == '\'')) {
3327 URI = htmlParseSystemLiteral(ctxt);
3330 return(URI);
3334 * xmlParsePI:
3335 * @ctxt: an XML parser context
3337 * parse an XML Processing Instruction.
3339 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3341 static void
3342 htmlParsePI(htmlParserCtxtPtr ctxt) {
3343 xmlChar *buf = NULL;
3344 int len = 0;
3345 int size = HTML_PARSER_BUFFER_SIZE;
3346 int cur, l;
3347 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3348 XML_MAX_HUGE_LENGTH :
3349 XML_MAX_TEXT_LENGTH;
3350 const xmlChar *target;
3351 xmlParserInputState state;
3353 if ((RAW == '<') && (NXT(1) == '?')) {
3354 state = ctxt->instate;
3355 ctxt->instate = XML_PARSER_PI;
3357 * this is a Processing Instruction.
3359 SKIP(2);
3362 * Parse the target name and check for special support like
3363 * namespace.
3365 target = htmlParseName(ctxt);
3366 if (target != NULL) {
3367 if (RAW == '>') {
3368 SKIP(1);
3371 * SAX: PI detected.
3373 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3374 (ctxt->sax->processingInstruction != NULL))
3375 ctxt->sax->processingInstruction(ctxt->userData,
3376 target, NULL);
3377 ctxt->instate = state;
3378 return;
3380 buf = (xmlChar *) xmlMallocAtomic(size);
3381 if (buf == NULL) {
3382 htmlErrMemory(ctxt, NULL);
3383 ctxt->instate = state;
3384 return;
3386 cur = CUR;
3387 if (!IS_BLANK(cur)) {
3388 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3389 "ParsePI: PI %s space expected\n", target, NULL);
3391 SKIP_BLANKS;
3392 cur = CUR_CHAR(l);
3393 while ((cur != 0) && (cur != '>')) {
3394 if (len + 5 >= size) {
3395 xmlChar *tmp;
3397 size *= 2;
3398 tmp = (xmlChar *) xmlRealloc(buf, size);
3399 if (tmp == NULL) {
3400 htmlErrMemory(ctxt, NULL);
3401 xmlFree(buf);
3402 ctxt->instate = state;
3403 return;
3405 buf = tmp;
3407 if (IS_CHAR(cur)) {
3408 COPY_BUF(l,buf,len,cur);
3409 } else {
3410 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3411 "Invalid char in processing instruction "
3412 "0x%X\n", cur);
3414 if (len > maxLength) {
3415 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3416 "PI %s too long", target, NULL);
3417 xmlFree(buf);
3418 ctxt->instate = state;
3419 return;
3421 NEXTL(l);
3422 cur = CUR_CHAR(l);
3424 buf[len] = 0;
3425 if (ctxt->instate == XML_PARSER_EOF) {
3426 xmlFree(buf);
3427 return;
3429 if (cur != '>') {
3430 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3431 "ParsePI: PI %s never end ...\n", target, NULL);
3432 } else {
3433 SKIP(1);
3436 * SAX: PI detected.
3438 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3439 (ctxt->sax->processingInstruction != NULL))
3440 ctxt->sax->processingInstruction(ctxt->userData,
3441 target, buf);
3443 xmlFree(buf);
3444 } else {
3445 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3446 "PI is not started correctly", NULL, NULL);
3448 ctxt->instate = state;
3453 * htmlParseComment:
3454 * @ctxt: an HTML parser context
3456 * Parse an XML (SGML) comment <!-- .... -->
3458 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3460 static void
3461 htmlParseComment(htmlParserCtxtPtr ctxt) {
3462 xmlChar *buf = NULL;
3463 int len;
3464 int size = HTML_PARSER_BUFFER_SIZE;
3465 int q, ql;
3466 int r, rl;
3467 int cur, l;
3468 int next, nl;
3469 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3470 XML_MAX_HUGE_LENGTH :
3471 XML_MAX_TEXT_LENGTH;
3472 xmlParserInputState state;
3475 * Check that there is a comment right here.
3477 if ((RAW != '<') || (NXT(1) != '!') ||
3478 (NXT(2) != '-') || (NXT(3) != '-')) return;
3480 state = ctxt->instate;
3481 ctxt->instate = XML_PARSER_COMMENT;
3482 SKIP(4);
3483 buf = (xmlChar *) xmlMallocAtomic(size);
3484 if (buf == NULL) {
3485 htmlErrMemory(ctxt, "buffer allocation failed\n");
3486 ctxt->instate = state;
3487 return;
3489 len = 0;
3490 buf[len] = 0;
3491 q = CUR_CHAR(ql);
3492 if (q == 0)
3493 goto unfinished;
3494 if (q == '>') {
3495 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3496 cur = '>';
3497 goto finished;
3499 NEXTL(ql);
3500 r = CUR_CHAR(rl);
3501 if (r == 0)
3502 goto unfinished;
3503 if (q == '-' && r == '>') {
3504 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3505 cur = '>';
3506 goto finished;
3508 NEXTL(rl);
3509 cur = CUR_CHAR(l);
3510 while ((cur != 0) &&
3511 ((cur != '>') ||
3512 (r != '-') || (q != '-'))) {
3513 NEXTL(l);
3514 next = CUR_CHAR(nl);
3516 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3517 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3518 "Comment incorrectly closed by '--!>'", NULL, NULL);
3519 cur = '>';
3520 break;
3523 if (len + 5 >= size) {
3524 xmlChar *tmp;
3526 size *= 2;
3527 tmp = (xmlChar *) xmlRealloc(buf, size);
3528 if (tmp == NULL) {
3529 xmlFree(buf);
3530 htmlErrMemory(ctxt, "growing buffer failed\n");
3531 ctxt->instate = state;
3532 return;
3534 buf = tmp;
3536 if (IS_CHAR(q)) {
3537 COPY_BUF(ql,buf,len,q);
3538 } else {
3539 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3540 "Invalid char in comment 0x%X\n", q);
3542 if (len > maxLength) {
3543 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3544 "comment too long", NULL, NULL);
3545 xmlFree(buf);
3546 ctxt->instate = state;
3547 return;
3550 q = r;
3551 ql = rl;
3552 r = cur;
3553 rl = l;
3554 cur = next;
3555 l = nl;
3557 finished:
3558 buf[len] = 0;
3559 if (ctxt->instate == XML_PARSER_EOF) {
3560 xmlFree(buf);
3561 return;
3563 if (cur == '>') {
3564 NEXT;
3565 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3566 (!ctxt->disableSAX))
3567 ctxt->sax->comment(ctxt->userData, buf);
3568 xmlFree(buf);
3569 ctxt->instate = state;
3570 return;
3573 unfinished:
3574 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3575 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3576 xmlFree(buf);
3580 * htmlParseCharRef:
3581 * @ctxt: an HTML parser context
3583 * DEPRECATED: Internal function, don't use.
3585 * parse Reference declarations
3587 * [66] CharRef ::= '&#' [0-9]+ ';' |
3588 * '&#x' [0-9a-fA-F]+ ';'
3590 * Returns the value parsed (as an int)
3593 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3594 int val = 0;
3596 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3597 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3598 "htmlParseCharRef: context error\n",
3599 NULL, NULL);
3600 return(0);
3602 if ((CUR == '&') && (NXT(1) == '#') &&
3603 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3604 SKIP(3);
3605 while (CUR != ';') {
3606 if ((CUR >= '0') && (CUR <= '9')) {
3607 if (val < 0x110000)
3608 val = val * 16 + (CUR - '0');
3609 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3610 if (val < 0x110000)
3611 val = val * 16 + (CUR - 'a') + 10;
3612 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3613 if (val < 0x110000)
3614 val = val * 16 + (CUR - 'A') + 10;
3615 } else {
3616 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3617 "htmlParseCharRef: missing semicolon\n",
3618 NULL, NULL);
3619 break;
3621 NEXT;
3623 if (CUR == ';')
3624 NEXT;
3625 } else if ((CUR == '&') && (NXT(1) == '#')) {
3626 SKIP(2);
3627 while (CUR != ';') {
3628 if ((CUR >= '0') && (CUR <= '9')) {
3629 if (val < 0x110000)
3630 val = val * 10 + (CUR - '0');
3631 } else {
3632 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3633 "htmlParseCharRef: missing semicolon\n",
3634 NULL, NULL);
3635 break;
3637 NEXT;
3639 if (CUR == ';')
3640 NEXT;
3641 } else {
3642 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3643 "htmlParseCharRef: invalid value\n", NULL, NULL);
3646 * Check the value IS_CHAR ...
3648 if (IS_CHAR(val)) {
3649 return(val);
3650 } else if (val >= 0x110000) {
3651 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3652 "htmlParseCharRef: value too large\n", NULL, NULL);
3653 } else {
3654 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3655 "htmlParseCharRef: invalid xmlChar value %d\n",
3656 val);
3658 return(0);
3663 * htmlParseDocTypeDecl:
3664 * @ctxt: an HTML parser context
3666 * parse a DOCTYPE declaration
3668 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3669 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3672 static void
3673 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3674 const xmlChar *name;
3675 xmlChar *ExternalID = NULL;
3676 xmlChar *URI = NULL;
3679 * We know that '<!DOCTYPE' has been detected.
3681 SKIP(9);
3683 SKIP_BLANKS;
3686 * Parse the DOCTYPE name.
3688 name = htmlParseName(ctxt);
3689 if (name == NULL) {
3690 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3691 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3692 NULL, NULL);
3695 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3698 SKIP_BLANKS;
3701 * Check for SystemID and ExternalID
3703 URI = htmlParseExternalID(ctxt, &ExternalID);
3704 SKIP_BLANKS;
3707 * We should be at the end of the DOCTYPE declaration.
3709 if (CUR != '>') {
3710 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3711 "DOCTYPE improperly terminated\n", NULL, NULL);
3712 /* Ignore bogus content */
3713 while ((CUR != 0) && (CUR != '>') &&
3714 (ctxt->instate != XML_PARSER_EOF))
3715 NEXT;
3717 if (CUR == '>')
3718 NEXT;
3721 * Create or update the document accordingly to the DOCTYPE
3723 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3724 (!ctxt->disableSAX))
3725 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3728 * Cleanup, since we don't use all those identifiers
3730 if (URI != NULL) xmlFree(URI);
3731 if (ExternalID != NULL) xmlFree(ExternalID);
3735 * htmlParseAttribute:
3736 * @ctxt: an HTML parser context
3737 * @value: a xmlChar ** used to store the value of the attribute
3739 * parse an attribute
3741 * [41] Attribute ::= Name Eq AttValue
3743 * [25] Eq ::= S? '=' S?
3745 * With namespace:
3747 * [NS 11] Attribute ::= QName Eq AttValue
3749 * Also the case QName == xmlns:??? is handled independently as a namespace
3750 * definition.
3752 * Returns the attribute name, and the value in *value.
3755 static const xmlChar *
3756 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3757 const xmlChar *name;
3758 xmlChar *val = NULL;
3760 *value = NULL;
3761 name = htmlParseHTMLName(ctxt);
3762 if (name == NULL) {
3763 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3764 "error parsing attribute name\n", NULL, NULL);
3765 return(NULL);
3769 * read the value
3771 SKIP_BLANKS;
3772 if (CUR == '=') {
3773 NEXT;
3774 SKIP_BLANKS;
3775 val = htmlParseAttValue(ctxt);
3778 *value = val;
3779 return(name);
3783 * htmlCheckEncodingDirect:
3784 * @ctxt: an HTML parser context
3785 * @attvalue: the attribute value
3787 * Checks an attribute value to detect
3788 * the encoding
3789 * If a new encoding is detected the parser is switched to decode
3790 * it and pass UTF8
3792 static void
3793 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3795 if ((ctxt == NULL) || (encoding == NULL) ||
3796 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3797 return;
3799 /* do not change encoding */
3800 if (ctxt->input->encoding != NULL)
3801 return;
3803 if (encoding != NULL) {
3804 xmlCharEncoding enc;
3805 xmlCharEncodingHandlerPtr handler;
3807 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3809 if (ctxt->input->encoding != NULL)
3810 xmlFree((xmlChar *) ctxt->input->encoding);
3811 ctxt->input->encoding = xmlStrdup(encoding);
3813 enc = xmlParseCharEncoding((const char *) encoding);
3815 * registered set of known encodings
3817 if (enc != XML_CHAR_ENCODING_ERROR) {
3818 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3819 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3820 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3821 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3822 (ctxt->input->buf != NULL) &&
3823 (ctxt->input->buf->encoder == NULL)) {
3824 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3825 "htmlCheckEncoding: wrong encoding meta\n",
3826 NULL, NULL);
3827 } else {
3828 xmlSwitchEncoding(ctxt, enc);
3830 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3831 } else {
3833 * fallback for unknown encodings
3835 handler = xmlFindCharEncodingHandler((const char *) encoding);
3836 if (handler != NULL) {
3837 xmlSwitchToEncoding(ctxt, handler);
3838 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3839 } else {
3840 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3841 "htmlCheckEncoding: unknown encoding %s\n",
3842 encoding, NULL);
3846 if ((ctxt->input->buf != NULL) &&
3847 (ctxt->input->buf->encoder != NULL) &&
3848 (ctxt->input->buf->raw != NULL) &&
3849 (ctxt->input->buf->buffer != NULL)) {
3850 int nbchars;
3851 size_t processed;
3854 * convert as much as possible to the parser reading buffer.
3856 processed = ctxt->input->cur - ctxt->input->base;
3857 xmlBufShrink(ctxt->input->buf->buffer, processed);
3858 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3859 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3860 if (nbchars < 0) {
3861 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3862 "htmlCheckEncoding: encoder error\n",
3863 NULL, NULL);
3870 * htmlCheckEncoding:
3871 * @ctxt: an HTML parser context
3872 * @attvalue: the attribute value
3874 * Checks an http-equiv attribute from a Meta tag to detect
3875 * the encoding
3876 * If a new encoding is detected the parser is switched to decode
3877 * it and pass UTF8
3879 static void
3880 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3881 const xmlChar *encoding;
3883 if (!attvalue)
3884 return;
3886 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3887 if (encoding != NULL) {
3888 encoding += 7;
3891 * skip blank
3893 if (encoding && IS_BLANK_CH(*encoding))
3894 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3895 if (encoding && *encoding == '=') {
3896 encoding ++;
3897 htmlCheckEncodingDirect(ctxt, encoding);
3902 * htmlCheckMeta:
3903 * @ctxt: an HTML parser context
3904 * @atts: the attributes values
3906 * Checks an attributes from a Meta tag
3908 static void
3909 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3910 int i;
3911 const xmlChar *att, *value;
3912 int http = 0;
3913 const xmlChar *content = NULL;
3915 if ((ctxt == NULL) || (atts == NULL))
3916 return;
3918 i = 0;
3919 att = atts[i++];
3920 while (att != NULL) {
3921 value = atts[i++];
3922 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3923 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3924 http = 1;
3925 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3926 htmlCheckEncodingDirect(ctxt, value);
3927 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3928 content = value;
3929 att = atts[i++];
3931 if ((http) && (content != NULL))
3932 htmlCheckEncoding(ctxt, content);
3937 * htmlParseStartTag:
3938 * @ctxt: an HTML parser context
3940 * parse a start of tag either for rule element or
3941 * EmptyElement. In both case we don't parse the tag closing chars.
3943 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3945 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3947 * With namespace:
3949 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3951 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3953 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3956 static int
3957 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3958 const xmlChar *name;
3959 const xmlChar *attname;
3960 xmlChar *attvalue;
3961 const xmlChar **atts;
3962 int nbatts = 0;
3963 int maxatts;
3964 int meta = 0;
3965 int i;
3966 int discardtag = 0;
3968 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3969 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3970 "htmlParseStartTag: context error\n", NULL, NULL);
3971 return -1;
3973 if (ctxt->instate == XML_PARSER_EOF)
3974 return(-1);
3975 if (CUR != '<') return -1;
3976 NEXT;
3978 atts = ctxt->atts;
3979 maxatts = ctxt->maxatts;
3981 GROW;
3982 name = htmlParseHTMLName(ctxt);
3983 if (name == NULL) {
3984 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3985 "htmlParseStartTag: invalid element name\n",
3986 NULL, NULL);
3987 /* Dump the bogus tag like browsers do */
3988 while ((CUR != 0) && (CUR != '>') &&
3989 (ctxt->instate != XML_PARSER_EOF))
3990 NEXT;
3991 return -1;
3993 if (xmlStrEqual(name, BAD_CAST"meta"))
3994 meta = 1;
3997 * Check for auto-closure of HTML elements.
3999 htmlAutoClose(ctxt, name);
4002 * Check for implied HTML elements.
4004 htmlCheckImplied(ctxt, name);
4007 * Avoid html at any level > 0, head at any level != 1
4008 * or any attempt to recurse body
4010 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4011 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012 "htmlParseStartTag: misplaced <html> tag\n",
4013 name, NULL);
4014 discardtag = 1;
4015 ctxt->depth++;
4017 if ((ctxt->nameNr != 1) &&
4018 (xmlStrEqual(name, BAD_CAST"head"))) {
4019 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4020 "htmlParseStartTag: misplaced <head> tag\n",
4021 name, NULL);
4022 discardtag = 1;
4023 ctxt->depth++;
4025 if (xmlStrEqual(name, BAD_CAST"body")) {
4026 int indx;
4027 for (indx = 0;indx < ctxt->nameNr;indx++) {
4028 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4029 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4030 "htmlParseStartTag: misplaced <body> tag\n",
4031 name, NULL);
4032 discardtag = 1;
4033 ctxt->depth++;
4039 * Now parse the attributes, it ends up with the ending
4041 * (S Attribute)* S?
4043 SKIP_BLANKS;
4044 while ((CUR != 0) &&
4045 (CUR != '>') &&
4046 ((CUR != '/') || (NXT(1) != '>')) &&
4047 (ctxt->instate != XML_PARSER_EOF)) {
4048 GROW;
4049 attname = htmlParseAttribute(ctxt, &attvalue);
4050 if (attname != NULL) {
4053 * Well formedness requires at most one declaration of an attribute
4055 for (i = 0; i < nbatts;i += 2) {
4056 if (xmlStrEqual(atts[i], attname)) {
4057 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4058 "Attribute %s redefined\n", attname, NULL);
4059 if (attvalue != NULL)
4060 xmlFree(attvalue);
4061 goto failed;
4066 * Add the pair to atts
4068 if (atts == NULL) {
4069 maxatts = 22; /* allow for 10 attrs by default */
4070 atts = (const xmlChar **)
4071 xmlMalloc(maxatts * sizeof(xmlChar *));
4072 if (atts == NULL) {
4073 htmlErrMemory(ctxt, NULL);
4074 if (attvalue != NULL)
4075 xmlFree(attvalue);
4076 goto failed;
4078 ctxt->atts = atts;
4079 ctxt->maxatts = maxatts;
4080 } else if (nbatts + 4 > maxatts) {
4081 const xmlChar **n;
4083 maxatts *= 2;
4084 n = (const xmlChar **) xmlRealloc((void *) atts,
4085 maxatts * sizeof(const xmlChar *));
4086 if (n == NULL) {
4087 htmlErrMemory(ctxt, NULL);
4088 if (attvalue != NULL)
4089 xmlFree(attvalue);
4090 goto failed;
4092 atts = n;
4093 ctxt->atts = atts;
4094 ctxt->maxatts = maxatts;
4096 atts[nbatts++] = attname;
4097 atts[nbatts++] = attvalue;
4098 atts[nbatts] = NULL;
4099 atts[nbatts + 1] = NULL;
4101 else {
4102 if (attvalue != NULL)
4103 xmlFree(attvalue);
4104 /* Dump the bogus attribute string up to the next blank or
4105 * the end of the tag. */
4106 while ((CUR != 0) &&
4107 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4108 ((CUR != '/') || (NXT(1) != '>')) &&
4109 (ctxt->instate != XML_PARSER_EOF))
4110 NEXT;
4113 failed:
4114 SKIP_BLANKS;
4118 * Handle specific association to the META tag
4120 if (meta && (nbatts != 0))
4121 htmlCheckMeta(ctxt, atts);
4124 * SAX: Start of Element !
4126 if (!discardtag) {
4127 htmlnamePush(ctxt, name);
4128 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4129 if (nbatts != 0)
4130 ctxt->sax->startElement(ctxt->userData, name, atts);
4131 else
4132 ctxt->sax->startElement(ctxt->userData, name, NULL);
4136 if (atts != NULL) {
4137 for (i = 1;i < nbatts;i += 2) {
4138 if (atts[i] != NULL)
4139 xmlFree((xmlChar *) atts[i]);
4143 return(discardtag);
4147 * htmlParseEndTag:
4148 * @ctxt: an HTML parser context
4150 * parse an end of tag
4152 * [42] ETag ::= '</' Name S? '>'
4154 * With namespace
4156 * [NS 9] ETag ::= '</' QName S? '>'
4158 * Returns 1 if the current level should be closed.
4161 static int
4162 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4164 const xmlChar *name;
4165 const xmlChar *oldname;
4166 int i, ret;
4168 if ((CUR != '<') || (NXT(1) != '/')) {
4169 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4170 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4171 return (0);
4173 SKIP(2);
4175 name = htmlParseHTMLName(ctxt);
4176 if (name == NULL)
4177 return (0);
4179 * We should definitely be at the ending "S? '>'" part
4181 SKIP_BLANKS;
4182 if (CUR != '>') {
4183 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4184 "End tag : expected '>'\n", NULL, NULL);
4185 /* Skip to next '>' */
4186 while ((CUR != 0) && (CUR != '>'))
4187 NEXT;
4189 if (CUR == '>')
4190 NEXT;
4193 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4194 * out now.
4196 if ((ctxt->depth > 0) &&
4197 (xmlStrEqual(name, BAD_CAST "html") ||
4198 xmlStrEqual(name, BAD_CAST "body") ||
4199 xmlStrEqual(name, BAD_CAST "head"))) {
4200 ctxt->depth--;
4201 return (0);
4205 * If the name read is not one of the element in the parsing stack
4206 * then return, it's just an error.
4208 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4209 if (xmlStrEqual(name, ctxt->nameTab[i]))
4210 break;
4212 if (i < 0) {
4213 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4214 "Unexpected end tag : %s\n", name, NULL);
4215 return (0);
4220 * Check for auto-closure of HTML elements.
4223 htmlAutoCloseOnClose(ctxt, name);
4226 * Well formedness constraints, opening and closing must match.
4227 * With the exception that the autoclose may have popped stuff out
4228 * of the stack.
4230 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4231 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4232 "Opening and ending tag mismatch: %s and %s\n",
4233 name, ctxt->name);
4237 * SAX: End of Tag
4239 oldname = ctxt->name;
4240 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4241 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4242 ctxt->sax->endElement(ctxt->userData, name);
4243 htmlNodeInfoPop(ctxt);
4244 htmlnamePop(ctxt);
4245 ret = 1;
4246 } else {
4247 ret = 0;
4250 return (ret);
4255 * htmlParseReference:
4256 * @ctxt: an HTML parser context
4258 * parse and handle entity references in content,
4259 * this will end-up in a call to character() since this is either a
4260 * CharRef, or a predefined entity.
4262 static void
4263 htmlParseReference(htmlParserCtxtPtr ctxt) {
4264 const htmlEntityDesc * ent;
4265 xmlChar out[6];
4266 const xmlChar *name;
4267 if (CUR != '&') return;
4269 if (NXT(1) == '#') {
4270 unsigned int c;
4271 int bits, i = 0;
4273 c = htmlParseCharRef(ctxt);
4274 if (c == 0)
4275 return;
4277 if (c < 0x80) { out[i++]= c; bits= -6; }
4278 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4279 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4280 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4282 for ( ; bits >= 0; bits-= 6) {
4283 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4285 out[i] = 0;
4287 htmlCheckParagraph(ctxt);
4288 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4289 ctxt->sax->characters(ctxt->userData, out, i);
4290 } else {
4291 ent = htmlParseEntityRef(ctxt, &name);
4292 if (name == NULL) {
4293 htmlCheckParagraph(ctxt);
4294 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4295 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4296 return;
4298 if ((ent == NULL) || !(ent->value > 0)) {
4299 htmlCheckParagraph(ctxt);
4300 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4301 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4302 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4303 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4305 } else {
4306 unsigned int c;
4307 int bits, i = 0;
4309 c = ent->value;
4310 if (c < 0x80)
4311 { out[i++]= c; bits= -6; }
4312 else if (c < 0x800)
4313 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4314 else if (c < 0x10000)
4315 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4316 else
4317 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4319 for ( ; bits >= 0; bits-= 6) {
4320 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4322 out[i] = 0;
4324 htmlCheckParagraph(ctxt);
4325 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4326 ctxt->sax->characters(ctxt->userData, out, i);
4332 * htmlParseContent:
4333 * @ctxt: an HTML parser context
4335 * Parse a content: comment, sub-element, reference or text.
4336 * Kept for compatibility with old code
4339 static void
4340 htmlParseContent(htmlParserCtxtPtr ctxt) {
4341 xmlChar *currentNode;
4342 int depth;
4343 const xmlChar *name;
4345 currentNode = xmlStrdup(ctxt->name);
4346 depth = ctxt->nameNr;
4347 while (1) {
4348 GROW;
4350 if (ctxt->instate == XML_PARSER_EOF)
4351 break;
4354 * Our tag or one of it's parent or children is ending.
4356 if ((CUR == '<') && (NXT(1) == '/')) {
4357 if (htmlParseEndTag(ctxt) &&
4358 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4359 if (currentNode != NULL)
4360 xmlFree(currentNode);
4361 return;
4363 continue; /* while */
4366 else if ((CUR == '<') &&
4367 ((IS_ASCII_LETTER(NXT(1))) ||
4368 (NXT(1) == '_') || (NXT(1) == ':'))) {
4369 name = htmlParseHTMLName_nonInvasive(ctxt);
4370 if (name == NULL) {
4371 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4372 "htmlParseStartTag: invalid element name\n",
4373 NULL, NULL);
4374 /* Dump the bogus tag like browsers do */
4375 while ((CUR != 0) && (CUR != '>'))
4376 NEXT;
4378 if (currentNode != NULL)
4379 xmlFree(currentNode);
4380 return;
4383 if (ctxt->name != NULL) {
4384 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4385 htmlAutoClose(ctxt, name);
4386 continue;
4392 * Has this node been popped out during parsing of
4393 * the next element
4395 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4396 (!xmlStrEqual(currentNode, ctxt->name)))
4398 if (currentNode != NULL) xmlFree(currentNode);
4399 return;
4402 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4403 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4405 * Handle SCRIPT/STYLE separately
4407 htmlParseScript(ctxt);
4410 else if ((CUR == '<') && (NXT(1) == '!')) {
4412 * Sometimes DOCTYPE arrives in the middle of the document
4414 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4415 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4416 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4417 (UPP(8) == 'E')) {
4418 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4419 "Misplaced DOCTYPE declaration\n",
4420 BAD_CAST "DOCTYPE" , NULL);
4421 htmlParseDocTypeDecl(ctxt);
4424 * First case : a comment
4426 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4427 htmlParseComment(ctxt);
4429 else {
4430 htmlSkipBogusComment(ctxt);
4435 * Second case : a Processing Instruction.
4437 else if ((CUR == '<') && (NXT(1) == '?')) {
4438 htmlParsePI(ctxt);
4442 * Third case : a sub-element.
4444 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4445 htmlParseElement(ctxt);
4447 else if (CUR == '<') {
4448 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4449 (ctxt->sax->characters != NULL))
4450 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4451 NEXT;
4455 * Fourth case : a reference. If if has not been resolved,
4456 * parsing returns it's Name, create the node
4458 else if (CUR == '&') {
4459 htmlParseReference(ctxt);
4463 * Fifth case : end of the resource
4465 else if (CUR == 0) {
4466 htmlAutoCloseOnEnd(ctxt);
4467 break;
4471 * Last case, text. Note that References are handled directly.
4473 else {
4474 htmlParseCharData(ctxt);
4477 SHRINK;
4478 GROW;
4480 if (currentNode != NULL) xmlFree(currentNode);
4484 * htmlParseElement:
4485 * @ctxt: an HTML parser context
4487 * DEPRECATED: Internal function, don't use.
4489 * parse an HTML element, this is highly recursive
4490 * this is kept for compatibility with previous code versions
4492 * [39] element ::= EmptyElemTag | STag content ETag
4494 * [41] Attribute ::= Name Eq AttValue
4497 void
4498 htmlParseElement(htmlParserCtxtPtr ctxt) {
4499 const xmlChar *name;
4500 xmlChar *currentNode = NULL;
4501 const htmlElemDesc * info;
4502 htmlParserNodeInfo node_info;
4503 int failed;
4504 int depth;
4505 const xmlChar *oldptr;
4507 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4508 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4509 "htmlParseElement: context error\n", NULL, NULL);
4510 return;
4513 if (ctxt->instate == XML_PARSER_EOF)
4514 return;
4516 /* Capture start position */
4517 if (ctxt->record_info) {
4518 node_info.begin_pos = ctxt->input->consumed +
4519 (CUR_PTR - ctxt->input->base);
4520 node_info.begin_line = ctxt->input->line;
4523 failed = htmlParseStartTag(ctxt);
4524 name = ctxt->name;
4525 if ((failed == -1) || (name == NULL)) {
4526 if (CUR == '>')
4527 NEXT;
4528 return;
4532 * Lookup the info for that element.
4534 info = htmlTagLookup(name);
4535 if (info == NULL) {
4536 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4537 "Tag %s invalid\n", name, NULL);
4541 * Check for an Empty Element labeled the XML/SGML way
4543 if ((CUR == '/') && (NXT(1) == '>')) {
4544 SKIP(2);
4545 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4546 ctxt->sax->endElement(ctxt->userData, name);
4547 htmlnamePop(ctxt);
4548 return;
4551 if (CUR == '>') {
4552 NEXT;
4553 } else {
4554 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4555 "Couldn't find end of Start Tag %s\n", name, NULL);
4558 * end of parsing of this node.
4560 if (xmlStrEqual(name, ctxt->name)) {
4561 nodePop(ctxt);
4562 htmlnamePop(ctxt);
4566 * Capture end position and add node
4568 if (ctxt->record_info) {
4569 node_info.end_pos = ctxt->input->consumed +
4570 (CUR_PTR - ctxt->input->base);
4571 node_info.end_line = ctxt->input->line;
4572 node_info.node = ctxt->node;
4573 xmlParserAddNodeInfo(ctxt, &node_info);
4575 return;
4579 * Check for an Empty Element from DTD definition
4581 if ((info != NULL) && (info->empty)) {
4582 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4583 ctxt->sax->endElement(ctxt->userData, name);
4584 htmlnamePop(ctxt);
4585 return;
4589 * Parse the content of the element:
4591 currentNode = xmlStrdup(ctxt->name);
4592 depth = ctxt->nameNr;
4593 while (CUR != 0) {
4594 oldptr = ctxt->input->cur;
4595 htmlParseContent(ctxt);
4596 if (oldptr==ctxt->input->cur) break;
4597 if (ctxt->nameNr < depth) break;
4601 * Capture end position and add node
4603 if ( currentNode != NULL && ctxt->record_info ) {
4604 node_info.end_pos = ctxt->input->consumed +
4605 (CUR_PTR - ctxt->input->base);
4606 node_info.end_line = ctxt->input->line;
4607 node_info.node = ctxt->node;
4608 xmlParserAddNodeInfo(ctxt, &node_info);
4610 if (CUR == 0) {
4611 htmlAutoCloseOnEnd(ctxt);
4614 if (currentNode != NULL)
4615 xmlFree(currentNode);
4618 static void
4619 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4621 * Capture end position and add node
4623 if ( ctxt->node != NULL && ctxt->record_info ) {
4624 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4625 (CUR_PTR - ctxt->input->base);
4626 ctxt->nodeInfo->end_line = ctxt->input->line;
4627 ctxt->nodeInfo->node = ctxt->node;
4628 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4629 htmlNodeInfoPop(ctxt);
4631 if (CUR == 0) {
4632 htmlAutoCloseOnEnd(ctxt);
4637 * htmlParseElementInternal:
4638 * @ctxt: an HTML parser context
4640 * parse an HTML element, new version, non recursive
4642 * [39] element ::= EmptyElemTag | STag content ETag
4644 * [41] Attribute ::= Name Eq AttValue
4647 static void
4648 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4649 const xmlChar *name;
4650 const htmlElemDesc * info;
4651 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4652 int failed;
4654 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4655 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4656 "htmlParseElementInternal: context error\n", NULL, NULL);
4657 return;
4660 if (ctxt->instate == XML_PARSER_EOF)
4661 return;
4663 /* Capture start position */
4664 if (ctxt->record_info) {
4665 node_info.begin_pos = ctxt->input->consumed +
4666 (CUR_PTR - ctxt->input->base);
4667 node_info.begin_line = ctxt->input->line;
4670 failed = htmlParseStartTag(ctxt);
4671 name = ctxt->name;
4672 if ((failed == -1) || (name == NULL)) {
4673 if (CUR == '>')
4674 NEXT;
4675 return;
4679 * Lookup the info for that element.
4681 info = htmlTagLookup(name);
4682 if (info == NULL) {
4683 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4684 "Tag %s invalid\n", name, NULL);
4688 * Check for an Empty Element labeled the XML/SGML way
4690 if ((CUR == '/') && (NXT(1) == '>')) {
4691 SKIP(2);
4692 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4693 ctxt->sax->endElement(ctxt->userData, name);
4694 htmlnamePop(ctxt);
4695 return;
4698 if (CUR == '>') {
4699 NEXT;
4700 } else {
4701 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4702 "Couldn't find end of Start Tag %s\n", name, NULL);
4705 * end of parsing of this node.
4707 if (xmlStrEqual(name, ctxt->name)) {
4708 nodePop(ctxt);
4709 htmlnamePop(ctxt);
4712 if (ctxt->record_info)
4713 htmlNodeInfoPush(ctxt, &node_info);
4714 htmlParserFinishElementParsing(ctxt);
4715 return;
4719 * Check for an Empty Element from DTD definition
4721 if ((info != NULL) && (info->empty)) {
4722 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4723 ctxt->sax->endElement(ctxt->userData, name);
4724 htmlnamePop(ctxt);
4725 return;
4728 if (ctxt->record_info)
4729 htmlNodeInfoPush(ctxt, &node_info);
4733 * htmlParseContentInternal:
4734 * @ctxt: an HTML parser context
4736 * Parse a content: comment, sub-element, reference or text.
4737 * New version for non recursive htmlParseElementInternal
4740 static void
4741 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4742 xmlChar *currentNode;
4743 int depth;
4744 const xmlChar *name;
4746 depth = ctxt->nameNr;
4747 if (depth <= 0) {
4748 currentNode = NULL;
4749 } else {
4750 currentNode = xmlStrdup(ctxt->name);
4751 if (currentNode == NULL) {
4752 htmlErrMemory(ctxt, NULL);
4753 return;
4756 while (1) {
4757 GROW;
4759 if (ctxt->instate == XML_PARSER_EOF)
4760 break;
4763 * Our tag or one of it's parent or children is ending.
4765 if ((CUR == '<') && (NXT(1) == '/')) {
4766 if (htmlParseEndTag(ctxt) &&
4767 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4768 if (currentNode != NULL)
4769 xmlFree(currentNode);
4771 depth = ctxt->nameNr;
4772 if (depth <= 0) {
4773 currentNode = NULL;
4774 } else {
4775 currentNode = xmlStrdup(ctxt->name);
4776 if (currentNode == NULL) {
4777 htmlErrMemory(ctxt, NULL);
4778 break;
4782 continue; /* while */
4785 else if ((CUR == '<') &&
4786 ((IS_ASCII_LETTER(NXT(1))) ||
4787 (NXT(1) == '_') || (NXT(1) == ':'))) {
4788 name = htmlParseHTMLName_nonInvasive(ctxt);
4789 if (name == NULL) {
4790 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4791 "htmlParseStartTag: invalid element name\n",
4792 NULL, NULL);
4793 /* Dump the bogus tag like browsers do */
4794 while ((CUR == 0) && (CUR != '>'))
4795 NEXT;
4797 htmlParserFinishElementParsing(ctxt);
4798 if (currentNode != NULL)
4799 xmlFree(currentNode);
4801 currentNode = xmlStrdup(ctxt->name);
4802 if (currentNode == NULL) {
4803 htmlErrMemory(ctxt, NULL);
4804 break;
4806 depth = ctxt->nameNr;
4807 continue;
4810 if (ctxt->name != NULL) {
4811 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4812 htmlAutoClose(ctxt, name);
4813 continue;
4819 * Has this node been popped out during parsing of
4820 * the next element
4822 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4823 (!xmlStrEqual(currentNode, ctxt->name)))
4825 htmlParserFinishElementParsing(ctxt);
4826 if (currentNode != NULL) xmlFree(currentNode);
4828 currentNode = xmlStrdup(ctxt->name);
4829 if (currentNode == NULL) {
4830 htmlErrMemory(ctxt, NULL);
4831 break;
4833 depth = ctxt->nameNr;
4834 continue;
4837 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4838 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4840 * Handle SCRIPT/STYLE separately
4842 htmlParseScript(ctxt);
4845 else if ((CUR == '<') && (NXT(1) == '!')) {
4847 * Sometimes DOCTYPE arrives in the middle of the document
4849 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4850 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4851 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4852 (UPP(8) == 'E')) {
4853 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4854 "Misplaced DOCTYPE declaration\n",
4855 BAD_CAST "DOCTYPE" , NULL);
4856 htmlParseDocTypeDecl(ctxt);
4859 * First case : a comment
4861 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4862 htmlParseComment(ctxt);
4864 else {
4865 htmlSkipBogusComment(ctxt);
4870 * Second case : a Processing Instruction.
4872 else if ((CUR == '<') && (NXT(1) == '?')) {
4873 htmlParsePI(ctxt);
4877 * Third case : a sub-element.
4879 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4880 htmlParseElementInternal(ctxt);
4881 if (currentNode != NULL) xmlFree(currentNode);
4883 currentNode = xmlStrdup(ctxt->name);
4884 if (currentNode == NULL) {
4885 htmlErrMemory(ctxt, NULL);
4886 break;
4888 depth = ctxt->nameNr;
4890 else if (CUR == '<') {
4891 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4892 (ctxt->sax->characters != NULL))
4893 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4894 NEXT;
4898 * Fourth case : a reference. If if has not been resolved,
4899 * parsing returns it's Name, create the node
4901 else if (CUR == '&') {
4902 htmlParseReference(ctxt);
4906 * Fifth case : end of the resource
4908 else if (CUR == 0) {
4909 htmlAutoCloseOnEnd(ctxt);
4910 break;
4914 * Last case, text. Note that References are handled directly.
4916 else {
4917 htmlParseCharData(ctxt);
4920 SHRINK;
4921 GROW;
4923 if (currentNode != NULL) xmlFree(currentNode);
4927 * htmlParseContent:
4928 * @ctxt: an HTML parser context
4930 * Parse a content: comment, sub-element, reference or text.
4931 * This is the entry point when called from parser.c
4934 void
4935 __htmlParseContent(void *ctxt) {
4936 if (ctxt != NULL)
4937 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4941 * htmlParseDocument:
4942 * @ctxt: an HTML parser context
4944 * parse an HTML document (and build a tree if using the standard SAX
4945 * interface).
4947 * Returns 0, -1 in case of error. the parser context is augmented
4948 * as a result of the parsing.
4952 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4953 xmlChar start[4];
4954 xmlCharEncoding enc;
4955 xmlDtdPtr dtd;
4957 xmlInitParser();
4959 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4960 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4961 "htmlParseDocument: context error\n", NULL, NULL);
4962 return(XML_ERR_INTERNAL_ERROR);
4964 GROW;
4966 * SAX: beginning of the document processing.
4968 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4969 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4971 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4972 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4974 * Get the 4 first bytes and decode the charset
4975 * if enc != XML_CHAR_ENCODING_NONE
4976 * plug some encoding conversion routines.
4978 start[0] = RAW;
4979 start[1] = NXT(1);
4980 start[2] = NXT(2);
4981 start[3] = NXT(3);
4982 enc = xmlDetectCharEncoding(&start[0], 4);
4983 if (enc != XML_CHAR_ENCODING_NONE) {
4984 xmlSwitchEncoding(ctxt, enc);
4989 * Wipe out everything which is before the first '<'
4991 SKIP_BLANKS;
4992 if (CUR == 0) {
4993 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4994 "Document is empty\n", NULL, NULL);
4997 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4998 ctxt->sax->startDocument(ctxt->userData);
5002 * Parse possible comments and PIs before any content
5004 while (((CUR == '<') && (NXT(1) == '!') &&
5005 (NXT(2) == '-') && (NXT(3) == '-')) ||
5006 ((CUR == '<') && (NXT(1) == '?'))) {
5007 htmlParseComment(ctxt);
5008 htmlParsePI(ctxt);
5009 SKIP_BLANKS;
5014 * Then possibly doc type declaration(s) and more Misc
5015 * (doctypedecl Misc*)?
5017 if ((CUR == '<') && (NXT(1) == '!') &&
5018 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5019 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5020 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5021 (UPP(8) == 'E')) {
5022 htmlParseDocTypeDecl(ctxt);
5024 SKIP_BLANKS;
5027 * Parse possible comments and PIs before any content
5029 while (((CUR == '<') && (NXT(1) == '!') &&
5030 (NXT(2) == '-') && (NXT(3) == '-')) ||
5031 ((CUR == '<') && (NXT(1) == '?'))) {
5032 htmlParseComment(ctxt);
5033 htmlParsePI(ctxt);
5034 SKIP_BLANKS;
5038 * Time to start parsing the tree itself
5040 htmlParseContentInternal(ctxt);
5043 * autoclose
5045 if (CUR == 0)
5046 htmlAutoCloseOnEnd(ctxt);
5050 * SAX: end of the document processing.
5052 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5053 ctxt->sax->endDocument(ctxt->userData);
5055 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5056 dtd = xmlGetIntSubset(ctxt->myDoc);
5057 if (dtd == NULL)
5058 ctxt->myDoc->intSubset =
5059 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5060 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5061 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5063 if (! ctxt->wellFormed) return(-1);
5064 return(0);
5068 /************************************************************************
5070 * Parser contexts handling *
5072 ************************************************************************/
5075 * htmlInitParserCtxt:
5076 * @ctxt: an HTML parser context
5077 * @sax: SAX handler
5078 * @userData: user data
5080 * Initialize a parser context
5082 * Returns 0 in case of success and -1 in case of error
5085 static int
5086 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
5087 void *userData)
5089 if (ctxt == NULL) return(-1);
5090 memset(ctxt, 0, sizeof(htmlParserCtxt));
5092 ctxt->dict = xmlDictCreate();
5093 if (ctxt->dict == NULL) {
5094 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5095 return(-1);
5098 if (ctxt->sax == NULL)
5099 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5100 if (ctxt->sax == NULL) {
5101 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5102 return(-1);
5104 if (sax == NULL) {
5105 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
5106 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
5107 ctxt->userData = ctxt;
5108 } else {
5109 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5110 ctxt->userData = userData ? userData : ctxt;
5113 /* Allocate the Input stack */
5114 ctxt->inputTab = (htmlParserInputPtr *)
5115 xmlMalloc(5 * sizeof(htmlParserInputPtr));
5116 if (ctxt->inputTab == NULL) {
5117 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5118 ctxt->inputNr = 0;
5119 ctxt->inputMax = 0;
5120 ctxt->input = NULL;
5121 return(-1);
5123 ctxt->inputNr = 0;
5124 ctxt->inputMax = 5;
5125 ctxt->input = NULL;
5126 ctxt->version = NULL;
5127 ctxt->encoding = NULL;
5128 ctxt->standalone = -1;
5129 ctxt->instate = XML_PARSER_START;
5131 /* Allocate the Node stack */
5132 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5133 if (ctxt->nodeTab == NULL) {
5134 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5135 ctxt->nodeNr = 0;
5136 ctxt->nodeMax = 0;
5137 ctxt->node = NULL;
5138 ctxt->inputNr = 0;
5139 ctxt->inputMax = 0;
5140 ctxt->input = NULL;
5141 return(-1);
5143 ctxt->nodeNr = 0;
5144 ctxt->nodeMax = 10;
5145 ctxt->node = NULL;
5147 /* Allocate the Name stack */
5148 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5149 if (ctxt->nameTab == NULL) {
5150 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5151 ctxt->nameNr = 0;
5152 ctxt->nameMax = 0;
5153 ctxt->name = NULL;
5154 ctxt->nodeNr = 0;
5155 ctxt->nodeMax = 0;
5156 ctxt->node = NULL;
5157 ctxt->inputNr = 0;
5158 ctxt->inputMax = 0;
5159 ctxt->input = NULL;
5160 return(-1);
5162 ctxt->nameNr = 0;
5163 ctxt->nameMax = 10;
5164 ctxt->name = NULL;
5166 ctxt->nodeInfoTab = NULL;
5167 ctxt->nodeInfoNr = 0;
5168 ctxt->nodeInfoMax = 0;
5170 ctxt->myDoc = NULL;
5171 ctxt->wellFormed = 1;
5172 ctxt->replaceEntities = 0;
5173 ctxt->linenumbers = xmlLineNumbersDefaultValue;
5174 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5175 ctxt->html = 1;
5176 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5177 ctxt->vctxt.userData = ctxt;
5178 ctxt->vctxt.error = xmlParserValidityError;
5179 ctxt->vctxt.warning = xmlParserValidityWarning;
5180 ctxt->record_info = 0;
5181 ctxt->validate = 0;
5182 ctxt->checkIndex = 0;
5183 ctxt->catalogs = NULL;
5184 xmlInitNodeInfoSeq(&ctxt->node_seq);
5185 return(0);
5189 * htmlFreeParserCtxt:
5190 * @ctxt: an HTML parser context
5192 * Free all the memory used by a parser context. However the parsed
5193 * document in ctxt->myDoc is not freed.
5196 void
5197 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5199 xmlFreeParserCtxt(ctxt);
5203 * htmlNewParserCtxt:
5205 * Allocate and initialize a new parser context.
5207 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5210 htmlParserCtxtPtr
5211 htmlNewParserCtxt(void)
5213 return(htmlNewSAXParserCtxt(NULL, NULL));
5217 * htmlNewSAXParserCtxt:
5218 * @sax: SAX handler
5219 * @userData: user data
5221 * Allocate and initialize a new SAX parser context. If userData is NULL,
5222 * the parser context will be passed as user data.
5224 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5227 htmlParserCtxtPtr
5228 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5230 xmlParserCtxtPtr ctxt;
5232 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5233 if (ctxt == NULL) {
5234 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5235 return(NULL);
5237 memset(ctxt, 0, sizeof(xmlParserCtxt));
5238 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5239 htmlFreeParserCtxt(ctxt);
5240 return(NULL);
5242 return(ctxt);
5246 * htmlCreateMemoryParserCtxt:
5247 * @buffer: a pointer to a char array
5248 * @size: the size of the array
5250 * Create a parser context for an HTML in-memory document.
5252 * Returns the new parser context or NULL
5254 htmlParserCtxtPtr
5255 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5256 xmlParserCtxtPtr ctxt;
5257 xmlParserInputPtr input;
5258 xmlParserInputBufferPtr buf;
5260 if (buffer == NULL)
5261 return(NULL);
5262 if (size <= 0)
5263 return(NULL);
5265 ctxt = htmlNewParserCtxt();
5266 if (ctxt == NULL)
5267 return(NULL);
5269 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5270 if (buf == NULL) {
5271 xmlFreeParserCtxt(ctxt);
5272 return(NULL);
5275 input = xmlNewInputStream(ctxt);
5276 if (input == NULL) {
5277 xmlFreeParserInputBuffer(buf);
5278 xmlFreeParserCtxt(ctxt);
5279 return(NULL);
5282 input->filename = NULL;
5283 input->buf = buf;
5284 xmlBufResetInput(buf->buffer, input);
5286 inputPush(ctxt, input);
5287 return(ctxt);
5291 * htmlCreateDocParserCtxt:
5292 * @cur: a pointer to an array of xmlChar
5293 * @encoding: a free form C string describing the HTML document encoding, or NULL
5295 * Create a parser context for an HTML document.
5297 * TODO: check the need to add encoding handling there
5299 * Returns the new parser context or NULL
5301 static htmlParserCtxtPtr
5302 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5303 int len;
5304 htmlParserCtxtPtr ctxt;
5306 if (cur == NULL)
5307 return(NULL);
5308 len = xmlStrlen(cur);
5309 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5310 if (ctxt == NULL)
5311 return(NULL);
5313 if (encoding != NULL) {
5314 xmlCharEncoding enc;
5315 xmlCharEncodingHandlerPtr handler;
5317 if (ctxt->input->encoding != NULL)
5318 xmlFree((xmlChar *) ctxt->input->encoding);
5319 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5321 enc = xmlParseCharEncoding(encoding);
5323 * registered set of known encodings
5325 if (enc != XML_CHAR_ENCODING_ERROR) {
5326 xmlSwitchEncoding(ctxt, enc);
5327 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5328 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5329 "Unsupported encoding %s\n",
5330 (const xmlChar *) encoding, NULL);
5332 } else {
5334 * fallback for unknown encodings
5336 handler = xmlFindCharEncodingHandler((const char *) encoding);
5337 if (handler != NULL) {
5338 xmlSwitchToEncoding(ctxt, handler);
5339 } else {
5340 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5341 "Unsupported encoding %s\n",
5342 (const xmlChar *) encoding, NULL);
5346 return(ctxt);
5349 #ifdef LIBXML_PUSH_ENABLED
5350 /************************************************************************
5352 * Progressive parsing interfaces *
5354 ************************************************************************/
5357 * htmlParseLookupSequence:
5358 * @ctxt: an HTML parser context
5359 * @first: the first char to lookup
5360 * @next: the next char to lookup or zero
5361 * @third: the next char to lookup or zero
5362 * @ignoreattrval: skip over attribute values
5364 * Try to find if a sequence (first, next, third) or just (first next) or
5365 * (first) is available in the input stream.
5366 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5367 * to avoid rescanning sequences of bytes, it DOES change the state of the
5368 * parser, do not use liberally.
5369 * This is basically similar to xmlParseLookupSequence()
5371 * Returns the index to the current parsing point if the full sequence
5372 * is available, -1 otherwise.
5374 static int
5375 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5376 xmlChar next, xmlChar third, int ignoreattrval)
5378 size_t base, len;
5379 htmlParserInputPtr in;
5380 const xmlChar *buf;
5381 int quote;
5383 in = ctxt->input;
5384 if (in == NULL)
5385 return (-1);
5387 base = ctxt->checkIndex;
5388 quote = ctxt->endCheckState;
5390 buf = in->cur;
5391 len = in->end - in->cur;
5393 /* take into account the sequence length */
5394 if (third)
5395 len -= 2;
5396 else if (next)
5397 len--;
5398 for (; base < len; base++) {
5399 if (base >= INT_MAX / 2) {
5400 ctxt->checkIndex = 0;
5401 ctxt->endCheckState = 0;
5402 return (base - 2);
5404 if (ignoreattrval) {
5405 if (quote) {
5406 if (buf[base] == quote)
5407 quote = 0;
5408 continue;
5410 if (buf[base] == '"' || buf[base] == '\'') {
5411 quote = buf[base];
5412 continue;
5415 if (buf[base] == first) {
5416 if (third != 0) {
5417 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5418 continue;
5419 } else if (next != 0) {
5420 if (buf[base + 1] != next)
5421 continue;
5423 ctxt->checkIndex = 0;
5424 ctxt->endCheckState = 0;
5425 return (base);
5428 ctxt->checkIndex = base;
5429 ctxt->endCheckState = quote;
5430 #ifdef DEBUG_PUSH
5431 if (next == 0)
5432 xmlGenericError(xmlGenericErrorContext,
5433 "HPP: lookup '%c' failed\n", first);
5434 else if (third == 0)
5435 xmlGenericError(xmlGenericErrorContext,
5436 "HPP: lookup '%c%c' failed\n", first, next);
5437 else
5438 xmlGenericError(xmlGenericErrorContext,
5439 "HPP: lookup '%c%c%c' failed\n", first, next,
5440 third);
5441 #endif
5442 return (-1);
5446 * htmlParseLookupCommentEnd:
5447 * @ctxt: an HTML parser context
5449 * Try to find a comment end tag in the input stream
5450 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5451 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5452 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5453 * to avoid rescanning sequences of bytes, it DOES change the state of the
5454 * parser, do not use liberally.
5455 * This wraps to htmlParseLookupSequence()
5457 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5459 static int
5460 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5462 int mark = 0;
5463 int offset;
5465 while (1) {
5466 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5467 if (mark < 0)
5468 break;
5469 if ((NXT(mark+2) == '>') ||
5470 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5471 ctxt->checkIndex = 0;
5472 break;
5474 offset = (NXT(mark+2) == '!') ? 3 : 2;
5475 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5476 ctxt->checkIndex = mark;
5477 return(-1);
5479 ctxt->checkIndex = mark + 1;
5481 return mark;
5486 * htmlParseTryOrFinish:
5487 * @ctxt: an HTML parser context
5488 * @terminate: last chunk indicator
5490 * Try to progress on parsing
5492 * Returns zero if no parsing was possible
5494 static int
5495 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5496 int ret = 0;
5497 htmlParserInputPtr in;
5498 ptrdiff_t avail = 0;
5499 xmlChar cur, next;
5501 htmlParserNodeInfo node_info;
5503 #ifdef DEBUG_PUSH
5504 switch (ctxt->instate) {
5505 case XML_PARSER_EOF:
5506 xmlGenericError(xmlGenericErrorContext,
5507 "HPP: try EOF\n"); break;
5508 case XML_PARSER_START:
5509 xmlGenericError(xmlGenericErrorContext,
5510 "HPP: try START\n"); break;
5511 case XML_PARSER_MISC:
5512 xmlGenericError(xmlGenericErrorContext,
5513 "HPP: try MISC\n");break;
5514 case XML_PARSER_COMMENT:
5515 xmlGenericError(xmlGenericErrorContext,
5516 "HPP: try COMMENT\n");break;
5517 case XML_PARSER_PROLOG:
5518 xmlGenericError(xmlGenericErrorContext,
5519 "HPP: try PROLOG\n");break;
5520 case XML_PARSER_START_TAG:
5521 xmlGenericError(xmlGenericErrorContext,
5522 "HPP: try START_TAG\n");break;
5523 case XML_PARSER_CONTENT:
5524 xmlGenericError(xmlGenericErrorContext,
5525 "HPP: try CONTENT\n");break;
5526 case XML_PARSER_CDATA_SECTION:
5527 xmlGenericError(xmlGenericErrorContext,
5528 "HPP: try CDATA_SECTION\n");break;
5529 case XML_PARSER_END_TAG:
5530 xmlGenericError(xmlGenericErrorContext,
5531 "HPP: try END_TAG\n");break;
5532 case XML_PARSER_ENTITY_DECL:
5533 xmlGenericError(xmlGenericErrorContext,
5534 "HPP: try ENTITY_DECL\n");break;
5535 case XML_PARSER_ENTITY_VALUE:
5536 xmlGenericError(xmlGenericErrorContext,
5537 "HPP: try ENTITY_VALUE\n");break;
5538 case XML_PARSER_ATTRIBUTE_VALUE:
5539 xmlGenericError(xmlGenericErrorContext,
5540 "HPP: try ATTRIBUTE_VALUE\n");break;
5541 case XML_PARSER_DTD:
5542 xmlGenericError(xmlGenericErrorContext,
5543 "HPP: try DTD\n");break;
5544 case XML_PARSER_EPILOG:
5545 xmlGenericError(xmlGenericErrorContext,
5546 "HPP: try EPILOG\n");break;
5547 case XML_PARSER_PI:
5548 xmlGenericError(xmlGenericErrorContext,
5549 "HPP: try PI\n");break;
5550 case XML_PARSER_SYSTEM_LITERAL:
5551 xmlGenericError(xmlGenericErrorContext,
5552 "HPP: try SYSTEM_LITERAL\n");break;
5554 #endif
5556 while (1) {
5558 in = ctxt->input;
5559 if (in == NULL) break;
5560 avail = in->end - in->cur;
5561 if ((avail == 0) && (terminate)) {
5562 htmlAutoCloseOnEnd(ctxt);
5563 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5565 * SAX: end of the document processing.
5567 ctxt->instate = XML_PARSER_EOF;
5568 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5569 ctxt->sax->endDocument(ctxt->userData);
5572 if (avail < 1)
5573 goto done;
5575 * This is done to make progress and avoid an infinite loop
5576 * if a parsing attempt was aborted by hitting a NUL byte. After
5577 * changing htmlCurrentChar, this probably isn't necessary anymore.
5578 * We should consider removing this check.
5580 cur = in->cur[0];
5581 if (cur == 0) {
5582 SKIP(1);
5583 continue;
5586 switch (ctxt->instate) {
5587 case XML_PARSER_EOF:
5589 * Document parsing is done !
5591 goto done;
5592 case XML_PARSER_START:
5594 * Very first chars read from the document flow.
5596 cur = in->cur[0];
5597 if (IS_BLANK_CH(cur)) {
5598 SKIP_BLANKS;
5599 avail = in->end - in->cur;
5601 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5602 ctxt->sax->setDocumentLocator(ctxt->userData,
5603 &xmlDefaultSAXLocator);
5604 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5605 (!ctxt->disableSAX))
5606 ctxt->sax->startDocument(ctxt->userData);
5608 cur = in->cur[0];
5609 next = in->cur[1];
5610 if ((cur == '<') && (next == '!') &&
5611 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5612 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5613 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5614 (UPP(8) == 'E')) {
5615 if ((!terminate) &&
5616 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5617 goto done;
5618 #ifdef DEBUG_PUSH
5619 xmlGenericError(xmlGenericErrorContext,
5620 "HPP: Parsing internal subset\n");
5621 #endif
5622 htmlParseDocTypeDecl(ctxt);
5623 ctxt->instate = XML_PARSER_PROLOG;
5624 #ifdef DEBUG_PUSH
5625 xmlGenericError(xmlGenericErrorContext,
5626 "HPP: entering PROLOG\n");
5627 #endif
5628 } else {
5629 ctxt->instate = XML_PARSER_MISC;
5630 #ifdef DEBUG_PUSH
5631 xmlGenericError(xmlGenericErrorContext,
5632 "HPP: entering MISC\n");
5633 #endif
5635 break;
5636 case XML_PARSER_MISC:
5637 SKIP_BLANKS;
5638 avail = in->end - in->cur;
5640 * no chars in buffer
5642 if (avail < 1)
5643 goto done;
5645 * not enough chars in buffer
5647 if (avail < 2) {
5648 if (!terminate)
5649 goto done;
5650 else
5651 next = ' ';
5652 } else {
5653 next = in->cur[1];
5655 cur = in->cur[0];
5656 if ((cur == '<') && (next == '!') &&
5657 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5658 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5659 goto done;
5660 #ifdef DEBUG_PUSH
5661 xmlGenericError(xmlGenericErrorContext,
5662 "HPP: Parsing Comment\n");
5663 #endif
5664 htmlParseComment(ctxt);
5665 ctxt->instate = XML_PARSER_MISC;
5666 } else if ((cur == '<') && (next == '?')) {
5667 if ((!terminate) &&
5668 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5669 goto done;
5670 #ifdef DEBUG_PUSH
5671 xmlGenericError(xmlGenericErrorContext,
5672 "HPP: Parsing PI\n");
5673 #endif
5674 htmlParsePI(ctxt);
5675 ctxt->instate = XML_PARSER_MISC;
5676 } else if ((cur == '<') && (next == '!') &&
5677 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5678 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5679 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5680 (UPP(8) == 'E')) {
5681 if ((!terminate) &&
5682 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5683 goto done;
5684 #ifdef DEBUG_PUSH
5685 xmlGenericError(xmlGenericErrorContext,
5686 "HPP: Parsing internal subset\n");
5687 #endif
5688 htmlParseDocTypeDecl(ctxt);
5689 ctxt->instate = XML_PARSER_PROLOG;
5690 #ifdef DEBUG_PUSH
5691 xmlGenericError(xmlGenericErrorContext,
5692 "HPP: entering PROLOG\n");
5693 #endif
5694 } else if ((cur == '<') && (next == '!') &&
5695 (avail < 9)) {
5696 goto done;
5697 } else {
5698 ctxt->instate = XML_PARSER_CONTENT;
5699 #ifdef DEBUG_PUSH
5700 xmlGenericError(xmlGenericErrorContext,
5701 "HPP: entering START_TAG\n");
5702 #endif
5704 break;
5705 case XML_PARSER_PROLOG:
5706 SKIP_BLANKS;
5707 avail = in->end - in->cur;
5708 if (avail < 2)
5709 goto done;
5710 cur = in->cur[0];
5711 next = in->cur[1];
5712 if ((cur == '<') && (next == '!') &&
5713 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5714 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5715 goto done;
5716 #ifdef DEBUG_PUSH
5717 xmlGenericError(xmlGenericErrorContext,
5718 "HPP: Parsing Comment\n");
5719 #endif
5720 htmlParseComment(ctxt);
5721 ctxt->instate = XML_PARSER_PROLOG;
5722 } else if ((cur == '<') && (next == '?')) {
5723 if ((!terminate) &&
5724 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5725 goto done;
5726 #ifdef DEBUG_PUSH
5727 xmlGenericError(xmlGenericErrorContext,
5728 "HPP: Parsing PI\n");
5729 #endif
5730 htmlParsePI(ctxt);
5731 ctxt->instate = XML_PARSER_PROLOG;
5732 } else if ((cur == '<') && (next == '!') &&
5733 (avail < 4)) {
5734 goto done;
5735 } else {
5736 ctxt->instate = XML_PARSER_CONTENT;
5737 #ifdef DEBUG_PUSH
5738 xmlGenericError(xmlGenericErrorContext,
5739 "HPP: entering START_TAG\n");
5740 #endif
5742 break;
5743 case XML_PARSER_EPILOG:
5744 avail = in->end - in->cur;
5745 if (avail < 1)
5746 goto done;
5747 cur = in->cur[0];
5748 if (IS_BLANK_CH(cur)) {
5749 htmlParseCharData(ctxt);
5750 goto done;
5752 if (avail < 2)
5753 goto done;
5754 next = in->cur[1];
5755 if ((cur == '<') && (next == '!') &&
5756 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5757 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5758 goto done;
5759 #ifdef DEBUG_PUSH
5760 xmlGenericError(xmlGenericErrorContext,
5761 "HPP: Parsing Comment\n");
5762 #endif
5763 htmlParseComment(ctxt);
5764 ctxt->instate = XML_PARSER_EPILOG;
5765 } else if ((cur == '<') && (next == '?')) {
5766 if ((!terminate) &&
5767 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5768 goto done;
5769 #ifdef DEBUG_PUSH
5770 xmlGenericError(xmlGenericErrorContext,
5771 "HPP: Parsing PI\n");
5772 #endif
5773 htmlParsePI(ctxt);
5774 ctxt->instate = XML_PARSER_EPILOG;
5775 } else if ((cur == '<') && (next == '!') &&
5776 (avail < 4)) {
5777 goto done;
5778 } else {
5779 ctxt->errNo = XML_ERR_DOCUMENT_END;
5780 ctxt->wellFormed = 0;
5781 ctxt->instate = XML_PARSER_EOF;
5782 #ifdef DEBUG_PUSH
5783 xmlGenericError(xmlGenericErrorContext,
5784 "HPP: entering EOF\n");
5785 #endif
5786 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5787 ctxt->sax->endDocument(ctxt->userData);
5788 goto done;
5790 break;
5791 case XML_PARSER_START_TAG: {
5792 const xmlChar *name;
5793 int failed;
5794 const htmlElemDesc * info;
5797 * no chars in buffer
5799 if (avail < 1)
5800 goto done;
5802 * not enough chars in buffer
5804 if (avail < 2) {
5805 if (!terminate)
5806 goto done;
5807 else
5808 next = ' ';
5809 } else {
5810 next = in->cur[1];
5812 cur = in->cur[0];
5813 if (cur != '<') {
5814 ctxt->instate = XML_PARSER_CONTENT;
5815 #ifdef DEBUG_PUSH
5816 xmlGenericError(xmlGenericErrorContext,
5817 "HPP: entering CONTENT\n");
5818 #endif
5819 break;
5821 if (next == '/') {
5822 ctxt->instate = XML_PARSER_END_TAG;
5823 ctxt->checkIndex = 0;
5824 #ifdef DEBUG_PUSH
5825 xmlGenericError(xmlGenericErrorContext,
5826 "HPP: entering END_TAG\n");
5827 #endif
5828 break;
5830 if ((!terminate) &&
5831 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5832 goto done;
5834 /* Capture start position */
5835 if (ctxt->record_info) {
5836 node_info.begin_pos = ctxt->input->consumed +
5837 (CUR_PTR - ctxt->input->base);
5838 node_info.begin_line = ctxt->input->line;
5842 failed = htmlParseStartTag(ctxt);
5843 name = ctxt->name;
5844 if ((failed == -1) ||
5845 (name == NULL)) {
5846 if (CUR == '>')
5847 NEXT;
5848 break;
5852 * Lookup the info for that element.
5854 info = htmlTagLookup(name);
5855 if (info == NULL) {
5856 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5857 "Tag %s invalid\n", name, NULL);
5861 * Check for an Empty Element labeled the XML/SGML way
5863 if ((CUR == '/') && (NXT(1) == '>')) {
5864 SKIP(2);
5865 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5866 ctxt->sax->endElement(ctxt->userData, name);
5867 htmlnamePop(ctxt);
5868 ctxt->instate = XML_PARSER_CONTENT;
5869 #ifdef DEBUG_PUSH
5870 xmlGenericError(xmlGenericErrorContext,
5871 "HPP: entering CONTENT\n");
5872 #endif
5873 break;
5876 if (CUR == '>') {
5877 NEXT;
5878 } else {
5879 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5880 "Couldn't find end of Start Tag %s\n",
5881 name, NULL);
5884 * end of parsing of this node.
5886 if (xmlStrEqual(name, ctxt->name)) {
5887 nodePop(ctxt);
5888 htmlnamePop(ctxt);
5891 if (ctxt->record_info)
5892 htmlNodeInfoPush(ctxt, &node_info);
5894 ctxt->instate = XML_PARSER_CONTENT;
5895 #ifdef DEBUG_PUSH
5896 xmlGenericError(xmlGenericErrorContext,
5897 "HPP: entering CONTENT\n");
5898 #endif
5899 break;
5903 * Check for an Empty Element from DTD definition
5905 if ((info != NULL) && (info->empty)) {
5906 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5907 ctxt->sax->endElement(ctxt->userData, name);
5908 htmlnamePop(ctxt);
5911 if (ctxt->record_info)
5912 htmlNodeInfoPush(ctxt, &node_info);
5914 ctxt->instate = XML_PARSER_CONTENT;
5915 #ifdef DEBUG_PUSH
5916 xmlGenericError(xmlGenericErrorContext,
5917 "HPP: entering CONTENT\n");
5918 #endif
5919 break;
5921 case XML_PARSER_CONTENT: {
5922 xmlChar chr[2] = { 0, 0 };
5925 * Handle preparsed entities and charRef
5927 if (ctxt->token != 0) {
5928 chr[0] = ctxt->token;
5929 htmlCheckParagraph(ctxt);
5930 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5931 ctxt->sax->characters(ctxt->userData, chr, 1);
5932 ctxt->token = 0;
5933 ctxt->checkIndex = 0;
5935 if ((avail == 1) && (terminate)) {
5936 cur = in->cur[0];
5937 if ((cur != '<') && (cur != '&')) {
5938 if (ctxt->sax != NULL) {
5939 chr[0] = cur;
5940 if (IS_BLANK_CH(cur)) {
5941 if (ctxt->keepBlanks) {
5942 if (ctxt->sax->characters != NULL)
5943 ctxt->sax->characters(
5944 ctxt->userData, chr, 1);
5945 } else {
5946 if (ctxt->sax->ignorableWhitespace != NULL)
5947 ctxt->sax->ignorableWhitespace(
5948 ctxt->userData, chr, 1);
5950 } else {
5951 htmlCheckParagraph(ctxt);
5952 if (ctxt->sax->characters != NULL)
5953 ctxt->sax->characters(
5954 ctxt->userData, chr, 1);
5957 ctxt->token = 0;
5958 ctxt->checkIndex = 0;
5959 in->cur++;
5960 break;
5963 if (avail < 2)
5964 goto done;
5965 cur = in->cur[0];
5966 next = in->cur[1];
5967 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5968 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5970 * Handle SCRIPT/STYLE separately
5972 if (!terminate) {
5973 int idx;
5974 xmlChar val;
5976 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5977 if (idx < 0)
5978 goto done;
5979 val = in->cur[idx + 2];
5980 if (val == 0) { /* bad cut of input */
5982 * FIXME: htmlParseScript checks for additional
5983 * characters after '</'.
5985 ctxt->checkIndex = idx;
5986 goto done;
5989 htmlParseScript(ctxt);
5990 if ((cur == '<') && (next == '/')) {
5991 ctxt->instate = XML_PARSER_END_TAG;
5992 ctxt->checkIndex = 0;
5993 #ifdef DEBUG_PUSH
5994 xmlGenericError(xmlGenericErrorContext,
5995 "HPP: entering END_TAG\n");
5996 #endif
5997 break;
5999 } else if ((cur == '<') && (next == '!')) {
6000 if (avail < 4)
6001 goto done;
6003 * Sometimes DOCTYPE arrives in the middle of the document
6005 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
6006 (UPP(4) == 'C') && (UPP(5) == 'T') &&
6007 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
6008 (UPP(8) == 'E')) {
6009 if ((!terminate) &&
6010 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
6011 goto done;
6012 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
6013 "Misplaced DOCTYPE declaration\n",
6014 BAD_CAST "DOCTYPE" , NULL);
6015 htmlParseDocTypeDecl(ctxt);
6016 } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
6017 if ((!terminate) &&
6018 (htmlParseLookupCommentEnd(ctxt) < 0))
6019 goto done;
6020 #ifdef DEBUG_PUSH
6021 xmlGenericError(xmlGenericErrorContext,
6022 "HPP: Parsing Comment\n");
6023 #endif
6024 htmlParseComment(ctxt);
6025 ctxt->instate = XML_PARSER_CONTENT;
6026 } else {
6027 if ((!terminate) &&
6028 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6029 goto done;
6030 htmlSkipBogusComment(ctxt);
6032 } else if ((cur == '<') && (next == '?')) {
6033 if ((!terminate) &&
6034 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6035 goto done;
6036 #ifdef DEBUG_PUSH
6037 xmlGenericError(xmlGenericErrorContext,
6038 "HPP: Parsing PI\n");
6039 #endif
6040 htmlParsePI(ctxt);
6041 ctxt->instate = XML_PARSER_CONTENT;
6042 } else if ((cur == '<') && (next == '/')) {
6043 ctxt->instate = XML_PARSER_END_TAG;
6044 ctxt->checkIndex = 0;
6045 #ifdef DEBUG_PUSH
6046 xmlGenericError(xmlGenericErrorContext,
6047 "HPP: entering END_TAG\n");
6048 #endif
6049 break;
6050 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6051 if ((!terminate) && (next == 0))
6052 goto done;
6053 ctxt->instate = XML_PARSER_START_TAG;
6054 ctxt->checkIndex = 0;
6055 #ifdef DEBUG_PUSH
6056 xmlGenericError(xmlGenericErrorContext,
6057 "HPP: entering START_TAG\n");
6058 #endif
6059 break;
6060 } else if (cur == '<') {
6061 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6062 (ctxt->sax->characters != NULL))
6063 ctxt->sax->characters(ctxt->userData,
6064 BAD_CAST "<", 1);
6065 NEXT;
6066 } else {
6068 * check that the text sequence is complete
6069 * before handing out the data to the parser
6070 * to avoid problems with erroneous end of
6071 * data detection.
6073 if ((!terminate) &&
6074 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6075 goto done;
6076 ctxt->checkIndex = 0;
6077 #ifdef DEBUG_PUSH
6078 xmlGenericError(xmlGenericErrorContext,
6079 "HPP: Parsing char data\n");
6080 #endif
6081 while ((ctxt->instate != XML_PARSER_EOF) &&
6082 (cur != '<') && (in->cur < in->end)) {
6083 if (cur == '&') {
6084 htmlParseReference(ctxt);
6085 } else {
6086 htmlParseCharData(ctxt);
6088 cur = in->cur[0];
6092 break;
6094 case XML_PARSER_END_TAG:
6095 if (avail < 2)
6096 goto done;
6097 if ((!terminate) &&
6098 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6099 goto done;
6100 htmlParseEndTag(ctxt);
6101 if (ctxt->nameNr == 0) {
6102 ctxt->instate = XML_PARSER_EPILOG;
6103 } else {
6104 ctxt->instate = XML_PARSER_CONTENT;
6106 ctxt->checkIndex = 0;
6107 #ifdef DEBUG_PUSH
6108 xmlGenericError(xmlGenericErrorContext,
6109 "HPP: entering CONTENT\n");
6110 #endif
6111 break;
6112 case XML_PARSER_CDATA_SECTION:
6113 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6114 "HPP: internal error, state == CDATA\n",
6115 NULL, NULL);
6116 ctxt->instate = XML_PARSER_CONTENT;
6117 ctxt->checkIndex = 0;
6118 #ifdef DEBUG_PUSH
6119 xmlGenericError(xmlGenericErrorContext,
6120 "HPP: entering CONTENT\n");
6121 #endif
6122 break;
6123 case XML_PARSER_DTD:
6124 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6125 "HPP: internal error, state == DTD\n",
6126 NULL, NULL);
6127 ctxt->instate = XML_PARSER_CONTENT;
6128 ctxt->checkIndex = 0;
6129 #ifdef DEBUG_PUSH
6130 xmlGenericError(xmlGenericErrorContext,
6131 "HPP: entering CONTENT\n");
6132 #endif
6133 break;
6134 case XML_PARSER_COMMENT:
6135 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6136 "HPP: internal error, state == COMMENT\n",
6137 NULL, NULL);
6138 ctxt->instate = XML_PARSER_CONTENT;
6139 ctxt->checkIndex = 0;
6140 #ifdef DEBUG_PUSH
6141 xmlGenericError(xmlGenericErrorContext,
6142 "HPP: entering CONTENT\n");
6143 #endif
6144 break;
6145 case XML_PARSER_PI:
6146 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6147 "HPP: internal error, state == PI\n",
6148 NULL, NULL);
6149 ctxt->instate = XML_PARSER_CONTENT;
6150 ctxt->checkIndex = 0;
6151 #ifdef DEBUG_PUSH
6152 xmlGenericError(xmlGenericErrorContext,
6153 "HPP: entering CONTENT\n");
6154 #endif
6155 break;
6156 case XML_PARSER_ENTITY_DECL:
6157 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6158 "HPP: internal error, state == ENTITY_DECL\n",
6159 NULL, NULL);
6160 ctxt->instate = XML_PARSER_CONTENT;
6161 ctxt->checkIndex = 0;
6162 #ifdef DEBUG_PUSH
6163 xmlGenericError(xmlGenericErrorContext,
6164 "HPP: entering CONTENT\n");
6165 #endif
6166 break;
6167 case XML_PARSER_ENTITY_VALUE:
6168 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6169 "HPP: internal error, state == ENTITY_VALUE\n",
6170 NULL, NULL);
6171 ctxt->instate = XML_PARSER_CONTENT;
6172 ctxt->checkIndex = 0;
6173 #ifdef DEBUG_PUSH
6174 xmlGenericError(xmlGenericErrorContext,
6175 "HPP: entering DTD\n");
6176 #endif
6177 break;
6178 case XML_PARSER_ATTRIBUTE_VALUE:
6179 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6180 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6181 NULL, NULL);
6182 ctxt->instate = XML_PARSER_START_TAG;
6183 ctxt->checkIndex = 0;
6184 #ifdef DEBUG_PUSH
6185 xmlGenericError(xmlGenericErrorContext,
6186 "HPP: entering START_TAG\n");
6187 #endif
6188 break;
6189 case XML_PARSER_SYSTEM_LITERAL:
6190 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6191 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6192 NULL, NULL);
6193 ctxt->instate = XML_PARSER_CONTENT;
6194 ctxt->checkIndex = 0;
6195 #ifdef DEBUG_PUSH
6196 xmlGenericError(xmlGenericErrorContext,
6197 "HPP: entering CONTENT\n");
6198 #endif
6199 break;
6200 case XML_PARSER_IGNORE:
6201 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6202 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6203 NULL, NULL);
6204 ctxt->instate = XML_PARSER_CONTENT;
6205 ctxt->checkIndex = 0;
6206 #ifdef DEBUG_PUSH
6207 xmlGenericError(xmlGenericErrorContext,
6208 "HPP: entering CONTENT\n");
6209 #endif
6210 break;
6211 case XML_PARSER_PUBLIC_LITERAL:
6212 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6213 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6214 NULL, NULL);
6215 ctxt->instate = XML_PARSER_CONTENT;
6216 ctxt->checkIndex = 0;
6217 #ifdef DEBUG_PUSH
6218 xmlGenericError(xmlGenericErrorContext,
6219 "HPP: entering CONTENT\n");
6220 #endif
6221 break;
6225 done:
6226 if ((avail == 0) && (terminate)) {
6227 htmlAutoCloseOnEnd(ctxt);
6228 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6230 * SAX: end of the document processing.
6232 ctxt->instate = XML_PARSER_EOF;
6233 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6234 ctxt->sax->endDocument(ctxt->userData);
6237 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6238 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6239 (ctxt->instate == XML_PARSER_EPILOG))) {
6240 xmlDtdPtr dtd;
6241 dtd = xmlGetIntSubset(ctxt->myDoc);
6242 if (dtd == NULL)
6243 ctxt->myDoc->intSubset =
6244 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6245 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6246 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6248 #ifdef DEBUG_PUSH
6249 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6250 #endif
6251 return(ret);
6255 * htmlParseChunk:
6256 * @ctxt: an HTML parser context
6257 * @chunk: an char array
6258 * @size: the size in byte of the chunk
6259 * @terminate: last chunk indicator
6261 * Parse a Chunk of memory
6263 * Returns zero if no error, the xmlParserErrors otherwise.
6266 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6267 int terminate) {
6268 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6269 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6270 "htmlParseChunk: context error\n", NULL, NULL);
6271 return(XML_ERR_INTERNAL_ERROR);
6273 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6274 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6275 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6276 size_t cur = ctxt->input->cur - ctxt->input->base;
6277 int res;
6279 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6280 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6281 if (res < 0) {
6282 htmlErrMemory(ctxt, NULL);
6283 return (ctxt->errNo);
6285 #ifdef DEBUG_PUSH
6286 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6287 #endif
6289 #if 0
6290 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6291 htmlParseTryOrFinish(ctxt, terminate);
6292 #endif
6293 } else if (ctxt->instate != XML_PARSER_EOF) {
6294 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6295 xmlParserInputBufferPtr in = ctxt->input->buf;
6296 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6297 (in->raw != NULL)) {
6298 int nbchars;
6299 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6300 size_t current = ctxt->input->cur - ctxt->input->base;
6302 nbchars = xmlCharEncInput(in, terminate);
6303 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6304 if (nbchars < 0) {
6305 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6306 "encoder error\n", NULL, NULL);
6307 return(XML_ERR_INVALID_ENCODING);
6312 htmlParseTryOrFinish(ctxt, terminate);
6313 if (terminate) {
6314 if ((ctxt->instate != XML_PARSER_EOF) &&
6315 (ctxt->instate != XML_PARSER_EPILOG) &&
6316 (ctxt->instate != XML_PARSER_MISC)) {
6317 ctxt->errNo = XML_ERR_DOCUMENT_END;
6318 ctxt->wellFormed = 0;
6320 if (ctxt->instate != XML_PARSER_EOF) {
6321 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6322 ctxt->sax->endDocument(ctxt->userData);
6324 ctxt->instate = XML_PARSER_EOF;
6326 return((xmlParserErrors) ctxt->errNo);
6329 /************************************************************************
6331 * User entry points *
6333 ************************************************************************/
6336 * htmlCreatePushParserCtxt:
6337 * @sax: a SAX handler
6338 * @user_data: The user data returned on SAX callbacks
6339 * @chunk: a pointer to an array of chars
6340 * @size: number of chars in the array
6341 * @filename: an optional file name or URI
6342 * @enc: an optional encoding
6344 * Create a parser context for using the HTML parser in push mode
6345 * The value of @filename is used for fetching external entities
6346 * and error/warning reports.
6348 * Returns the new parser context or NULL
6350 htmlParserCtxtPtr
6351 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6352 const char *chunk, int size, const char *filename,
6353 xmlCharEncoding enc) {
6354 htmlParserCtxtPtr ctxt;
6355 htmlParserInputPtr inputStream;
6356 xmlParserInputBufferPtr buf;
6358 xmlInitParser();
6360 buf = xmlAllocParserInputBuffer(enc);
6361 if (buf == NULL) return(NULL);
6363 ctxt = htmlNewSAXParserCtxt(sax, user_data);
6364 if (ctxt == NULL) {
6365 xmlFreeParserInputBuffer(buf);
6366 return(NULL);
6368 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6369 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6370 if (filename == NULL) {
6371 ctxt->directory = NULL;
6372 } else {
6373 ctxt->directory = xmlParserGetDirectory(filename);
6376 inputStream = htmlNewInputStream(ctxt);
6377 if (inputStream == NULL) {
6378 xmlFreeParserCtxt(ctxt);
6379 xmlFreeParserInputBuffer(buf);
6380 return(NULL);
6383 if (filename == NULL)
6384 inputStream->filename = NULL;
6385 else
6386 inputStream->filename = (char *)
6387 xmlCanonicPath((const xmlChar *) filename);
6388 inputStream->buf = buf;
6389 xmlBufResetInput(buf->buffer, inputStream);
6391 inputPush(ctxt, inputStream);
6393 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6394 (ctxt->input->buf != NULL)) {
6395 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6396 size_t cur = ctxt->input->cur - ctxt->input->base;
6398 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6400 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6401 #ifdef DEBUG_PUSH
6402 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6403 #endif
6405 ctxt->progressive = 1;
6407 return(ctxt);
6409 #endif /* LIBXML_PUSH_ENABLED */
6412 * htmlSAXParseDoc:
6413 * @cur: a pointer to an array of xmlChar
6414 * @encoding: a free form C string describing the HTML document encoding, or NULL
6415 * @sax: the SAX handler block
6416 * @userData: if using SAX, this pointer will be provided on callbacks.
6418 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6420 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6421 * to handle parse events. If sax is NULL, fallback to the default DOM
6422 * behavior and return a tree.
6424 * Returns the resulting document tree unless SAX is NULL or the document is
6425 * not well formed.
6428 htmlDocPtr
6429 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6430 htmlSAXHandlerPtr sax, void *userData) {
6431 htmlDocPtr ret;
6432 htmlParserCtxtPtr ctxt;
6434 xmlInitParser();
6436 if (cur == NULL) return(NULL);
6439 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6440 if (ctxt == NULL) return(NULL);
6441 if (sax != NULL) {
6442 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6443 ctxt->sax = sax;
6444 ctxt->userData = userData;
6447 htmlParseDocument(ctxt);
6448 ret = ctxt->myDoc;
6449 if (sax != NULL) {
6450 ctxt->sax = NULL;
6451 ctxt->userData = NULL;
6453 htmlFreeParserCtxt(ctxt);
6455 return(ret);
6459 * htmlParseDoc:
6460 * @cur: a pointer to an array of xmlChar
6461 * @encoding: a free form C string describing the HTML document encoding, or NULL
6463 * parse an HTML in-memory document and build a tree.
6465 * Returns the resulting document tree
6468 htmlDocPtr
6469 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6470 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6475 * htmlCreateFileParserCtxt:
6476 * @filename: the filename
6477 * @encoding: a free form C string describing the HTML document encoding, or NULL
6479 * Create a parser context for a file content.
6480 * Automatic support for ZLIB/Compress compressed document is provided
6481 * by default if found at compile-time.
6483 * Returns the new parser context or NULL
6485 htmlParserCtxtPtr
6486 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6488 htmlParserCtxtPtr ctxt;
6489 htmlParserInputPtr inputStream;
6490 char *canonicFilename;
6491 /* htmlCharEncoding enc; */
6492 xmlChar *content, *content_line = (xmlChar *) "charset=";
6494 if (filename == NULL)
6495 return(NULL);
6497 ctxt = htmlNewParserCtxt();
6498 if (ctxt == NULL) {
6499 return(NULL);
6501 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6502 if (canonicFilename == NULL) {
6503 xmlFreeParserCtxt(ctxt);
6504 return(NULL);
6507 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6508 xmlFree(canonicFilename);
6509 if (inputStream == NULL) {
6510 xmlFreeParserCtxt(ctxt);
6511 return(NULL);
6514 inputPush(ctxt, inputStream);
6516 /* set encoding */
6517 if (encoding) {
6518 size_t l = strlen(encoding);
6520 if (l < 1000) {
6521 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6522 if (content) {
6523 strcpy ((char *)content, (char *)content_line);
6524 strcat ((char *)content, (char *)encoding);
6525 htmlCheckEncoding (ctxt, content);
6526 xmlFree (content);
6531 return(ctxt);
6535 * htmlSAXParseFile:
6536 * @filename: the filename
6537 * @encoding: a free form C string describing the HTML document encoding, or NULL
6538 * @sax: the SAX handler block
6539 * @userData: if using SAX, this pointer will be provided on callbacks.
6541 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6543 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6544 * compressed document is provided by default if found at compile-time.
6545 * It use the given SAX function block to handle the parsing callback.
6546 * If sax is NULL, fallback to the default DOM tree building routines.
6548 * Returns the resulting document tree unless SAX is NULL or the document is
6549 * not well formed.
6552 htmlDocPtr
6553 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6554 void *userData) {
6555 htmlDocPtr ret;
6556 htmlParserCtxtPtr ctxt;
6557 htmlSAXHandlerPtr oldsax = NULL;
6559 xmlInitParser();
6561 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6562 if (ctxt == NULL) return(NULL);
6563 if (sax != NULL) {
6564 oldsax = ctxt->sax;
6565 ctxt->sax = sax;
6566 ctxt->userData = userData;
6569 htmlParseDocument(ctxt);
6571 ret = ctxt->myDoc;
6572 if (sax != NULL) {
6573 ctxt->sax = oldsax;
6574 ctxt->userData = NULL;
6576 htmlFreeParserCtxt(ctxt);
6578 return(ret);
6582 * htmlParseFile:
6583 * @filename: the filename
6584 * @encoding: a free form C string describing the HTML document encoding, or NULL
6586 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6587 * compressed document is provided by default if found at compile-time.
6589 * Returns the resulting document tree
6592 htmlDocPtr
6593 htmlParseFile(const char *filename, const char *encoding) {
6594 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6598 * htmlHandleOmittedElem:
6599 * @val: int 0 or 1
6601 * Set and return the previous value for handling HTML omitted tags.
6603 * Returns the last value for 0 for no handling, 1 for auto insertion.
6607 htmlHandleOmittedElem(int val) {
6608 int old = htmlOmittedDefaultValue;
6610 htmlOmittedDefaultValue = val;
6611 return(old);
6615 * htmlElementAllowedHere:
6616 * @parent: HTML parent element
6617 * @elt: HTML element
6619 * Checks whether an HTML element may be a direct child of a parent element.
6620 * Note - doesn't check for deprecated elements
6622 * Returns 1 if allowed; 0 otherwise.
6625 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6626 const char** p ;
6628 if ( ! elt || ! parent || ! parent->subelts )
6629 return 0 ;
6631 for ( p = parent->subelts; *p; ++p )
6632 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6633 return 1 ;
6635 return 0 ;
6638 * htmlElementStatusHere:
6639 * @parent: HTML parent element
6640 * @elt: HTML element
6642 * Checks whether an HTML element may be a direct child of a parent element.
6643 * and if so whether it is valid or deprecated.
6645 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6647 htmlStatus
6648 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6649 if ( ! parent || ! elt )
6650 return HTML_INVALID ;
6651 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6652 return HTML_INVALID ;
6654 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6657 * htmlAttrAllowed:
6658 * @elt: HTML element
6659 * @attr: HTML attribute
6660 * @legacy: whether to allow deprecated attributes
6662 * Checks whether an attribute is valid for an element
6663 * Has full knowledge of Required and Deprecated attributes
6665 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6667 htmlStatus
6668 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6669 const char** p ;
6671 if ( !elt || ! attr )
6672 return HTML_INVALID ;
6674 if ( elt->attrs_req )
6675 for ( p = elt->attrs_req; *p; ++p)
6676 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6677 return HTML_REQUIRED ;
6679 if ( elt->attrs_opt )
6680 for ( p = elt->attrs_opt; *p; ++p)
6681 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6682 return HTML_VALID ;
6684 if ( legacy && elt->attrs_depr )
6685 for ( p = elt->attrs_depr; *p; ++p)
6686 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6687 return HTML_DEPRECATED ;
6689 return HTML_INVALID ;
6692 * htmlNodeStatus:
6693 * @node: an htmlNodePtr in a tree
6694 * @legacy: whether to allow deprecated elements (YES is faster here
6695 * for Element nodes)
6697 * Checks whether the tree node is valid. Experimental (the author
6698 * only uses the HTML enhancements in a SAX parser)
6700 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6701 * legacy allowed) or htmlElementStatusHere (otherwise).
6702 * for Attribute nodes, a return from htmlAttrAllowed
6703 * for other nodes, HTML_NA (no checks performed)
6705 htmlStatus
6706 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6707 if ( ! node )
6708 return HTML_INVALID ;
6710 switch ( node->type ) {
6711 case XML_ELEMENT_NODE:
6712 return legacy
6713 ? ( htmlElementAllowedHere (
6714 htmlTagLookup(node->parent->name) , node->name
6715 ) ? HTML_VALID : HTML_INVALID )
6716 : htmlElementStatusHere(
6717 htmlTagLookup(node->parent->name) ,
6718 htmlTagLookup(node->name) )
6720 case XML_ATTRIBUTE_NODE:
6721 return htmlAttrAllowed(
6722 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6723 default: return HTML_NA ;
6726 /************************************************************************
6728 * New set (2.6.0) of simpler and more flexible APIs *
6730 ************************************************************************/
6732 * DICT_FREE:
6733 * @str: a string
6735 * Free a string if it is not owned by the "dict" dictionary in the
6736 * current scope
6738 #define DICT_FREE(str) \
6739 if ((str) && ((!dict) || \
6740 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6741 xmlFree((char *)(str));
6744 * htmlCtxtReset:
6745 * @ctxt: an HTML parser context
6747 * Reset a parser context
6749 void
6750 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6752 xmlParserInputPtr input;
6753 xmlDictPtr dict;
6755 if (ctxt == NULL)
6756 return;
6758 xmlInitParser();
6759 dict = ctxt->dict;
6761 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6762 xmlFreeInputStream(input);
6764 ctxt->inputNr = 0;
6765 ctxt->input = NULL;
6767 ctxt->spaceNr = 0;
6768 if (ctxt->spaceTab != NULL) {
6769 ctxt->spaceTab[0] = -1;
6770 ctxt->space = &ctxt->spaceTab[0];
6771 } else {
6772 ctxt->space = NULL;
6776 ctxt->nodeNr = 0;
6777 ctxt->node = NULL;
6779 ctxt->nameNr = 0;
6780 ctxt->name = NULL;
6782 ctxt->nsNr = 0;
6784 DICT_FREE(ctxt->version);
6785 ctxt->version = NULL;
6786 DICT_FREE(ctxt->encoding);
6787 ctxt->encoding = NULL;
6788 DICT_FREE(ctxt->directory);
6789 ctxt->directory = NULL;
6790 DICT_FREE(ctxt->extSubURI);
6791 ctxt->extSubURI = NULL;
6792 DICT_FREE(ctxt->extSubSystem);
6793 ctxt->extSubSystem = NULL;
6794 if (ctxt->myDoc != NULL)
6795 xmlFreeDoc(ctxt->myDoc);
6796 ctxt->myDoc = NULL;
6798 ctxt->standalone = -1;
6799 ctxt->hasExternalSubset = 0;
6800 ctxt->hasPErefs = 0;
6801 ctxt->html = 1;
6802 ctxt->external = 0;
6803 ctxt->instate = XML_PARSER_START;
6804 ctxt->token = 0;
6806 ctxt->wellFormed = 1;
6807 ctxt->nsWellFormed = 1;
6808 ctxt->disableSAX = 0;
6809 ctxt->valid = 1;
6810 ctxt->vctxt.userData = ctxt;
6811 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6812 ctxt->vctxt.error = xmlParserValidityError;
6813 ctxt->vctxt.warning = xmlParserValidityWarning;
6814 ctxt->record_info = 0;
6815 ctxt->checkIndex = 0;
6816 ctxt->endCheckState = 0;
6817 ctxt->inSubset = 0;
6818 ctxt->errNo = XML_ERR_OK;
6819 ctxt->depth = 0;
6820 ctxt->charset = XML_CHAR_ENCODING_NONE;
6821 ctxt->catalogs = NULL;
6822 xmlInitNodeInfoSeq(&ctxt->node_seq);
6824 if (ctxt->attsDefault != NULL) {
6825 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6826 ctxt->attsDefault = NULL;
6828 if (ctxt->attsSpecial != NULL) {
6829 xmlHashFree(ctxt->attsSpecial, NULL);
6830 ctxt->attsSpecial = NULL;
6833 ctxt->nbErrors = 0;
6834 ctxt->nbWarnings = 0;
6835 if (ctxt->lastError.code != XML_ERR_OK)
6836 xmlResetError(&ctxt->lastError);
6840 * htmlCtxtUseOptions:
6841 * @ctxt: an HTML parser context
6842 * @options: a combination of htmlParserOption(s)
6844 * Applies the options to the parser context
6846 * Returns 0 in case of success, the set of unknown or unimplemented options
6847 * in case of error.
6850 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6852 if (ctxt == NULL)
6853 return(-1);
6855 if (options & HTML_PARSE_NOWARNING) {
6856 ctxt->sax->warning = NULL;
6857 ctxt->vctxt.warning = NULL;
6858 options -= XML_PARSE_NOWARNING;
6859 ctxt->options |= XML_PARSE_NOWARNING;
6861 if (options & HTML_PARSE_NOERROR) {
6862 ctxt->sax->error = NULL;
6863 ctxt->vctxt.error = NULL;
6864 ctxt->sax->fatalError = NULL;
6865 options -= XML_PARSE_NOERROR;
6866 ctxt->options |= XML_PARSE_NOERROR;
6868 if (options & HTML_PARSE_PEDANTIC) {
6869 ctxt->pedantic = 1;
6870 options -= XML_PARSE_PEDANTIC;
6871 ctxt->options |= XML_PARSE_PEDANTIC;
6872 } else
6873 ctxt->pedantic = 0;
6874 if (options & XML_PARSE_NOBLANKS) {
6875 ctxt->keepBlanks = 0;
6876 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6877 options -= XML_PARSE_NOBLANKS;
6878 ctxt->options |= XML_PARSE_NOBLANKS;
6879 } else
6880 ctxt->keepBlanks = 1;
6881 if (options & HTML_PARSE_RECOVER) {
6882 ctxt->recovery = 1;
6883 options -= HTML_PARSE_RECOVER;
6884 } else
6885 ctxt->recovery = 0;
6886 if (options & HTML_PARSE_COMPACT) {
6887 ctxt->options |= HTML_PARSE_COMPACT;
6888 options -= HTML_PARSE_COMPACT;
6890 if (options & XML_PARSE_HUGE) {
6891 ctxt->options |= XML_PARSE_HUGE;
6892 options -= XML_PARSE_HUGE;
6894 if (options & HTML_PARSE_NODEFDTD) {
6895 ctxt->options |= HTML_PARSE_NODEFDTD;
6896 options -= HTML_PARSE_NODEFDTD;
6898 if (options & HTML_PARSE_IGNORE_ENC) {
6899 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6900 options -= HTML_PARSE_IGNORE_ENC;
6902 if (options & HTML_PARSE_NOIMPLIED) {
6903 ctxt->options |= HTML_PARSE_NOIMPLIED;
6904 options -= HTML_PARSE_NOIMPLIED;
6906 ctxt->dictNames = 0;
6907 ctxt->linenumbers = 1;
6908 return (options);
6912 * htmlDoRead:
6913 * @ctxt: an HTML parser context
6914 * @URL: the base URL to use for the document
6915 * @encoding: the document encoding, or NULL
6916 * @options: a combination of htmlParserOption(s)
6917 * @reuse: keep the context for reuse
6919 * Common front-end for the htmlRead functions
6921 * Returns the resulting document tree or NULL
6923 static htmlDocPtr
6924 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6925 int options, int reuse)
6927 htmlDocPtr ret;
6929 htmlCtxtUseOptions(ctxt, options);
6930 ctxt->html = 1;
6931 if (encoding != NULL) {
6932 xmlCharEncodingHandlerPtr hdlr;
6934 hdlr = xmlFindCharEncodingHandler(encoding);
6935 if (hdlr != NULL) {
6936 xmlSwitchToEncoding(ctxt, hdlr);
6937 if (ctxt->input->encoding != NULL)
6938 xmlFree((xmlChar *) ctxt->input->encoding);
6939 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6942 if ((URL != NULL) && (ctxt->input != NULL) &&
6943 (ctxt->input->filename == NULL))
6944 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6945 htmlParseDocument(ctxt);
6946 ret = ctxt->myDoc;
6947 ctxt->myDoc = NULL;
6948 if (!reuse) {
6949 if ((ctxt->dictNames) &&
6950 (ret != NULL) &&
6951 (ret->dict == ctxt->dict))
6952 ctxt->dict = NULL;
6953 xmlFreeParserCtxt(ctxt);
6955 return (ret);
6959 * htmlReadDoc:
6960 * @cur: a pointer to a zero terminated string
6961 * @URL: the base URL to use for the document
6962 * @encoding: the document encoding, or NULL
6963 * @options: a combination of htmlParserOption(s)
6965 * parse an XML in-memory document and build a tree.
6967 * Returns the resulting document tree
6969 htmlDocPtr
6970 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6972 htmlParserCtxtPtr ctxt;
6974 if (cur == NULL)
6975 return (NULL);
6977 xmlInitParser();
6978 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6979 if (ctxt == NULL)
6980 return (NULL);
6981 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6985 * htmlReadFile:
6986 * @filename: a file or URL
6987 * @encoding: the document encoding, or NULL
6988 * @options: a combination of htmlParserOption(s)
6990 * parse an XML file from the filesystem or the network.
6992 * Returns the resulting document tree
6994 htmlDocPtr
6995 htmlReadFile(const char *filename, const char *encoding, int options)
6997 htmlParserCtxtPtr ctxt;
6999 xmlInitParser();
7000 ctxt = htmlCreateFileParserCtxt(filename, encoding);
7001 if (ctxt == NULL)
7002 return (NULL);
7003 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
7007 * htmlReadMemory:
7008 * @buffer: a pointer to a char array
7009 * @size: the size of the array
7010 * @URL: the base URL to use for the document
7011 * @encoding: the document encoding, or NULL
7012 * @options: a combination of htmlParserOption(s)
7014 * parse an XML in-memory document and build a tree.
7016 * Returns the resulting document tree
7018 htmlDocPtr
7019 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
7021 htmlParserCtxtPtr ctxt;
7023 xmlInitParser();
7024 ctxt = htmlCreateMemoryParserCtxt(buffer, size);
7025 if (ctxt == NULL)
7026 return (NULL);
7027 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7031 * htmlReadFd:
7032 * @fd: an open file descriptor
7033 * @URL: the base URL to use for the document
7034 * @encoding: the document encoding, or NULL
7035 * @options: a combination of htmlParserOption(s)
7037 * parse an HTML from a file descriptor and build a tree.
7038 * NOTE that the file descriptor will not be closed when the
7039 * reader is closed or reset.
7041 * Returns the resulting document tree
7043 htmlDocPtr
7044 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7046 htmlParserCtxtPtr ctxt;
7047 xmlParserInputBufferPtr input;
7048 htmlParserInputPtr stream;
7050 if (fd < 0)
7051 return (NULL);
7053 xmlInitParser();
7054 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7055 if (input == NULL)
7056 return (NULL);
7057 input->closecallback = NULL;
7058 ctxt = htmlNewParserCtxt();
7059 if (ctxt == NULL) {
7060 xmlFreeParserInputBuffer(input);
7061 return (NULL);
7063 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7064 if (stream == NULL) {
7065 xmlFreeParserInputBuffer(input);
7066 htmlFreeParserCtxt(ctxt);
7067 return (NULL);
7069 inputPush(ctxt, stream);
7070 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7074 * htmlReadIO:
7075 * @ioread: an I/O read function
7076 * @ioclose: an I/O close function
7077 * @ioctx: an I/O handler
7078 * @URL: the base URL to use for the document
7079 * @encoding: the document encoding, or NULL
7080 * @options: a combination of htmlParserOption(s)
7082 * parse an HTML document from I/O functions and source and build a tree.
7084 * Returns the resulting document tree
7086 htmlDocPtr
7087 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7088 void *ioctx, const char *URL, const char *encoding, int options)
7090 htmlParserCtxtPtr ctxt;
7091 xmlParserInputBufferPtr input;
7092 xmlParserInputPtr stream;
7094 if (ioread == NULL)
7095 return (NULL);
7096 xmlInitParser();
7098 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7099 XML_CHAR_ENCODING_NONE);
7100 if (input == NULL) {
7101 if (ioclose != NULL)
7102 ioclose(ioctx);
7103 return (NULL);
7105 ctxt = htmlNewParserCtxt();
7106 if (ctxt == NULL) {
7107 xmlFreeParserInputBuffer(input);
7108 return (NULL);
7110 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7111 if (stream == NULL) {
7112 xmlFreeParserInputBuffer(input);
7113 xmlFreeParserCtxt(ctxt);
7114 return (NULL);
7116 inputPush(ctxt, stream);
7117 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7121 * htmlCtxtReadDoc:
7122 * @ctxt: an HTML parser context
7123 * @cur: a pointer to a zero terminated string
7124 * @URL: the base URL to use for the document
7125 * @encoding: the document encoding, or NULL
7126 * @options: a combination of htmlParserOption(s)
7128 * parse an XML in-memory document and build a tree.
7129 * This reuses the existing @ctxt parser context
7131 * Returns the resulting document tree
7133 htmlDocPtr
7134 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7135 const char *URL, const char *encoding, int options)
7137 if (cur == NULL)
7138 return (NULL);
7139 return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
7140 encoding, options));
7144 * htmlCtxtReadFile:
7145 * @ctxt: an HTML parser context
7146 * @filename: a file or URL
7147 * @encoding: the document encoding, or NULL
7148 * @options: a combination of htmlParserOption(s)
7150 * parse an XML file from the filesystem or the network.
7151 * This reuses the existing @ctxt parser context
7153 * Returns the resulting document tree
7155 htmlDocPtr
7156 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7157 const char *encoding, int options)
7159 xmlParserInputPtr stream;
7161 if (filename == NULL)
7162 return (NULL);
7163 if (ctxt == NULL)
7164 return (NULL);
7165 xmlInitParser();
7167 htmlCtxtReset(ctxt);
7169 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7170 if (stream == NULL) {
7171 return (NULL);
7173 inputPush(ctxt, stream);
7174 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7178 * htmlCtxtReadMemory:
7179 * @ctxt: an HTML parser context
7180 * @buffer: a pointer to a char array
7181 * @size: the size of the array
7182 * @URL: the base URL to use for the document
7183 * @encoding: the document encoding, or NULL
7184 * @options: a combination of htmlParserOption(s)
7186 * parse an XML in-memory document and build a tree.
7187 * This reuses the existing @ctxt parser context
7189 * Returns the resulting document tree
7191 htmlDocPtr
7192 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7193 const char *URL, const char *encoding, int options)
7195 xmlParserInputBufferPtr input;
7196 xmlParserInputPtr stream;
7198 if (ctxt == NULL)
7199 return (NULL);
7200 if (buffer == NULL)
7201 return (NULL);
7202 xmlInitParser();
7204 htmlCtxtReset(ctxt);
7206 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7207 if (input == NULL) {
7208 return(NULL);
7211 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7212 if (stream == NULL) {
7213 xmlFreeParserInputBuffer(input);
7214 return(NULL);
7217 inputPush(ctxt, stream);
7218 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7222 * htmlCtxtReadFd:
7223 * @ctxt: an HTML parser context
7224 * @fd: an open file descriptor
7225 * @URL: the base URL to use for the document
7226 * @encoding: the document encoding, or NULL
7227 * @options: a combination of htmlParserOption(s)
7229 * parse an XML from a file descriptor and build a tree.
7230 * This reuses the existing @ctxt parser context
7232 * Returns the resulting document tree
7234 htmlDocPtr
7235 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7236 const char *URL, const char *encoding, int options)
7238 xmlParserInputBufferPtr input;
7239 xmlParserInputPtr stream;
7241 if (fd < 0)
7242 return (NULL);
7243 if (ctxt == NULL)
7244 return (NULL);
7245 xmlInitParser();
7247 htmlCtxtReset(ctxt);
7250 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7251 if (input == NULL)
7252 return (NULL);
7253 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7254 if (stream == NULL) {
7255 xmlFreeParserInputBuffer(input);
7256 return (NULL);
7258 inputPush(ctxt, stream);
7259 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7263 * htmlCtxtReadIO:
7264 * @ctxt: an HTML parser context
7265 * @ioread: an I/O read function
7266 * @ioclose: an I/O close function
7267 * @ioctx: an I/O handler
7268 * @URL: the base URL to use for the document
7269 * @encoding: the document encoding, or NULL
7270 * @options: a combination of htmlParserOption(s)
7272 * parse an HTML document from I/O functions and source and build a tree.
7273 * This reuses the existing @ctxt parser context
7275 * Returns the resulting document tree
7277 htmlDocPtr
7278 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7279 xmlInputCloseCallback ioclose, void *ioctx,
7280 const char *URL,
7281 const char *encoding, int options)
7283 xmlParserInputBufferPtr input;
7284 xmlParserInputPtr stream;
7286 if (ioread == NULL)
7287 return (NULL);
7288 if (ctxt == NULL)
7289 return (NULL);
7290 xmlInitParser();
7292 htmlCtxtReset(ctxt);
7294 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7295 XML_CHAR_ENCODING_NONE);
7296 if (input == NULL) {
7297 if (ioclose != NULL)
7298 ioclose(ioctx);
7299 return (NULL);
7301 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7302 if (stream == NULL) {
7303 xmlFreeParserInputBuffer(input);
7304 return (NULL);
7306 inputPush(ctxt, stream);
7307 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7310 #endif /* LIBXML_HTML_ENABLED */