libs/xml2/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef LIBXML_ZLIB_ENABLED
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45 #include <libxml/uri.h>
  46
  47 #include "buf.h"
  48 #include "enc.h"
  49
  50 #define HTML_MAX_NAMELEN 1000
  51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  52 #define HTML_PARSER_BUFFER_SIZE 100
  53
  54 /* #define DEBUG */
  55 /* #define DEBUG_PUSH */
  56
  57 static int htmlOmittedDefaultValue = 1;
  58
  59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  60                              xmlChar end, xmlChar  end2, xmlChar end3);
  61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  62
  63 /************************************************************************
  64  *                                                                      *
  65  *              Some factorized error routines                          *
  66  *                                                                      *
  67  ************************************************************************/
  68
  69 /**
  70  * htmlErrMemory:
  71  * @ctxt:  an HTML parser context
  72  * @extra:  extra information
  73  *
  74  * Handle a redefinition of attribute error
  75  */
  76 static void
  77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  78 {
  79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  80         (ctxt->instate == XML_PARSER_EOF))
  81         return;
  82     if (ctxt != NULL) {
  83         ctxt->errNo = XML_ERR_NO_MEMORY;
  84         ctxt->instate = XML_PARSER_EOF;
  85         ctxt->disableSAX = 1;
  86     }
  87     if (extra)
  88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  90                         NULL, NULL, 0, 0,
  91                         "Memory allocation failed : %s\n", extra);
  92     else
  93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  96 }
  97
  98 /**
  99  * htmlParseErr:
 100  * @ctxt:  an HTML parser context
 101  * @error:  the error number
 102  * @msg:  the error message
 103  * @str1:  string infor
 104  * @str2:  string infor
 105  *
 106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 107  */
 108 static void LIBXML_ATTR_FORMAT(3,0)
 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 110              const char *msg, const xmlChar *str1, const xmlChar *str2)
 111 {
 112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 113         (ctxt->instate == XML_PARSER_EOF))
 114         return;
 115     if (ctxt != NULL)
 116         ctxt->errNo = error;
 117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 118                     XML_ERR_ERROR, NULL, 0,
 119                     (const char *) str1, (const char *) str2,
 120                     NULL, 0, 0,
 121                     msg, str1, str2);
 122     if (ctxt != NULL)
 123         ctxt->wellFormed = 0;
 124 }
 125
 126 /**
 127  * htmlParseErrInt:
 128  * @ctxt:  an HTML parser context
 129  * @error:  the error number
 130  * @msg:  the error message
 131  * @val:  integer info
 132  *
 133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 134  */
 135 static void LIBXML_ATTR_FORMAT(3,0)
 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 137              const char *msg, int val)
 138 {
 139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 140         (ctxt->instate == XML_PARSER_EOF))
 141         return;
 142     if (ctxt != NULL)
 143         ctxt->errNo = error;
 144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 146                     NULL, val, 0, msg, val);
 147     if (ctxt != NULL)
 148         ctxt->wellFormed = 0;
 149 }
 150
 151 /************************************************************************
 152  *                                                                      *
 153  *      Parser stacks related functions and macros              *
 154  *                                                                      *
 155  ************************************************************************/
 156
 157 /**
 158  * htmlnamePush:
 159  * @ctxt:  an HTML parser context
 160  * @value:  the element name
 161  *
 162  * Pushes a new element name on top of the name stack
 163  *
 164  * Returns 0 in case of error, the index in the stack otherwise
 165  */
 166 static int
 167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 168 {
 169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 170         ctxt->html = 3;
 171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 172         ctxt->html = 10;
 173     if (ctxt->nameNr >= ctxt->nameMax) {
 174         ctxt->nameMax *= 2;
 175         ctxt->nameTab = (const xmlChar * *)
 176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 177                                     ctxt->nameMax *
 178                                     sizeof(ctxt->nameTab[0]));
 179         if (ctxt->nameTab == NULL) {
 180             htmlErrMemory(ctxt, NULL);
 181             return (0);
 182         }
 183     }
 184     ctxt->nameTab[ctxt->nameNr] = value;
 185     ctxt->name = value;
 186     return (ctxt->nameNr++);
 187 }
 188 /**
 189  * htmlnamePop:
 190  * @ctxt: an HTML parser context
 191  *
 192  * Pops the top element name from the name stack
 193  *
 194  * Returns the name just removed
 195  */
 196 static const xmlChar *
 197 htmlnamePop(htmlParserCtxtPtr ctxt)
 198 {
 199     const xmlChar *ret;
 200
 201     if (ctxt->nameNr <= 0)
 202         return (NULL);
 203     ctxt->nameNr--;
 204     if (ctxt->nameNr < 0)
 205         return (NULL);
 206     if (ctxt->nameNr > 0)
 207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 208     else
 209         ctxt->name = NULL;
 210     ret = ctxt->nameTab[ctxt->nameNr];
 211     ctxt->nameTab[ctxt->nameNr] = NULL;
 212     return (ret);
 213 }
 214
 215 /**
 216  * htmlNodeInfoPush:
 217  * @ctxt:  an HTML parser context
 218  * @value:  the node info
 219  *
 220  * Pushes a new element name on top of the node info stack
 221  *
 222  * Returns 0 in case of error, the index in the stack otherwise
 223  */
 224 static int
 225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 226 {
 227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 228         if (ctxt->nodeInfoMax == 0)
 229                 ctxt->nodeInfoMax = 5;
 230         ctxt->nodeInfoMax *= 2;
 231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 233                                     ctxt->nodeInfoMax *
 234                                     sizeof(ctxt->nodeInfoTab[0]));
 235         if (ctxt->nodeInfoTab == NULL) {
 236             htmlErrMemory(ctxt, NULL);
 237             return (0);
 238         }
 239     }
 240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 242     return (ctxt->nodeInfoNr++);
 243 }
 244
 245 /**
 246  * htmlNodeInfoPop:
 247  * @ctxt:  an HTML parser context
 248  *
 249  * Pops the top element name from the node info stack
 250  *
 251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 252  */
 253 static htmlParserNodeInfo *
 254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 255 {
 256     if (ctxt->nodeInfoNr <= 0)
 257         return (NULL);
 258     ctxt->nodeInfoNr--;
 259     if (ctxt->nodeInfoNr < 0)
 260         return (NULL);
 261     if (ctxt->nodeInfoNr > 0)
 262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 263     else
 264         ctxt->nodeInfo = NULL;
 265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 266 }
 267
 268 /*
 269  * Macros for accessing the content. Those should be used only by the parser,
 270  * and not exported.
 271  *
 272  * Dirty macros, i.e. one need to make assumption on the context to use them
 273  *
 274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 277  *           in UNICODE mode. This should be used internally by the parser
 278  *           only to compare to ASCII values otherwise it would break when
 279  *           running with UTF-8 encoding.
 280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 281  *           to compare on ASCII based substring.
 282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 283  *           it should be used only to compare on ASCII based substring.
 284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 285  *           strings without newlines within the parser.
 286  *
 287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 288  *
 289  *   CURRENT Returns the current char value, with the full decoding of
 290  *           UTF-8 if we are using this mode. It returns an int.
 291  *   NEXT    Skip to the next character, this does the proper decoding
 292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 295  */
 296
 297 #define UPPER (toupper(*ctxt->input->cur))
 298
 299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
 300
 301 #define NXT(val) ctxt->input->cur[(val)]
 302
 303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 304
 305 #define CUR_PTR ctxt->input->cur
 306 #define BASE_PTR ctxt->input->base
 307
 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 309                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 310         xmlParserInputShrink(ctxt->input)
 311
 312 #define GROW if ((ctxt->progressive == 0) &&                            \
 313                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 314         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 315
 316 #define CURRENT ((int) (*ctxt->input->cur))
 317
 318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 319
 320 /* Imported from XML */
 321
 322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 323 #define CUR ((int) (*ctxt->input->cur))
 324 #define NEXT xmlNextChar(ctxt)
 325
 326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 327
 328
 329 #define NEXTL(l) do {                                                   \
 330     if (*(ctxt->input->cur) == '\n') {                                  \
 331         ctxt->input->line++; ctxt->input->col = 1;                      \
 332     } else ctxt->input->col++;                                          \
 333     ctxt->token = 0; ctxt->input->cur += l;                             \
 334   } while (0)
 335
 336 /************
 337     \
 338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 340  ************/
 341
 342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 344
 345 #define COPY_BUF(l,b,i,v)                                               \
 346     if (l == 1) b[i++] = (xmlChar) v;                                   \
 347     else i += xmlCopyChar(l,&b[i],v)
 348
 349 /**
 350  * htmlFindEncoding:
 351  * @the HTML parser context
 352  *
 353  * Ty to find and encoding in the current data available in the input
 354  * buffer this is needed to try to switch to the proper encoding when
 355  * one face a character error.
 356  * That's an heuristic, since it's operating outside of parsing it could
 357  * try to use a meta which had been commented out, that's the reason it
 358  * should only be used in case of error, not as a default.
 359  *
 360  * Returns an encoding string or NULL if not found, the string need to
 361  *   be freed
 362  */
 363 static xmlChar *
 364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 365     const xmlChar *start, *cur, *end;
 366
 367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 369         (ctxt->input->buf->encoder != NULL))
 370         return(NULL);
 371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 372         return(NULL);
 373
 374     start = ctxt->input->cur;
 375     end = ctxt->input->end;
 376     /* we also expect the input buffer to be zero terminated */
 377     if (*end != 0)
 378         return(NULL);
 379
 380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 381     if (cur == NULL)
 382         return(NULL);
 383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 384     if (cur == NULL)
 385         return(NULL);
 386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 387     if (cur == NULL)
 388         return(NULL);
 389     cur += 8;
 390     start = cur;
 391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 392            ((*cur >= 'a') && (*cur <= 'z')) ||
 393            ((*cur >= '0') && (*cur <= '9')) ||
 394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 395            cur++;
 396     if (cur == start)
 397         return(NULL);
 398     return(xmlStrndup(start, cur - start));
 399 }
 400
 401 /**
 402  * htmlCurrentChar:
 403  * @ctxt:  the HTML parser context
 404  * @len:  pointer to the length of the char read
 405  *
 406  * The current char value, if using UTF-8 this may actually span multiple
 407  * bytes in the input buffer. Implement the end of line normalization:
 408  * 2.11 End-of-Line Handling
 409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 410  * char, then the encoding converter is plugged in automatically.
 411  *
 412  * Returns the current char value and its length
 413  */
 414
 415 static int
 416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 417     const unsigned char *cur;
 418     unsigned char c;
 419     unsigned int val;
 420
 421     if (ctxt->instate == XML_PARSER_EOF)
 422         return(0);
 423
 424     if (ctxt->token != 0) {
 425         *len = 0;
 426         return(ctxt->token);
 427     }
 428     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
 429         xmlChar * guess;
 430         xmlCharEncodingHandlerPtr handler;
 431
 432         /*
 433          * Assume it's a fixed length encoding (1) with
 434          * a compatible encoding for the ASCII set, since
 435          * HTML constructs only use < 128 chars
 436          */
 437         if ((int) *ctxt->input->cur < 0x80) {
 438             *len = 1;
 439             if ((*ctxt->input->cur == 0) &&
 440                 (ctxt->input->cur < ctxt->input->end)) {
 441                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 442                                 "Char 0x%X out of allowed range\n", 0);
 443                 return(' ');
 444             }
 445             return((int) *ctxt->input->cur);
 446         }
 447
 448         /*
 449          * Humm this is bad, do an automatic flow conversion
 450          */
 451         guess = htmlFindEncoding(ctxt);
 452         if (guess == NULL) {
 453             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 454         } else {
 455             if (ctxt->input->encoding != NULL)
 456                 xmlFree((xmlChar *) ctxt->input->encoding);
 457             ctxt->input->encoding = guess;
 458             handler = xmlFindCharEncodingHandler((const char *) guess);
 459             if (handler != NULL) {
 460                 /*
 461                  * Don't use UTF-8 encoder which isn't required and
 462                  * can produce invalid UTF-8.
 463                  */
 464                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
 465                     xmlSwitchToEncoding(ctxt, handler);
 466             } else {
 467                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 468                              "Unsupported encoding %s", guess, NULL);
 469             }
 470         }
 471         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 472     }
 473
 474     /*
 475      * We are supposed to handle UTF8, check it's valid
 476      * From rfc2044: encoding of the Unicode values on UTF-8:
 477      *
 478      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 479      * 0000 0000-0000 007F   0xxxxxxx
 480      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 481      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 482      *
 483      * Check for the 0x110000 limit too
 484      */
 485     cur = ctxt->input->cur;
 486     c = *cur;
 487     if (c & 0x80) {
 488         if ((c & 0x40) == 0)
 489             goto encoding_error;
 490         if (cur[1] == 0) {
 491             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 492             cur = ctxt->input->cur;
 493         }
 494         if ((cur[1] & 0xc0) != 0x80)
 495             goto encoding_error;
 496         if ((c & 0xe0) == 0xe0) {
 497
 498             if (cur[2] == 0) {
 499                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 500                 cur = ctxt->input->cur;
 501             }
 502             if ((cur[2] & 0xc0) != 0x80)
 503                 goto encoding_error;
 504             if ((c & 0xf0) == 0xf0) {
 505                 if (cur[3] == 0) {
 506                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 507                     cur = ctxt->input->cur;
 508                 }
 509                 if (((c & 0xf8) != 0xf0) ||
 510                     ((cur[3] & 0xc0) != 0x80))
 511                     goto encoding_error;
 512                 /* 4-byte code */
 513                 *len = 4;
 514                 val = (cur[0] & 0x7) << 18;
 515                 val |= (cur[1] & 0x3f) << 12;
 516                 val |= (cur[2] & 0x3f) << 6;
 517                 val |= cur[3] & 0x3f;
 518                 if (val < 0x10000)
 519                     goto encoding_error;
 520             } else {
 521               /* 3-byte code */
 522                 *len = 3;
 523                 val = (cur[0] & 0xf) << 12;
 524                 val |= (cur[1] & 0x3f) << 6;
 525                 val |= cur[2] & 0x3f;
 526                 if (val < 0x800)
 527                     goto encoding_error;
 528             }
 529         } else {
 530           /* 2-byte code */
 531             *len = 2;
 532             val = (cur[0] & 0x1f) << 6;
 533             val |= cur[1] & 0x3f;
 534             if (val < 0x80)
 535                 goto encoding_error;
 536         }
 537         if (!IS_CHAR(val)) {
 538             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 539                             "Char 0x%X out of allowed range\n", val);
 540         }
 541         return(val);
 542     } else {
 543         if ((*ctxt->input->cur == 0) &&
 544             (ctxt->input->cur < ctxt->input->end)) {
 545             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 546                             "Char 0x%X out of allowed range\n", 0);
 547             *len = 1;
 548             return(' ');
 549         }
 550         /* 1-byte code */
 551         *len = 1;
 552         return((int) *ctxt->input->cur);
 553     }
 554
 555 encoding_error:
 556     /*
 557      * If we detect an UTF8 error that probably mean that the
 558      * input encoding didn't get properly advertised in the
 559      * declaration header. Report the error and switch the encoding
 560      * to ISO-Latin-1 (if you don't like this policy, just declare the
 561      * encoding !)
 562      */
 563     {
 564         char buffer[150];
 565
 566         if (ctxt->input->end - ctxt->input->cur >= 4) {
 567             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 568                             ctxt->input->cur[0], ctxt->input->cur[1],
 569                             ctxt->input->cur[2], ctxt->input->cur[3]);
 570         } else {
 571             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 572         }
 573         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 574                      "Input is not proper UTF-8, indicate encoding !\n",
 575                      BAD_CAST buffer, NULL);
 576     }
 577
 578     /*
 579      * Don't switch encodings twice. Note that if there's an encoder, we
 580      * shouldn't receive invalid UTF-8 anyway.
 581      *
 582      * Note that if ctxt->input->buf == NULL, switching encodings is
 583      * impossible, see Gitlab issue #34.
 584      */
 585     if ((ctxt->input->buf != NULL) &&
 586         (ctxt->input->buf->encoder == NULL))
 587         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 588     *len = 1;
 589     return((int) *ctxt->input->cur);
 590 }
 591
 592 /**
 593  * htmlSkipBlankChars:
 594  * @ctxt:  the HTML parser context
 595  *
 596  * skip all blanks character found at that point in the input streams.
 597  *
 598  * Returns the number of space chars skipped
 599  */
 600
 601 static int
 602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 603     int res = 0;
 604
 605     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 606         if ((*ctxt->input->cur == 0) &&
 607             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 608                 xmlPopInput(ctxt);
 609         } else {
 610             if (*(ctxt->input->cur) == '\n') {
 611                 ctxt->input->line++; ctxt->input->col = 1;
 612             } else ctxt->input->col++;
 613             ctxt->input->cur++;
 614             if (*ctxt->input->cur == 0)
 615                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 616         }
 617         if (res < INT_MAX)
 618             res++;
 619     }
 620     return(res);
 621 }
 622
 623
 624
 625 /************************************************************************
 626  *                                                                      *
 627  *      The list of HTML elements and their properties          *
 628  *                                                                      *
 629  ************************************************************************/
 630
 631 /*
 632  *  Start Tag: 1 means the start tag can be omitted
 633  *  End Tag:   1 means the end tag can be omitted
 634  *             2 means it's forbidden (empty elements)
 635  *             3 means the tag is stylistic and should be closed easily
 636  *  Depr:      this element is deprecated
 637  *  DTD:       1 means that this element is valid only in the Loose DTD
 638  *             2 means that this element is valid only in the Frameset DTD
 639  *
 640  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 641         , subElements , impliedsubelt , Attributes, userdata
 642  */
 643
 644 /* Definitions and a couple of vars for HTML Elements */
 645
 646 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 647 #define NB_FONTSTYLE 8
 648 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 649 #define NB_PHRASE 10
 650 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 651 #define NB_SPECIAL 16
 652 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 653 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 654 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 655 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 656 #define FORMCTRL "input", "select", "textarea", "label", "button"
 657 #define NB_FORMCTRL 5
 658 #define PCDATA
 659 #define NB_PCDATA 0
 660 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 661 #define NB_HEADING 6
 662 #define LIST "ul", "ol", "dir", "menu"
 663 #define NB_LIST 4
 664 #define MODIFIER
 665 #define NB_MODIFIER 0
 666 #define FLOW BLOCK,INLINE
 667 #define NB_FLOW NB_BLOCK + NB_INLINE
 668 #define EMPTY NULL
 669
 670
 671 static const char* const html_flow[] = { FLOW, NULL } ;
 672 static const char* const html_inline[] = { INLINE, NULL } ;
 673
 674 /* placeholders: elts with content but no subelements */
 675 static const char* const html_pcdata[] = { NULL } ;
 676 #define html_cdata html_pcdata
 677
 678
 679 /* ... and for HTML Attributes */
 680
 681 #define COREATTRS "id", "class", "style", "title"
 682 #define NB_COREATTRS 4
 683 #define I18N "lang", "dir"
 684 #define NB_I18N 2
 685 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 686 #define NB_EVENTS 9
 687 #define ATTRS COREATTRS,I18N,EVENTS
 688 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 689 #define CELLHALIGN "align", "char", "charoff"
 690 #define NB_CELLHALIGN 3
 691 #define CELLVALIGN "valign"
 692 #define NB_CELLVALIGN 1
 693
 694 static const char* const html_attrs[] = { ATTRS, NULL } ;
 695 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 696 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 697 static const char* const i18n_attrs[] = { I18N, NULL } ;
 698
 699
 700 /* Other declarations that should go inline ... */
 701 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 702         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 703         "tabindex", "onfocus", "onblur", NULL } ;
 704 static const char* const target_attr[] = { "target", NULL } ;
 705 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 706 static const char* const alt_attr[] = { "alt", NULL } ;
 707 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 708 static const char* const href_attrs[] = { "href", NULL } ;
 709 static const char* const clear_attrs[] = { "clear", NULL } ;
 710 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 711
 712 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 713 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 714                 "archive", "alt", "name", "height", "width", "align",
 715                 "hspace", "vspace", NULL } ;
 716 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 717         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 718 static const char* const basefont_attrs[] =
 719         { "id", "size", "color", "face", NULL } ;
 720 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 721 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 722 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 723 static const char* const body_depr[] = { "background", "bgcolor", "text",
 724         "link", "vlink", "alink", NULL } ;
 725 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 726         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 727
 728
 729 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 730 static const char* const col_elt[] = { "col", NULL } ;
 731 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 732 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 733 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 734 static const char* const compact_attr[] = { "compact", NULL } ;
 735 static const char* const label_attr[] = { "label", NULL } ;
 736 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 737 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 738 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 739 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 740 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 741 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 742 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 743 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 744 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 745 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 746 static const char* const version_attr[] = { "version", NULL } ;
 747 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 748 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 749 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 750 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 751 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 752 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 753 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 754 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 755 static const char* const align_attr[] = { "align", NULL } ;
 756 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 757 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 758 static const char* const name_attr[] = { "name", NULL } ;
 759 static const char* const action_attr[] = { "action", NULL } ;
 760 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 761 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 762 static const char* const content_attr[] = { "content", NULL } ;
 763 static const char* const type_attr[] = { "type", NULL } ;
 764 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 765 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 766 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 767 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 768 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 769 static const char* const option_elt[] = { "option", NULL } ;
 770 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 771 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 772 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 773 static const char* const width_attr[] = { "width", NULL } ;
 774 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 775 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 776 static const char* const language_attr[] = { "language", NULL } ;
 777 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 778 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 779 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 780 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 781 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 782 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 783 static const char* const tr_elt[] = { "tr", NULL } ;
 784 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 785 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 786 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 787 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 788 static const char* const tr_contents[] = { "th", "td", NULL } ;
 789 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 790 static const char* const li_elt[] = { "li", NULL } ;
 791 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 792 static const char* const dir_attr[] = { "dir", NULL} ;
 793
 794 #define DECL (const char**)
 795
 796 static const htmlElemDesc
 797 html40ElementTable[] = {
 798 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 799         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 800 },
 801 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 802         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 803 },
 804 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 805         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 806 },
 807 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 808         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 809 },
 810 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 811         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 812 },
 813 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 814         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 815 },
 816 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 817         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 818 },
 819 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 820         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 821 },
 822 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 823         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 824 },
 825 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 826         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 827 },
 828 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 829         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 830 },
 831 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 832         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 833 },
 834 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 835         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 836 },
 837 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 838         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 839 },
 840 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 841         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 842 },
 843 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 844         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 845 },
 846 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 847         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 848 },
 849 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 850         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 851 },
 852 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 853         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 854 },
 855 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 856         EMPTY , NULL , DECL col_attrs , NULL, NULL
 857 },
 858 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 859         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 860 },
 861 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 862         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 863 },
 864 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 865         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 866 },
 867 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 868         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 869 },
 870 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 871         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 872 },
 873 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 874         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 875 },
 876 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 877         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 878 },
 879 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 880         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 881 },
 882 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 883         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 884 },
 885 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 886         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 887 },
 888 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 889         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 890 },
 891 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 892         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 893 },
 894 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 895         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 896 },
 897 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 898         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 899 },
 900 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 901         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 902 },
 903 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 904         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 905 },
 906 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 907         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 908 },
 909 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 910         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 911 },
 912 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 913         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 914 },
 915 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 916         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 917 },
 918 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 919         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 920 },
 921 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 922         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 923 },
 924 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 925         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 926 },
 927 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 928         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 929 },
 930 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 931         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 932 },
 933 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 934         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 935 },
 936 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 937         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 938 },
 939 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 940         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 941 },
 942 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 943         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 944 },
 945 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 946         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 947 },
 948 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 949         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 950 },
 951 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 952         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 953 },
 954 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 955         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 956 },
 957 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 958         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 959 },
 960 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 961         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 962 },
 963 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 964         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 965 },
 966 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 967         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 968 },
 969 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 970         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 971 },
 972 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 973         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 974 },
 975 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 976         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 977 },
 978 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 979         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 980 },
 981 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 982         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 983 },
 984 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 985         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 986 },
 987 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 988         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 989 },
 990 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 991         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 992 },
 993 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 994         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 995 },
 996 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 997         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 998 },
 999 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
1000         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1001 },
1002 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1003         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1004 },
1005 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1006         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007 },
1008 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
1009         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1010 },
1011 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
1012         DECL select_content, NULL, DECL select_attrs, NULL, NULL
1013 },
1014 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
1015         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1016 },
1017 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1018         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1019 },
1020 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
1021         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1022 },
1023 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1024         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1025 },
1026 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
1027         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1028 },
1029 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
1030         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1031 },
1032 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1033         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1034 },
1035 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1036         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1037 },
1038 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1039         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1040 },
1041 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1042         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1043 },
1044 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1045         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1046 },
1047 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1048         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1049 },
1050 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1051         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1052 },
1053 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1054         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1055 },
1056 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1057         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1058 },
1059 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1060         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1061 },
1062 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1063         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1064 },
1065 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1066         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1067 },
1068 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1069         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1070 },
1071 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1072         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1073 }
1074 };
1075
1076 typedef struct {
1077     const char *oldTag;
1078     const char *newTag;
1079 } htmlStartCloseEntry;
1080
1081 /*
1082  * start tags that imply the end of current element
1083  */
1084 static const htmlStartCloseEntry htmlStartClose[] = {
1085     { "a", "a" },
1086     { "a", "fieldset" },
1087     { "a", "table" },
1088     { "a", "td" },
1089     { "a", "th" },
1090     { "address", "dd" },
1091     { "address", "dl" },
1092     { "address", "dt" },
1093     { "address", "form" },
1094     { "address", "li" },
1095     { "address", "ul" },
1096     { "b", "center" },
1097     { "b", "p" },
1098     { "b", "td" },
1099     { "b", "th" },
1100     { "big", "p" },
1101     { "caption", "col" },
1102     { "caption", "colgroup" },
1103     { "caption", "tbody" },
1104     { "caption", "tfoot" },
1105     { "caption", "thead" },
1106     { "caption", "tr" },
1107     { "col", "col" },
1108     { "col", "colgroup" },
1109     { "col", "tbody" },
1110     { "col", "tfoot" },
1111     { "col", "thead" },
1112     { "col", "tr" },
1113     { "colgroup", "colgroup" },
1114     { "colgroup", "tbody" },
1115     { "colgroup", "tfoot" },
1116     { "colgroup", "thead" },
1117     { "colgroup", "tr" },
1118     { "dd", "dt" },
1119     { "dir", "dd" },
1120     { "dir", "dl" },
1121     { "dir", "dt" },
1122     { "dir", "form" },
1123     { "dir", "ul" },
1124     { "dl", "form" },
1125     { "dl", "li" },
1126     { "dt", "dd" },
1127     { "dt", "dl" },
1128     { "font", "center" },
1129     { "font", "td" },
1130     { "font", "th" },
1131     { "form", "form" },
1132     { "h1", "fieldset" },
1133     { "h1", "form" },
1134     { "h1", "li" },
1135     { "h1", "p" },
1136     { "h1", "table" },
1137     { "h2", "fieldset" },
1138     { "h2", "form" },
1139     { "h2", "li" },
1140     { "h2", "p" },
1141     { "h2", "table" },
1142     { "h3", "fieldset" },
1143     { "h3", "form" },
1144     { "h3", "li" },
1145     { "h3", "p" },
1146     { "h3", "table" },
1147     { "h4", "fieldset" },
1148     { "h4", "form" },
1149     { "h4", "li" },
1150     { "h4", "p" },
1151     { "h4", "table" },
1152     { "h5", "fieldset" },
1153     { "h5", "form" },
1154     { "h5", "li" },
1155     { "h5", "p" },
1156     { "h5", "table" },
1157     { "h6", "fieldset" },
1158     { "h6", "form" },
1159     { "h6", "li" },
1160     { "h6", "p" },
1161     { "h6", "table" },
1162     { "head", "a" },
1163     { "head", "abbr" },
1164     { "head", "acronym" },
1165     { "head", "address" },
1166     { "head", "b" },
1167     { "head", "bdo" },
1168     { "head", "big" },
1169     { "head", "blockquote" },
1170     { "head", "body" },
1171     { "head", "br" },
1172     { "head", "center" },
1173     { "head", "cite" },
1174     { "head", "code" },
1175     { "head", "dd" },
1176     { "head", "dfn" },
1177     { "head", "dir" },
1178     { "head", "div" },
1179     { "head", "dl" },
1180     { "head", "dt" },
1181     { "head", "em" },
1182     { "head", "fieldset" },
1183     { "head", "font" },
1184     { "head", "form" },
1185     { "head", "frameset" },
1186     { "head", "h1" },
1187     { "head", "h2" },
1188     { "head", "h3" },
1189     { "head", "h4" },
1190     { "head", "h5" },
1191     { "head", "h6" },
1192     { "head", "hr" },
1193     { "head", "i" },
1194     { "head", "iframe" },
1195     { "head", "img" },
1196     { "head", "kbd" },
1197     { "head", "li" },
1198     { "head", "listing" },
1199     { "head", "map" },
1200     { "head", "menu" },
1201     { "head", "ol" },
1202     { "head", "p" },
1203     { "head", "pre" },
1204     { "head", "q" },
1205     { "head", "s" },
1206     { "head", "samp" },
1207     { "head", "small" },
1208     { "head", "span" },
1209     { "head", "strike" },
1210     { "head", "strong" },
1211     { "head", "sub" },
1212     { "head", "sup" },
1213     { "head", "table" },
1214     { "head", "tt" },
1215     { "head", "u" },
1216     { "head", "ul" },
1217     { "head", "var" },
1218     { "head", "xmp" },
1219     { "hr", "form" },
1220     { "i", "center" },
1221     { "i", "p" },
1222     { "i", "td" },
1223     { "i", "th" },
1224     { "legend", "fieldset" },
1225     { "li", "li" },
1226     { "link", "body" },
1227     { "link", "frameset" },
1228     { "listing", "dd" },
1229     { "listing", "dl" },
1230     { "listing", "dt" },
1231     { "listing", "fieldset" },
1232     { "listing", "form" },
1233     { "listing", "li" },
1234     { "listing", "table" },
1235     { "listing", "ul" },
1236     { "menu", "dd" },
1237     { "menu", "dl" },
1238     { "menu", "dt" },
1239     { "menu", "form" },
1240     { "menu", "ul" },
1241     { "ol", "form" },
1242     { "ol", "ul" },
1243     { "option", "optgroup" },
1244     { "option", "option" },
1245     { "p", "address" },
1246     { "p", "blockquote" },
1247     { "p", "body" },
1248     { "p", "caption" },
1249     { "p", "center" },
1250     { "p", "col" },
1251     { "p", "colgroup" },
1252     { "p", "dd" },
1253     { "p", "dir" },
1254     { "p", "div" },
1255     { "p", "dl" },
1256     { "p", "dt" },
1257     { "p", "fieldset" },
1258     { "p", "form" },
1259     { "p", "frameset" },
1260     { "p", "h1" },
1261     { "p", "h2" },
1262     { "p", "h3" },
1263     { "p", "h4" },
1264     { "p", "h5" },
1265     { "p", "h6" },
1266     { "p", "head" },
1267     { "p", "hr" },
1268     { "p", "li" },
1269     { "p", "listing" },
1270     { "p", "menu" },
1271     { "p", "ol" },
1272     { "p", "p" },
1273     { "p", "pre" },
1274     { "p", "table" },
1275     { "p", "tbody" },
1276     { "p", "td" },
1277     { "p", "tfoot" },
1278     { "p", "th" },
1279     { "p", "title" },
1280     { "p", "tr" },
1281     { "p", "ul" },
1282     { "p", "xmp" },
1283     { "pre", "dd" },
1284     { "pre", "dl" },
1285     { "pre", "dt" },
1286     { "pre", "fieldset" },
1287     { "pre", "form" },
1288     { "pre", "li" },
1289     { "pre", "table" },
1290     { "pre", "ul" },
1291     { "s", "p" },
1292     { "script", "noscript" },
1293     { "small", "p" },
1294     { "span", "td" },
1295     { "span", "th" },
1296     { "strike", "p" },
1297     { "style", "body" },
1298     { "style", "frameset" },
1299     { "tbody", "tbody" },
1300     { "tbody", "tfoot" },
1301     { "td", "tbody" },
1302     { "td", "td" },
1303     { "td", "tfoot" },
1304     { "td", "th" },
1305     { "td", "tr" },
1306     { "tfoot", "tbody" },
1307     { "th", "tbody" },
1308     { "th", "td" },
1309     { "th", "tfoot" },
1310     { "th", "th" },
1311     { "th", "tr" },
1312     { "thead", "tbody" },
1313     { "thead", "tfoot" },
1314     { "title", "body" },
1315     { "title", "frameset" },
1316     { "tr", "tbody" },
1317     { "tr", "tfoot" },
1318     { "tr", "tr" },
1319     { "tt", "p" },
1320     { "u", "p" },
1321     { "u", "td" },
1322     { "u", "th" },
1323     { "ul", "address" },
1324     { "ul", "form" },
1325     { "ul", "menu" },
1326     { "ul", "ol" },
1327     { "ul", "pre" },
1328     { "xmp", "dd" },
1329     { "xmp", "dl" },
1330     { "xmp", "dt" },
1331     { "xmp", "fieldset" },
1332     { "xmp", "form" },
1333     { "xmp", "li" },
1334     { "xmp", "table" },
1335     { "xmp", "ul" }
1336 };
1337
1338 /*
1339  * The list of HTML elements which are supposed not to have
1340  * CDATA content and where a p element will be implied
1341  *
1342  * TODO: extend that list by reading the HTML SGML DTD on
1343  *       implied paragraph
1344  */
1345 static const char *const htmlNoContentElements[] = {
1346     "html",
1347     "head",
1348     NULL
1349 };
1350
1351 /*
1352  * The list of HTML attributes which are of content %Script;
1353  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1354  *       it assumes the name starts with 'on'
1355  */
1356 static const char *const htmlScriptAttributes[] = {
1357     "onclick",
1358     "ondblclick",
1359     "onmousedown",
1360     "onmouseup",
1361     "onmouseover",
1362     "onmousemove",
1363     "onmouseout",
1364     "onkeypress",
1365     "onkeydown",
1366     "onkeyup",
1367     "onload",
1368     "onunload",
1369     "onfocus",
1370     "onblur",
1371     "onsubmit",
1372     "onreset",
1373     "onchange",
1374     "onselect"
1375 };
1376
1377 /*
1378  * This table is used by the htmlparser to know what to do with
1379  * broken html pages. By assigning different priorities to different
1380  * elements the parser can decide how to handle extra endtags.
1381  * Endtags are only allowed to close elements with lower or equal
1382  * priority.
1383  */
1384
1385 typedef struct {
1386     const char *name;
1387     int priority;
1388 } elementPriority;
1389
1390 static const elementPriority htmlEndPriority[] = {
1391     {"div",   150},
1392     {"td",    160},
1393     {"th",    160},
1394     {"tr",    170},
1395     {"thead", 180},
1396     {"tbody", 180},
1397     {"tfoot", 180},
1398     {"table", 190},
1399     {"head",  200},
1400     {"body",  200},
1401     {"html",  220},
1402     {NULL,    100} /* Default priority */
1403 };
1404
1405 /************************************************************************
1406  *                                                                      *
1407  *      functions to handle HTML specific data                  *
1408  *                                                                      *
1409  ************************************************************************/
1410
1411 /**
1412  * htmlInitAutoClose:
1413  *
1414  * This is a no-op now.
1415  */
1416 void
1417 htmlInitAutoClose(void) {
1418 }
1419
1420 static int
1421 htmlCompareTags(const void *key, const void *member) {
1422     const xmlChar *tag = (const xmlChar *) key;
1423     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1424
1425     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1426 }
1427
1428 /**
1429  * htmlTagLookup:
1430  * @tag:  The tag name in lowercase
1431  *
1432  * Lookup the HTML tag in the ElementTable
1433  *
1434  * Returns the related htmlElemDescPtr or NULL if not found.
1435  */
1436 const htmlElemDesc *
1437 htmlTagLookup(const xmlChar *tag) {
1438     if (tag == NULL)
1439         return(NULL);
1440
1441     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1442                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1443                 sizeof(htmlElemDesc), htmlCompareTags));
1444 }
1445
1446 /**
1447  * htmlGetEndPriority:
1448  * @name: The name of the element to look up the priority for.
1449  *
1450  * Return value: The "endtag" priority.
1451  **/
1452 static int
1453 htmlGetEndPriority (const xmlChar *name) {
1454     int i = 0;
1455
1456     while ((htmlEndPriority[i].name != NULL) &&
1457            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1458         i++;
1459
1460     return(htmlEndPriority[i].priority);
1461 }
1462
1463
1464 static int
1465 htmlCompareStartClose(const void *vkey, const void *member) {
1466     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1467     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1468     int ret;
1469
1470     ret = strcmp(key->oldTag, entry->oldTag);
1471     if (ret == 0)
1472         ret = strcmp(key->newTag, entry->newTag);
1473
1474     return(ret);
1475 }
1476
1477 /**
1478  * htmlCheckAutoClose:
1479  * @newtag:  The new tag name
1480  * @oldtag:  The old tag name
1481  *
1482  * Checks whether the new tag is one of the registered valid tags for
1483  * closing old.
1484  *
1485  * Returns 0 if no, 1 if yes.
1486  */
1487 static int
1488 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1489 {
1490     htmlStartCloseEntry key;
1491     void *res;
1492
1493     key.oldTag = (const char *) oldtag;
1494     key.newTag = (const char *) newtag;
1495     res = bsearch(&key, htmlStartClose,
1496             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1497             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1498     return(res != NULL);
1499 }
1500
1501 /**
1502  * htmlAutoCloseOnClose:
1503  * @ctxt:  an HTML parser context
1504  * @newtag:  The new tag name
1505  * @force:  force the tag closure
1506  *
1507  * The HTML DTD allows an ending tag to implicitly close other tags.
1508  */
1509 static void
1510 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1511 {
1512     const htmlElemDesc *info;
1513     int i, priority;
1514
1515     priority = htmlGetEndPriority(newtag);
1516
1517     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1518
1519         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1520             break;
1521         /*
1522          * A misplaced endtag can only close elements with lower
1523          * or equal priority, so if we find an element with higher
1524          * priority before we find an element with
1525          * matching name, we just ignore this endtag
1526          */
1527         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1528             return;
1529     }
1530     if (i < 0)
1531         return;
1532
1533     while (!xmlStrEqual(newtag, ctxt->name)) {
1534         info = htmlTagLookup(ctxt->name);
1535         if ((info != NULL) && (info->endTag == 3)) {
1536             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1537                          "Opening and ending tag mismatch: %s and %s\n",
1538                          newtag, ctxt->name);
1539         }
1540         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1541             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1542         htmlnamePop(ctxt);
1543     }
1544 }
1545
1546 /**
1547  * htmlAutoCloseOnEnd:
1548  * @ctxt:  an HTML parser context
1549  *
1550  * Close all remaining tags at the end of the stream
1551  */
1552 static void
1553 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1554 {
1555     int i;
1556
1557     if (ctxt->nameNr == 0)
1558         return;
1559     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1560         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1561             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1562         htmlnamePop(ctxt);
1563     }
1564 }
1565
1566 /**
1567  * htmlAutoClose:
1568  * @ctxt:  an HTML parser context
1569  * @newtag:  The new tag name or NULL
1570  *
1571  * The HTML DTD allows a tag to implicitly close other tags.
1572  * The list is kept in htmlStartClose array. This function is
1573  * called when a new tag has been detected and generates the
1574  * appropriates closes if possible/needed.
1575  * If newtag is NULL this mean we are at the end of the resource
1576  * and we should check
1577  */
1578 static void
1579 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1580 {
1581     while ((newtag != NULL) && (ctxt->name != NULL) &&
1582            (htmlCheckAutoClose(newtag, ctxt->name))) {
1583         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1584             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1585         htmlnamePop(ctxt);
1586     }
1587     if (newtag == NULL) {
1588         htmlAutoCloseOnEnd(ctxt);
1589         return;
1590     }
1591     while ((newtag == NULL) && (ctxt->name != NULL) &&
1592            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1593             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1594             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1595         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1596             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1597         htmlnamePop(ctxt);
1598     }
1599 }
1600
1601 /**
1602  * htmlAutoCloseTag:
1603  * @doc:  the HTML document
1604  * @name:  The tag name
1605  * @elem:  the HTML element
1606  *
1607  * The HTML DTD allows a tag to implicitly close other tags.
1608  * The list is kept in htmlStartClose array. This function checks
1609  * if the element or one of it's children would autoclose the
1610  * given tag.
1611  *
1612  * Returns 1 if autoclose, 0 otherwise
1613  */
1614 int
1615 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1616     htmlNodePtr child;
1617
1618     if (elem == NULL) return(1);
1619     if (xmlStrEqual(name, elem->name)) return(0);
1620     if (htmlCheckAutoClose(elem->name, name)) return(1);
1621     child = elem->children;
1622     while (child != NULL) {
1623         if (htmlAutoCloseTag(doc, name, child)) return(1);
1624         child = child->next;
1625     }
1626     return(0);
1627 }
1628
1629 /**
1630  * htmlIsAutoClosed:
1631  * @doc:  the HTML document
1632  * @elem:  the HTML element
1633  *
1634  * The HTML DTD allows a tag to implicitly close other tags.
1635  * The list is kept in htmlStartClose array. This function checks
1636  * if a tag is autoclosed by one of it's child
1637  *
1638  * Returns 1 if autoclosed, 0 otherwise
1639  */
1640 int
1641 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1642     htmlNodePtr child;
1643
1644     if (elem == NULL) return(1);
1645     child = elem->children;
1646     while (child != NULL) {
1647         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1648         child = child->next;
1649     }
1650     return(0);
1651 }
1652
1653 /**
1654  * htmlCheckImplied:
1655  * @ctxt:  an HTML parser context
1656  * @newtag:  The new tag name
1657  *
1658  * The HTML DTD allows a tag to exists only implicitly
1659  * called when a new tag has been detected and generates the
1660  * appropriates implicit tags if missing
1661  */
1662 static void
1663 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1664     int i;
1665
1666     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1667         return;
1668     if (!htmlOmittedDefaultValue)
1669         return;
1670     if (xmlStrEqual(newtag, BAD_CAST"html"))
1671         return;
1672     if (ctxt->nameNr <= 0) {
1673         htmlnamePush(ctxt, BAD_CAST"html");
1674         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1675             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1676     }
1677     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1678         return;
1679     if ((ctxt->nameNr <= 1) &&
1680         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1681          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1682          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1683          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1684          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1685          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1686         if (ctxt->html >= 3) {
1687             /* we already saw or generated an <head> before */
1688             return;
1689         }
1690         /*
1691          * dropped OBJECT ... i you put it first BODY will be
1692          * assumed !
1693          */
1694         htmlnamePush(ctxt, BAD_CAST"head");
1695         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1696             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1697     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1698                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1699                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1700         if (ctxt->html >= 10) {
1701             /* we already saw or generated a <body> before */
1702             return;
1703         }
1704         for (i = 0;i < ctxt->nameNr;i++) {
1705             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1706                 return;
1707             }
1708             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1709                 return;
1710             }
1711         }
1712
1713         htmlnamePush(ctxt, BAD_CAST"body");
1714         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1715             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1716     }
1717 }
1718
1719 /**
1720  * htmlCheckParagraph
1721  * @ctxt:  an HTML parser context
1722  *
1723  * Check whether a p element need to be implied before inserting
1724  * characters in the current element.
1725  *
1726  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1727  *         in case of error.
1728  */
1729
1730 static int
1731 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1732     const xmlChar *tag;
1733     int i;
1734
1735     if (ctxt == NULL)
1736         return(-1);
1737     tag = ctxt->name;
1738     if (tag == NULL) {
1739         htmlAutoClose(ctxt, BAD_CAST"p");
1740         htmlCheckImplied(ctxt, BAD_CAST"p");
1741         htmlnamePush(ctxt, BAD_CAST"p");
1742         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1743             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1744         return(1);
1745     }
1746     if (!htmlOmittedDefaultValue)
1747         return(0);
1748     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1749         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1750             htmlAutoClose(ctxt, BAD_CAST"p");
1751             htmlCheckImplied(ctxt, BAD_CAST"p");
1752             htmlnamePush(ctxt, BAD_CAST"p");
1753             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1754                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1755             return(1);
1756         }
1757     }
1758     return(0);
1759 }
1760
1761 /**
1762  * htmlIsScriptAttribute:
1763  * @name:  an attribute name
1764  *
1765  * Check if an attribute is of content type Script
1766  *
1767  * Returns 1 is the attribute is a script 0 otherwise
1768  */
1769 int
1770 htmlIsScriptAttribute(const xmlChar *name) {
1771     unsigned int i;
1772
1773     if (name == NULL)
1774       return(0);
1775     /*
1776      * all script attributes start with 'on'
1777      */
1778     if ((name[0] != 'o') || (name[1] != 'n'))
1779       return(0);
1780     for (i = 0;
1781          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1782          i++) {
1783         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1784             return(1);
1785     }
1786     return(0);
1787 }
1788
1789 /************************************************************************
1790  *                                                                      *
1791  *      The list of HTML predefined entities                    *
1792  *                                                                      *
1793  ************************************************************************/
1794
1795
1796 static const htmlEntityDesc  html40EntitiesTable[] = {
1797 /*
1798  * the 4 absolute ones, plus apostrophe.
1799  */
1800 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1801 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1802 { 39,   "apos", "single quote" },
1803 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1804 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1805
1806 /*
1807  * A bunch still in the 128-255 range
1808  * Replacing them depend really on the charset used.
1809  */
1810 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1811 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1812 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1813 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1814 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1815 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1816 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1817 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1818 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1819 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1820 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1821 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1822 { 172,  "not",  "not sign, U+00AC ISOnum" },
1823 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1824 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1825 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1826 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1827 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1828 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1829 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1830 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1831 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1832 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1833 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1834 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1835 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1836 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1837 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1838 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1839 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1840 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1841 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1842 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1843 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1844 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1845 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1846 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1847 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1848 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1849 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1850 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1851 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1852 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1853 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1854 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1855 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1856 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1857 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1858 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1859 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1860 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1861 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1862 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1863 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1864 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1865 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1866 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1867 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1868 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1869 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1870 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1871 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1872 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1873 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1874 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1875 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1876 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1877 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1878 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1879 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1880 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1881 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1882 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1883 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1884 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1885 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1886 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1887 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1888 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1889 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1890 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1891 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1892 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1893 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1894 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1895 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1896 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1897 { 247,  "divide","division sign, U+00F7 ISOnum" },
1898 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1899 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1900 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1901 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1902 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1903 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1904 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1905 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1906
1907 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1908 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1909 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1910 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1911 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1912
1913 /*
1914  * Anything below should really be kept as entities references
1915  */
1916 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1917
1918 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1919 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1920
1921 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1922 { 914,  "Beta", "greek capital letter beta, U+0392" },
1923 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1924 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1925 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1926 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1927 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1928 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1929 { 921,  "Iota", "greek capital letter iota, U+0399" },
1930 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1931 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1932 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1933 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1934 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1935 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1936 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1937 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1938 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1939 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1940 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1941 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1942 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1943 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1944 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1945
1946 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1947 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1948 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1949 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1950 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1951 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1952 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1953 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1954 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1955 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1956 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1957 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1958 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1959 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1960 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1961 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1962 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1963 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1964 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1965 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1966 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1967 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1968 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1969 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1970 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1971 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1972 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1973 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1974
1975 { 8194, "ensp", "en space, U+2002 ISOpub" },
1976 { 8195, "emsp", "em space, U+2003 ISOpub" },
1977 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1978 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1979 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1980 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1981 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1982 { 8211, "ndash","en dash, U+2013 ISOpub" },
1983 { 8212, "mdash","em dash, U+2014 ISOpub" },
1984 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1985 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1986 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1987 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1988 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1989 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1990 { 8224, "dagger","dagger, U+2020 ISOpub" },
1991 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1992
1993 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1994 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1995
1996 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1997
1998 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1999 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
2000
2001 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2002 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2003
2004 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
2005 { 8260, "frasl","fraction slash, U+2044 NEW" },
2006
2007 { 8364, "euro", "euro sign, U+20AC NEW" },
2008
2009 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2010 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2011 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2012 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2013 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2014 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2015 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2016 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2017 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2018 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2019 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2020 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2021 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2022 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2023 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2024 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2025
2026 { 8704, "forall","for all, U+2200 ISOtech" },
2027 { 8706, "part", "partial differential, U+2202 ISOtech" },
2028 { 8707, "exist","there exists, U+2203 ISOtech" },
2029 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2030 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2031 { 8712, "isin", "element of, U+2208 ISOtech" },
2032 { 8713, "notin","not an element of, U+2209 ISOtech" },
2033 { 8715, "ni",   "contains as member, U+220B ISOtech" },
2034 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2035 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2036 { 8722, "minus","minus sign, U+2212 ISOtech" },
2037 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2038 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2039 { 8733, "prop", "proportional to, U+221D ISOtech" },
2040 { 8734, "infin","infinity, U+221E ISOtech" },
2041 { 8736, "ang",  "angle, U+2220 ISOamso" },
2042 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2043 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
2044 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2045 { 8746, "cup",  "union = cup, U+222A ISOtech" },
2046 { 8747, "int",  "integral, U+222B ISOtech" },
2047 { 8756, "there4","therefore, U+2234 ISOtech" },
2048 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2049 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2050 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2051 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
2052 { 8801, "equiv","identical to, U+2261 ISOtech" },
2053 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
2054 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
2055 { 8834, "sub",  "subset of, U+2282 ISOtech" },
2056 { 8835, "sup",  "superset of, U+2283 ISOtech" },
2057 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2058 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2059 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2060 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2061 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2062 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2063 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2064 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2065 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2066 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2067 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2068 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2069 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2070 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
2071
2072 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2073 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2074 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2075 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2076
2077 };
2078
2079 /************************************************************************
2080  *                                                                      *
2081  *              Commodity functions to handle entities                  *
2082  *                                                                      *
2083  ************************************************************************/
2084
2085 /*
2086  * Macro used to grow the current buffer.
2087  */
2088 #define growBuffer(buffer) {                                            \
2089     xmlChar *tmp;                                                       \
2090     buffer##_size *= 2;                                                 \
2091     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2092     if (tmp == NULL) {                                          \
2093         htmlErrMemory(ctxt, "growing buffer\n");                        \
2094         xmlFree(buffer);                                                \
2095         return(NULL);                                                   \
2096     }                                                                   \
2097     buffer = tmp;                                                       \
2098 }
2099
2100 /**
2101  * htmlEntityLookup:
2102  * @name: the entity name
2103  *
2104  * Lookup the given entity in EntitiesTable
2105  *
2106  * TODO: the linear scan is really ugly, an hash table is really needed.
2107  *
2108  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2109  */
2110 const htmlEntityDesc *
2111 htmlEntityLookup(const xmlChar *name) {
2112     unsigned int i;
2113
2114     for (i = 0;i < (sizeof(html40EntitiesTable)/
2115                     sizeof(html40EntitiesTable[0]));i++) {
2116         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2117             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2118         }
2119     }
2120     return(NULL);
2121 }
2122
2123 /**
2124  * htmlEntityValueLookup:
2125  * @value: the entity's unicode value
2126  *
2127  * Lookup the given entity in EntitiesTable
2128  *
2129  * TODO: the linear scan is really ugly, an hash table is really needed.
2130  *
2131  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2132  */
2133 const htmlEntityDesc *
2134 htmlEntityValueLookup(unsigned int value) {
2135     unsigned int i;
2136
2137     for (i = 0;i < (sizeof(html40EntitiesTable)/
2138                     sizeof(html40EntitiesTable[0]));i++) {
2139         if (html40EntitiesTable[i].value >= value) {
2140             if (html40EntitiesTable[i].value > value)
2141                 break;
2142             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2143         }
2144     }
2145     return(NULL);
2146 }
2147
2148 /**
2149  * UTF8ToHtml:
2150  * @out:  a pointer to an array of bytes to store the result
2151  * @outlen:  the length of @out
2152  * @in:  a pointer to an array of UTF-8 chars
2153  * @inlen:  the length of @in
2154  *
2155  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2156  * plus HTML entities block of chars out.
2157  *
2158  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2159  * The value of @inlen after return is the number of octets consumed
2160  *     as the return value is positive, else unpredictable.
2161  * The value of @outlen after return is the number of octets consumed.
2162  */
2163 int
2164 UTF8ToHtml(unsigned char* out, int *outlen,
2165               const unsigned char* in, int *inlen) {
2166     const unsigned char* processed = in;
2167     const unsigned char* outend;
2168     const unsigned char* outstart = out;
2169     const unsigned char* instart = in;
2170     const unsigned char* inend;
2171     unsigned int c, d;
2172     int trailing;
2173
2174     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2175     if (in == NULL) {
2176         /*
2177          * initialization nothing to do
2178          */
2179         *outlen = 0;
2180         *inlen = 0;
2181         return(0);
2182     }
2183     inend = in + (*inlen);
2184     outend = out + (*outlen);
2185     while (in < inend) {
2186         d = *in++;
2187         if      (d < 0x80)  { c= d; trailing= 0; }
2188         else if (d < 0xC0) {
2189             /* trailing byte in leading position */
2190             *outlen = out - outstart;
2191             *inlen = processed - instart;
2192             return(-2);
2193         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2194         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2195         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2196         else {
2197             /* no chance for this in Ascii */
2198             *outlen = out - outstart;
2199             *inlen = processed - instart;
2200             return(-2);
2201         }
2202
2203         if (inend - in < trailing) {
2204             break;
2205         }
2206
2207         for ( ; trailing; trailing--) {
2208             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2209                 break;
2210             c <<= 6;
2211             c |= d & 0x3F;
2212         }
2213
2214         /* assertion: c is a single UTF-4 value */
2215         if (c < 0x80) {
2216             if (out + 1 >= outend)
2217                 break;
2218             *out++ = c;
2219         } else {
2220             int len;
2221             const htmlEntityDesc * ent;
2222             const char *cp;
2223             char nbuf[16];
2224
2225             /*
2226              * Try to lookup a predefined HTML entity for it
2227              */
2228
2229             ent = htmlEntityValueLookup(c);
2230             if (ent == NULL) {
2231               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2232               cp = nbuf;
2233             }
2234             else
2235               cp = ent->name;
2236             len = strlen(cp);
2237             if (out + 2 + len >= outend)
2238                 break;
2239             *out++ = '&';
2240             memcpy(out, cp, len);
2241             out += len;
2242             *out++ = ';';
2243         }
2244         processed = in;
2245     }
2246     *outlen = out - outstart;
2247     *inlen = processed - instart;
2248     return(0);
2249 }
2250
2251 /**
2252  * htmlEncodeEntities:
2253  * @out:  a pointer to an array of bytes to store the result
2254  * @outlen:  the length of @out
2255  * @in:  a pointer to an array of UTF-8 chars
2256  * @inlen:  the length of @in
2257  * @quoteChar: the quote character to escape (' or ") or zero.
2258  *
2259  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2260  * plus HTML entities block of chars out.
2261  *
2262  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2263  * The value of @inlen after return is the number of octets consumed
2264  *     as the return value is positive, else unpredictable.
2265  * The value of @outlen after return is the number of octets consumed.
2266  */
2267 int
2268 htmlEncodeEntities(unsigned char* out, int *outlen,
2269                    const unsigned char* in, int *inlen, int quoteChar) {
2270     const unsigned char* processed = in;
2271     const unsigned char* outend;
2272     const unsigned char* outstart = out;
2273     const unsigned char* instart = in;
2274     const unsigned char* inend;
2275     unsigned int c, d;
2276     int trailing;
2277
2278     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2279         return(-1);
2280     outend = out + (*outlen);
2281     inend = in + (*inlen);
2282     while (in < inend) {
2283         d = *in++;
2284         if      (d < 0x80)  { c= d; trailing= 0; }
2285         else if (d < 0xC0) {
2286             /* trailing byte in leading position */
2287             *outlen = out - outstart;
2288             *inlen = processed - instart;
2289             return(-2);
2290         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2291         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2292         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2293         else {
2294             /* no chance for this in Ascii */
2295             *outlen = out - outstart;
2296             *inlen = processed - instart;
2297             return(-2);
2298         }
2299
2300         if (inend - in < trailing)
2301             break;
2302
2303         while (trailing--) {
2304             if (((d= *in++) & 0xC0) != 0x80) {
2305                 *outlen = out - outstart;
2306                 *inlen = processed - instart;
2307                 return(-2);
2308             }
2309             c <<= 6;
2310             c |= d & 0x3F;
2311         }
2312
2313         /* assertion: c is a single UTF-4 value */
2314         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2315             (c != '&') && (c != '<') && (c != '>')) {
2316             if (out >= outend)
2317                 break;
2318             *out++ = c;
2319         } else {
2320             const htmlEntityDesc * ent;
2321             const char *cp;
2322             char nbuf[16];
2323             int len;
2324
2325             /*
2326              * Try to lookup a predefined HTML entity for it
2327              */
2328             ent = htmlEntityValueLookup(c);
2329             if (ent == NULL) {
2330                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2331                 cp = nbuf;
2332             }
2333             else
2334                 cp = ent->name;
2335             len = strlen(cp);
2336             if (out + 2 + len > outend)
2337                 break;
2338             *out++ = '&';
2339             memcpy(out, cp, len);
2340             out += len;
2341             *out++ = ';';
2342         }
2343         processed = in;
2344     }
2345     *outlen = out - outstart;
2346     *inlen = processed - instart;
2347     return(0);
2348 }
2349
2350 /************************************************************************
2351  *                                                                      *
2352  *              Commodity functions to handle streams                   *
2353  *                                                                      *
2354  ************************************************************************/
2355
2356 #ifdef LIBXML_PUSH_ENABLED
2357 /**
2358  * htmlNewInputStream:
2359  * @ctxt:  an HTML parser context
2360  *
2361  * Create a new input stream structure
2362  * Returns the new input stream or NULL
2363  */
2364 static htmlParserInputPtr
2365 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2366     htmlParserInputPtr input;
2367
2368     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2369     if (input == NULL) {
2370         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2371         return(NULL);
2372     }
2373     memset(input, 0, sizeof(htmlParserInput));
2374     input->filename = NULL;
2375     input->directory = NULL;
2376     input->base = NULL;
2377     input->cur = NULL;
2378     input->buf = NULL;
2379     input->line = 1;
2380     input->col = 1;
2381     input->buf = NULL;
2382     input->free = NULL;
2383     input->version = NULL;
2384     input->consumed = 0;
2385     input->length = 0;
2386     return(input);
2387 }
2388 #endif
2389
2390
2391 /************************************************************************
2392  *                                                                      *
2393  *              Commodity functions, cleanup needed ?                   *
2394  *                                                                      *
2395  ************************************************************************/
2396 /*
2397  * all tags allowing pc data from the html 4.01 loose dtd
2398  * NOTE: it might be more appropriate to integrate this information
2399  * into the html40ElementTable array but I don't want to risk any
2400  * binary incompatibility
2401  */
2402 static const char *allowPCData[] = {
2403     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2404     "blockquote", "body", "button", "caption", "center", "cite", "code",
2405     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2406     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2407     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2408     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2409 };
2410
2411 /**
2412  * areBlanks:
2413  * @ctxt:  an HTML parser context
2414  * @str:  a xmlChar *
2415  * @len:  the size of @str
2416  *
2417  * Is this a sequence of blank chars that one can ignore ?
2418  *
2419  * Returns 1 if ignorable 0 otherwise.
2420  */
2421
2422 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2423     unsigned int i;
2424     int j;
2425     xmlNodePtr lastChild;
2426     xmlDtdPtr dtd;
2427
2428     for (j = 0;j < len;j++)
2429         if (!(IS_BLANK_CH(str[j]))) return(0);
2430
2431     if (CUR == 0) return(1);
2432     if (CUR != '<') return(0);
2433     if (ctxt->name == NULL)
2434         return(1);
2435     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2436         return(1);
2437     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2438         return(1);
2439
2440     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2441     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2442         dtd = xmlGetIntSubset(ctxt->myDoc);
2443         if (dtd != NULL && dtd->ExternalID != NULL) {
2444             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2445                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2446                 return(1);
2447         }
2448     }
2449
2450     if (ctxt->node == NULL) return(0);
2451     lastChild = xmlGetLastChild(ctxt->node);
2452     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2453         lastChild = lastChild->prev;
2454     if (lastChild == NULL) {
2455         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2456             (ctxt->node->content != NULL)) return(0);
2457         /* keep ws in constructs like ...<b> </b>...
2458            for all tags "b" allowing PCDATA */
2459         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2460             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2461                 return(0);
2462             }
2463         }
2464     } else if (xmlNodeIsText(lastChild)) {
2465         return(0);
2466     } else {
2467         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2468            for all tags "p" allowing PCDATA */
2469         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2470             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2471                 return(0);
2472             }
2473         }
2474     }
2475     return(1);
2476 }
2477
2478 /**
2479  * htmlNewDocNoDtD:
2480  * @URI:  URI for the dtd, or NULL
2481  * @ExternalID:  the external ID of the DTD, or NULL
2482  *
2483  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2484  * are NULL
2485  *
2486  * Returns a new document, do not initialize the DTD if not provided
2487  */
2488 htmlDocPtr
2489 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2490     xmlDocPtr cur;
2491
2492     /*
2493      * Allocate a new document and fill the fields.
2494      */
2495     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2496     if (cur == NULL) {
2497         htmlErrMemory(NULL, "HTML document creation failed\n");
2498         return(NULL);
2499     }
2500     memset(cur, 0, sizeof(xmlDoc));
2501
2502     cur->type = XML_HTML_DOCUMENT_NODE;
2503     cur->version = NULL;
2504     cur->intSubset = NULL;
2505     cur->doc = cur;
2506     cur->name = NULL;
2507     cur->children = NULL;
2508     cur->extSubset = NULL;
2509     cur->oldNs = NULL;
2510     cur->encoding = NULL;
2511     cur->standalone = 1;
2512     cur->compression = 0;
2513     cur->ids = NULL;
2514     cur->refs = NULL;
2515     cur->_private = NULL;
2516     cur->charset = XML_CHAR_ENCODING_UTF8;
2517     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2518     if ((ExternalID != NULL) ||
2519         (URI != NULL))
2520         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2521     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2522         xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2523     return(cur);
2524 }
2525
2526 /**
2527  * htmlNewDoc:
2528  * @URI:  URI for the dtd, or NULL
2529  * @ExternalID:  the external ID of the DTD, or NULL
2530  *
2531  * Creates a new HTML document
2532  *
2533  * Returns a new document
2534  */
2535 htmlDocPtr
2536 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2537     if ((URI == NULL) && (ExternalID == NULL))
2538         return(htmlNewDocNoDtD(
2539                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2540                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2541
2542     return(htmlNewDocNoDtD(URI, ExternalID));
2543 }
2544
2545
2546 /************************************************************************
2547  *                                                                      *
2548  *                      The parser itself                               *
2549  *      Relates to http://www.w3.org/TR/html40                          *
2550  *                                                                      *
2551  ************************************************************************/
2552
2553 /************************************************************************
2554  *                                                                      *
2555  *                      The parser itself                               *
2556  *                                                                      *
2557  ************************************************************************/
2558
2559 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2560
2561 /**
2562  * htmlParseHTMLName:
2563  * @ctxt:  an HTML parser context
2564  *
2565  * parse an HTML tag or attribute name, note that we convert it to lowercase
2566  * since HTML names are not case-sensitive.
2567  *
2568  * Returns the Tag Name parsed or NULL
2569  */
2570
2571 static const xmlChar *
2572 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2573     int i = 0;
2574     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2575
2576     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2577         (CUR != ':') && (CUR != '.')) return(NULL);
2578
2579     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2580            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2581            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2582            (CUR == '.'))) {
2583         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2584         else loc[i] = CUR;
2585         i++;
2586
2587         NEXT;
2588     }
2589
2590     return(xmlDictLookup(ctxt->dict, loc, i));
2591 }
2592
2593
2594 /**
2595  * htmlParseHTMLName_nonInvasive:
2596  * @ctxt:  an HTML parser context
2597  *
2598  * parse an HTML tag or attribute name, note that we convert it to lowercase
2599  * since HTML names are not case-sensitive, this doesn't consume the data
2600  * from the stream, it's a look-ahead
2601  *
2602  * Returns the Tag Name parsed or NULL
2603  */
2604
2605 static const xmlChar *
2606 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2607     int i = 0;
2608     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2609
2610     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2611         (NXT(1) != ':')) return(NULL);
2612
2613     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2614            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2615            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2616         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2617         else loc[i] = NXT(1+i);
2618         i++;
2619     }
2620
2621     return(xmlDictLookup(ctxt->dict, loc, i));
2622 }
2623
2624
2625 /**
2626  * htmlParseName:
2627  * @ctxt:  an HTML parser context
2628  *
2629  * parse an HTML name, this routine is case sensitive.
2630  *
2631  * Returns the Name parsed or NULL
2632  */
2633
2634 static const xmlChar *
2635 htmlParseName(htmlParserCtxtPtr ctxt) {
2636     const xmlChar *in;
2637     const xmlChar *ret;
2638     int count = 0;
2639
2640     GROW;
2641
2642     /*
2643      * Accelerator for simple ASCII names
2644      */
2645     in = ctxt->input->cur;
2646     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2647         ((*in >= 0x41) && (*in <= 0x5A)) ||
2648         (*in == '_') || (*in == ':')) {
2649         in++;
2650         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2651                ((*in >= 0x41) && (*in <= 0x5A)) ||
2652                ((*in >= 0x30) && (*in <= 0x39)) ||
2653                (*in == '_') || (*in == '-') ||
2654                (*in == ':') || (*in == '.'))
2655             in++;
2656
2657         if (in == ctxt->input->end)
2658             return(NULL);
2659
2660         if ((*in > 0) && (*in < 0x80)) {
2661             count = in - ctxt->input->cur;
2662             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2663             ctxt->input->cur = in;
2664             ctxt->input->col += count;
2665             return(ret);
2666         }
2667     }
2668     return(htmlParseNameComplex(ctxt));
2669 }
2670
2671 static const xmlChar *
2672 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2673     int len = 0, l;
2674     int c;
2675     int count = 0;
2676     const xmlChar *base = ctxt->input->base;
2677
2678     /*
2679      * Handler for more complex cases
2680      */
2681     GROW;
2682     c = CUR_CHAR(l);
2683     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2684         (!IS_LETTER(c) && (c != '_') &&
2685          (c != ':'))) {
2686         return(NULL);
2687     }
2688
2689     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2690            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2691             (c == '.') || (c == '-') ||
2692             (c == '_') || (c == ':') ||
2693             (IS_COMBINING(c)) ||
2694             (IS_EXTENDER(c)))) {
2695         if (count++ > 100) {
2696             count = 0;
2697             GROW;
2698         }
2699         len += l;
2700         NEXTL(l);
2701         c = CUR_CHAR(l);
2702         if (ctxt->input->base != base) {
2703             /*
2704              * We changed encoding from an unknown encoding
2705              * Input buffer changed location, so we better start again
2706              */
2707             return(htmlParseNameComplex(ctxt));
2708         }
2709     }
2710
2711     if (ctxt->input->cur - ctxt->input->base < len) {
2712         /* Sanity check */
2713         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2714                      "unexpected change of input buffer", NULL, NULL);
2715         return (NULL);
2716     }
2717
2718     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2719 }
2720
2721
2722 /**
2723  * htmlParseHTMLAttribute:
2724  * @ctxt:  an HTML parser context
2725  * @stop:  a char stop value
2726  *
2727  * parse an HTML attribute value till the stop (quote), if
2728  * stop is 0 then it stops at the first space
2729  *
2730  * Returns the attribute parsed or NULL
2731  */
2732
2733 static xmlChar *
2734 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2735     xmlChar *buffer = NULL;
2736     int buffer_size = 0;
2737     xmlChar *out = NULL;
2738     const xmlChar *name = NULL;
2739     const xmlChar *cur = NULL;
2740     const htmlEntityDesc * ent;
2741
2742     /*
2743      * allocate a translation buffer.
2744      */
2745     buffer_size = HTML_PARSER_BUFFER_SIZE;
2746     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2747     if (buffer == NULL) {
2748         htmlErrMemory(ctxt, "buffer allocation failed\n");
2749         return(NULL);
2750     }
2751     out = buffer;
2752
2753     /*
2754      * Ok loop until we reach one of the ending chars
2755      */
2756     while ((CUR != 0) && (CUR != stop)) {
2757         if ((stop == 0) && (CUR == '>')) break;
2758         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2759         if (CUR == '&') {
2760             if (NXT(1) == '#') {
2761                 unsigned int c;
2762                 int bits;
2763
2764                 c = htmlParseCharRef(ctxt);
2765                 if      (c <    0x80)
2766                         { *out++  = c;                bits= -6; }
2767                 else if (c <   0x800)
2768                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2769                 else if (c < 0x10000)
2770                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2771                 else
2772                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2773
2774                 for ( ; bits >= 0; bits-= 6) {
2775                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2776                 }
2777
2778                 if (out - buffer > buffer_size - 100) {
2779                         int indx = out - buffer;
2780
2781                         growBuffer(buffer);
2782                         out = &buffer[indx];
2783                 }
2784             } else {
2785                 ent = htmlParseEntityRef(ctxt, &name);
2786                 if (name == NULL) {
2787                     *out++ = '&';
2788                     if (out - buffer > buffer_size - 100) {
2789                         int indx = out - buffer;
2790
2791                         growBuffer(buffer);
2792                         out = &buffer[indx];
2793                     }
2794                 } else if (ent == NULL) {
2795                     *out++ = '&';
2796                     cur = name;
2797                     while (*cur != 0) {
2798                         if (out - buffer > buffer_size - 100) {
2799                             int indx = out - buffer;
2800
2801                             growBuffer(buffer);
2802                             out = &buffer[indx];
2803                         }
2804                         *out++ = *cur++;
2805                     }
2806                 } else {
2807                     unsigned int c;
2808                     int bits;
2809
2810                     if (out - buffer > buffer_size - 100) {
2811                         int indx = out - buffer;
2812
2813                         growBuffer(buffer);
2814                         out = &buffer[indx];
2815                     }
2816                     c = ent->value;
2817                     if      (c <    0x80)
2818                         { *out++  = c;                bits= -6; }
2819                     else if (c <   0x800)
2820                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2821                     else if (c < 0x10000)
2822                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2823                     else
2824                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2825
2826                     for ( ; bits >= 0; bits-= 6) {
2827                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2828                     }
2829                 }
2830             }
2831         } else {
2832             unsigned int c;
2833             int bits, l;
2834
2835             if (out - buffer > buffer_size - 100) {
2836                 int indx = out - buffer;
2837
2838                 growBuffer(buffer);
2839                 out = &buffer[indx];
2840             }
2841             c = CUR_CHAR(l);
2842             if      (c <    0x80)
2843                     { *out++  = c;                bits= -6; }
2844             else if (c <   0x800)
2845                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2846             else if (c < 0x10000)
2847                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2848             else
2849                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2850
2851             for ( ; bits >= 0; bits-= 6) {
2852                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2853             }
2854             NEXT;
2855         }
2856     }
2857     *out = 0;
2858     return(buffer);
2859 }
2860
2861 /**
2862  * htmlParseEntityRef:
2863  * @ctxt:  an HTML parser context
2864  * @str:  location to store the entity name
2865  *
2866  * parse an HTML ENTITY references
2867  *
2868  * [68] EntityRef ::= '&' Name ';'
2869  *
2870  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2871  *         if non-NULL *str will have to be freed by the caller.
2872  */
2873 const htmlEntityDesc *
2874 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2875     const xmlChar *name;
2876     const htmlEntityDesc * ent = NULL;
2877
2878     if (str != NULL) *str = NULL;
2879     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2880
2881     if (CUR == '&') {
2882         NEXT;
2883         name = htmlParseName(ctxt);
2884         if (name == NULL) {
2885             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2886                          "htmlParseEntityRef: no name\n", NULL, NULL);
2887         } else {
2888             GROW;
2889             if (CUR == ';') {
2890                 if (str != NULL)
2891                     *str = name;
2892
2893                 /*
2894                  * Lookup the entity in the table.
2895                  */
2896                 ent = htmlEntityLookup(name);
2897                 if (ent != NULL) /* OK that's ugly !!! */
2898                     NEXT;
2899             } else {
2900                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2901                              "htmlParseEntityRef: expecting ';'\n",
2902                              NULL, NULL);
2903                 if (str != NULL)
2904                     *str = name;
2905             }
2906         }
2907     }
2908     return(ent);
2909 }
2910
2911 /**
2912  * htmlParseAttValue:
2913  * @ctxt:  an HTML parser context
2914  *
2915  * parse a value for an attribute
2916  * Note: the parser won't do substitution of entities here, this
2917  * will be handled later in xmlStringGetNodeList, unless it was
2918  * asked for ctxt->replaceEntities != 0
2919  *
2920  * Returns the AttValue parsed or NULL.
2921  */
2922
2923 static xmlChar *
2924 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2925     xmlChar *ret = NULL;
2926
2927     if (CUR == '"') {
2928         NEXT;
2929         ret = htmlParseHTMLAttribute(ctxt, '"');
2930         if (CUR != '"') {
2931             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2932                          "AttValue: \" expected\n", NULL, NULL);
2933         } else
2934             NEXT;
2935     } else if (CUR == '\'') {
2936         NEXT;
2937         ret = htmlParseHTMLAttribute(ctxt, '\'');
2938         if (CUR != '\'') {
2939             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2940                          "AttValue: ' expected\n", NULL, NULL);
2941         } else
2942             NEXT;
2943     } else {
2944         /*
2945          * That's an HTMLism, the attribute value may not be quoted
2946          */
2947         ret = htmlParseHTMLAttribute(ctxt, 0);
2948         if (ret == NULL) {
2949             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2950                          "AttValue: no value found\n", NULL, NULL);
2951         }
2952     }
2953     return(ret);
2954 }
2955
2956 /**
2957  * htmlParseSystemLiteral:
2958  * @ctxt:  an HTML parser context
2959  *
2960  * parse an HTML Literal
2961  *
2962  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2963  *
2964  * Returns the SystemLiteral parsed or NULL
2965  */
2966
2967 static xmlChar *
2968 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2969     size_t len = 0, startPosition = 0;
2970     int err = 0;
2971     int quote;
2972     xmlChar *ret = NULL;
2973
2974     if ((CUR != '"') && (CUR != '\'')) {
2975         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2976                      "SystemLiteral \" or ' expected\n", NULL, NULL);
2977         return(NULL);
2978     }
2979     quote = CUR;
2980     NEXT;
2981
2982     if (CUR_PTR < BASE_PTR)
2983         return(ret);
2984     startPosition = CUR_PTR - BASE_PTR;
2985
2986     while ((CUR != 0) && (CUR != quote)) {
2987         /* TODO: Handle UTF-8 */
2988         if (!IS_CHAR_CH(CUR)) {
2989             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2990                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2991             err = 1;
2992         }
2993         NEXT;
2994         len++;
2995     }
2996     if (CUR != quote) {
2997         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2998                      "Unfinished SystemLiteral\n", NULL, NULL);
2999     } else {
3000         NEXT;
3001         if (err == 0)
3002             ret = xmlStrndup((BASE_PTR+startPosition), len);
3003     }
3004
3005     return(ret);
3006 }
3007
3008 /**
3009  * htmlParsePubidLiteral:
3010  * @ctxt:  an HTML parser context
3011  *
3012  * parse an HTML public literal
3013  *
3014  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3015  *
3016  * Returns the PubidLiteral parsed or NULL.
3017  */
3018
3019 static xmlChar *
3020 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3021     size_t len = 0, startPosition = 0;
3022     int err = 0;
3023     int quote;
3024     xmlChar *ret = NULL;
3025
3026     if ((CUR != '"') && (CUR != '\'')) {
3027         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3028                      "PubidLiteral \" or ' expected\n", NULL, NULL);
3029         return(NULL);
3030     }
3031     quote = CUR;
3032     NEXT;
3033
3034     /*
3035      * Name ::= (Letter | '_') (NameChar)*
3036      */
3037     if (CUR_PTR < BASE_PTR)
3038         return(ret);
3039     startPosition = CUR_PTR - BASE_PTR;
3040
3041     while ((CUR != 0) && (CUR != quote)) {
3042         if (!IS_PUBIDCHAR_CH(CUR)) {
3043             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3044                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3045             err = 1;
3046         }
3047         len++;
3048         NEXT;
3049     }
3050
3051     if (CUR != quote) {
3052         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3053                      "Unfinished PubidLiteral\n", NULL, NULL);
3054     } else {
3055         NEXT;
3056         if (err == 0)
3057             ret = xmlStrndup((BASE_PTR + startPosition), len);
3058     }
3059
3060     return(ret);
3061 }
3062
3063 /**
3064  * htmlParseScript:
3065  * @ctxt:  an HTML parser context
3066  *
3067  * parse the content of an HTML SCRIPT or STYLE element
3068  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3069  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3070  * http://www.w3.org/TR/html4/types.html#type-script
3071  * http://www.w3.org/TR/html4/types.html#h-6.15
3072  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3073  *
3074  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3075  * element and the value of intrinsic event attributes. User agents must
3076  * not evaluate script data as HTML markup but instead must pass it on as
3077  * data to a script engine.
3078  * NOTES:
3079  * - The content is passed like CDATA
3080  * - the attributes for style and scripting "onXXX" are also described
3081  *   as CDATA but SGML allows entities references in attributes so their
3082  *   processing is identical as other attributes
3083  */
3084 static void
3085 htmlParseScript(htmlParserCtxtPtr ctxt) {
3086     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3087     int nbchar = 0;
3088     int cur,l;
3089
3090     SHRINK;
3091     cur = CUR_CHAR(l);
3092     while (cur != 0) {
3093         if ((cur == '<') && (NXT(1) == '/')) {
3094             /*
3095              * One should break here, the specification is clear:
3096              * Authors should therefore escape "</" within the content.
3097              * Escape mechanisms are specific to each scripting or
3098              * style sheet language.
3099              *
3100              * In recovery mode, only break if end tag match the
3101              * current tag, effectively ignoring all tags inside the
3102              * script/style block and treating the entire block as
3103              * CDATA.
3104              */
3105             if (ctxt->recovery) {
3106                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3107                                    xmlStrlen(ctxt->name)) == 0)
3108                 {
3109                     break; /* while */
3110                 } else {
3111                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3112                                  "Element %s embeds close tag\n",
3113                                  ctxt->name, NULL);
3114                 }
3115             } else {
3116                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3117                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3118                 {
3119                     break; /* while */
3120                 }
3121             }
3122         }
3123         if (IS_CHAR(cur)) {
3124             COPY_BUF(l,buf,nbchar,cur);
3125         } else {
3126             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3127                             "Invalid char in CDATA 0x%X\n", cur);
3128         }
3129         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3130             buf[nbchar] = 0;
3131             if (ctxt->sax->cdataBlock!= NULL) {
3132                 /*
3133                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3134                  */
3135                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3136             } else if (ctxt->sax->characters != NULL) {
3137                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3138             }
3139             nbchar = 0;
3140         }
3141         GROW;
3142         NEXTL(l);
3143         cur = CUR_CHAR(l);
3144     }
3145
3146     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3147         buf[nbchar] = 0;
3148         if (ctxt->sax->cdataBlock!= NULL) {
3149             /*
3150              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3151              */
3152             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3153         } else if (ctxt->sax->characters != NULL) {
3154             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3155         }
3156     }
3157 }
3158
3159
3160 /**
3161  * htmlParseCharDataInternal:
3162  * @ctxt:  an HTML parser context
3163  * @readahead: optional read ahead character in ascii range
3164  *
3165  * parse a CharData section.
3166  * if we are within a CDATA section ']]>' marks an end of section.
3167  *
3168  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3169  */
3170
3171 static void
3172 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3173     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3174     int nbchar = 0;
3175     int cur, l;
3176     int chunk = 0;
3177
3178     if (readahead)
3179         buf[nbchar++] = readahead;
3180
3181     SHRINK;
3182     cur = CUR_CHAR(l);
3183     while (((cur != '<') || (ctxt->token == '<')) &&
3184            ((cur != '&') || (ctxt->token == '&')) &&
3185            (cur != 0)) {
3186         if (!(IS_CHAR(cur))) {
3187             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3188                         "Invalid char in CDATA 0x%X\n", cur);
3189         } else {
3190             COPY_BUF(l,buf,nbchar,cur);
3191         }
3192         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3193             buf[nbchar] = 0;
3194
3195             /*
3196              * Ok the segment is to be consumed as chars.
3197              */
3198             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3199                 if (areBlanks(ctxt, buf, nbchar)) {
3200                     if (ctxt->keepBlanks) {
3201                         if (ctxt->sax->characters != NULL)
3202                             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3203                     } else {
3204                         if (ctxt->sax->ignorableWhitespace != NULL)
3205                             ctxt->sax->ignorableWhitespace(ctxt->userData,
3206                                                            buf, nbchar);
3207                     }
3208                 } else {
3209                     htmlCheckParagraph(ctxt);
3210                     if (ctxt->sax->characters != NULL)
3211                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3212                 }
3213             }
3214             nbchar = 0;
3215         }
3216         NEXTL(l);
3217         chunk++;
3218         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3219             chunk = 0;
3220             SHRINK;
3221             GROW;
3222         }
3223         cur = CUR_CHAR(l);
3224         if (cur == 0) {
3225             SHRINK;
3226             GROW;
3227             cur = CUR_CHAR(l);
3228         }
3229     }
3230     if (nbchar != 0) {
3231         buf[nbchar] = 0;
3232
3233         /*
3234          * Ok the segment is to be consumed as chars.
3235          */
3236         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3237             if (areBlanks(ctxt, buf, nbchar)) {
3238                 if (ctxt->keepBlanks) {
3239                     if (ctxt->sax->characters != NULL)
3240                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3241                 } else {
3242                     if (ctxt->sax->ignorableWhitespace != NULL)
3243                         ctxt->sax->ignorableWhitespace(ctxt->userData,
3244                                                        buf, nbchar);
3245                 }
3246             } else {
3247                 htmlCheckParagraph(ctxt);
3248                 if (ctxt->sax->characters != NULL)
3249                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
3250             }
3251         }
3252     } else {
3253         /*
3254          * Loop detection
3255          */
3256         if (cur == 0)
3257             ctxt->instate = XML_PARSER_EOF;
3258     }
3259 }
3260
3261 /**
3262  * htmlParseCharData:
3263  * @ctxt:  an HTML parser context
3264  *
3265  * parse a CharData section.
3266  * if we are within a CDATA section ']]>' marks an end of section.
3267  *
3268  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3269  */
3270
3271 static void
3272 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3273     htmlParseCharDataInternal(ctxt, 0);
3274 }
3275
3276 /**
3277  * htmlParseExternalID:
3278  * @ctxt:  an HTML parser context
3279  * @publicID:  a xmlChar** receiving PubidLiteral
3280  *
3281  * Parse an External ID or a Public ID
3282  *
3283  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3284  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3285  *
3286  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3287  *
3288  * Returns the function returns SystemLiteral and in the second
3289  *                case publicID receives PubidLiteral, is strict is off
3290  *                it is possible to return NULL and have publicID set.
3291  */
3292
3293 static xmlChar *
3294 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3295     xmlChar *URI = NULL;
3296
3297     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3298          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3299          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3300         SKIP(6);
3301         if (!IS_BLANK_CH(CUR)) {
3302             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3303                          "Space required after 'SYSTEM'\n", NULL, NULL);
3304         }
3305         SKIP_BLANKS;
3306         URI = htmlParseSystemLiteral(ctxt);
3307         if (URI == NULL) {
3308             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3309                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3310         }
3311     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3312                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3313                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3314         SKIP(6);
3315         if (!IS_BLANK_CH(CUR)) {
3316             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3317                          "Space required after 'PUBLIC'\n", NULL, NULL);
3318         }
3319         SKIP_BLANKS;
3320         *publicID = htmlParsePubidLiteral(ctxt);
3321         if (*publicID == NULL) {
3322             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3323                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3324                          NULL, NULL);
3325         }
3326         SKIP_BLANKS;
3327         if ((CUR == '"') || (CUR == '\'')) {
3328             URI = htmlParseSystemLiteral(ctxt);
3329         }
3330     }
3331     return(URI);
3332 }
3333
3334 /**
3335  * xmlParsePI:
3336  * @ctxt:  an XML parser context
3337  *
3338  * parse an XML Processing Instruction.
3339  *
3340  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3341  */
3342 static void
3343 htmlParsePI(htmlParserCtxtPtr ctxt) {
3344     xmlChar *buf = NULL;
3345     int len = 0;
3346     int size = HTML_PARSER_BUFFER_SIZE;
3347     int cur, l;
3348     const xmlChar *target;
3349     xmlParserInputState state;
3350     int count = 0;
3351
3352     if ((RAW == '<') && (NXT(1) == '?')) {
3353         state = ctxt->instate;
3354         ctxt->instate = XML_PARSER_PI;
3355         /*
3356          * this is a Processing Instruction.
3357          */
3358         SKIP(2);
3359         SHRINK;
3360
3361         /*
3362          * Parse the target name and check for special support like
3363          * namespace.
3364          */
3365         target = htmlParseName(ctxt);
3366         if (target != NULL) {
3367             if (RAW == '>') {
3368                 SKIP(1);
3369
3370                 /*
3371                  * SAX: PI detected.
3372                  */
3373                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3374                     (ctxt->sax->processingInstruction != NULL))
3375                     ctxt->sax->processingInstruction(ctxt->userData,
3376                                                      target, NULL);
3377                 ctxt->instate = state;
3378                 return;
3379             }
3380             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3381             if (buf == NULL) {
3382                 htmlErrMemory(ctxt, NULL);
3383                 ctxt->instate = state;
3384                 return;
3385             }
3386             cur = CUR;
3387             if (!IS_BLANK(cur)) {
3388                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3389                           "ParsePI: PI %s space expected\n", target, NULL);
3390             }
3391             SKIP_BLANKS;
3392             cur = CUR_CHAR(l);
3393             while ((cur != 0) && (cur != '>')) {
3394                 if (len + 5 >= size) {
3395                     xmlChar *tmp;
3396
3397                     size *= 2;
3398                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3399                     if (tmp == NULL) {
3400                         htmlErrMemory(ctxt, NULL);
3401                         xmlFree(buf);
3402                         ctxt->instate = state;
3403                         return;
3404                     }
3405                     buf = tmp;
3406                 }
3407                 count++;
3408                 if (count > 50) {
3409                     GROW;
3410                     count = 0;
3411                 }
3412                 if (IS_CHAR(cur)) {
3413                     COPY_BUF(l,buf,len,cur);
3414                 } else {
3415                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3416                                     "Invalid char in processing instruction "
3417                                     "0x%X\n", cur);
3418                 }
3419                 NEXTL(l);
3420                 cur = CUR_CHAR(l);
3421                 if (cur == 0) {
3422                     SHRINK;
3423                     GROW;
3424                     cur = CUR_CHAR(l);
3425                 }
3426             }
3427             buf[len] = 0;
3428             if (cur != '>') {
3429                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3430                       "ParsePI: PI %s never end ...\n", target, NULL);
3431             } else {
3432                 SKIP(1);
3433
3434                 /*
3435                  * SAX: PI detected.
3436                  */
3437                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3438                     (ctxt->sax->processingInstruction != NULL))
3439                     ctxt->sax->processingInstruction(ctxt->userData,
3440                                                      target, buf);
3441             }
3442             xmlFree(buf);
3443         } else {
3444             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3445                          "PI is not started correctly", NULL, NULL);
3446         }
3447         ctxt->instate = state;
3448     }
3449 }
3450
3451 /**
3452  * htmlParseComment:
3453  * @ctxt:  an HTML parser context
3454  *
3455  * Parse an XML (SGML) comment <!-- .... -->
3456  *
3457  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3458  */
3459 static void
3460 htmlParseComment(htmlParserCtxtPtr ctxt) {
3461     xmlChar *buf = NULL;
3462     int len;
3463     int size = HTML_PARSER_BUFFER_SIZE;
3464     int q, ql;
3465     int r, rl;
3466     int cur, l;
3467     int next, nl;
3468     xmlParserInputState state;
3469
3470     /*
3471      * Check that there is a comment right here.
3472      */
3473     if ((RAW != '<') || (NXT(1) != '!') ||
3474         (NXT(2) != '-') || (NXT(3) != '-')) return;
3475
3476     state = ctxt->instate;
3477     ctxt->instate = XML_PARSER_COMMENT;
3478     SHRINK;
3479     SKIP(4);
3480     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3481     if (buf == NULL) {
3482         htmlErrMemory(ctxt, "buffer allocation failed\n");
3483         ctxt->instate = state;
3484         return;
3485     }
3486     len = 0;
3487     buf[len] = 0;
3488     q = CUR_CHAR(ql);
3489     if (q == 0)
3490         goto unfinished;
3491     NEXTL(ql);
3492     r = CUR_CHAR(rl);
3493     if (r == 0)
3494         goto unfinished;
3495     NEXTL(rl);
3496     cur = CUR_CHAR(l);
3497     while ((cur != 0) &&
3498            ((cur != '>') ||
3499             (r != '-') || (q != '-'))) {
3500         NEXTL(l);
3501         next = CUR_CHAR(nl);
3502         if (next == 0) {
3503             SHRINK;
3504             GROW;
3505             next = CUR_CHAR(nl);
3506         }
3507
3508         if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3509           htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3510                        "Comment incorrectly closed by '--!>'", NULL, NULL);
3511           cur = '>';
3512           break;
3513         }
3514
3515         if (len + 5 >= size) {
3516             xmlChar *tmp;
3517
3518             size *= 2;
3519             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3520             if (tmp == NULL) {
3521                 xmlFree(buf);
3522                 htmlErrMemory(ctxt, "growing buffer failed\n");
3523                 ctxt->instate = state;
3524                 return;
3525             }
3526             buf = tmp;
3527         }
3528         if (IS_CHAR(q)) {
3529             COPY_BUF(ql,buf,len,q);
3530         } else {
3531             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3532                             "Invalid char in comment 0x%X\n", q);
3533         }
3534
3535         q = r;
3536         ql = rl;
3537         r = cur;
3538         rl = l;
3539         cur = next;
3540         l = nl;
3541     }
3542     buf[len] = 0;
3543     if (cur == '>') {
3544         NEXT;
3545         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3546             (!ctxt->disableSAX))
3547             ctxt->sax->comment(ctxt->userData, buf);
3548         xmlFree(buf);
3549         ctxt->instate = state;
3550         return;
3551     }
3552
3553 unfinished:
3554     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3555                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3556     xmlFree(buf);
3557 }
3558
3559 /**
3560  * htmlParseCharRef:
3561  * @ctxt:  an HTML parser context
3562  *
3563  * parse Reference declarations
3564  *
3565  * [66] CharRef ::= '&#' [0-9]+ ';' |
3566  *                  '&#x' [0-9a-fA-F]+ ';'
3567  *
3568  * Returns the value parsed (as an int)
3569  */
3570 int
3571 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3572     int val = 0;
3573
3574     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3575         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3576                      "htmlParseCharRef: context error\n",
3577                      NULL, NULL);
3578         return(0);
3579     }
3580     if ((CUR == '&') && (NXT(1) == '#') &&
3581         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3582         SKIP(3);
3583         while (CUR != ';') {
3584             if ((CUR >= '0') && (CUR <= '9')) {
3585                 if (val < 0x110000)
3586                     val = val * 16 + (CUR - '0');
3587             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3588                 if (val < 0x110000)
3589                     val = val * 16 + (CUR - 'a') + 10;
3590             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3591                 if (val < 0x110000)
3592                     val = val * 16 + (CUR - 'A') + 10;
3593             } else {
3594                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3595                              "htmlParseCharRef: missing semicolon\n",
3596                              NULL, NULL);
3597                 break;
3598             }
3599             NEXT;
3600         }
3601         if (CUR == ';')
3602             NEXT;
3603     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3604         SKIP(2);
3605         while (CUR != ';') {
3606             if ((CUR >= '0') && (CUR <= '9')) {
3607                 if (val < 0x110000)
3608                     val = val * 10 + (CUR - '0');
3609             } else {
3610                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3611                              "htmlParseCharRef: missing semicolon\n",
3612                              NULL, NULL);
3613                 break;
3614             }
3615             NEXT;
3616         }
3617         if (CUR == ';')
3618             NEXT;
3619     } else {
3620         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3621                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3622     }
3623     /*
3624      * Check the value IS_CHAR ...
3625      */
3626     if (IS_CHAR(val)) {
3627         return(val);
3628     } else if (val >= 0x110000) {
3629         htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3630                      "htmlParseCharRef: value too large\n", NULL, NULL);
3631     } else {
3632         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3633                         "htmlParseCharRef: invalid xmlChar value %d\n",
3634                         val);
3635     }
3636     return(0);
3637 }
3638
3639
3640 /**
3641  * htmlParseDocTypeDecl:
3642  * @ctxt:  an HTML parser context
3643  *
3644  * parse a DOCTYPE declaration
3645  *
3646  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3647  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3648  */
3649
3650 static void
3651 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3652     const xmlChar *name;
3653     xmlChar *ExternalID = NULL;
3654     xmlChar *URI = NULL;
3655
3656     /*
3657      * We know that '<!DOCTYPE' has been detected.
3658      */
3659     SKIP(9);
3660
3661     SKIP_BLANKS;
3662
3663     /*
3664      * Parse the DOCTYPE name.
3665      */
3666     name = htmlParseName(ctxt);
3667     if (name == NULL) {
3668         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3669                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3670                      NULL, NULL);
3671     }
3672     /*
3673      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3674      */
3675
3676     SKIP_BLANKS;
3677
3678     /*
3679      * Check for SystemID and ExternalID
3680      */
3681     URI = htmlParseExternalID(ctxt, &ExternalID);
3682     SKIP_BLANKS;
3683
3684     /*
3685      * We should be at the end of the DOCTYPE declaration.
3686      */
3687     if (CUR != '>') {
3688         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3689                      "DOCTYPE improperly terminated\n", NULL, NULL);
3690         /* Ignore bogus content */
3691         while ((CUR != 0) && (CUR != '>'))
3692             NEXT;
3693     }
3694     if (CUR == '>')
3695         NEXT;
3696
3697     /*
3698      * Create or update the document accordingly to the DOCTYPE
3699      */
3700     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3701         (!ctxt->disableSAX))
3702         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3703
3704     /*
3705      * Cleanup, since we don't use all those identifiers
3706      */
3707     if (URI != NULL) xmlFree(URI);
3708     if (ExternalID != NULL) xmlFree(ExternalID);
3709 }
3710
3711 /**
3712  * htmlParseAttribute:
3713  * @ctxt:  an HTML parser context
3714  * @value:  a xmlChar ** used to store the value of the attribute
3715  *
3716  * parse an attribute
3717  *
3718  * [41] Attribute ::= Name Eq AttValue
3719  *
3720  * [25] Eq ::= S? '=' S?
3721  *
3722  * With namespace:
3723  *
3724  * [NS 11] Attribute ::= QName Eq AttValue
3725  *
3726  * Also the case QName == xmlns:??? is handled independently as a namespace
3727  * definition.
3728  *
3729  * Returns the attribute name, and the value in *value.
3730  */
3731
3732 static const xmlChar *
3733 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3734     const xmlChar *name;
3735     xmlChar *val = NULL;
3736
3737     *value = NULL;
3738     name = htmlParseHTMLName(ctxt);
3739     if (name == NULL) {
3740         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3741                      "error parsing attribute name\n", NULL, NULL);
3742         return(NULL);
3743     }
3744
3745     /*
3746      * read the value
3747      */
3748     SKIP_BLANKS;
3749     if (CUR == '=') {
3750         NEXT;
3751         SKIP_BLANKS;
3752         val = htmlParseAttValue(ctxt);
3753     }
3754
3755     *value = val;
3756     return(name);
3757 }
3758
3759 /**
3760  * htmlCheckEncodingDirect:
3761  * @ctxt:  an HTML parser context
3762  * @attvalue: the attribute value
3763  *
3764  * Checks an attribute value to detect
3765  * the encoding
3766  * If a new encoding is detected the parser is switched to decode
3767  * it and pass UTF8
3768  */
3769 static void
3770 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3771
3772     if ((ctxt == NULL) || (encoding == NULL) ||
3773         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3774         return;
3775
3776     /* do not change encoding */
3777     if (ctxt->input->encoding != NULL)
3778         return;
3779
3780     if (encoding != NULL) {
3781         xmlCharEncoding enc;
3782         xmlCharEncodingHandlerPtr handler;
3783
3784         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3785
3786         if (ctxt->input->encoding != NULL)
3787             xmlFree((xmlChar *) ctxt->input->encoding);
3788         ctxt->input->encoding = xmlStrdup(encoding);
3789
3790         enc = xmlParseCharEncoding((const char *) encoding);
3791         /*
3792          * registered set of known encodings
3793          */
3794         if (enc != XML_CHAR_ENCODING_ERROR) {
3795             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3796                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3797                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3798                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3799                 (ctxt->input->buf != NULL) &&
3800                 (ctxt->input->buf->encoder == NULL)) {
3801                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3802                              "htmlCheckEncoding: wrong encoding meta\n",
3803                              NULL, NULL);
3804             } else {
3805                 xmlSwitchEncoding(ctxt, enc);
3806             }
3807             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3808         } else {
3809             /*
3810              * fallback for unknown encodings
3811              */
3812             handler = xmlFindCharEncodingHandler((const char *) encoding);
3813             if (handler != NULL) {
3814                 xmlSwitchToEncoding(ctxt, handler);
3815                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3816             } else {
3817                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3818                              "htmlCheckEncoding: unknown encoding %s\n",
3819                              encoding, NULL);
3820             }
3821         }
3822
3823         if ((ctxt->input->buf != NULL) &&
3824             (ctxt->input->buf->encoder != NULL) &&
3825             (ctxt->input->buf->raw != NULL) &&
3826             (ctxt->input->buf->buffer != NULL)) {
3827             int nbchars;
3828             int processed;
3829
3830             /*
3831              * convert as much as possible to the parser reading buffer.
3832              */
3833             processed = ctxt->input->cur - ctxt->input->base;
3834             xmlBufShrink(ctxt->input->buf->buffer, processed);
3835             nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3836             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3837             if (nbchars < 0) {
3838                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3839                              "htmlCheckEncoding: encoder error\n",
3840                              NULL, NULL);
3841             }
3842         }
3843     }
3844 }
3845
3846 /**
3847  * htmlCheckEncoding:
3848  * @ctxt:  an HTML parser context
3849  * @attvalue: the attribute value
3850  *
3851  * Checks an http-equiv attribute from a Meta tag to detect
3852  * the encoding
3853  * If a new encoding is detected the parser is switched to decode
3854  * it and pass UTF8
3855  */
3856 static void
3857 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3858     const xmlChar *encoding;
3859
3860     if (!attvalue)
3861         return;
3862
3863     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3864     if (encoding != NULL) {
3865         encoding += 7;
3866     }
3867     /*
3868      * skip blank
3869      */
3870     if (encoding && IS_BLANK_CH(*encoding))
3871         encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3872     if (encoding && *encoding == '=') {
3873         encoding ++;
3874         htmlCheckEncodingDirect(ctxt, encoding);
3875     }
3876 }
3877
3878 /**
3879  * htmlCheckMeta:
3880  * @ctxt:  an HTML parser context
3881  * @atts:  the attributes values
3882  *
3883  * Checks an attributes from a Meta tag
3884  */
3885 static void
3886 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3887     int i;
3888     const xmlChar *att, *value;
3889     int http = 0;
3890     const xmlChar *content = NULL;
3891
3892     if ((ctxt == NULL) || (atts == NULL))
3893         return;
3894
3895     i = 0;
3896     att = atts[i++];
3897     while (att != NULL) {
3898         value = atts[i++];
3899         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3900          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3901             http = 1;
3902         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3903             htmlCheckEncodingDirect(ctxt, value);
3904         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3905             content = value;
3906         att = atts[i++];
3907     }
3908     if ((http) && (content != NULL))
3909         htmlCheckEncoding(ctxt, content);
3910
3911 }
3912
3913 /**
3914  * htmlParseStartTag:
3915  * @ctxt:  an HTML parser context
3916  *
3917  * parse a start of tag either for rule element or
3918  * EmptyElement. In both case we don't parse the tag closing chars.
3919  *
3920  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3921  *
3922  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3923  *
3924  * With namespace:
3925  *
3926  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3927  *
3928  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3929  *
3930  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3931  */
3932
3933 static int
3934 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3935     const xmlChar *name;
3936     const xmlChar *attname;
3937     xmlChar *attvalue;
3938     const xmlChar **atts;
3939     int nbatts = 0;
3940     int maxatts;
3941     int meta = 0;
3942     int i;
3943     int discardtag = 0;
3944
3945     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3946         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3947                      "htmlParseStartTag: context error\n", NULL, NULL);
3948         return -1;
3949     }
3950     if (ctxt->instate == XML_PARSER_EOF)
3951         return(-1);
3952     if (CUR != '<') return -1;
3953     NEXT;
3954
3955     atts = ctxt->atts;
3956     maxatts = ctxt->maxatts;
3957
3958     GROW;
3959     name = htmlParseHTMLName(ctxt);
3960     if (name == NULL) {
3961         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3962                      "htmlParseStartTag: invalid element name\n",
3963                      NULL, NULL);
3964         /* Dump the bogus tag like browsers do */
3965         while ((CUR != 0) && (CUR != '>') &&
3966                (ctxt->instate != XML_PARSER_EOF))
3967             NEXT;
3968         return -1;
3969     }
3970     if (xmlStrEqual(name, BAD_CAST"meta"))
3971         meta = 1;
3972
3973     /*
3974      * Check for auto-closure of HTML elements.
3975      */
3976     htmlAutoClose(ctxt, name);
3977
3978     /*
3979      * Check for implied HTML elements.
3980      */
3981     htmlCheckImplied(ctxt, name);
3982
3983     /*
3984      * Avoid html at any level > 0, head at any level != 1
3985      * or any attempt to recurse body
3986      */
3987     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3988         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3989                      "htmlParseStartTag: misplaced <html> tag\n",
3990                      name, NULL);
3991         discardtag = 1;
3992         ctxt->depth++;
3993     }
3994     if ((ctxt->nameNr != 1) &&
3995         (xmlStrEqual(name, BAD_CAST"head"))) {
3996         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3997                      "htmlParseStartTag: misplaced <head> tag\n",
3998                      name, NULL);
3999         discardtag = 1;
4000         ctxt->depth++;
4001     }
4002     if (xmlStrEqual(name, BAD_CAST"body")) {
4003         int indx;
4004         for (indx = 0;indx < ctxt->nameNr;indx++) {
4005             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4006                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4007                              "htmlParseStartTag: misplaced <body> tag\n",
4008                              name, NULL);
4009                 discardtag = 1;
4010                 ctxt->depth++;
4011             }
4012         }
4013     }
4014
4015     /*
4016      * Now parse the attributes, it ends up with the ending
4017      *
4018      * (S Attribute)* S?
4019      */
4020     SKIP_BLANKS;
4021     while ((CUR != 0) &&
4022            (CUR != '>') &&
4023            ((CUR != '/') || (NXT(1) != '>'))) {
4024         GROW;
4025         attname = htmlParseAttribute(ctxt, &attvalue);
4026         if (attname != NULL) {
4027
4028             /*
4029              * Well formedness requires at most one declaration of an attribute
4030              */
4031             for (i = 0; i < nbatts;i += 2) {
4032                 if (xmlStrEqual(atts[i], attname)) {
4033                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4034                                  "Attribute %s redefined\n", attname, NULL);
4035                     if (attvalue != NULL)
4036                         xmlFree(attvalue);
4037                     goto failed;
4038                 }
4039             }
4040
4041             /*
4042              * Add the pair to atts
4043              */
4044             if (atts == NULL) {
4045                 maxatts = 22; /* allow for 10 attrs by default */
4046                 atts = (const xmlChar **)
4047                        xmlMalloc(maxatts * sizeof(xmlChar *));
4048                 if (atts == NULL) {
4049                     htmlErrMemory(ctxt, NULL);
4050                     if (attvalue != NULL)
4051                         xmlFree(attvalue);
4052                     goto failed;
4053                 }
4054                 ctxt->atts = atts;
4055                 ctxt->maxatts = maxatts;
4056             } else if (nbatts + 4 > maxatts) {
4057                 const xmlChar **n;
4058
4059                 maxatts *= 2;
4060                 n = (const xmlChar **) xmlRealloc((void *) atts,
4061                                              maxatts * sizeof(const xmlChar *));
4062                 if (n == NULL) {
4063                     htmlErrMemory(ctxt, NULL);
4064                     if (attvalue != NULL)
4065                         xmlFree(attvalue);
4066                     goto failed;
4067                 }
4068                 atts = n;
4069                 ctxt->atts = atts;
4070                 ctxt->maxatts = maxatts;
4071             }
4072             atts[nbatts++] = attname;
4073             atts[nbatts++] = attvalue;
4074             atts[nbatts] = NULL;
4075             atts[nbatts + 1] = NULL;
4076         }
4077         else {
4078             if (attvalue != NULL)
4079                 xmlFree(attvalue);
4080             /* Dump the bogus attribute string up to the next blank or
4081              * the end of the tag. */
4082             while ((CUR != 0) &&
4083                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4084                    ((CUR != '/') || (NXT(1) != '>')))
4085                 NEXT;
4086         }
4087
4088 failed:
4089         SKIP_BLANKS;
4090     }
4091
4092     /*
4093      * Handle specific association to the META tag
4094      */
4095     if (meta && (nbatts != 0))
4096         htmlCheckMeta(ctxt, atts);
4097
4098     /*
4099      * SAX: Start of Element !
4100      */
4101     if (!discardtag) {
4102         htmlnamePush(ctxt, name);
4103         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4104             if (nbatts != 0)
4105                 ctxt->sax->startElement(ctxt->userData, name, atts);
4106             else
4107                 ctxt->sax->startElement(ctxt->userData, name, NULL);
4108         }
4109     }
4110
4111     if (atts != NULL) {
4112         for (i = 1;i < nbatts;i += 2) {
4113             if (atts[i] != NULL)
4114                 xmlFree((xmlChar *) atts[i]);
4115         }
4116     }
4117
4118     return(discardtag);
4119 }
4120
4121 /**
4122  * htmlParseEndTag:
4123  * @ctxt:  an HTML parser context
4124  *
4125  * parse an end of tag
4126  *
4127  * [42] ETag ::= '</' Name S? '>'
4128  *
4129  * With namespace
4130  *
4131  * [NS 9] ETag ::= '</' QName S? '>'
4132  *
4133  * Returns 1 if the current level should be closed.
4134  */
4135
4136 static int
4137 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4138 {
4139     const xmlChar *name;
4140     const xmlChar *oldname;
4141     int i, ret;
4142
4143     if ((CUR != '<') || (NXT(1) != '/')) {
4144         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4145                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
4146         return (0);
4147     }
4148     SKIP(2);
4149
4150     name = htmlParseHTMLName(ctxt);
4151     if (name == NULL)
4152         return (0);
4153     /*
4154      * We should definitely be at the ending "S? '>'" part
4155      */
4156     SKIP_BLANKS;
4157     if (CUR != '>') {
4158         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4159                      "End tag : expected '>'\n", NULL, NULL);
4160         /* Skip to next '>' */
4161         while ((CUR != 0) && (CUR != '>'))
4162             NEXT;
4163     }
4164     if (CUR == '>')
4165         NEXT;
4166
4167     /*
4168      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4169      * out now.
4170      */
4171     if ((ctxt->depth > 0) &&
4172         (xmlStrEqual(name, BAD_CAST "html") ||
4173          xmlStrEqual(name, BAD_CAST "body") ||
4174          xmlStrEqual(name, BAD_CAST "head"))) {
4175         ctxt->depth--;
4176         return (0);
4177     }
4178
4179     /*
4180      * If the name read is not one of the element in the parsing stack
4181      * then return, it's just an error.
4182      */
4183     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4184         if (xmlStrEqual(name, ctxt->nameTab[i]))
4185             break;
4186     }
4187     if (i < 0) {
4188         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4189                      "Unexpected end tag : %s\n", name, NULL);
4190         return (0);
4191     }
4192
4193
4194     /*
4195      * Check for auto-closure of HTML elements.
4196      */
4197
4198     htmlAutoCloseOnClose(ctxt, name);
4199
4200     /*
4201      * Well formedness constraints, opening and closing must match.
4202      * With the exception that the autoclose may have popped stuff out
4203      * of the stack.
4204      */
4205     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4206         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4207                      "Opening and ending tag mismatch: %s and %s\n",
4208                      name, ctxt->name);
4209     }
4210
4211     /*
4212      * SAX: End of Tag
4213      */
4214     oldname = ctxt->name;
4215     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4216         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4217             ctxt->sax->endElement(ctxt->userData, name);
4218         htmlNodeInfoPop(ctxt);
4219         htmlnamePop(ctxt);
4220         ret = 1;
4221     } else {
4222         ret = 0;
4223     }
4224
4225     return (ret);
4226 }
4227
4228
4229 /**
4230  * htmlParseReference:
4231  * @ctxt:  an HTML parser context
4232  *
4233  * parse and handle entity references in content,
4234  * this will end-up in a call to character() since this is either a
4235  * CharRef, or a predefined entity.
4236  */
4237 static void
4238 htmlParseReference(htmlParserCtxtPtr ctxt) {
4239     const htmlEntityDesc * ent;
4240     xmlChar out[6];
4241     const xmlChar *name;
4242     if (CUR != '&') return;
4243
4244     if (NXT(1) == '#') {
4245         unsigned int c;
4246         int bits, i = 0;
4247
4248         c = htmlParseCharRef(ctxt);
4249         if (c == 0)
4250             return;
4251
4252         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4253         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4254         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4255         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4256
4257         for ( ; bits >= 0; bits-= 6) {
4258             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4259         }
4260         out[i] = 0;
4261
4262         htmlCheckParagraph(ctxt);
4263         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4264             ctxt->sax->characters(ctxt->userData, out, i);
4265     } else {
4266         ent = htmlParseEntityRef(ctxt, &name);
4267         if (name == NULL) {
4268             htmlCheckParagraph(ctxt);
4269             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4270                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4271             return;
4272         }
4273         if ((ent == NULL) || !(ent->value > 0)) {
4274             htmlCheckParagraph(ctxt);
4275             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4276                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4277                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4278                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4279             }
4280         } else {
4281             unsigned int c;
4282             int bits, i = 0;
4283
4284             c = ent->value;
4285             if      (c <    0x80)
4286                     { out[i++]= c;                bits= -6; }
4287             else if (c <   0x800)
4288                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4289             else if (c < 0x10000)
4290                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4291             else
4292                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4293
4294             for ( ; bits >= 0; bits-= 6) {
4295                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4296             }
4297             out[i] = 0;
4298
4299             htmlCheckParagraph(ctxt);
4300             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4301                 ctxt->sax->characters(ctxt->userData, out, i);
4302         }
4303     }
4304 }
4305
4306 /**
4307  * htmlParseContent:
4308  * @ctxt:  an HTML parser context
4309  *
4310  * Parse a content: comment, sub-element, reference or text.
4311  * Kept for compatibility with old code
4312  */
4313
4314 static void
4315 htmlParseContent(htmlParserCtxtPtr ctxt) {
4316     xmlChar *currentNode;
4317     int depth;
4318     const xmlChar *name;
4319
4320     currentNode = xmlStrdup(ctxt->name);
4321     depth = ctxt->nameNr;
4322     while (1) {
4323         GROW;
4324
4325         if (ctxt->instate == XML_PARSER_EOF)
4326             break;
4327
4328         /*
4329          * Our tag or one of it's parent or children is ending.
4330          */
4331         if ((CUR == '<') && (NXT(1) == '/')) {
4332             if (htmlParseEndTag(ctxt) &&
4333                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4334                 if (currentNode != NULL)
4335                     xmlFree(currentNode);
4336                 return;
4337             }
4338             continue; /* while */
4339         }
4340
4341         else if ((CUR == '<') &&
4342                  ((IS_ASCII_LETTER(NXT(1))) ||
4343                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4344             name = htmlParseHTMLName_nonInvasive(ctxt);
4345             if (name == NULL) {
4346                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4347                          "htmlParseStartTag: invalid element name\n",
4348                          NULL, NULL);
4349                 /* Dump the bogus tag like browsers do */
4350                 while ((CUR != 0) && (CUR != '>'))
4351                     NEXT;
4352
4353                 if (currentNode != NULL)
4354                     xmlFree(currentNode);
4355                 return;
4356             }
4357
4358             if (ctxt->name != NULL) {
4359                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4360                     htmlAutoClose(ctxt, name);
4361                     continue;
4362                 }
4363             }
4364         }
4365
4366         /*
4367          * Has this node been popped out during parsing of
4368          * the next element
4369          */
4370         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4371             (!xmlStrEqual(currentNode, ctxt->name)))
4372              {
4373             if (currentNode != NULL) xmlFree(currentNode);
4374             return;
4375         }
4376
4377         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4378             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4379             /*
4380              * Handle SCRIPT/STYLE separately
4381              */
4382             htmlParseScript(ctxt);
4383         } else {
4384             /*
4385              * Sometimes DOCTYPE arrives in the middle of the document
4386              */
4387             if ((CUR == '<') && (NXT(1) == '!') &&
4388                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4389                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4390                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4391                 (UPP(8) == 'E')) {
4392                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4393                              "Misplaced DOCTYPE declaration\n",
4394                              BAD_CAST "DOCTYPE" , NULL);
4395                 htmlParseDocTypeDecl(ctxt);
4396             }
4397
4398             /*
4399              * First case :  a comment
4400              */
4401             if ((CUR == '<') && (NXT(1) == '!') &&
4402                 (NXT(2) == '-') && (NXT(3) == '-')) {
4403                 htmlParseComment(ctxt);
4404             }
4405
4406             /*
4407              * Second case : a Processing Instruction.
4408              */
4409             else if ((CUR == '<') && (NXT(1) == '?')) {
4410                 htmlParsePI(ctxt);
4411             }
4412
4413             /*
4414              * Third case :  a sub-element.
4415              */
4416             else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4417                 htmlParseElement(ctxt);
4418             }
4419             else if (CUR == '<') {
4420                 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4421                     (ctxt->sax->characters != NULL))
4422                     ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4423                 NEXT;
4424             }
4425
4426             /*
4427              * Fourth case : a reference. If if has not been resolved,
4428              *    parsing returns it's Name, create the node
4429              */
4430             else if (CUR == '&') {
4431                 htmlParseReference(ctxt);
4432             }
4433
4434             /*
4435              * Fifth case : end of the resource
4436              */
4437             else if (CUR == 0) {
4438                 htmlAutoCloseOnEnd(ctxt);
4439                 break;
4440             }
4441
4442             /*
4443              * Last case, text. Note that References are handled directly.
4444              */
4445             else {
4446                 htmlParseCharData(ctxt);
4447             }
4448         }
4449         GROW;
4450     }
4451     if (currentNode != NULL) xmlFree(currentNode);
4452 }
4453
4454 /**
4455  * htmlParseElement:
4456  * @ctxt:  an HTML parser context
4457  *
4458  * parse an HTML element, this is highly recursive
4459  * this is kept for compatibility with previous code versions
4460  *
4461  * [39] element ::= EmptyElemTag | STag content ETag
4462  *
4463  * [41] Attribute ::= Name Eq AttValue
4464  */
4465
4466 void
4467 htmlParseElement(htmlParserCtxtPtr ctxt) {
4468     const xmlChar *name;
4469     xmlChar *currentNode = NULL;
4470     const htmlElemDesc * info;
4471     htmlParserNodeInfo node_info;
4472     int failed;
4473     int depth;
4474     const xmlChar *oldptr;
4475
4476     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4477         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4478                      "htmlParseElement: context error\n", NULL, NULL);
4479         return;
4480     }
4481
4482     if (ctxt->instate == XML_PARSER_EOF)
4483         return;
4484
4485     /* Capture start position */
4486     if (ctxt->record_info) {
4487         node_info.begin_pos = ctxt->input->consumed +
4488                           (CUR_PTR - ctxt->input->base);
4489         node_info.begin_line = ctxt->input->line;
4490     }
4491
4492     failed = htmlParseStartTag(ctxt);
4493     name = ctxt->name;
4494     if ((failed == -1) || (name == NULL)) {
4495         if (CUR == '>')
4496             NEXT;
4497         return;
4498     }
4499
4500     /*
4501      * Lookup the info for that element.
4502      */
4503     info = htmlTagLookup(name);
4504     if (info == NULL) {
4505         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4506                      "Tag %s invalid\n", name, NULL);
4507     }
4508
4509     /*
4510      * Check for an Empty Element labeled the XML/SGML way
4511      */
4512     if ((CUR == '/') && (NXT(1) == '>')) {
4513         SKIP(2);
4514         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4515             ctxt->sax->endElement(ctxt->userData, name);
4516         htmlnamePop(ctxt);
4517         return;
4518     }
4519
4520     if (CUR == '>') {
4521         NEXT;
4522     } else {
4523         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4524                      "Couldn't find end of Start Tag %s\n", name, NULL);
4525
4526         /*
4527          * end of parsing of this node.
4528          */
4529         if (xmlStrEqual(name, ctxt->name)) {
4530             nodePop(ctxt);
4531             htmlnamePop(ctxt);
4532         }
4533
4534         /*
4535          * Capture end position and add node
4536          */
4537         if (ctxt->record_info) {
4538            node_info.end_pos = ctxt->input->consumed +
4539                               (CUR_PTR - ctxt->input->base);
4540            node_info.end_line = ctxt->input->line;
4541            node_info.node = ctxt->node;
4542            xmlParserAddNodeInfo(ctxt, &node_info);
4543         }
4544         return;
4545     }
4546
4547     /*
4548      * Check for an Empty Element from DTD definition
4549      */
4550     if ((info != NULL) && (info->empty)) {
4551         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4552             ctxt->sax->endElement(ctxt->userData, name);
4553         htmlnamePop(ctxt);
4554         return;
4555     }
4556
4557     /*
4558      * Parse the content of the element:
4559      */
4560     currentNode = xmlStrdup(ctxt->name);
4561     depth = ctxt->nameNr;
4562     while (CUR != 0) {
4563         oldptr = ctxt->input->cur;
4564         htmlParseContent(ctxt);
4565         if (oldptr==ctxt->input->cur) break;
4566         if (ctxt->nameNr < depth) break;
4567     }
4568
4569     /*
4570      * Capture end position and add node
4571      */
4572     if ( currentNode != NULL && ctxt->record_info ) {
4573        node_info.end_pos = ctxt->input->consumed +
4574                           (CUR_PTR - ctxt->input->base);
4575        node_info.end_line = ctxt->input->line;
4576        node_info.node = ctxt->node;
4577        xmlParserAddNodeInfo(ctxt, &node_info);
4578     }
4579     if (CUR == 0) {
4580         htmlAutoCloseOnEnd(ctxt);
4581     }
4582
4583     if (currentNode != NULL)
4584         xmlFree(currentNode);
4585 }
4586
4587 static void
4588 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4589     /*
4590      * Capture end position and add node
4591      */
4592     if ( ctxt->node != NULL && ctxt->record_info ) {
4593        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4594                                 (CUR_PTR - ctxt->input->base);
4595        ctxt->nodeInfo->end_line = ctxt->input->line;
4596        ctxt->nodeInfo->node = ctxt->node;
4597        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4598        htmlNodeInfoPop(ctxt);
4599     }
4600     if (CUR == 0) {
4601        htmlAutoCloseOnEnd(ctxt);
4602     }
4603 }
4604
4605 /**
4606  * htmlParseElementInternal:
4607  * @ctxt:  an HTML parser context
4608  *
4609  * parse an HTML element, new version, non recursive
4610  *
4611  * [39] element ::= EmptyElemTag | STag content ETag
4612  *
4613  * [41] Attribute ::= Name Eq AttValue
4614  */
4615
4616 static void
4617 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4618     const xmlChar *name;
4619     const htmlElemDesc * info;
4620     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4621     int failed;
4622
4623     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4624         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4625                      "htmlParseElementInternal: context error\n", NULL, NULL);
4626         return;
4627     }
4628
4629     if (ctxt->instate == XML_PARSER_EOF)
4630         return;
4631
4632     /* Capture start position */
4633     if (ctxt->record_info) {
4634         node_info.begin_pos = ctxt->input->consumed +
4635                           (CUR_PTR - ctxt->input->base);
4636         node_info.begin_line = ctxt->input->line;
4637     }
4638
4639     failed = htmlParseStartTag(ctxt);
4640     name = ctxt->name;
4641     if ((failed == -1) || (name == NULL)) {
4642         if (CUR == '>')
4643             NEXT;
4644         return;
4645     }
4646
4647     /*
4648      * Lookup the info for that element.
4649      */
4650     info = htmlTagLookup(name);
4651     if (info == NULL) {
4652         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4653                      "Tag %s invalid\n", name, NULL);
4654     }
4655
4656     /*
4657      * Check for an Empty Element labeled the XML/SGML way
4658      */
4659     if ((CUR == '/') && (NXT(1) == '>')) {
4660         SKIP(2);
4661         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4662             ctxt->sax->endElement(ctxt->userData, name);
4663         htmlnamePop(ctxt);
4664         return;
4665     }
4666
4667     if (CUR == '>') {
4668         NEXT;
4669     } else {
4670         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4671                      "Couldn't find end of Start Tag %s\n", name, NULL);
4672
4673         /*
4674          * end of parsing of this node.
4675          */
4676         if (xmlStrEqual(name, ctxt->name)) {
4677             nodePop(ctxt);
4678             htmlnamePop(ctxt);
4679         }
4680
4681         if (ctxt->record_info)
4682             htmlNodeInfoPush(ctxt, &node_info);
4683         htmlParserFinishElementParsing(ctxt);
4684         return;
4685     }
4686
4687     /*
4688      * Check for an Empty Element from DTD definition
4689      */
4690     if ((info != NULL) && (info->empty)) {
4691         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4692             ctxt->sax->endElement(ctxt->userData, name);
4693         htmlnamePop(ctxt);
4694         return;
4695     }
4696
4697     if (ctxt->record_info)
4698         htmlNodeInfoPush(ctxt, &node_info);
4699 }
4700
4701 /**
4702  * htmlParseContentInternal:
4703  * @ctxt:  an HTML parser context
4704  *
4705  * Parse a content: comment, sub-element, reference or text.
4706  * New version for non recursive htmlParseElementInternal
4707  */
4708
4709 static void
4710 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4711     xmlChar *currentNode;
4712     int depth;
4713     const xmlChar *name;
4714
4715     currentNode = xmlStrdup(ctxt->name);
4716     depth = ctxt->nameNr;
4717     while (1) {
4718         GROW;
4719
4720         if (ctxt->instate == XML_PARSER_EOF)
4721             break;
4722
4723         /*
4724          * Our tag or one of it's parent or children is ending.
4725          */
4726         if ((CUR == '<') && (NXT(1) == '/')) {
4727             if (htmlParseEndTag(ctxt) &&
4728                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4729                 if (currentNode != NULL)
4730                     xmlFree(currentNode);
4731
4732                 currentNode = xmlStrdup(ctxt->name);
4733                 depth = ctxt->nameNr;
4734             }
4735             continue; /* while */
4736         }
4737
4738         else if ((CUR == '<') &&
4739                  ((IS_ASCII_LETTER(NXT(1))) ||
4740                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4741             name = htmlParseHTMLName_nonInvasive(ctxt);
4742             if (name == NULL) {
4743                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4744                          "htmlParseStartTag: invalid element name\n",
4745                          NULL, NULL);
4746                 /* Dump the bogus tag like browsers do */
4747                 while ((CUR == 0) && (CUR != '>'))
4748                     NEXT;
4749
4750                 htmlParserFinishElementParsing(ctxt);
4751                 if (currentNode != NULL)
4752                     xmlFree(currentNode);
4753
4754                 currentNode = xmlStrdup(ctxt->name);
4755                 depth = ctxt->nameNr;
4756                 continue;
4757             }
4758
4759             if (ctxt->name != NULL) {
4760                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4761                     htmlAutoClose(ctxt, name);
4762                     continue;
4763                 }
4764             }
4765         }
4766
4767         /*
4768          * Has this node been popped out during parsing of
4769          * the next element
4770          */
4771         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4772             (!xmlStrEqual(currentNode, ctxt->name)))
4773              {
4774             htmlParserFinishElementParsing(ctxt);
4775             if (currentNode != NULL) xmlFree(currentNode);
4776
4777             currentNode = xmlStrdup(ctxt->name);
4778             depth = ctxt->nameNr;
4779             continue;
4780         }
4781
4782         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4783             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4784             /*
4785              * Handle SCRIPT/STYLE separately
4786              */
4787             htmlParseScript(ctxt);
4788         } else {
4789             /*
4790              * Sometimes DOCTYPE arrives in the middle of the document
4791              */
4792             if ((CUR == '<') && (NXT(1) == '!') &&
4793                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4794                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4795                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4796                 (UPP(8) == 'E')) {
4797                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4798                              "Misplaced DOCTYPE declaration\n",
4799                              BAD_CAST "DOCTYPE" , NULL);
4800                 htmlParseDocTypeDecl(ctxt);
4801             }
4802
4803             /*
4804              * First case :  a comment
4805              */
4806             if ((CUR == '<') && (NXT(1) == '!') &&
4807                 (NXT(2) == '-') && (NXT(3) == '-')) {
4808                 htmlParseComment(ctxt);
4809             }
4810
4811             /*
4812              * Second case : a Processing Instruction.
4813              */
4814             else if ((CUR == '<') && (NXT(1) == '?')) {
4815                 htmlParsePI(ctxt);
4816             }
4817
4818             /*
4819              * Third case :  a sub-element.
4820              */
4821             else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4822                 htmlParseElementInternal(ctxt);
4823                 if (currentNode != NULL) xmlFree(currentNode);
4824
4825                 currentNode = xmlStrdup(ctxt->name);
4826                 depth = ctxt->nameNr;
4827             }
4828             else if (CUR == '<') {
4829                 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4830                     (ctxt->sax->characters != NULL))
4831                     ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4832                 NEXT;
4833             }
4834
4835             /*
4836              * Fourth case : a reference. If if has not been resolved,
4837              *    parsing returns it's Name, create the node
4838              */
4839             else if (CUR == '&') {
4840                 htmlParseReference(ctxt);
4841             }
4842
4843             /*
4844              * Fifth case : end of the resource
4845              */
4846             else if (CUR == 0) {
4847                 htmlAutoCloseOnEnd(ctxt);
4848                 break;
4849             }
4850
4851             /*
4852              * Last case, text. Note that References are handled directly.
4853              */
4854             else {
4855                 htmlParseCharData(ctxt);
4856             }
4857         }
4858         GROW;
4859     }
4860     if (currentNode != NULL) xmlFree(currentNode);
4861 }
4862
4863 /**
4864  * htmlParseContent:
4865  * @ctxt:  an HTML parser context
4866  *
4867  * Parse a content: comment, sub-element, reference or text.
4868  * This is the entry point when called from parser.c
4869  */
4870
4871 void
4872 __htmlParseContent(void *ctxt) {
4873     if (ctxt != NULL)
4874         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4875 }
4876
4877 /**
4878  * htmlParseDocument:
4879  * @ctxt:  an HTML parser context
4880  *
4881  * parse an HTML document (and build a tree if using the standard SAX
4882  * interface).
4883  *
4884  * Returns 0, -1 in case of error. the parser context is augmented
4885  *                as a result of the parsing.
4886  */
4887
4888 int
4889 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4890     xmlChar start[4];
4891     xmlCharEncoding enc;
4892     xmlDtdPtr dtd;
4893
4894     xmlInitParser();
4895
4896     htmlDefaultSAXHandlerInit();
4897
4898     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4899         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4900                      "htmlParseDocument: context error\n", NULL, NULL);
4901         return(XML_ERR_INTERNAL_ERROR);
4902     }
4903     ctxt->html = 1;
4904     ctxt->linenumbers = 1;
4905     GROW;
4906     /*
4907      * SAX: beginning of the document processing.
4908      */
4909     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4910         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4911
4912     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4913         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4914         /*
4915          * Get the 4 first bytes and decode the charset
4916          * if enc != XML_CHAR_ENCODING_NONE
4917          * plug some encoding conversion routines.
4918          */
4919         start[0] = RAW;
4920         start[1] = NXT(1);
4921         start[2] = NXT(2);
4922         start[3] = NXT(3);
4923         enc = xmlDetectCharEncoding(&start[0], 4);
4924         if (enc != XML_CHAR_ENCODING_NONE) {
4925             xmlSwitchEncoding(ctxt, enc);
4926         }
4927     }
4928
4929     /*
4930      * Wipe out everything which is before the first '<'
4931      */
4932     SKIP_BLANKS;
4933     if (CUR == 0) {
4934         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4935                      "Document is empty\n", NULL, NULL);
4936     }
4937
4938     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4939         ctxt->sax->startDocument(ctxt->userData);
4940
4941
4942     /*
4943      * Parse possible comments and PIs before any content
4944      */
4945     while (((CUR == '<') && (NXT(1) == '!') &&
4946             (NXT(2) == '-') && (NXT(3) == '-')) ||
4947            ((CUR == '<') && (NXT(1) == '?'))) {
4948         htmlParseComment(ctxt);
4949         htmlParsePI(ctxt);
4950         SKIP_BLANKS;
4951     }
4952
4953
4954     /*
4955      * Then possibly doc type declaration(s) and more Misc
4956      * (doctypedecl Misc*)?
4957      */
4958     if ((CUR == '<') && (NXT(1) == '!') &&
4959         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4960         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4961         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4962         (UPP(8) == 'E')) {
4963         htmlParseDocTypeDecl(ctxt);
4964     }
4965     SKIP_BLANKS;
4966
4967     /*
4968      * Parse possible comments and PIs before any content
4969      */
4970     while (((CUR == '<') && (NXT(1) == '!') &&
4971             (NXT(2) == '-') && (NXT(3) == '-')) ||
4972            ((CUR == '<') && (NXT(1) == '?'))) {
4973         htmlParseComment(ctxt);
4974         htmlParsePI(ctxt);
4975         SKIP_BLANKS;
4976     }
4977
4978     /*
4979      * Time to start parsing the tree itself
4980      */
4981     htmlParseContentInternal(ctxt);
4982
4983     /*
4984      * autoclose
4985      */
4986     if (CUR == 0)
4987         htmlAutoCloseOnEnd(ctxt);
4988
4989
4990     /*
4991      * SAX: end of the document processing.
4992      */
4993     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4994         ctxt->sax->endDocument(ctxt->userData);
4995
4996     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4997         dtd = xmlGetIntSubset(ctxt->myDoc);
4998         if (dtd == NULL)
4999             ctxt->myDoc->intSubset =
5000                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5001                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5002                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5003     }
5004     if (! ctxt->wellFormed) return(-1);
5005     return(0);
5006 }
5007
5008
5009 /************************************************************************
5010  *                                                                      *
5011  *                      Parser contexts handling                        *
5012  *                                                                      *
5013  ************************************************************************/
5014
5015 /**
5016  * htmlInitParserCtxt:
5017  * @ctxt:  an HTML parser context
5018  *
5019  * Initialize a parser context
5020  *
5021  * Returns 0 in case of success and -1 in case of error
5022  */
5023
5024 static int
5025 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5026 {
5027     htmlSAXHandler *sax;
5028
5029     if (ctxt == NULL) return(-1);
5030     memset(ctxt, 0, sizeof(htmlParserCtxt));
5031
5032     ctxt->dict = xmlDictCreate();
5033     if (ctxt->dict == NULL) {
5034         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5035         return(-1);
5036     }
5037     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5038     if (sax == NULL) {
5039         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5040         return(-1);
5041     }
5042     else
5043         memset(sax, 0, sizeof(htmlSAXHandler));
5044
5045     /* Allocate the Input stack */
5046     ctxt->inputTab = (htmlParserInputPtr *)
5047                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5048     if (ctxt->inputTab == NULL) {
5049         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5050         ctxt->inputNr = 0;
5051         ctxt->inputMax = 0;
5052         ctxt->input = NULL;
5053         return(-1);
5054     }
5055     ctxt->inputNr = 0;
5056     ctxt->inputMax = 5;
5057     ctxt->input = NULL;
5058     ctxt->version = NULL;
5059     ctxt->encoding = NULL;
5060     ctxt->standalone = -1;
5061     ctxt->instate = XML_PARSER_START;
5062
5063     /* Allocate the Node stack */
5064     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5065     if (ctxt->nodeTab == NULL) {
5066         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5067         ctxt->nodeNr = 0;
5068         ctxt->nodeMax = 0;
5069         ctxt->node = NULL;
5070         ctxt->inputNr = 0;
5071         ctxt->inputMax = 0;
5072         ctxt->input = NULL;
5073         return(-1);
5074     }
5075     ctxt->nodeNr = 0;
5076     ctxt->nodeMax = 10;
5077     ctxt->node = NULL;
5078
5079     /* Allocate the Name stack */
5080     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5081     if (ctxt->nameTab == NULL) {
5082         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5083         ctxt->nameNr = 0;
5084         ctxt->nameMax = 0;
5085         ctxt->name = NULL;
5086         ctxt->nodeNr = 0;
5087         ctxt->nodeMax = 0;
5088         ctxt->node = NULL;
5089         ctxt->inputNr = 0;
5090         ctxt->inputMax = 0;
5091         ctxt->input = NULL;
5092         return(-1);
5093     }
5094     ctxt->nameNr = 0;
5095     ctxt->nameMax = 10;
5096     ctxt->name = NULL;
5097
5098     ctxt->nodeInfoTab = NULL;
5099     ctxt->nodeInfoNr  = 0;
5100     ctxt->nodeInfoMax = 0;
5101
5102     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5103     else {
5104         ctxt->sax = sax;
5105         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5106     }
5107     ctxt->userData = ctxt;
5108     ctxt->myDoc = NULL;
5109     ctxt->wellFormed = 1;
5110     ctxt->replaceEntities = 0;
5111     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5112     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5113     ctxt->html = 1;
5114     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5115     ctxt->vctxt.userData = ctxt;
5116     ctxt->vctxt.error = xmlParserValidityError;
5117     ctxt->vctxt.warning = xmlParserValidityWarning;
5118     ctxt->record_info = 0;
5119     ctxt->validate = 0;
5120     ctxt->checkIndex = 0;
5121     ctxt->catalogs = NULL;
5122     xmlInitNodeInfoSeq(&ctxt->node_seq);
5123     return(0);
5124 }
5125
5126 /**
5127  * htmlFreeParserCtxt:
5128  * @ctxt:  an HTML parser context
5129  *
5130  * Free all the memory used by a parser context. However the parsed
5131  * document in ctxt->myDoc is not freed.
5132  */
5133
5134 void
5135 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5136 {
5137     xmlFreeParserCtxt(ctxt);
5138 }
5139
5140 /**
5141  * htmlNewParserCtxt:
5142  *
5143  * Allocate and initialize a new parser context.
5144  *
5145  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5146  */
5147
5148 htmlParserCtxtPtr
5149 htmlNewParserCtxt(void)
5150 {
5151     xmlParserCtxtPtr ctxt;
5152
5153     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5154     if (ctxt == NULL) {
5155         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5156         return(NULL);
5157     }
5158     memset(ctxt, 0, sizeof(xmlParserCtxt));
5159     if (htmlInitParserCtxt(ctxt) < 0) {
5160         htmlFreeParserCtxt(ctxt);
5161         return(NULL);
5162     }
5163     return(ctxt);
5164 }
5165
5166 /**
5167  * htmlCreateMemoryParserCtxt:
5168  * @buffer:  a pointer to a char array
5169  * @size:  the size of the array
5170  *
5171  * Create a parser context for an HTML in-memory document.
5172  *
5173  * Returns the new parser context or NULL
5174  */
5175 htmlParserCtxtPtr
5176 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5177     xmlParserCtxtPtr ctxt;
5178     xmlParserInputPtr input;
5179     xmlParserInputBufferPtr buf;
5180
5181     if (buffer == NULL)
5182         return(NULL);
5183     if (size <= 0)
5184         return(NULL);
5185
5186     ctxt = htmlNewParserCtxt();
5187     if (ctxt == NULL)
5188         return(NULL);
5189
5190     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5191     if (buf == NULL) return(NULL);
5192
5193     input = xmlNewInputStream(ctxt);
5194     if (input == NULL) {
5195         xmlFreeParserInputBuffer(buf);
5196         xmlFreeParserCtxt(ctxt);
5197         return(NULL);
5198     }
5199
5200     input->filename = NULL;
5201     input->buf = buf;
5202     xmlBufResetInput(buf->buffer, input);
5203
5204     inputPush(ctxt, input);
5205     return(ctxt);
5206 }
5207
5208 /**
5209  * htmlCreateDocParserCtxt:
5210  * @cur:  a pointer to an array of xmlChar
5211  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5212  *
5213  * Create a parser context for an HTML document.
5214  *
5215  * TODO: check the need to add encoding handling there
5216  *
5217  * Returns the new parser context or NULL
5218  */
5219 static htmlParserCtxtPtr
5220 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5221     int len;
5222     htmlParserCtxtPtr ctxt;
5223
5224     if (cur == NULL)
5225         return(NULL);
5226     len = xmlStrlen(cur);
5227     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5228     if (ctxt == NULL)
5229         return(NULL);
5230
5231     if (encoding != NULL) {
5232         xmlCharEncoding enc;
5233         xmlCharEncodingHandlerPtr handler;
5234
5235         if (ctxt->input->encoding != NULL)
5236             xmlFree((xmlChar *) ctxt->input->encoding);
5237         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5238
5239         enc = xmlParseCharEncoding(encoding);
5240         /*
5241          * registered set of known encodings
5242          */
5243         if (enc != XML_CHAR_ENCODING_ERROR) {
5244             xmlSwitchEncoding(ctxt, enc);
5245             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5246                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5247                              "Unsupported encoding %s\n",
5248                              (const xmlChar *) encoding, NULL);
5249             }
5250         } else {
5251             /*
5252              * fallback for unknown encodings
5253              */
5254             handler = xmlFindCharEncodingHandler((const char *) encoding);
5255             if (handler != NULL) {
5256                 xmlSwitchToEncoding(ctxt, handler);
5257             } else {
5258                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5259                              "Unsupported encoding %s\n",
5260                              (const xmlChar *) encoding, NULL);
5261             }
5262         }
5263     }
5264     return(ctxt);
5265 }
5266
5267 #ifdef LIBXML_PUSH_ENABLED
5268 /************************************************************************
5269  *                                                                      *
5270  *      Progressive parsing interfaces                          *
5271  *                                                                      *
5272  ************************************************************************/
5273
5274 /**
5275  * htmlParseLookupSequence:
5276  * @ctxt:  an HTML parser context
5277  * @first:  the first char to lookup
5278  * @next:  the next char to lookup or zero
5279  * @third:  the next char to lookup or zero
5280  * @ignoreattrval: skip over attribute values
5281  *
5282  * Try to find if a sequence (first, next, third) or  just (first next) or
5283  * (first) is available in the input stream.
5284  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5285  * to avoid rescanning sequences of bytes, it DOES change the state of the
5286  * parser, do not use liberally.
5287  * This is basically similar to xmlParseLookupSequence()
5288  *
5289  * Returns the index to the current parsing point if the full sequence
5290  *      is available, -1 otherwise.
5291  */
5292 static int
5293 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5294                         xmlChar next, xmlChar third, int ignoreattrval)
5295 {
5296     int base, len;
5297     htmlParserInputPtr in;
5298     const xmlChar *buf;
5299     int invalue = 0;
5300     char valdellim = 0x0;
5301
5302     in = ctxt->input;
5303     if (in == NULL)
5304         return (-1);
5305
5306     base = in->cur - in->base;
5307     if (base < 0)
5308         return (-1);
5309
5310     if (ctxt->checkIndex > base) {
5311         base = ctxt->checkIndex;
5312         /* Abuse hasPErefs member to restore current state. */
5313         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5314     }
5315
5316     if (in->buf == NULL) {
5317         buf = in->base;
5318         len = in->length;
5319     } else {
5320         buf = xmlBufContent(in->buf->buffer);
5321         len = xmlBufUse(in->buf->buffer);
5322     }
5323
5324     /* take into account the sequence length */
5325     if (third)
5326         len -= 2;
5327     else if (next)
5328         len--;
5329     for (; base < len; base++) {
5330         if (ignoreattrval) {
5331             if (buf[base] == '"' || buf[base] == '\'') {
5332                 if (invalue) {
5333                     if (buf[base] == valdellim) {
5334                         invalue = 0;
5335                         continue;
5336                     }
5337                 } else {
5338                     valdellim = buf[base];
5339                     invalue = 1;
5340                     continue;
5341                 }
5342             } else if (invalue) {
5343                 continue;
5344             }
5345         }
5346         if (buf[base] == first) {
5347             if (third != 0) {
5348                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5349                     continue;
5350             } else if (next != 0) {
5351                 if (buf[base + 1] != next)
5352                     continue;
5353             }
5354             ctxt->checkIndex = 0;
5355 #ifdef DEBUG_PUSH
5356             if (next == 0)
5357                 xmlGenericError(xmlGenericErrorContext,
5358                                 "HPP: lookup '%c' found at %d\n",
5359                                 first, base);
5360             else if (third == 0)
5361                 xmlGenericError(xmlGenericErrorContext,
5362                                 "HPP: lookup '%c%c' found at %d\n",
5363                                 first, next, base);
5364             else
5365                 xmlGenericError(xmlGenericErrorContext,
5366                                 "HPP: lookup '%c%c%c' found at %d\n",
5367                                 first, next, third, base);
5368 #endif
5369             return (base - (in->cur - in->base));
5370         }
5371     }
5372     ctxt->checkIndex = base;
5373     /* Abuse hasPErefs member to track current state. */
5374     if (invalue)
5375         ctxt->hasPErefs |= 1;
5376     else
5377         ctxt->hasPErefs &= ~1;
5378 #ifdef DEBUG_PUSH
5379     if (next == 0)
5380         xmlGenericError(xmlGenericErrorContext,
5381                         "HPP: lookup '%c' failed\n", first);
5382     else if (third == 0)
5383         xmlGenericError(xmlGenericErrorContext,
5384                         "HPP: lookup '%c%c' failed\n", first, next);
5385     else
5386         xmlGenericError(xmlGenericErrorContext,
5387                         "HPP: lookup '%c%c%c' failed\n", first, next,
5388                         third);
5389 #endif
5390     return (-1);
5391 }
5392
5393 /**
5394  * htmlParseLookupCommentEnd:
5395  * @ctxt: an HTML parser context
5396  *
5397  * Try to find a comment end tag in the input stream
5398  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5399  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5400  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5401  * to avoid rescanning sequences of bytes, it DOES change the state of the
5402  * parser, do not use liberally.
5403  * This wraps to htmlParseLookupSequence()
5404  *
5405  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5406  */
5407 static int
5408 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5409 {
5410     int mark = 0;
5411     int cur = CUR_PTR - BASE_PTR;
5412
5413     while (mark >= 0) {
5414         mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5415         if ((mark < 0) ||
5416             (NXT(mark+2) == '>') ||
5417             ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5418             return mark;
5419         }
5420         ctxt->checkIndex = cur + mark + 1;
5421     }
5422     return mark;
5423 }
5424
5425
5426 /**
5427  * htmlParseTryOrFinish:
5428  * @ctxt:  an HTML parser context
5429  * @terminate:  last chunk indicator
5430  *
5431  * Try to progress on parsing
5432  *
5433  * Returns zero if no parsing was possible
5434  */
5435 static int
5436 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5437     int ret = 0;
5438     htmlParserInputPtr in;
5439     ptrdiff_t avail = 0;
5440     xmlChar cur, next;
5441
5442     htmlParserNodeInfo node_info;
5443
5444 #ifdef DEBUG_PUSH
5445     switch (ctxt->instate) {
5446         case XML_PARSER_EOF:
5447             xmlGenericError(xmlGenericErrorContext,
5448                     "HPP: try EOF\n"); break;
5449         case XML_PARSER_START:
5450             xmlGenericError(xmlGenericErrorContext,
5451                     "HPP: try START\n"); break;
5452         case XML_PARSER_MISC:
5453             xmlGenericError(xmlGenericErrorContext,
5454                     "HPP: try MISC\n");break;
5455         case XML_PARSER_COMMENT:
5456             xmlGenericError(xmlGenericErrorContext,
5457                     "HPP: try COMMENT\n");break;
5458         case XML_PARSER_PROLOG:
5459             xmlGenericError(xmlGenericErrorContext,
5460                     "HPP: try PROLOG\n");break;
5461         case XML_PARSER_START_TAG:
5462             xmlGenericError(xmlGenericErrorContext,
5463                     "HPP: try START_TAG\n");break;
5464         case XML_PARSER_CONTENT:
5465             xmlGenericError(xmlGenericErrorContext,
5466                     "HPP: try CONTENT\n");break;
5467         case XML_PARSER_CDATA_SECTION:
5468             xmlGenericError(xmlGenericErrorContext,
5469                     "HPP: try CDATA_SECTION\n");break;
5470         case XML_PARSER_END_TAG:
5471             xmlGenericError(xmlGenericErrorContext,
5472                     "HPP: try END_TAG\n");break;
5473         case XML_PARSER_ENTITY_DECL:
5474             xmlGenericError(xmlGenericErrorContext,
5475                     "HPP: try ENTITY_DECL\n");break;
5476         case XML_PARSER_ENTITY_VALUE:
5477             xmlGenericError(xmlGenericErrorContext,
5478                     "HPP: try ENTITY_VALUE\n");break;
5479         case XML_PARSER_ATTRIBUTE_VALUE:
5480             xmlGenericError(xmlGenericErrorContext,
5481                     "HPP: try ATTRIBUTE_VALUE\n");break;
5482         case XML_PARSER_DTD:
5483             xmlGenericError(xmlGenericErrorContext,
5484                     "HPP: try DTD\n");break;
5485         case XML_PARSER_EPILOG:
5486             xmlGenericError(xmlGenericErrorContext,
5487                     "HPP: try EPILOG\n");break;
5488         case XML_PARSER_PI:
5489             xmlGenericError(xmlGenericErrorContext,
5490                     "HPP: try PI\n");break;
5491         case XML_PARSER_SYSTEM_LITERAL:
5492             xmlGenericError(xmlGenericErrorContext,
5493                     "HPP: try SYSTEM_LITERAL\n");break;
5494     }
5495 #endif
5496
5497     while (1) {
5498
5499         in = ctxt->input;
5500         if (in == NULL) break;
5501         if (in->buf == NULL)
5502             avail = in->length - (in->cur - in->base);
5503         else
5504             avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5505                     (in->cur - in->base);
5506         if ((avail == 0) && (terminate)) {
5507             htmlAutoCloseOnEnd(ctxt);
5508             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5509                 /*
5510                  * SAX: end of the document processing.
5511                  */
5512                 ctxt->instate = XML_PARSER_EOF;
5513                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5514                     ctxt->sax->endDocument(ctxt->userData);
5515             }
5516         }
5517         if (avail < 1)
5518             goto done;
5519         /*
5520          * This is done to make progress and avoid an infinite loop
5521          * if a parsing attempt was aborted by hitting a NUL byte. After
5522          * changing htmlCurrentChar, this probably isn't necessary anymore.
5523          * We should consider removing this check.
5524          */
5525         cur = in->cur[0];
5526         if (cur == 0) {
5527             SKIP(1);
5528             continue;
5529         }
5530
5531         switch (ctxt->instate) {
5532             case XML_PARSER_EOF:
5533                 /*
5534                  * Document parsing is done !
5535                  */
5536                 goto done;
5537             case XML_PARSER_START:
5538                 /*
5539                  * Very first chars read from the document flow.
5540                  */
5541                 cur = in->cur[0];
5542                 if (IS_BLANK_CH(cur)) {
5543                     SKIP_BLANKS;
5544                     if (in->buf == NULL)
5545                         avail = in->length - (in->cur - in->base);
5546                     else
5547                         avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5548                                 (in->cur - in->base);
5549                 }
5550                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5551                     ctxt->sax->setDocumentLocator(ctxt->userData,
5552                                                   &xmlDefaultSAXLocator);
5553                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5554                     (!ctxt->disableSAX))
5555                     ctxt->sax->startDocument(ctxt->userData);
5556
5557                 cur = in->cur[0];
5558                 next = in->cur[1];
5559                 if ((cur == '<') && (next == '!') &&
5560                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5561                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5562                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5563                     (UPP(8) == 'E')) {
5564                     if ((!terminate) &&
5565                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5566                         goto done;
5567 #ifdef DEBUG_PUSH
5568                     xmlGenericError(xmlGenericErrorContext,
5569                             "HPP: Parsing internal subset\n");
5570 #endif
5571                     htmlParseDocTypeDecl(ctxt);
5572                     ctxt->instate = XML_PARSER_PROLOG;
5573 #ifdef DEBUG_PUSH
5574                     xmlGenericError(xmlGenericErrorContext,
5575                             "HPP: entering PROLOG\n");
5576 #endif
5577                 } else {
5578                     ctxt->instate = XML_PARSER_MISC;
5579 #ifdef DEBUG_PUSH
5580                     xmlGenericError(xmlGenericErrorContext,
5581                             "HPP: entering MISC\n");
5582 #endif
5583                 }
5584                 break;
5585             case XML_PARSER_MISC:
5586                 SKIP_BLANKS;
5587                 if (in->buf == NULL)
5588                     avail = in->length - (in->cur - in->base);
5589                 else
5590                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5591                             (in->cur - in->base);
5592                 /*
5593                  * no chars in buffer
5594                  */
5595                 if (avail < 1)
5596                     goto done;
5597                 /*
5598                  * not enough chars in buffer
5599                  */
5600                 if (avail < 2) {
5601                     if (!terminate)
5602                         goto done;
5603                     else
5604                         next = ' ';
5605                 } else {
5606                     next = in->cur[1];
5607                 }
5608                 cur = in->cur[0];
5609                 if ((cur == '<') && (next == '!') &&
5610                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5611                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5612                         goto done;
5613 #ifdef DEBUG_PUSH
5614                     xmlGenericError(xmlGenericErrorContext,
5615                             "HPP: Parsing Comment\n");
5616 #endif
5617                     htmlParseComment(ctxt);
5618                     ctxt->instate = XML_PARSER_MISC;
5619                 } else if ((cur == '<') && (next == '?')) {
5620                     if ((!terminate) &&
5621                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5622                         goto done;
5623 #ifdef DEBUG_PUSH
5624                     xmlGenericError(xmlGenericErrorContext,
5625                             "HPP: Parsing PI\n");
5626 #endif
5627                     htmlParsePI(ctxt);
5628                     ctxt->instate = XML_PARSER_MISC;
5629                 } else if ((cur == '<') && (next == '!') &&
5630                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5631                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5632                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5633                     (UPP(8) == 'E')) {
5634                     if ((!terminate) &&
5635                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5636                         goto done;
5637 #ifdef DEBUG_PUSH
5638                     xmlGenericError(xmlGenericErrorContext,
5639                             "HPP: Parsing internal subset\n");
5640 #endif
5641                     htmlParseDocTypeDecl(ctxt);
5642                     ctxt->instate = XML_PARSER_PROLOG;
5643 #ifdef DEBUG_PUSH
5644                     xmlGenericError(xmlGenericErrorContext,
5645                             "HPP: entering PROLOG\n");
5646 #endif
5647                 } else if ((cur == '<') && (next == '!') &&
5648                            (avail < 9)) {
5649                     goto done;
5650                 } else {
5651                     ctxt->instate = XML_PARSER_CONTENT;
5652 #ifdef DEBUG_PUSH
5653                     xmlGenericError(xmlGenericErrorContext,
5654                             "HPP: entering START_TAG\n");
5655 #endif
5656                 }
5657                 break;
5658             case XML_PARSER_PROLOG:
5659                 SKIP_BLANKS;
5660                 if (in->buf == NULL)
5661                     avail = in->length - (in->cur - in->base);
5662                 else
5663                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5664                             (in->cur - in->base);
5665                 if (avail < 2)
5666                     goto done;
5667                 cur = in->cur[0];
5668                 next = in->cur[1];
5669                 if ((cur == '<') && (next == '!') &&
5670                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5671                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5672                         goto done;
5673 #ifdef DEBUG_PUSH
5674                     xmlGenericError(xmlGenericErrorContext,
5675                             "HPP: Parsing Comment\n");
5676 #endif
5677                     htmlParseComment(ctxt);
5678                     ctxt->instate = XML_PARSER_PROLOG;
5679                 } else if ((cur == '<') && (next == '?')) {
5680                     if ((!terminate) &&
5681                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5682                         goto done;
5683 #ifdef DEBUG_PUSH
5684                     xmlGenericError(xmlGenericErrorContext,
5685                             "HPP: Parsing PI\n");
5686 #endif
5687                     htmlParsePI(ctxt);
5688                     ctxt->instate = XML_PARSER_PROLOG;
5689                 } else if ((cur == '<') && (next == '!') &&
5690                            (avail < 4)) {
5691                     goto done;
5692                 } else {
5693                     ctxt->instate = XML_PARSER_CONTENT;
5694 #ifdef DEBUG_PUSH
5695                     xmlGenericError(xmlGenericErrorContext,
5696                             "HPP: entering START_TAG\n");
5697 #endif
5698                 }
5699                 break;
5700             case XML_PARSER_EPILOG:
5701                 if (in->buf == NULL)
5702                     avail = in->length - (in->cur - in->base);
5703                 else
5704                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5705                             (in->cur - in->base);
5706                 if (avail < 1)
5707                     goto done;
5708                 cur = in->cur[0];
5709                 if (IS_BLANK_CH(cur)) {
5710                     htmlParseCharData(ctxt);
5711                     goto done;
5712                 }
5713                 if (avail < 2)
5714                     goto done;
5715                 next = in->cur[1];
5716                 if ((cur == '<') && (next == '!') &&
5717                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5718                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5719                         goto done;
5720 #ifdef DEBUG_PUSH
5721                     xmlGenericError(xmlGenericErrorContext,
5722                             "HPP: Parsing Comment\n");
5723 #endif
5724                     htmlParseComment(ctxt);
5725                     ctxt->instate = XML_PARSER_EPILOG;
5726                 } else if ((cur == '<') && (next == '?')) {
5727                     if ((!terminate) &&
5728                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5729                         goto done;
5730 #ifdef DEBUG_PUSH
5731                     xmlGenericError(xmlGenericErrorContext,
5732                             "HPP: Parsing PI\n");
5733 #endif
5734                     htmlParsePI(ctxt);
5735                     ctxt->instate = XML_PARSER_EPILOG;
5736                 } else if ((cur == '<') && (next == '!') &&
5737                            (avail < 4)) {
5738                     goto done;
5739                 } else {
5740                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5741                     ctxt->wellFormed = 0;
5742                     ctxt->instate = XML_PARSER_EOF;
5743 #ifdef DEBUG_PUSH
5744                     xmlGenericError(xmlGenericErrorContext,
5745                             "HPP: entering EOF\n");
5746 #endif
5747                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5748                         ctxt->sax->endDocument(ctxt->userData);
5749                     goto done;
5750                 }
5751                 break;
5752             case XML_PARSER_START_TAG: {
5753                 const xmlChar *name;
5754                 int failed;
5755                 const htmlElemDesc * info;
5756
5757                 /*
5758                  * no chars in buffer
5759                  */
5760                 if (avail < 1)
5761                     goto done;
5762                 /*
5763                  * not enough chars in buffer
5764                  */
5765                 if (avail < 2) {
5766                     if (!terminate)
5767                         goto done;
5768                     else
5769                         next = ' ';
5770                 } else {
5771                     next = in->cur[1];
5772                 }
5773                 cur = in->cur[0];
5774                 if (cur != '<') {
5775                     ctxt->instate = XML_PARSER_CONTENT;
5776 #ifdef DEBUG_PUSH
5777                     xmlGenericError(xmlGenericErrorContext,
5778                             "HPP: entering CONTENT\n");
5779 #endif
5780                     break;
5781                 }
5782                 if (next == '/') {
5783                     ctxt->instate = XML_PARSER_END_TAG;
5784                     ctxt->checkIndex = 0;
5785 #ifdef DEBUG_PUSH
5786                     xmlGenericError(xmlGenericErrorContext,
5787                             "HPP: entering END_TAG\n");
5788 #endif
5789                     break;
5790                 }
5791                 if ((!terminate) &&
5792                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5793                     goto done;
5794
5795                 /* Capture start position */
5796                 if (ctxt->record_info) {
5797                      node_info.begin_pos = ctxt->input->consumed +
5798                                         (CUR_PTR - ctxt->input->base);
5799                      node_info.begin_line = ctxt->input->line;
5800                 }
5801
5802
5803                 failed = htmlParseStartTag(ctxt);
5804                 name = ctxt->name;
5805                 if ((failed == -1) ||
5806                     (name == NULL)) {
5807                     if (CUR == '>')
5808                         NEXT;
5809                     break;
5810                 }
5811
5812                 /*
5813                  * Lookup the info for that element.
5814                  */
5815                 info = htmlTagLookup(name);
5816                 if (info == NULL) {
5817                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5818                                  "Tag %s invalid\n", name, NULL);
5819                 }
5820
5821                 /*
5822                  * Check for an Empty Element labeled the XML/SGML way
5823                  */
5824                 if ((CUR == '/') && (NXT(1) == '>')) {
5825                     SKIP(2);
5826                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5827                         ctxt->sax->endElement(ctxt->userData, name);
5828                     htmlnamePop(ctxt);
5829                     ctxt->instate = XML_PARSER_CONTENT;
5830 #ifdef DEBUG_PUSH
5831                     xmlGenericError(xmlGenericErrorContext,
5832                             "HPP: entering CONTENT\n");
5833 #endif
5834                     break;
5835                 }
5836
5837                 if (CUR == '>') {
5838                     NEXT;
5839                 } else {
5840                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5841                                  "Couldn't find end of Start Tag %s\n",
5842                                  name, NULL);
5843
5844                     /*
5845                      * end of parsing of this node.
5846                      */
5847                     if (xmlStrEqual(name, ctxt->name)) {
5848                         nodePop(ctxt);
5849                         htmlnamePop(ctxt);
5850                     }
5851
5852                     if (ctxt->record_info)
5853                         htmlNodeInfoPush(ctxt, &node_info);
5854
5855                     ctxt->instate = XML_PARSER_CONTENT;
5856 #ifdef DEBUG_PUSH
5857                     xmlGenericError(xmlGenericErrorContext,
5858                             "HPP: entering CONTENT\n");
5859 #endif
5860                     break;
5861                 }
5862
5863                 /*
5864                  * Check for an Empty Element from DTD definition
5865                  */
5866                 if ((info != NULL) && (info->empty)) {
5867                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5868                         ctxt->sax->endElement(ctxt->userData, name);
5869                     htmlnamePop(ctxt);
5870                 }
5871
5872                 if (ctxt->record_info)
5873                     htmlNodeInfoPush(ctxt, &node_info);
5874
5875                 ctxt->instate = XML_PARSER_CONTENT;
5876 #ifdef DEBUG_PUSH
5877                 xmlGenericError(xmlGenericErrorContext,
5878                         "HPP: entering CONTENT\n");
5879 #endif
5880                 break;
5881             }
5882             case XML_PARSER_CONTENT: {
5883                 xmlChar chr[2] = { 0, 0 };
5884
5885                 /*
5886                  * Handle preparsed entities and charRef
5887                  */
5888                 if (ctxt->token != 0) {
5889                     chr[0] = (xmlChar) ctxt->token;
5890                     htmlCheckParagraph(ctxt);
5891                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5892                         ctxt->sax->characters(ctxt->userData, chr, 1);
5893                     ctxt->token = 0;
5894                     ctxt->checkIndex = 0;
5895                 }
5896                 if ((avail == 1) && (terminate)) {
5897                     cur = in->cur[0];
5898                     if ((cur != '<') && (cur != '&')) {
5899                         if (ctxt->sax != NULL) {
5900                             chr[0] = cur;
5901                             if (IS_BLANK_CH(cur)) {
5902                                 if (ctxt->keepBlanks) {
5903                                     if (ctxt->sax->characters != NULL)
5904                                         ctxt->sax->characters(
5905                                                 ctxt->userData, chr, 1);
5906                                 } else {
5907                                     if (ctxt->sax->ignorableWhitespace != NULL)
5908                                         ctxt->sax->ignorableWhitespace(
5909                                                 ctxt->userData, chr, 1);
5910                                 }
5911                             } else {
5912                                 htmlCheckParagraph(ctxt);
5913                                 if (ctxt->sax->characters != NULL)
5914                                     ctxt->sax->characters(
5915                                             ctxt->userData, chr, 1);
5916                             }
5917                         }
5918                         ctxt->token = 0;
5919                         ctxt->checkIndex = 0;
5920                         in->cur++;
5921                         break;
5922                     }
5923                 }
5924                 if (avail < 2)
5925                     goto done;
5926                 cur = in->cur[0];
5927                 next = in->cur[1];
5928                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5929                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5930                     /*
5931                      * Handle SCRIPT/STYLE separately
5932                      */
5933                     if (!terminate) {
5934                         int idx;
5935                         xmlChar val;
5936
5937                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5938                         if (idx < 0)
5939                             goto done;
5940                         val = in->cur[idx + 2];
5941                         if (val == 0) /* bad cut of input */
5942                             goto done;
5943                     }
5944                     htmlParseScript(ctxt);
5945                     if ((cur == '<') && (next == '/')) {
5946                         ctxt->instate = XML_PARSER_END_TAG;
5947                         ctxt->checkIndex = 0;
5948 #ifdef DEBUG_PUSH
5949                         xmlGenericError(xmlGenericErrorContext,
5950                                 "HPP: entering END_TAG\n");
5951 #endif
5952                         break;
5953                     }
5954                 } else {
5955                     /*
5956                      * Sometimes DOCTYPE arrives in the middle of the document
5957                      */
5958                     if ((cur == '<') && (next == '!') &&
5959                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
5960                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5961                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5962                         (UPP(8) == 'E')) {
5963                         if ((!terminate) &&
5964                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5965                             goto done;
5966                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5967                                      "Misplaced DOCTYPE declaration\n",
5968                                      BAD_CAST "DOCTYPE" , NULL);
5969                         htmlParseDocTypeDecl(ctxt);
5970                     } else if ((cur == '<') && (next == '!') &&
5971                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
5972                         if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5973                             goto done;
5974 #ifdef DEBUG_PUSH
5975                         xmlGenericError(xmlGenericErrorContext,
5976                                 "HPP: Parsing Comment\n");
5977 #endif
5978                         htmlParseComment(ctxt);
5979                         ctxt->instate = XML_PARSER_CONTENT;
5980                     } else if ((cur == '<') && (next == '?')) {
5981                         if ((!terminate) &&
5982                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5983                             goto done;
5984 #ifdef DEBUG_PUSH
5985                         xmlGenericError(xmlGenericErrorContext,
5986                                 "HPP: Parsing PI\n");
5987 #endif
5988                         htmlParsePI(ctxt);
5989                         ctxt->instate = XML_PARSER_CONTENT;
5990                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5991                         goto done;
5992                     } else if ((cur == '<') && (next == '/')) {
5993                         ctxt->instate = XML_PARSER_END_TAG;
5994                         ctxt->checkIndex = 0;
5995 #ifdef DEBUG_PUSH
5996                         xmlGenericError(xmlGenericErrorContext,
5997                                 "HPP: entering END_TAG\n");
5998 #endif
5999                         break;
6000                     } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6001                         if ((!terminate) && (next == 0))
6002                             goto done;
6003                         ctxt->instate = XML_PARSER_START_TAG;
6004                         ctxt->checkIndex = 0;
6005 #ifdef DEBUG_PUSH
6006                         xmlGenericError(xmlGenericErrorContext,
6007                                 "HPP: entering START_TAG\n");
6008 #endif
6009                         break;
6010                     } else if (cur == '<') {
6011                         if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6012                             (ctxt->sax->characters != NULL))
6013                             ctxt->sax->characters(ctxt->userData,
6014                                                   BAD_CAST "<", 1);
6015                         NEXT;
6016                     } else {
6017                         /*
6018                          * check that the text sequence is complete
6019                          * before handing out the data to the parser
6020                          * to avoid problems with erroneous end of
6021                          * data detection.
6022                          */
6023                         if ((!terminate) &&
6024                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6025                             goto done;
6026                         ctxt->checkIndex = 0;
6027 #ifdef DEBUG_PUSH
6028                         xmlGenericError(xmlGenericErrorContext,
6029                                 "HPP: Parsing char data\n");
6030 #endif
6031                         while ((ctxt->instate != XML_PARSER_EOF) &&
6032                                (cur != '<') && (in->cur < in->end)) {
6033                             if (cur == '&') {
6034                                 htmlParseReference(ctxt);
6035                             } else {
6036                                 htmlParseCharData(ctxt);
6037                             }
6038                             cur = in->cur[0];
6039                         }
6040                     }
6041                 }
6042
6043                 break;
6044             }
6045             case XML_PARSER_END_TAG:
6046                 if (avail < 2)
6047                     goto done;
6048                 if ((!terminate) &&
6049                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6050                     goto done;
6051                 htmlParseEndTag(ctxt);
6052                 if (ctxt->nameNr == 0) {
6053                     ctxt->instate = XML_PARSER_EPILOG;
6054                 } else {
6055                     ctxt->instate = XML_PARSER_CONTENT;
6056                 }
6057                 ctxt->checkIndex = 0;
6058 #ifdef DEBUG_PUSH
6059                 xmlGenericError(xmlGenericErrorContext,
6060                         "HPP: entering CONTENT\n");
6061 #endif
6062                 break;
6063             case XML_PARSER_CDATA_SECTION:
6064                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6065                         "HPP: internal error, state == CDATA\n",
6066                              NULL, NULL);
6067                 ctxt->instate = XML_PARSER_CONTENT;
6068                 ctxt->checkIndex = 0;
6069 #ifdef DEBUG_PUSH
6070                 xmlGenericError(xmlGenericErrorContext,
6071                         "HPP: entering CONTENT\n");
6072 #endif
6073                 break;
6074             case XML_PARSER_DTD:
6075                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6076                         "HPP: internal error, state == DTD\n",
6077                              NULL, NULL);
6078                 ctxt->instate = XML_PARSER_CONTENT;
6079                 ctxt->checkIndex = 0;
6080 #ifdef DEBUG_PUSH
6081                 xmlGenericError(xmlGenericErrorContext,
6082                         "HPP: entering CONTENT\n");
6083 #endif
6084                 break;
6085             case XML_PARSER_COMMENT:
6086                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6087                         "HPP: internal error, state == COMMENT\n",
6088                              NULL, NULL);
6089                 ctxt->instate = XML_PARSER_CONTENT;
6090                 ctxt->checkIndex = 0;
6091 #ifdef DEBUG_PUSH
6092                 xmlGenericError(xmlGenericErrorContext,
6093                         "HPP: entering CONTENT\n");
6094 #endif
6095                 break;
6096             case XML_PARSER_PI:
6097                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6098                         "HPP: internal error, state == PI\n",
6099                              NULL, NULL);
6100                 ctxt->instate = XML_PARSER_CONTENT;
6101                 ctxt->checkIndex = 0;
6102 #ifdef DEBUG_PUSH
6103                 xmlGenericError(xmlGenericErrorContext,
6104                         "HPP: entering CONTENT\n");
6105 #endif
6106                 break;
6107             case XML_PARSER_ENTITY_DECL:
6108                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6109                         "HPP: internal error, state == ENTITY_DECL\n",
6110                              NULL, NULL);
6111                 ctxt->instate = XML_PARSER_CONTENT;
6112                 ctxt->checkIndex = 0;
6113 #ifdef DEBUG_PUSH
6114                 xmlGenericError(xmlGenericErrorContext,
6115                         "HPP: entering CONTENT\n");
6116 #endif
6117                 break;
6118             case XML_PARSER_ENTITY_VALUE:
6119                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6120                         "HPP: internal error, state == ENTITY_VALUE\n",
6121                              NULL, NULL);
6122                 ctxt->instate = XML_PARSER_CONTENT;
6123                 ctxt->checkIndex = 0;
6124 #ifdef DEBUG_PUSH
6125                 xmlGenericError(xmlGenericErrorContext,
6126                         "HPP: entering DTD\n");
6127 #endif
6128                 break;
6129             case XML_PARSER_ATTRIBUTE_VALUE:
6130                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6131                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6132                              NULL, NULL);
6133                 ctxt->instate = XML_PARSER_START_TAG;
6134                 ctxt->checkIndex = 0;
6135 #ifdef DEBUG_PUSH
6136                 xmlGenericError(xmlGenericErrorContext,
6137                         "HPP: entering START_TAG\n");
6138 #endif
6139                 break;
6140             case XML_PARSER_SYSTEM_LITERAL:
6141                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6142                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6143                              NULL, NULL);
6144                 ctxt->instate = XML_PARSER_CONTENT;
6145                 ctxt->checkIndex = 0;
6146 #ifdef DEBUG_PUSH
6147                 xmlGenericError(xmlGenericErrorContext,
6148                         "HPP: entering CONTENT\n");
6149 #endif
6150                 break;
6151             case XML_PARSER_IGNORE:
6152                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6153                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
6154                              NULL, NULL);
6155                 ctxt->instate = XML_PARSER_CONTENT;
6156                 ctxt->checkIndex = 0;
6157 #ifdef DEBUG_PUSH
6158                 xmlGenericError(xmlGenericErrorContext,
6159                         "HPP: entering CONTENT\n");
6160 #endif
6161                 break;
6162             case XML_PARSER_PUBLIC_LITERAL:
6163                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6164                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
6165                              NULL, NULL);
6166                 ctxt->instate = XML_PARSER_CONTENT;
6167                 ctxt->checkIndex = 0;
6168 #ifdef DEBUG_PUSH
6169                 xmlGenericError(xmlGenericErrorContext,
6170                         "HPP: entering CONTENT\n");
6171 #endif
6172                 break;
6173
6174         }
6175     }
6176 done:
6177     if ((avail == 0) && (terminate)) {
6178         htmlAutoCloseOnEnd(ctxt);
6179         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6180             /*
6181              * SAX: end of the document processing.
6182              */
6183             ctxt->instate = XML_PARSER_EOF;
6184             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6185                 ctxt->sax->endDocument(ctxt->userData);
6186         }
6187     }
6188     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6189         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6190          (ctxt->instate == XML_PARSER_EPILOG))) {
6191         xmlDtdPtr dtd;
6192         dtd = xmlGetIntSubset(ctxt->myDoc);
6193         if (dtd == NULL)
6194             ctxt->myDoc->intSubset =
6195                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6196                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6197                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6198     }
6199 #ifdef DEBUG_PUSH
6200     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6201 #endif
6202     return(ret);
6203 }
6204
6205 /**
6206  * htmlParseChunk:
6207  * @ctxt:  an HTML parser context
6208  * @chunk:  an char array
6209  * @size:  the size in byte of the chunk
6210  * @terminate:  last chunk indicator
6211  *
6212  * Parse a Chunk of memory
6213  *
6214  * Returns zero if no error, the xmlParserErrors otherwise.
6215  */
6216 int
6217 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6218               int terminate) {
6219     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6220         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6221                      "htmlParseChunk: context error\n", NULL, NULL);
6222         return(XML_ERR_INTERNAL_ERROR);
6223     }
6224     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6225         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6226         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6227         size_t cur = ctxt->input->cur - ctxt->input->base;
6228         int res;
6229
6230         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6231         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6232         if (res < 0) {
6233             ctxt->errNo = XML_PARSER_EOF;
6234             ctxt->disableSAX = 1;
6235             return (XML_PARSER_EOF);
6236         }
6237 #ifdef DEBUG_PUSH
6238         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6239 #endif
6240
6241 #if 0
6242         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6243             htmlParseTryOrFinish(ctxt, terminate);
6244 #endif
6245     } else if (ctxt->instate != XML_PARSER_EOF) {
6246         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6247             xmlParserInputBufferPtr in = ctxt->input->buf;
6248             if ((in->encoder != NULL) && (in->buffer != NULL) &&
6249                     (in->raw != NULL)) {
6250                 int nbchars;
6251                 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6252                 size_t current = ctxt->input->cur - ctxt->input->base;
6253
6254                 nbchars = xmlCharEncInput(in, terminate);
6255                 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6256                 if (nbchars < 0) {
6257                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6258                                  "encoder error\n", NULL, NULL);
6259                     return(XML_ERR_INVALID_ENCODING);
6260                 }
6261             }
6262         }
6263     }
6264     htmlParseTryOrFinish(ctxt, terminate);
6265     if (terminate) {
6266         if ((ctxt->instate != XML_PARSER_EOF) &&
6267             (ctxt->instate != XML_PARSER_EPILOG) &&
6268             (ctxt->instate != XML_PARSER_MISC)) {
6269             ctxt->errNo = XML_ERR_DOCUMENT_END;
6270             ctxt->wellFormed = 0;
6271         }
6272         if (ctxt->instate != XML_PARSER_EOF) {
6273             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6274                 ctxt->sax->endDocument(ctxt->userData);
6275         }
6276         ctxt->instate = XML_PARSER_EOF;
6277     }
6278     return((xmlParserErrors) ctxt->errNo);
6279 }
6280
6281 /************************************************************************
6282  *                                                                      *
6283  *                      User entry points                               *
6284  *                                                                      *
6285  ************************************************************************/
6286
6287 /**
6288  * htmlCreatePushParserCtxt:
6289  * @sax:  a SAX handler
6290  * @user_data:  The user data returned on SAX callbacks
6291  * @chunk:  a pointer to an array of chars
6292  * @size:  number of chars in the array
6293  * @filename:  an optional file name or URI
6294  * @enc:  an optional encoding
6295  *
6296  * Create a parser context for using the HTML parser in push mode
6297  * The value of @filename is used for fetching external entities
6298  * and error/warning reports.
6299  *
6300  * Returns the new parser context or NULL
6301  */
6302 htmlParserCtxtPtr
6303 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6304                          const char *chunk, int size, const char *filename,
6305                          xmlCharEncoding enc) {
6306     htmlParserCtxtPtr ctxt;
6307     htmlParserInputPtr inputStream;
6308     xmlParserInputBufferPtr buf;
6309
6310     xmlInitParser();
6311
6312     buf = xmlAllocParserInputBuffer(enc);
6313     if (buf == NULL) return(NULL);
6314
6315     ctxt = htmlNewParserCtxt();
6316     if (ctxt == NULL) {
6317         xmlFreeParserInputBuffer(buf);
6318         return(NULL);
6319     }
6320     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6321         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6322     if (sax != NULL) {
6323         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6324             xmlFree(ctxt->sax);
6325         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6326         if (ctxt->sax == NULL) {
6327             xmlFree(buf);
6328             xmlFree(ctxt);
6329             return(NULL);
6330         }
6331         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6332         if (user_data != NULL)
6333             ctxt->userData = user_data;
6334     }
6335     if (filename == NULL) {
6336         ctxt->directory = NULL;
6337     } else {
6338         ctxt->directory = xmlParserGetDirectory(filename);
6339     }
6340
6341     inputStream = htmlNewInputStream(ctxt);
6342     if (inputStream == NULL) {
6343         xmlFreeParserCtxt(ctxt);
6344         xmlFree(buf);
6345         return(NULL);
6346     }
6347
6348     if (filename == NULL)
6349         inputStream->filename = NULL;
6350     else
6351         inputStream->filename = (char *)
6352             xmlCanonicPath((const xmlChar *) filename);
6353     inputStream->buf = buf;
6354     xmlBufResetInput(buf->buffer, inputStream);
6355
6356     inputPush(ctxt, inputStream);
6357
6358     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6359         (ctxt->input->buf != NULL))  {
6360         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6361         size_t cur = ctxt->input->cur - ctxt->input->base;
6362
6363         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6364
6365         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6366 #ifdef DEBUG_PUSH
6367         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6368 #endif
6369     }
6370     ctxt->progressive = 1;
6371
6372     return(ctxt);
6373 }
6374 #endif /* LIBXML_PUSH_ENABLED */
6375
6376 /**
6377  * htmlSAXParseDoc:
6378  * @cur:  a pointer to an array of xmlChar
6379  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6380  * @sax:  the SAX handler block
6381  * @userData: if using SAX, this pointer will be provided on callbacks.
6382  *
6383  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6384  * to handle parse events. If sax is NULL, fallback to the default DOM
6385  * behavior and return a tree.
6386  *
6387  * Returns the resulting document tree unless SAX is NULL or the document is
6388  *     not well formed.
6389  */
6390
6391 htmlDocPtr
6392 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6393                 htmlSAXHandlerPtr sax, void *userData) {
6394     htmlDocPtr ret;
6395     htmlParserCtxtPtr ctxt;
6396
6397     xmlInitParser();
6398
6399     if (cur == NULL) return(NULL);
6400
6401
6402     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6403     if (ctxt == NULL) return(NULL);
6404     if (sax != NULL) {
6405         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6406         ctxt->sax = sax;
6407         ctxt->userData = userData;
6408     }
6409
6410     htmlParseDocument(ctxt);
6411     ret = ctxt->myDoc;
6412     if (sax != NULL) {
6413         ctxt->sax = NULL;
6414         ctxt->userData = NULL;
6415     }
6416     htmlFreeParserCtxt(ctxt);
6417
6418     return(ret);
6419 }
6420
6421 /**
6422  * htmlParseDoc:
6423  * @cur:  a pointer to an array of xmlChar
6424  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6425  *
6426  * parse an HTML in-memory document and build a tree.
6427  *
6428  * Returns the resulting document tree
6429  */
6430
6431 htmlDocPtr
6432 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6433     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6434 }
6435
6436
6437 /**
6438  * htmlCreateFileParserCtxt:
6439  * @filename:  the filename
6440  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6441  *
6442  * Create a parser context for a file content.
6443  * Automatic support for ZLIB/Compress compressed document is provided
6444  * by default if found at compile-time.
6445  *
6446  * Returns the new parser context or NULL
6447  */
6448 htmlParserCtxtPtr
6449 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6450 {
6451     htmlParserCtxtPtr ctxt;
6452     htmlParserInputPtr inputStream;
6453     char *canonicFilename;
6454     /* htmlCharEncoding enc; */
6455     xmlChar *content, *content_line = (xmlChar *) "charset=";
6456
6457     if (filename == NULL)
6458         return(NULL);
6459
6460     ctxt = htmlNewParserCtxt();
6461     if (ctxt == NULL) {
6462         return(NULL);
6463     }
6464     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6465     if (canonicFilename == NULL) {
6466 #ifdef LIBXML_SAX1_ENABLED
6467         if (xmlDefaultSAXHandler.error != NULL) {
6468             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6469         }
6470 #endif
6471         xmlFreeParserCtxt(ctxt);
6472         return(NULL);
6473     }
6474
6475     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6476     xmlFree(canonicFilename);
6477     if (inputStream == NULL) {
6478         xmlFreeParserCtxt(ctxt);
6479         return(NULL);
6480     }
6481
6482     inputPush(ctxt, inputStream);
6483
6484     /* set encoding */
6485     if (encoding) {
6486         size_t l = strlen(encoding);
6487
6488         if (l < 1000) {
6489             content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6490             if (content) {
6491                 strcpy ((char *)content, (char *)content_line);
6492                 strcat ((char *)content, (char *)encoding);
6493                 htmlCheckEncoding (ctxt, content);
6494                 xmlFree (content);
6495             }
6496         }
6497     }
6498
6499     return(ctxt);
6500 }
6501
6502 /**
6503  * htmlSAXParseFile:
6504  * @filename:  the filename
6505  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6506  * @sax:  the SAX handler block
6507  * @userData: if using SAX, this pointer will be provided on callbacks.
6508  *
6509  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6510  * compressed document is provided by default if found at compile-time.
6511  * It use the given SAX function block to handle the parsing callback.
6512  * If sax is NULL, fallback to the default DOM tree building routines.
6513  *
6514  * Returns the resulting document tree unless SAX is NULL or the document is
6515  *     not well formed.
6516  */
6517
6518 htmlDocPtr
6519 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6520                  void *userData) {
6521     htmlDocPtr ret;
6522     htmlParserCtxtPtr ctxt;
6523     htmlSAXHandlerPtr oldsax = NULL;
6524
6525     xmlInitParser();
6526
6527     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6528     if (ctxt == NULL) return(NULL);
6529     if (sax != NULL) {
6530         oldsax = ctxt->sax;
6531         ctxt->sax = sax;
6532         ctxt->userData = userData;
6533     }
6534
6535     htmlParseDocument(ctxt);
6536
6537     ret = ctxt->myDoc;
6538     if (sax != NULL) {
6539         ctxt->sax = oldsax;
6540         ctxt->userData = NULL;
6541     }
6542     htmlFreeParserCtxt(ctxt);
6543
6544     return(ret);
6545 }
6546
6547 /**
6548  * htmlParseFile:
6549  * @filename:  the filename
6550  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6551  *
6552  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6553  * compressed document is provided by default if found at compile-time.
6554  *
6555  * Returns the resulting document tree
6556  */
6557
6558 htmlDocPtr
6559 htmlParseFile(const char *filename, const char *encoding) {
6560     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6561 }
6562
6563 /**
6564  * htmlHandleOmittedElem:
6565  * @val:  int 0 or 1
6566  *
6567  * Set and return the previous value for handling HTML omitted tags.
6568  *
6569  * Returns the last value for 0 for no handling, 1 for auto insertion.
6570  */
6571
6572 int
6573 htmlHandleOmittedElem(int val) {
6574     int old = htmlOmittedDefaultValue;
6575
6576     htmlOmittedDefaultValue = val;
6577     return(old);
6578 }
6579
6580 /**
6581  * htmlElementAllowedHere:
6582  * @parent: HTML parent element
6583  * @elt: HTML element
6584  *
6585  * Checks whether an HTML element may be a direct child of a parent element.
6586  * Note - doesn't check for deprecated elements
6587  *
6588  * Returns 1 if allowed; 0 otherwise.
6589  */
6590 int
6591 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6592   const char** p ;
6593
6594   if ( ! elt || ! parent || ! parent->subelts )
6595         return 0 ;
6596
6597   for ( p = parent->subelts; *p; ++p )
6598     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6599       return 1 ;
6600
6601   return 0 ;
6602 }
6603 /**
6604  * htmlElementStatusHere:
6605  * @parent: HTML parent element
6606  * @elt: HTML element
6607  *
6608  * Checks whether an HTML element may be a direct child of a parent element.
6609  * and if so whether it is valid or deprecated.
6610  *
6611  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6612  */
6613 htmlStatus
6614 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6615   if ( ! parent || ! elt )
6616     return HTML_INVALID ;
6617   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6618     return HTML_INVALID ;
6619
6620   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6621 }
6622 /**
6623  * htmlAttrAllowed:
6624  * @elt: HTML element
6625  * @attr: HTML attribute
6626  * @legacy: whether to allow deprecated attributes
6627  *
6628  * Checks whether an attribute is valid for an element
6629  * Has full knowledge of Required and Deprecated attributes
6630  *
6631  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6632  */
6633 htmlStatus
6634 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6635   const char** p ;
6636
6637   if ( !elt || ! attr )
6638         return HTML_INVALID ;
6639
6640   if ( elt->attrs_req )
6641     for ( p = elt->attrs_req; *p; ++p)
6642       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6643         return HTML_REQUIRED ;
6644
6645   if ( elt->attrs_opt )
6646     for ( p = elt->attrs_opt; *p; ++p)
6647       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6648         return HTML_VALID ;
6649
6650   if ( legacy && elt->attrs_depr )
6651     for ( p = elt->attrs_depr; *p; ++p)
6652       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6653         return HTML_DEPRECATED ;
6654
6655   return HTML_INVALID ;
6656 }
6657 /**
6658  * htmlNodeStatus:
6659  * @node: an htmlNodePtr in a tree
6660  * @legacy: whether to allow deprecated elements (YES is faster here
6661  *      for Element nodes)
6662  *
6663  * Checks whether the tree node is valid.  Experimental (the author
6664  *     only uses the HTML enhancements in a SAX parser)
6665  *
6666  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6667  *      legacy allowed) or htmlElementStatusHere (otherwise).
6668  *      for Attribute nodes, a return from htmlAttrAllowed
6669  *      for other nodes, HTML_NA (no checks performed)
6670  */
6671 htmlStatus
6672 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6673   if ( ! node )
6674     return HTML_INVALID ;
6675
6676   switch ( node->type ) {
6677     case XML_ELEMENT_NODE:
6678       return legacy
6679         ? ( htmlElementAllowedHere (
6680                 htmlTagLookup(node->parent->name) , node->name
6681                 ) ? HTML_VALID : HTML_INVALID )
6682         : htmlElementStatusHere(
6683                 htmlTagLookup(node->parent->name) ,
6684                 htmlTagLookup(node->name) )
6685         ;
6686     case XML_ATTRIBUTE_NODE:
6687       return htmlAttrAllowed(
6688         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6689     default: return HTML_NA ;
6690   }
6691 }
6692 /************************************************************************
6693  *                                                                      *
6694  *      New set (2.6.0) of simpler and more flexible APIs               *
6695  *                                                                      *
6696  ************************************************************************/
6697 /**
6698  * DICT_FREE:
6699  * @str:  a string
6700  *
6701  * Free a string if it is not owned by the "dict" dictionary in the
6702  * current scope
6703  */
6704 #define DICT_FREE(str)                                          \
6705         if ((str) && ((!dict) ||                                \
6706             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6707             xmlFree((char *)(str));
6708
6709 /**
6710  * htmlCtxtReset:
6711  * @ctxt: an HTML parser context
6712  *
6713  * Reset a parser context
6714  */
6715 void
6716 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6717 {
6718     xmlParserInputPtr input;
6719     xmlDictPtr dict;
6720
6721     if (ctxt == NULL)
6722         return;
6723
6724     xmlInitParser();
6725     dict = ctxt->dict;
6726
6727     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6728         xmlFreeInputStream(input);
6729     }
6730     ctxt->inputNr = 0;
6731     ctxt->input = NULL;
6732
6733     ctxt->spaceNr = 0;
6734     if (ctxt->spaceTab != NULL) {
6735         ctxt->spaceTab[0] = -1;
6736         ctxt->space = &ctxt->spaceTab[0];
6737     } else {
6738         ctxt->space = NULL;
6739     }
6740
6741
6742     ctxt->nodeNr = 0;
6743     ctxt->node = NULL;
6744
6745     ctxt->nameNr = 0;
6746     ctxt->name = NULL;
6747
6748     DICT_FREE(ctxt->version);
6749     ctxt->version = NULL;
6750     DICT_FREE(ctxt->encoding);
6751     ctxt->encoding = NULL;
6752     DICT_FREE(ctxt->directory);
6753     ctxt->directory = NULL;
6754     DICT_FREE(ctxt->extSubURI);
6755     ctxt->extSubURI = NULL;
6756     DICT_FREE(ctxt->extSubSystem);
6757     ctxt->extSubSystem = NULL;
6758     if (ctxt->myDoc != NULL)
6759         xmlFreeDoc(ctxt->myDoc);
6760     ctxt->myDoc = NULL;
6761
6762     ctxt->standalone = -1;
6763     ctxt->hasExternalSubset = 0;
6764     ctxt->hasPErefs = 0;
6765     ctxt->html = 1;
6766     ctxt->external = 0;
6767     ctxt->instate = XML_PARSER_START;
6768     ctxt->token = 0;
6769
6770     ctxt->wellFormed = 1;
6771     ctxt->nsWellFormed = 1;
6772     ctxt->disableSAX = 0;
6773     ctxt->valid = 1;
6774     ctxt->vctxt.userData = ctxt;
6775     ctxt->vctxt.error = xmlParserValidityError;
6776     ctxt->vctxt.warning = xmlParserValidityWarning;
6777     ctxt->record_info = 0;
6778     ctxt->checkIndex = 0;
6779     ctxt->inSubset = 0;
6780     ctxt->errNo = XML_ERR_OK;
6781     ctxt->depth = 0;
6782     ctxt->charset = XML_CHAR_ENCODING_NONE;
6783     ctxt->catalogs = NULL;
6784     xmlInitNodeInfoSeq(&ctxt->node_seq);
6785
6786     if (ctxt->attsDefault != NULL) {
6787         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6788         ctxt->attsDefault = NULL;
6789     }
6790     if (ctxt->attsSpecial != NULL) {
6791         xmlHashFree(ctxt->attsSpecial, NULL);
6792         ctxt->attsSpecial = NULL;
6793     }
6794 }
6795
6796 /**
6797  * htmlCtxtUseOptions:
6798  * @ctxt: an HTML parser context
6799  * @options:  a combination of htmlParserOption(s)
6800  *
6801  * Applies the options to the parser context
6802  *
6803  * Returns 0 in case of success, the set of unknown or unimplemented options
6804  *         in case of error.
6805  */
6806 int
6807 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6808 {
6809     if (ctxt == NULL)
6810         return(-1);
6811
6812     if (options & HTML_PARSE_NOWARNING) {
6813         ctxt->sax->warning = NULL;
6814         ctxt->vctxt.warning = NULL;
6815         options -= XML_PARSE_NOWARNING;
6816         ctxt->options |= XML_PARSE_NOWARNING;
6817     }
6818     if (options & HTML_PARSE_NOERROR) {
6819         ctxt->sax->error = NULL;
6820         ctxt->vctxt.error = NULL;
6821         ctxt->sax->fatalError = NULL;
6822         options -= XML_PARSE_NOERROR;
6823         ctxt->options |= XML_PARSE_NOERROR;
6824     }
6825     if (options & HTML_PARSE_PEDANTIC) {
6826         ctxt->pedantic = 1;
6827         options -= XML_PARSE_PEDANTIC;
6828         ctxt->options |= XML_PARSE_PEDANTIC;
6829     } else
6830         ctxt->pedantic = 0;
6831     if (options & XML_PARSE_NOBLANKS) {
6832         ctxt->keepBlanks = 0;
6833         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6834         options -= XML_PARSE_NOBLANKS;
6835         ctxt->options |= XML_PARSE_NOBLANKS;
6836     } else
6837         ctxt->keepBlanks = 1;
6838     if (options & HTML_PARSE_RECOVER) {
6839         ctxt->recovery = 1;
6840         options -= HTML_PARSE_RECOVER;
6841     } else
6842         ctxt->recovery = 0;
6843     if (options & HTML_PARSE_COMPACT) {
6844         ctxt->options |= HTML_PARSE_COMPACT;
6845         options -= HTML_PARSE_COMPACT;
6846     }
6847     if (options & XML_PARSE_HUGE) {
6848         ctxt->options |= XML_PARSE_HUGE;
6849         options -= XML_PARSE_HUGE;
6850     }
6851     if (options & HTML_PARSE_NODEFDTD) {
6852         ctxt->options |= HTML_PARSE_NODEFDTD;
6853         options -= HTML_PARSE_NODEFDTD;
6854     }
6855     if (options & HTML_PARSE_IGNORE_ENC) {
6856         ctxt->options |= HTML_PARSE_IGNORE_ENC;
6857         options -= HTML_PARSE_IGNORE_ENC;
6858     }
6859     if (options & HTML_PARSE_NOIMPLIED) {
6860         ctxt->options |= HTML_PARSE_NOIMPLIED;
6861         options -= HTML_PARSE_NOIMPLIED;
6862     }
6863     ctxt->dictNames = 0;
6864     return (options);
6865 }
6866
6867 /**
6868  * htmlDoRead:
6869  * @ctxt:  an HTML parser context
6870  * @URL:  the base URL to use for the document
6871  * @encoding:  the document encoding, or NULL
6872  * @options:  a combination of htmlParserOption(s)
6873  * @reuse:  keep the context for reuse
6874  *
6875  * Common front-end for the htmlRead functions
6876  *
6877  * Returns the resulting document tree or NULL
6878  */
6879 static htmlDocPtr
6880 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6881           int options, int reuse)
6882 {
6883     htmlDocPtr ret;
6884
6885     htmlCtxtUseOptions(ctxt, options);
6886     ctxt->html = 1;
6887     if (encoding != NULL) {
6888         xmlCharEncodingHandlerPtr hdlr;
6889
6890         hdlr = xmlFindCharEncodingHandler(encoding);
6891         if (hdlr != NULL) {
6892             xmlSwitchToEncoding(ctxt, hdlr);
6893             if (ctxt->input->encoding != NULL)
6894               xmlFree((xmlChar *) ctxt->input->encoding);
6895             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6896         }
6897     }
6898     if ((URL != NULL) && (ctxt->input != NULL) &&
6899         (ctxt->input->filename == NULL))
6900         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6901     htmlParseDocument(ctxt);
6902     ret = ctxt->myDoc;
6903     ctxt->myDoc = NULL;
6904     if (!reuse) {
6905         if ((ctxt->dictNames) &&
6906             (ret != NULL) &&
6907             (ret->dict == ctxt->dict))
6908             ctxt->dict = NULL;
6909         xmlFreeParserCtxt(ctxt);
6910     }
6911     return (ret);
6912 }
6913
6914 /**
6915  * htmlReadDoc:
6916  * @cur:  a pointer to a zero terminated string
6917  * @URL:  the base URL to use for the document
6918  * @encoding:  the document encoding, or NULL
6919  * @options:  a combination of htmlParserOption(s)
6920  *
6921  * parse an XML in-memory document and build a tree.
6922  *
6923  * Returns the resulting document tree
6924  */
6925 htmlDocPtr
6926 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6927 {
6928     htmlParserCtxtPtr ctxt;
6929
6930     if (cur == NULL)
6931         return (NULL);
6932
6933     xmlInitParser();
6934     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6935     if (ctxt == NULL)
6936         return (NULL);
6937     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6938 }
6939
6940 /**
6941  * htmlReadFile:
6942  * @filename:  a file or URL
6943  * @encoding:  the document encoding, or NULL
6944  * @options:  a combination of htmlParserOption(s)
6945  *
6946  * parse an XML file from the filesystem or the network.
6947  *
6948  * Returns the resulting document tree
6949  */
6950 htmlDocPtr
6951 htmlReadFile(const char *filename, const char *encoding, int options)
6952 {
6953     htmlParserCtxtPtr ctxt;
6954
6955     xmlInitParser();
6956     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6957     if (ctxt == NULL)
6958         return (NULL);
6959     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6960 }
6961
6962 /**
6963  * htmlReadMemory:
6964  * @buffer:  a pointer to a char array
6965  * @size:  the size of the array
6966  * @URL:  the base URL to use for the document
6967  * @encoding:  the document encoding, or NULL
6968  * @options:  a combination of htmlParserOption(s)
6969  *
6970  * parse an XML in-memory document and build a tree.
6971  *
6972  * Returns the resulting document tree
6973  */
6974 htmlDocPtr
6975 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6976 {
6977     htmlParserCtxtPtr ctxt;
6978
6979     xmlInitParser();
6980     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6981     if (ctxt == NULL)
6982         return (NULL);
6983     htmlDefaultSAXHandlerInit();
6984     if (ctxt->sax != NULL)
6985         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6986     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6987 }
6988
6989 /**
6990  * htmlReadFd:
6991  * @fd:  an open file descriptor
6992  * @URL:  the base URL to use for the document
6993  * @encoding:  the document encoding, or NULL
6994  * @options:  a combination of htmlParserOption(s)
6995  *
6996  * parse an HTML from a file descriptor and build a tree.
6997  * NOTE that the file descriptor will not be closed when the
6998  *      reader is closed or reset.
6999  *
7000  * Returns the resulting document tree
7001  */
7002 htmlDocPtr
7003 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7004 {
7005     htmlParserCtxtPtr ctxt;
7006     xmlParserInputBufferPtr input;
7007     htmlParserInputPtr stream;
7008
7009     if (fd < 0)
7010         return (NULL);
7011
7012     xmlInitParser();
7013     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7014     if (input == NULL)
7015         return (NULL);
7016     input->closecallback = NULL;
7017     ctxt = htmlNewParserCtxt();
7018     if (ctxt == NULL) {
7019         xmlFreeParserInputBuffer(input);
7020         return (NULL);
7021     }
7022     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7023     if (stream == NULL) {
7024         xmlFreeParserInputBuffer(input);
7025         htmlFreeParserCtxt(ctxt);
7026         return (NULL);
7027     }
7028     inputPush(ctxt, stream);
7029     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7030 }
7031
7032 /**
7033  * htmlReadIO:
7034  * @ioread:  an I/O read function
7035  * @ioclose:  an I/O close function
7036  * @ioctx:  an I/O handler
7037  * @URL:  the base URL to use for the document
7038  * @encoding:  the document encoding, or NULL
7039  * @options:  a combination of htmlParserOption(s)
7040  *
7041  * parse an HTML document from I/O functions and source and build a tree.
7042  *
7043  * Returns the resulting document tree
7044  */
7045 htmlDocPtr
7046 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7047           void *ioctx, const char *URL, const char *encoding, int options)
7048 {
7049     htmlParserCtxtPtr ctxt;
7050     xmlParserInputBufferPtr input;
7051     xmlParserInputPtr stream;
7052
7053     if (ioread == NULL)
7054         return (NULL);
7055     xmlInitParser();
7056
7057     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7058                                          XML_CHAR_ENCODING_NONE);
7059     if (input == NULL) {
7060         if (ioclose != NULL)
7061             ioclose(ioctx);
7062         return (NULL);
7063     }
7064     ctxt = htmlNewParserCtxt();
7065     if (ctxt == NULL) {
7066         xmlFreeParserInputBuffer(input);
7067         return (NULL);
7068     }
7069     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7070     if (stream == NULL) {
7071         xmlFreeParserInputBuffer(input);
7072         xmlFreeParserCtxt(ctxt);
7073         return (NULL);
7074     }
7075     inputPush(ctxt, stream);
7076     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7077 }
7078
7079 /**
7080  * htmlCtxtReadDoc:
7081  * @ctxt:  an HTML parser context
7082  * @cur:  a pointer to a zero terminated string
7083  * @URL:  the base URL to use for the document
7084  * @encoding:  the document encoding, or NULL
7085  * @options:  a combination of htmlParserOption(s)
7086  *
7087  * parse an XML in-memory document and build a tree.
7088  * This reuses the existing @ctxt parser context
7089  *
7090  * Returns the resulting document tree
7091  */
7092 htmlDocPtr
7093 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7094                const char *URL, const char *encoding, int options)
7095 {
7096     xmlParserInputPtr stream;
7097
7098     if (cur == NULL)
7099         return (NULL);
7100     if (ctxt == NULL)
7101         return (NULL);
7102     xmlInitParser();
7103
7104     htmlCtxtReset(ctxt);
7105
7106     stream = xmlNewStringInputStream(ctxt, cur);
7107     if (stream == NULL) {
7108         return (NULL);
7109     }
7110     inputPush(ctxt, stream);
7111     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7112 }
7113
7114 /**
7115  * htmlCtxtReadFile:
7116  * @ctxt:  an HTML parser context
7117  * @filename:  a file or URL
7118  * @encoding:  the document encoding, or NULL
7119  * @options:  a combination of htmlParserOption(s)
7120  *
7121  * parse an XML file from the filesystem or the network.
7122  * This reuses the existing @ctxt parser context
7123  *
7124  * Returns the resulting document tree
7125  */
7126 htmlDocPtr
7127 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7128                 const char *encoding, int options)
7129 {
7130     xmlParserInputPtr stream;
7131
7132     if (filename == NULL)
7133         return (NULL);
7134     if (ctxt == NULL)
7135         return (NULL);
7136     xmlInitParser();
7137
7138     htmlCtxtReset(ctxt);
7139
7140     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7141     if (stream == NULL) {
7142         return (NULL);
7143     }
7144     inputPush(ctxt, stream);
7145     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7146 }
7147
7148 /**
7149  * htmlCtxtReadMemory:
7150  * @ctxt:  an HTML parser context
7151  * @buffer:  a pointer to a char array
7152  * @size:  the size of the array
7153  * @URL:  the base URL to use for the document
7154  * @encoding:  the document encoding, or NULL
7155  * @options:  a combination of htmlParserOption(s)
7156  *
7157  * parse an XML in-memory document and build a tree.
7158  * This reuses the existing @ctxt parser context
7159  *
7160  * Returns the resulting document tree
7161  */
7162 htmlDocPtr
7163 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7164                   const char *URL, const char *encoding, int options)
7165 {
7166     xmlParserInputBufferPtr input;
7167     xmlParserInputPtr stream;
7168
7169     if (ctxt == NULL)
7170         return (NULL);
7171     if (buffer == NULL)
7172         return (NULL);
7173     xmlInitParser();
7174
7175     htmlCtxtReset(ctxt);
7176
7177     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7178     if (input == NULL) {
7179         return(NULL);
7180     }
7181
7182     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7183     if (stream == NULL) {
7184         xmlFreeParserInputBuffer(input);
7185         return(NULL);
7186     }
7187
7188     inputPush(ctxt, stream);
7189     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7190 }
7191
7192 /**
7193  * htmlCtxtReadFd:
7194  * @ctxt:  an HTML parser context
7195  * @fd:  an open file descriptor
7196  * @URL:  the base URL to use for the document
7197  * @encoding:  the document encoding, or NULL
7198  * @options:  a combination of htmlParserOption(s)
7199  *
7200  * parse an XML from a file descriptor and build a tree.
7201  * This reuses the existing @ctxt parser context
7202  *
7203  * Returns the resulting document tree
7204  */
7205 htmlDocPtr
7206 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7207               const char *URL, const char *encoding, int options)
7208 {
7209     xmlParserInputBufferPtr input;
7210     xmlParserInputPtr stream;
7211
7212     if (fd < 0)
7213         return (NULL);
7214     if (ctxt == NULL)
7215         return (NULL);
7216     xmlInitParser();
7217
7218     htmlCtxtReset(ctxt);
7219
7220
7221     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7222     if (input == NULL)
7223         return (NULL);
7224     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7225     if (stream == NULL) {
7226         xmlFreeParserInputBuffer(input);
7227         return (NULL);
7228     }
7229     inputPush(ctxt, stream);
7230     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7231 }
7232
7233 /**
7234  * htmlCtxtReadIO:
7235  * @ctxt:  an HTML parser context
7236  * @ioread:  an I/O read function
7237  * @ioclose:  an I/O close function
7238  * @ioctx:  an I/O handler
7239  * @URL:  the base URL to use for the document
7240  * @encoding:  the document encoding, or NULL
7241  * @options:  a combination of htmlParserOption(s)
7242  *
7243  * parse an HTML document from I/O functions and source and build a tree.
7244  * This reuses the existing @ctxt parser context
7245  *
7246  * Returns the resulting document tree
7247  */
7248 htmlDocPtr
7249 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7250               xmlInputCloseCallback ioclose, void *ioctx,
7251               const char *URL,
7252               const char *encoding, int options)
7253 {
7254     xmlParserInputBufferPtr input;
7255     xmlParserInputPtr stream;
7256
7257     if (ioread == NULL)
7258         return (NULL);
7259     if (ctxt == NULL)
7260         return (NULL);
7261     xmlInitParser();
7262
7263     htmlCtxtReset(ctxt);
7264
7265     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7266                                          XML_CHAR_ENCODING_NONE);
7267     if (input == NULL) {
7268         if (ioclose != NULL)
7269             ioclose(ioctx);
7270         return (NULL);
7271     }
7272     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7273     if (stream == NULL) {
7274         xmlFreeParserInputBuffer(input);
7275         return (NULL);
7276     }
7277     inputPush(ctxt, stream);
7278     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7279 }
7280
7281 #define bottom_HTMLparser
7282 #include "elfgcchack.h"
7283 #endif /* LIBXML_HTML_ENABLED */