libs/xml2/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef LIBXML_ZLIB_ENABLED
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45 #include <libxml/uri.h>
  46
  47 #include "buf.h"
  48 #include "enc.h"
  49
  50 #define HTML_MAX_NAMELEN 1000
  51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  52 #define HTML_PARSER_BUFFER_SIZE 100
  53
  54 /* #define DEBUG */
  55 /* #define DEBUG_PUSH */
  56
  57 static int htmlOmittedDefaultValue = 1;
  58
  59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  60                              xmlChar end, xmlChar  end2, xmlChar end3);
  61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  62
  63 /************************************************************************
  64  *                                                                      *
  65  *              Some factorized error routines                          *
  66  *                                                                      *
  67  ************************************************************************/
  68
  69 /**
  70  * htmlErrMemory:
  71  * @ctxt:  an HTML parser context
  72  * @extra:  extra information
  73  *
  74  * Handle a redefinition of attribute error
  75  */
  76 static void
  77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  78 {
  79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  80         (ctxt->instate == XML_PARSER_EOF))
  81         return;
  82     if (ctxt != NULL) {
  83         ctxt->errNo = XML_ERR_NO_MEMORY;
  84         ctxt->instate = XML_PARSER_EOF;
  85         ctxt->disableSAX = 1;
  86     }
  87     if (extra)
  88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  90                         NULL, NULL, 0, 0,
  91                         "Memory allocation failed : %s\n", extra);
  92     else
  93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  96 }
  97
  98 /**
  99  * htmlParseErr:
 100  * @ctxt:  an HTML parser context
 101  * @error:  the error number
 102  * @msg:  the error message
 103  * @str1:  string infor
 104  * @str2:  string infor
 105  *
 106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 107  */
 108 static void LIBXML_ATTR_FORMAT(3,0)
 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 110              const char *msg, const xmlChar *str1, const xmlChar *str2)
 111 {
 112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 113         (ctxt->instate == XML_PARSER_EOF))
 114         return;
 115     if (ctxt != NULL)
 116         ctxt->errNo = error;
 117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 118                     XML_ERR_ERROR, NULL, 0,
 119                     (const char *) str1, (const char *) str2,
 120                     NULL, 0, 0,
 121                     msg, str1, str2);
 122     if (ctxt != NULL)
 123         ctxt->wellFormed = 0;
 124 }
 125
 126 /**
 127  * htmlParseErrInt:
 128  * @ctxt:  an HTML parser context
 129  * @error:  the error number
 130  * @msg:  the error message
 131  * @val:  integer info
 132  *
 133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 134  */
 135 static void LIBXML_ATTR_FORMAT(3,0)
 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 137              const char *msg, int val)
 138 {
 139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 140         (ctxt->instate == XML_PARSER_EOF))
 141         return;
 142     if (ctxt != NULL)
 143         ctxt->errNo = error;
 144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 146                     NULL, val, 0, msg, val);
 147     if (ctxt != NULL)
 148         ctxt->wellFormed = 0;
 149 }
 150
 151 /************************************************************************
 152  *                                                                      *
 153  *      Parser stacks related functions and macros              *
 154  *                                                                      *
 155  ************************************************************************/
 156
 157 /**
 158  * htmlnamePush:
 159  * @ctxt:  an HTML parser context
 160  * @value:  the element name
 161  *
 162  * Pushes a new element name on top of the name stack
 163  *
 164  * Returns 0 in case of error, the index in the stack otherwise
 165  */
 166 static int
 167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 168 {
 169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 170         ctxt->html = 3;
 171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 172         ctxt->html = 10;
 173     if (ctxt->nameNr >= ctxt->nameMax) {
 174         ctxt->nameMax *= 2;
 175         ctxt->nameTab = (const xmlChar * *)
 176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 177                                     ctxt->nameMax *
 178                                     sizeof(ctxt->nameTab[0]));
 179         if (ctxt->nameTab == NULL) {
 180             htmlErrMemory(ctxt, NULL);
 181             return (0);
 182         }
 183     }
 184     ctxt->nameTab[ctxt->nameNr] = value;
 185     ctxt->name = value;
 186     return (ctxt->nameNr++);
 187 }
 188 /**
 189  * htmlnamePop:
 190  * @ctxt: an HTML parser context
 191  *
 192  * Pops the top element name from the name stack
 193  *
 194  * Returns the name just removed
 195  */
 196 static const xmlChar *
 197 htmlnamePop(htmlParserCtxtPtr ctxt)
 198 {
 199     const xmlChar *ret;
 200
 201     if (ctxt->nameNr <= 0)
 202         return (NULL);
 203     ctxt->nameNr--;
 204     if (ctxt->nameNr < 0)
 205         return (NULL);
 206     if (ctxt->nameNr > 0)
 207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 208     else
 209         ctxt->name = NULL;
 210     ret = ctxt->nameTab[ctxt->nameNr];
 211     ctxt->nameTab[ctxt->nameNr] = NULL;
 212     return (ret);
 213 }
 214
 215 /**
 216  * htmlNodeInfoPush:
 217  * @ctxt:  an HTML parser context
 218  * @value:  the node info
 219  *
 220  * Pushes a new element name on top of the node info stack
 221  *
 222  * Returns 0 in case of error, the index in the stack otherwise
 223  */
 224 static int
 225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 226 {
 227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 228         if (ctxt->nodeInfoMax == 0)
 229                 ctxt->nodeInfoMax = 5;
 230         ctxt->nodeInfoMax *= 2;
 231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 233                                     ctxt->nodeInfoMax *
 234                                     sizeof(ctxt->nodeInfoTab[0]));
 235         if (ctxt->nodeInfoTab == NULL) {
 236             htmlErrMemory(ctxt, NULL);
 237             return (0);
 238         }
 239     }
 240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 242     return (ctxt->nodeInfoNr++);
 243 }
 244
 245 /**
 246  * htmlNodeInfoPop:
 247  * @ctxt:  an HTML parser context
 248  *
 249  * Pops the top element name from the node info stack
 250  *
 251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 252  */
 253 static htmlParserNodeInfo *
 254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 255 {
 256     if (ctxt->nodeInfoNr <= 0)
 257         return (NULL);
 258     ctxt->nodeInfoNr--;
 259     if (ctxt->nodeInfoNr < 0)
 260         return (NULL);
 261     if (ctxt->nodeInfoNr > 0)
 262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 263     else
 264         ctxt->nodeInfo = NULL;
 265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 266 }
 267
 268 /*
 269  * Macros for accessing the content. Those should be used only by the parser,
 270  * and not exported.
 271  *
 272  * Dirty macros, i.e. one need to make assumption on the context to use them
 273  *
 274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 277  *           in UNICODE mode. This should be used internally by the parser
 278  *           only to compare to ASCII values otherwise it would break when
 279  *           running with UTF-8 encoding.
 280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 281  *           to compare on ASCII based substring.
 282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 283  *           it should be used only to compare on ASCII based substring.
 284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 285  *           strings without newlines within the parser.
 286  *
 287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 288  *
 289  *   CURRENT Returns the current char value, with the full decoding of
 290  *           UTF-8 if we are using this mode. It returns an int.
 291  *   NEXT    Skip to the next character, this does the proper decoding
 292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 295  */
 296
 297 #define UPPER (toupper(*ctxt->input->cur))
 298
 299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
 300
 301 #define NXT(val) ctxt->input->cur[(val)]
 302
 303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 304
 305 #define CUR_PTR ctxt->input->cur
 306 #define BASE_PTR ctxt->input->base
 307
 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 309                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 310         xmlParserInputShrink(ctxt->input)
 311
 312 #define GROW if ((ctxt->progressive == 0) &&                            \
 313                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 314         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 315
 316 #define CURRENT ((int) (*ctxt->input->cur))
 317
 318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 319
 320 /* Imported from XML */
 321
 322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 323 #define CUR ((int) (*ctxt->input->cur))
 324 #define NEXT xmlNextChar(ctxt)
 325
 326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 327
 328
 329 #define NEXTL(l) do {                                                   \
 330     if (*(ctxt->input->cur) == '\n') {                                  \
 331         ctxt->input->line++; ctxt->input->col = 1;                      \
 332     } else ctxt->input->col++;                                          \
 333     ctxt->token = 0; ctxt->input->cur += l;                             \
 334   } while (0)
 335
 336 /************
 337     \
 338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 340  ************/
 341
 342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 344
 345 #define COPY_BUF(l,b,i,v)                                               \
 346     if (l == 1) b[i++] = (xmlChar) v;                                   \
 347     else i += xmlCopyChar(l,&b[i],v)
 348
 349 /**
 350  * htmlFindEncoding:
 351  * @the HTML parser context
 352  *
 353  * Ty to find and encoding in the current data available in the input
 354  * buffer this is needed to try to switch to the proper encoding when
 355  * one face a character error.
 356  * That's an heuristic, since it's operating outside of parsing it could
 357  * try to use a meta which had been commented out, that's the reason it
 358  * should only be used in case of error, not as a default.
 359  *
 360  * Returns an encoding string or NULL if not found, the string need to
 361  *   be freed
 362  */
 363 static xmlChar *
 364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 365     const xmlChar *start, *cur, *end;
 366
 367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 369         (ctxt->input->buf->encoder != NULL))
 370         return(NULL);
 371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 372         return(NULL);
 373
 374     start = ctxt->input->cur;
 375     end = ctxt->input->end;
 376     /* we also expect the input buffer to be zero terminated */
 377     if (*end != 0)
 378         return(NULL);
 379
 380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 381     if (cur == NULL)
 382         return(NULL);
 383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 384     if (cur == NULL)
 385         return(NULL);
 386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 387     if (cur == NULL)
 388         return(NULL);
 389     cur += 8;
 390     start = cur;
 391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 392            ((*cur >= 'a') && (*cur <= 'z')) ||
 393            ((*cur >= '0') && (*cur <= '9')) ||
 394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 395            cur++;
 396     if (cur == start)
 397         return(NULL);
 398     return(xmlStrndup(start, cur - start));
 399 }
 400
 401 /**
 402  * htmlCurrentChar:
 403  * @ctxt:  the HTML parser context
 404  * @len:  pointer to the length of the char read
 405  *
 406  * The current char value, if using UTF-8 this may actually span multiple
 407  * bytes in the input buffer. Implement the end of line normalization:
 408  * 2.11 End-of-Line Handling
 409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 410  * char, then the encoding converter is plugged in automatically.
 411  *
 412  * Returns the current char value and its length
 413  */
 414
 415 static int
 416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 417     const unsigned char *cur;
 418     unsigned char c;
 419     unsigned int val;
 420
 421     if (ctxt->instate == XML_PARSER_EOF)
 422         return(0);
 423
 424     if (ctxt->token != 0) {
 425         *len = 0;
 426         return(ctxt->token);
 427     }
 428     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
 429         xmlChar * guess;
 430         xmlCharEncodingHandlerPtr handler;
 431
 432         /*
 433          * Assume it's a fixed length encoding (1) with
 434          * a compatible encoding for the ASCII set, since
 435          * HTML constructs only use < 128 chars
 436          */
 437         if ((int) *ctxt->input->cur < 0x80) {
 438             *len = 1;
 439             if ((*ctxt->input->cur == 0) &&
 440                 (ctxt->input->cur < ctxt->input->end)) {
 441                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 442                                 "Char 0x%X out of allowed range\n", 0);
 443                 return(' ');
 444             }
 445             return((int) *ctxt->input->cur);
 446         }
 447
 448         /*
 449          * Humm this is bad, do an automatic flow conversion
 450          */
 451         guess = htmlFindEncoding(ctxt);
 452         if (guess == NULL) {
 453             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 454         } else {
 455             if (ctxt->input->encoding != NULL)
 456                 xmlFree((xmlChar *) ctxt->input->encoding);
 457             ctxt->input->encoding = guess;
 458             handler = xmlFindCharEncodingHandler((const char *) guess);
 459             if (handler != NULL) {
 460                 /*
 461                  * Don't use UTF-8 encoder which isn't required and
 462                  * can produce invalid UTF-8.
 463                  */
 464                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
 465                     xmlSwitchToEncoding(ctxt, handler);
 466             } else {
 467                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 468                              "Unsupported encoding %s", guess, NULL);
 469             }
 470         }
 471         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 472     }
 473
 474     /*
 475      * We are supposed to handle UTF8, check it's valid
 476      * From rfc2044: encoding of the Unicode values on UTF-8:
 477      *
 478      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 479      * 0000 0000-0000 007F   0xxxxxxx
 480      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 481      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 482      *
 483      * Check for the 0x110000 limit too
 484      */
 485     cur = ctxt->input->cur;
 486     c = *cur;
 487     if (c & 0x80) {
 488         if ((c & 0x40) == 0)
 489             goto encoding_error;
 490         if (cur[1] == 0) {
 491             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 492             cur = ctxt->input->cur;
 493         }
 494         if ((cur[1] & 0xc0) != 0x80)
 495             goto encoding_error;
 496         if ((c & 0xe0) == 0xe0) {
 497
 498             if (cur[2] == 0) {
 499                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 500                 cur = ctxt->input->cur;
 501             }
 502             if ((cur[2] & 0xc0) != 0x80)
 503                 goto encoding_error;
 504             if ((c & 0xf0) == 0xf0) {
 505                 if (cur[3] == 0) {
 506                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 507                     cur = ctxt->input->cur;
 508                 }
 509                 if (((c & 0xf8) != 0xf0) ||
 510                     ((cur[3] & 0xc0) != 0x80))
 511                     goto encoding_error;
 512                 /* 4-byte code */
 513                 *len = 4;
 514                 val = (cur[0] & 0x7) << 18;
 515                 val |= (cur[1] & 0x3f) << 12;
 516                 val |= (cur[2] & 0x3f) << 6;
 517                 val |= cur[3] & 0x3f;
 518                 if (val < 0x10000)
 519                     goto encoding_error;
 520             } else {
 521               /* 3-byte code */
 522                 *len = 3;
 523                 val = (cur[0] & 0xf) << 12;
 524                 val |= (cur[1] & 0x3f) << 6;
 525                 val |= cur[2] & 0x3f;
 526                 if (val < 0x800)
 527                     goto encoding_error;
 528             }
 529         } else {
 530           /* 2-byte code */
 531             *len = 2;
 532             val = (cur[0] & 0x1f) << 6;
 533             val |= cur[1] & 0x3f;
 534             if (val < 0x80)
 535                 goto encoding_error;
 536         }
 537         if (!IS_CHAR(val)) {
 538             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 539                             "Char 0x%X out of allowed range\n", val);
 540         }
 541         return(val);
 542     } else {
 543         if ((*ctxt->input->cur == 0) &&
 544             (ctxt->input->cur < ctxt->input->end)) {
 545             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 546                             "Char 0x%X out of allowed range\n", 0);
 547             *len = 1;
 548             return(' ');
 549         }
 550         /* 1-byte code */
 551         *len = 1;
 552         return((int) *ctxt->input->cur);
 553     }
 554
 555 encoding_error:
 556     /*
 557      * If we detect an UTF8 error that probably mean that the
 558      * input encoding didn't get properly advertised in the
 559      * declaration header. Report the error and switch the encoding
 560      * to ISO-Latin-1 (if you don't like this policy, just declare the
 561      * encoding !)
 562      */
 563     {
 564         char buffer[150];
 565
 566         if (ctxt->input->end - ctxt->input->cur >= 4) {
 567             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 568                             ctxt->input->cur[0], ctxt->input->cur[1],
 569                             ctxt->input->cur[2], ctxt->input->cur[3]);
 570         } else {
 571             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 572         }
 573         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 574                      "Input is not proper UTF-8, indicate encoding !\n",
 575                      BAD_CAST buffer, NULL);
 576     }
 577
 578     /*
 579      * Don't switch encodings twice. Note that if there's an encoder, we
 580      * shouldn't receive invalid UTF-8 anyway.
 581      *
 582      * Note that if ctxt->input->buf == NULL, switching encodings is
 583      * impossible, see Gitlab issue #34.
 584      */
 585     if ((ctxt->input->buf != NULL) &&
 586         (ctxt->input->buf->encoder == NULL))
 587         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 588     *len = 1;
 589     return((int) *ctxt->input->cur);
 590 }
 591
 592 /**
 593  * htmlSkipBlankChars:
 594  * @ctxt:  the HTML parser context
 595  *
 596  * skip all blanks character found at that point in the input streams.
 597  *
 598  * Returns the number of space chars skipped
 599  */
 600
 601 static int
 602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 603     int res = 0;
 604
 605     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 606         if ((*ctxt->input->cur == 0) &&
 607             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 608                 xmlPopInput(ctxt);
 609         } else {
 610             if (*(ctxt->input->cur) == '\n') {
 611                 ctxt->input->line++; ctxt->input->col = 1;
 612             } else ctxt->input->col++;
 613             ctxt->input->cur++;
 614             if (*ctxt->input->cur == 0)
 615                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 616         }
 617         res++;
 618     }
 619     return(res);
 620 }
 621
 622
 623
 624 /************************************************************************
 625  *                                                                      *
 626  *      The list of HTML elements and their properties          *
 627  *                                                                      *
 628  ************************************************************************/
 629
 630 /*
 631  *  Start Tag: 1 means the start tag can be omitted
 632  *  End Tag:   1 means the end tag can be omitted
 633  *             2 means it's forbidden (empty elements)
 634  *             3 means the tag is stylistic and should be closed easily
 635  *  Depr:      this element is deprecated
 636  *  DTD:       1 means that this element is valid only in the Loose DTD
 637  *             2 means that this element is valid only in the Frameset DTD
 638  *
 639  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 640         , subElements , impliedsubelt , Attributes, userdata
 641  */
 642
 643 /* Definitions and a couple of vars for HTML Elements */
 644
 645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 646 #define NB_FONTSTYLE 8
 647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 648 #define NB_PHRASE 10
 649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 650 #define NB_SPECIAL 16
 651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 655 #define FORMCTRL "input", "select", "textarea", "label", "button"
 656 #define NB_FORMCTRL 5
 657 #define PCDATA
 658 #define NB_PCDATA 0
 659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 660 #define NB_HEADING 6
 661 #define LIST "ul", "ol", "dir", "menu"
 662 #define NB_LIST 4
 663 #define MODIFIER
 664 #define NB_MODIFIER 0
 665 #define FLOW BLOCK,INLINE
 666 #define NB_FLOW NB_BLOCK + NB_INLINE
 667 #define EMPTY NULL
 668
 669
 670 static const char* const html_flow[] = { FLOW, NULL } ;
 671 static const char* const html_inline[] = { INLINE, NULL } ;
 672
 673 /* placeholders: elts with content but no subelements */
 674 static const char* const html_pcdata[] = { NULL } ;
 675 #define html_cdata html_pcdata
 676
 677
 678 /* ... and for HTML Attributes */
 679
 680 #define COREATTRS "id", "class", "style", "title"
 681 #define NB_COREATTRS 4
 682 #define I18N "lang", "dir"
 683 #define NB_I18N 2
 684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 685 #define NB_EVENTS 9
 686 #define ATTRS COREATTRS,I18N,EVENTS
 687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 688 #define CELLHALIGN "align", "char", "charoff"
 689 #define NB_CELLHALIGN 3
 690 #define CELLVALIGN "valign"
 691 #define NB_CELLVALIGN 1
 692
 693 static const char* const html_attrs[] = { ATTRS, NULL } ;
 694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 696 static const char* const i18n_attrs[] = { I18N, NULL } ;
 697
 698
 699 /* Other declarations that should go inline ... */
 700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 701         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 702         "tabindex", "onfocus", "onblur", NULL } ;
 703 static const char* const target_attr[] = { "target", NULL } ;
 704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 705 static const char* const alt_attr[] = { "alt", NULL } ;
 706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 707 static const char* const href_attrs[] = { "href", NULL } ;
 708 static const char* const clear_attrs[] = { "clear", NULL } ;
 709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 710
 711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 713                 "archive", "alt", "name", "height", "width", "align",
 714                 "hspace", "vspace", NULL } ;
 715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 716         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 717 static const char* const basefont_attrs[] =
 718         { "id", "size", "color", "face", NULL } ;
 719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 722 static const char* const body_depr[] = { "background", "bgcolor", "text",
 723         "link", "vlink", "alink", NULL } ;
 724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 725         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 726
 727
 728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 729 static const char* const col_elt[] = { "col", NULL } ;
 730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 733 static const char* const compact_attr[] = { "compact", NULL } ;
 734 static const char* const label_attr[] = { "label", NULL } ;
 735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 745 static const char* const version_attr[] = { "version", NULL } ;
 746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 754 static const char* const align_attr[] = { "align", NULL } ;
 755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 757 static const char* const name_attr[] = { "name", NULL } ;
 758 static const char* const action_attr[] = { "action", NULL } ;
 759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 761 static const char* const content_attr[] = { "content", NULL } ;
 762 static const char* const type_attr[] = { "type", NULL } ;
 763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 768 static const char* const option_elt[] = { "option", NULL } ;
 769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 772 static const char* const width_attr[] = { "width", NULL } ;
 773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 775 static const char* const language_attr[] = { "language", NULL } ;
 776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 782 static const char* const tr_elt[] = { "tr", NULL } ;
 783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 787 static const char* const tr_contents[] = { "th", "td", NULL } ;
 788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 789 static const char* const li_elt[] = { "li", NULL } ;
 790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 791 static const char* const dir_attr[] = { "dir", NULL} ;
 792
 793 #define DECL (const char**)
 794
 795 static const htmlElemDesc
 796 html40ElementTable[] = {
 797 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 798         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 799 },
 800 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 801         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 802 },
 803 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 804         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 805 },
 806 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 807         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 808 },
 809 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 810         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 811 },
 812 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 813         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 814 },
 815 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 816         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 817 },
 818 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 819         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 820 },
 821 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 822         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 823 },
 824 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 825         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 826 },
 827 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 828         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 829 },
 830 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 831         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 832 },
 833 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 834         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 835 },
 836 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 837         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 838 },
 839 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 840         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 841 },
 842 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 843         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 844 },
 845 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 846         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 847 },
 848 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 849         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 850 },
 851 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 852         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 853 },
 854 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 855         EMPTY , NULL , DECL col_attrs , NULL, NULL
 856 },
 857 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 858         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 859 },
 860 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 861         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 862 },
 863 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 864         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 865 },
 866 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 867         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 868 },
 869 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 870         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 871 },
 872 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 873         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 874 },
 875 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 876         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 877 },
 878 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 879         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 880 },
 881 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 882         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 883 },
 884 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 885         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 886 },
 887 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 888         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 889 },
 890 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 891         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 892 },
 893 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 894         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 895 },
 896 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 897         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 898 },
 899 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 900         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 901 },
 902 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 903         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 904 },
 905 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 906         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 907 },
 908 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 909         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 910 },
 911 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 912         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 913 },
 914 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 915         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 916 },
 917 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 918         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 919 },
 920 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 921         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 922 },
 923 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 924         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 925 },
 926 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 927         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 928 },
 929 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 930         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 931 },
 932 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 933         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 934 },
 935 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 936         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 937 },
 938 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 939         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 940 },
 941 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 942         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 943 },
 944 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 945         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 946 },
 947 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 948         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 949 },
 950 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 951         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 952 },
 953 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 954         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 955 },
 956 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 957         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 958 },
 959 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 960         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 961 },
 962 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 963         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 964 },
 965 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 966         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 967 },
 968 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 969         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 970 },
 971 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 972         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 973 },
 974 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 975         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 976 },
 977 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 978         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 979 },
 980 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 981         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 982 },
 983 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 984         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 985 },
 986 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 987         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 988 },
 989 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 990         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 991 },
 992 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 993         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 994 },
 995 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 996         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 997 },
 998 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 999         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
1008         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
1011         DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
1014         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
1026         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
1029         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1032         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1035         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1038         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1041         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1047         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1050         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1053         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1056         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1059         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074
1075 typedef struct {
1076     const char *oldTag;
1077     const char *newTag;
1078 } htmlStartCloseEntry;
1079
1080 /*
1081  * start tags that imply the end of current element
1082  */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084     { "a", "a" },
1085     { "a", "fieldset" },
1086     { "a", "table" },
1087     { "a", "td" },
1088     { "a", "th" },
1089     { "address", "dd" },
1090     { "address", "dl" },
1091     { "address", "dt" },
1092     { "address", "form" },
1093     { "address", "li" },
1094     { "address", "ul" },
1095     { "b", "center" },
1096     { "b", "p" },
1097     { "b", "td" },
1098     { "b", "th" },
1099     { "big", "p" },
1100     { "caption", "col" },
1101     { "caption", "colgroup" },
1102     { "caption", "tbody" },
1103     { "caption", "tfoot" },
1104     { "caption", "thead" },
1105     { "caption", "tr" },
1106     { "col", "col" },
1107     { "col", "colgroup" },
1108     { "col", "tbody" },
1109     { "col", "tfoot" },
1110     { "col", "thead" },
1111     { "col", "tr" },
1112     { "colgroup", "colgroup" },
1113     { "colgroup", "tbody" },
1114     { "colgroup", "tfoot" },
1115     { "colgroup", "thead" },
1116     { "colgroup", "tr" },
1117     { "dd", "dt" },
1118     { "dir", "dd" },
1119     { "dir", "dl" },
1120     { "dir", "dt" },
1121     { "dir", "form" },
1122     { "dir", "ul" },
1123     { "dl", "form" },
1124     { "dl", "li" },
1125     { "dt", "dd" },
1126     { "dt", "dl" },
1127     { "font", "center" },
1128     { "font", "td" },
1129     { "font", "th" },
1130     { "form", "form" },
1131     { "h1", "fieldset" },
1132     { "h1", "form" },
1133     { "h1", "li" },
1134     { "h1", "p" },
1135     { "h1", "table" },
1136     { "h2", "fieldset" },
1137     { "h2", "form" },
1138     { "h2", "li" },
1139     { "h2", "p" },
1140     { "h2", "table" },
1141     { "h3", "fieldset" },
1142     { "h3", "form" },
1143     { "h3", "li" },
1144     { "h3", "p" },
1145     { "h3", "table" },
1146     { "h4", "fieldset" },
1147     { "h4", "form" },
1148     { "h4", "li" },
1149     { "h4", "p" },
1150     { "h4", "table" },
1151     { "h5", "fieldset" },
1152     { "h5", "form" },
1153     { "h5", "li" },
1154     { "h5", "p" },
1155     { "h5", "table" },
1156     { "h6", "fieldset" },
1157     { "h6", "form" },
1158     { "h6", "li" },
1159     { "h6", "p" },
1160     { "h6", "table" },
1161     { "head", "a" },
1162     { "head", "abbr" },
1163     { "head", "acronym" },
1164     { "head", "address" },
1165     { "head", "b" },
1166     { "head", "bdo" },
1167     { "head", "big" },
1168     { "head", "blockquote" },
1169     { "head", "body" },
1170     { "head", "br" },
1171     { "head", "center" },
1172     { "head", "cite" },
1173     { "head", "code" },
1174     { "head", "dd" },
1175     { "head", "dfn" },
1176     { "head", "dir" },
1177     { "head", "div" },
1178     { "head", "dl" },
1179     { "head", "dt" },
1180     { "head", "em" },
1181     { "head", "fieldset" },
1182     { "head", "font" },
1183     { "head", "form" },
1184     { "head", "frameset" },
1185     { "head", "h1" },
1186     { "head", "h2" },
1187     { "head", "h3" },
1188     { "head", "h4" },
1189     { "head", "h5" },
1190     { "head", "h6" },
1191     { "head", "hr" },
1192     { "head", "i" },
1193     { "head", "iframe" },
1194     { "head", "img" },
1195     { "head", "kbd" },
1196     { "head", "li" },
1197     { "head", "listing" },
1198     { "head", "map" },
1199     { "head", "menu" },
1200     { "head", "ol" },
1201     { "head", "p" },
1202     { "head", "pre" },
1203     { "head", "q" },
1204     { "head", "s" },
1205     { "head", "samp" },
1206     { "head", "small" },
1207     { "head", "span" },
1208     { "head", "strike" },
1209     { "head", "strong" },
1210     { "head", "sub" },
1211     { "head", "sup" },
1212     { "head", "table" },
1213     { "head", "tt" },
1214     { "head", "u" },
1215     { "head", "ul" },
1216     { "head", "var" },
1217     { "head", "xmp" },
1218     { "hr", "form" },
1219     { "i", "center" },
1220     { "i", "p" },
1221     { "i", "td" },
1222     { "i", "th" },
1223     { "legend", "fieldset" },
1224     { "li", "li" },
1225     { "link", "body" },
1226     { "link", "frameset" },
1227     { "listing", "dd" },
1228     { "listing", "dl" },
1229     { "listing", "dt" },
1230     { "listing", "fieldset" },
1231     { "listing", "form" },
1232     { "listing", "li" },
1233     { "listing", "table" },
1234     { "listing", "ul" },
1235     { "menu", "dd" },
1236     { "menu", "dl" },
1237     { "menu", "dt" },
1238     { "menu", "form" },
1239     { "menu", "ul" },
1240     { "ol", "form" },
1241     { "ol", "ul" },
1242     { "option", "optgroup" },
1243     { "option", "option" },
1244     { "p", "address" },
1245     { "p", "blockquote" },
1246     { "p", "body" },
1247     { "p", "caption" },
1248     { "p", "center" },
1249     { "p", "col" },
1250     { "p", "colgroup" },
1251     { "p", "dd" },
1252     { "p", "dir" },
1253     { "p", "div" },
1254     { "p", "dl" },
1255     { "p", "dt" },
1256     { "p", "fieldset" },
1257     { "p", "form" },
1258     { "p", "frameset" },
1259     { "p", "h1" },
1260     { "p", "h2" },
1261     { "p", "h3" },
1262     { "p", "h4" },
1263     { "p", "h5" },
1264     { "p", "h6" },
1265     { "p", "head" },
1266     { "p", "hr" },
1267     { "p", "li" },
1268     { "p", "listing" },
1269     { "p", "menu" },
1270     { "p", "ol" },
1271     { "p", "p" },
1272     { "p", "pre" },
1273     { "p", "table" },
1274     { "p", "tbody" },
1275     { "p", "td" },
1276     { "p", "tfoot" },
1277     { "p", "th" },
1278     { "p", "title" },
1279     { "p", "tr" },
1280     { "p", "ul" },
1281     { "p", "xmp" },
1282     { "pre", "dd" },
1283     { "pre", "dl" },
1284     { "pre", "dt" },
1285     { "pre", "fieldset" },
1286     { "pre", "form" },
1287     { "pre", "li" },
1288     { "pre", "table" },
1289     { "pre", "ul" },
1290     { "s", "p" },
1291     { "script", "noscript" },
1292     { "small", "p" },
1293     { "span", "td" },
1294     { "span", "th" },
1295     { "strike", "p" },
1296     { "style", "body" },
1297     { "style", "frameset" },
1298     { "tbody", "tbody" },
1299     { "tbody", "tfoot" },
1300     { "td", "tbody" },
1301     { "td", "td" },
1302     { "td", "tfoot" },
1303     { "td", "th" },
1304     { "td", "tr" },
1305     { "tfoot", "tbody" },
1306     { "th", "tbody" },
1307     { "th", "td" },
1308     { "th", "tfoot" },
1309     { "th", "th" },
1310     { "th", "tr" },
1311     { "thead", "tbody" },
1312     { "thead", "tfoot" },
1313     { "title", "body" },
1314     { "title", "frameset" },
1315     { "tr", "tbody" },
1316     { "tr", "tfoot" },
1317     { "tr", "tr" },
1318     { "tt", "p" },
1319     { "u", "p" },
1320     { "u", "td" },
1321     { "u", "th" },
1322     { "ul", "address" },
1323     { "ul", "form" },
1324     { "ul", "menu" },
1325     { "ul", "ol" },
1326     { "ul", "pre" },
1327     { "xmp", "dd" },
1328     { "xmp", "dl" },
1329     { "xmp", "dt" },
1330     { "xmp", "fieldset" },
1331     { "xmp", "form" },
1332     { "xmp", "li" },
1333     { "xmp", "table" },
1334     { "xmp", "ul" }
1335 };
1336
1337 /*
1338  * The list of HTML elements which are supposed not to have
1339  * CDATA content and where a p element will be implied
1340  *
1341  * TODO: extend that list by reading the HTML SGML DTD on
1342  *       implied paragraph
1343  */
1344 static const char *const htmlNoContentElements[] = {
1345     "html",
1346     "head",
1347     NULL
1348 };
1349
1350 /*
1351  * The list of HTML attributes which are of content %Script;
1352  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353  *       it assumes the name starts with 'on'
1354  */
1355 static const char *const htmlScriptAttributes[] = {
1356     "onclick",
1357     "ondblclick",
1358     "onmousedown",
1359     "onmouseup",
1360     "onmouseover",
1361     "onmousemove",
1362     "onmouseout",
1363     "onkeypress",
1364     "onkeydown",
1365     "onkeyup",
1366     "onload",
1367     "onunload",
1368     "onfocus",
1369     "onblur",
1370     "onsubmit",
1371     "onreset",
1372     "onchange",
1373     "onselect"
1374 };
1375
1376 /*
1377  * This table is used by the htmlparser to know what to do with
1378  * broken html pages. By assigning different priorities to different
1379  * elements the parser can decide how to handle extra endtags.
1380  * Endtags are only allowed to close elements with lower or equal
1381  * priority.
1382  */
1383
1384 typedef struct {
1385     const char *name;
1386     int priority;
1387 } elementPriority;
1388
1389 static const elementPriority htmlEndPriority[] = {
1390     {"div",   150},
1391     {"td",    160},
1392     {"th",    160},
1393     {"tr",    170},
1394     {"thead", 180},
1395     {"tbody", 180},
1396     {"tfoot", 180},
1397     {"table", 190},
1398     {"head",  200},
1399     {"body",  200},
1400     {"html",  220},
1401     {NULL,    100} /* Default priority */
1402 };
1403
1404 /************************************************************************
1405  *                                                                      *
1406  *      functions to handle HTML specific data                  *
1407  *                                                                      *
1408  ************************************************************************/
1409
1410 /**
1411  * htmlInitAutoClose:
1412  *
1413  * This is a no-op now.
1414  */
1415 void
1416 htmlInitAutoClose(void) {
1417 }
1418
1419 static int  __cdecl
1420 htmlCompareTags(const void *key, const void *member) {
1421     const xmlChar *tag = (const xmlChar *) key;
1422     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423
1424     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426
1427 /**
1428  * htmlTagLookup:
1429  * @tag:  The tag name in lowercase
1430  *
1431  * Lookup the HTML tag in the ElementTable
1432  *
1433  * Returns the related htmlElemDescPtr or NULL if not found.
1434  */
1435 const htmlElemDesc *
1436 htmlTagLookup(const xmlChar *tag) {
1437     if (tag == NULL)
1438         return(NULL);
1439
1440     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442                 sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444
1445 /**
1446  * htmlGetEndPriority:
1447  * @name: The name of the element to look up the priority for.
1448  *
1449  * Return value: The "endtag" priority.
1450  **/
1451 static int
1452 htmlGetEndPriority (const xmlChar *name) {
1453     int i = 0;
1454
1455     while ((htmlEndPriority[i].name != NULL) &&
1456            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457         i++;
1458
1459     return(htmlEndPriority[i].priority);
1460 }
1461
1462
1463 static int  __cdecl
1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467     int ret;
1468
1469     ret = strcmp(key->oldTag, entry->oldTag);
1470     if (ret == 0)
1471         ret = strcmp(key->newTag, entry->newTag);
1472
1473     return(ret);
1474 }
1475
1476 /**
1477  * htmlCheckAutoClose:
1478  * @newtag:  The new tag name
1479  * @oldtag:  The old tag name
1480  *
1481  * Checks whether the new tag is one of the registered valid tags for
1482  * closing old.
1483  *
1484  * Returns 0 if no, 1 if yes.
1485  */
1486 static int
1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489     htmlStartCloseEntry key;
1490     void *res;
1491
1492     key.oldTag = (const char *) oldtag;
1493     key.newTag = (const char *) newtag;
1494     res = bsearch(&key, htmlStartClose,
1495             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497     return(res != NULL);
1498 }
1499
1500 /**
1501  * htmlAutoCloseOnClose:
1502  * @ctxt:  an HTML parser context
1503  * @newtag:  The new tag name
1504  * @force:  force the tag closure
1505  *
1506  * The HTML DTD allows an ending tag to implicitly close other tags.
1507  */
1508 static void
1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511     const htmlElemDesc *info;
1512     int i, priority;
1513
1514     priority = htmlGetEndPriority(newtag);
1515
1516     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517
1518         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519             break;
1520         /*
1521          * A misplaced endtag can only close elements with lower
1522          * or equal priority, so if we find an element with higher
1523          * priority before we find an element with
1524          * matching name, we just ignore this endtag
1525          */
1526         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527             return;
1528     }
1529     if (i < 0)
1530         return;
1531
1532     while (!xmlStrEqual(newtag, ctxt->name)) {
1533         info = htmlTagLookup(ctxt->name);
1534         if ((info != NULL) && (info->endTag == 3)) {
1535             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536                          "Opening and ending tag mismatch: %s and %s\n",
1537                          newtag, ctxt->name);
1538         }
1539         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541         htmlnamePop(ctxt);
1542     }
1543 }
1544
1545 /**
1546  * htmlAutoCloseOnEnd:
1547  * @ctxt:  an HTML parser context
1548  *
1549  * Close all remaining tags at the end of the stream
1550  */
1551 static void
1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554     int i;
1555
1556     if (ctxt->nameNr == 0)
1557         return;
1558     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561         htmlnamePop(ctxt);
1562     }
1563 }
1564
1565 /**
1566  * htmlAutoClose:
1567  * @ctxt:  an HTML parser context
1568  * @newtag:  The new tag name or NULL
1569  *
1570  * The HTML DTD allows a tag to implicitly close other tags.
1571  * The list is kept in htmlStartClose array. This function is
1572  * called when a new tag has been detected and generates the
1573  * appropriates closes if possible/needed.
1574  * If newtag is NULL this mean we are at the end of the resource
1575  * and we should check
1576  */
1577 static void
1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580     while ((newtag != NULL) && (ctxt->name != NULL) &&
1581            (htmlCheckAutoClose(newtag, ctxt->name))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584         htmlnamePop(ctxt);
1585     }
1586     if (newtag == NULL) {
1587         htmlAutoCloseOnEnd(ctxt);
1588         return;
1589     }
1590     while ((newtag == NULL) && (ctxt->name != NULL) &&
1591            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596         htmlnamePop(ctxt);
1597     }
1598 }
1599
1600 /**
1601  * htmlAutoCloseTag:
1602  * @doc:  the HTML document
1603  * @name:  The tag name
1604  * @elem:  the HTML element
1605  *
1606  * The HTML DTD allows a tag to implicitly close other tags.
1607  * The list is kept in htmlStartClose array. This function checks
1608  * if the element or one of it's children would autoclose the
1609  * given tag.
1610  *
1611  * Returns 1 if autoclose, 0 otherwise
1612  */
1613 int
1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615     htmlNodePtr child;
1616
1617     if (elem == NULL) return(1);
1618     if (xmlStrEqual(name, elem->name)) return(0);
1619     if (htmlCheckAutoClose(elem->name, name)) return(1);
1620     child = elem->children;
1621     while (child != NULL) {
1622         if (htmlAutoCloseTag(doc, name, child)) return(1);
1623         child = child->next;
1624     }
1625     return(0);
1626 }
1627
1628 /**
1629  * htmlIsAutoClosed:
1630  * @doc:  the HTML document
1631  * @elem:  the HTML element
1632  *
1633  * The HTML DTD allows a tag to implicitly close other tags.
1634  * The list is kept in htmlStartClose array. This function checks
1635  * if a tag is autoclosed by one of it's child
1636  *
1637  * Returns 1 if autoclosed, 0 otherwise
1638  */
1639 int
1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641     htmlNodePtr child;
1642
1643     if (elem == NULL) return(1);
1644     child = elem->children;
1645     while (child != NULL) {
1646         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647         child = child->next;
1648     }
1649     return(0);
1650 }
1651
1652 /**
1653  * htmlCheckImplied:
1654  * @ctxt:  an HTML parser context
1655  * @newtag:  The new tag name
1656  *
1657  * The HTML DTD allows a tag to exists only implicitly
1658  * called when a new tag has been detected and generates the
1659  * appropriates implicit tags if missing
1660  */
1661 static void
1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663     int i;
1664
1665     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666         return;
1667     if (!htmlOmittedDefaultValue)
1668         return;
1669     if (xmlStrEqual(newtag, BAD_CAST"html"))
1670         return;
1671     if (ctxt->nameNr <= 0) {
1672         htmlnamePush(ctxt, BAD_CAST"html");
1673         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675     }
1676     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677         return;
1678     if ((ctxt->nameNr <= 1) &&
1679         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685         if (ctxt->html >= 3) {
1686             /* we already saw or generated an <head> before */
1687             return;
1688         }
1689         /*
1690          * dropped OBJECT ... i you put it first BODY will be
1691          * assumed !
1692          */
1693         htmlnamePush(ctxt, BAD_CAST"head");
1694         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699         if (ctxt->html >= 10) {
1700             /* we already saw or generated a <body> before */
1701             return;
1702         }
1703         for (i = 0;i < ctxt->nameNr;i++) {
1704             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705                 return;
1706             }
1707             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708                 return;
1709             }
1710         }
1711
1712         htmlnamePush(ctxt, BAD_CAST"body");
1713         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715     }
1716 }
1717
1718 /**
1719  * htmlCheckParagraph
1720  * @ctxt:  an HTML parser context
1721  *
1722  * Check whether a p element need to be implied before inserting
1723  * characters in the current element.
1724  *
1725  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726  *         in case of error.
1727  */
1728
1729 static int
1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731     const xmlChar *tag;
1732     int i;
1733
1734     if (ctxt == NULL)
1735         return(-1);
1736     tag = ctxt->name;
1737     if (tag == NULL) {
1738         htmlAutoClose(ctxt, BAD_CAST"p");
1739         htmlCheckImplied(ctxt, BAD_CAST"p");
1740         htmlnamePush(ctxt, BAD_CAST"p");
1741         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743         return(1);
1744     }
1745     if (!htmlOmittedDefaultValue)
1746         return(0);
1747     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749             htmlAutoClose(ctxt, BAD_CAST"p");
1750             htmlCheckImplied(ctxt, BAD_CAST"p");
1751             htmlnamePush(ctxt, BAD_CAST"p");
1752             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754             return(1);
1755         }
1756     }
1757     return(0);
1758 }
1759
1760 /**
1761  * htmlIsScriptAttribute:
1762  * @name:  an attribute name
1763  *
1764  * Check if an attribute is of content type Script
1765  *
1766  * Returns 1 is the attribute is a script 0 otherwise
1767  */
1768 int
1769 htmlIsScriptAttribute(const xmlChar *name) {
1770     unsigned int i;
1771
1772     if (name == NULL)
1773       return(0);
1774     /*
1775      * all script attributes start with 'on'
1776      */
1777     if ((name[0] != 'o') || (name[1] != 'n'))
1778       return(0);
1779     for (i = 0;
1780          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781          i++) {
1782         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783             return(1);
1784     }
1785     return(0);
1786 }
1787
1788 /************************************************************************
1789  *                                                                      *
1790  *      The list of HTML predefined entities                    *
1791  *                                                                      *
1792  ************************************************************************/
1793
1794
1795 static const htmlEntityDesc  html40EntitiesTable[] = {
1796 /*
1797  * the 4 absolute ones, plus apostrophe.
1798  */
1799 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1801 { 39,   "apos", "single quote" },
1802 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1803 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1804
1805 /*
1806  * A bunch still in the 128-255 range
1807  * Replacing them depend really on the charset used.
1808  */
1809 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1812 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1813 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1814 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1817 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1819 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172,  "not",  "not sign, U+00AC ISOnum" },
1822 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1826 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1831 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1865 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1890 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247,  "divide","division sign, U+00F7 ISOnum" },
1897 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1905
1906 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911
1912 /*
1913  * Anything below should really be kept as entities references
1914  */
1915 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1916
1917 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1919
1920 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1921 { 914,  "Beta", "greek capital letter beta, U+0392" },
1922 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1926 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1927 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921,  "Iota", "greek capital letter iota, U+0399" },
1929 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1930 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1932 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1933 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1934 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1935 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1937 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1939 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1942 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944
1945 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1957 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1958 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1959 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1973
1974 { 8194, "ensp", "en space, U+2002 ISOpub" },
1975 { 8195, "emsp", "em space, U+2003 ISOpub" },
1976 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1977 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211, "ndash","en dash, U+2013 ISOpub" },
1982 { 8212, "mdash","em dash, U+2014 ISOpub" },
1983 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224, "dagger","dagger, U+2020 ISOpub" },
1990 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1991
1992 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994
1995 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1996
1997 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999
2000 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002
2003 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260, "frasl","fraction slash, U+2044 NEW" },
2005
2006 { 8364, "euro", "euro sign, U+20AC NEW" },
2007
2008 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2024
2025 { 8704, "forall","for all, U+2200 ISOtech" },
2026 { 8706, "part", "partial differential, U+2202 ISOtech" },
2027 { 8707, "exist","there exists, U+2203 ISOtech" },
2028 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712, "isin", "element of, U+2208 ISOtech" },
2031 { 8713, "notin","not an element of, U+2209 ISOtech" },
2032 { 8715, "ni",   "contains as member, U+220B ISOtech" },
2033 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2035 { 8722, "minus","minus sign, U+2212 ISOtech" },
2036 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733, "prop", "proportional to, U+221D ISOtech" },
2039 { 8734, "infin","infinity, U+221E ISOtech" },
2040 { 8736, "ang",  "angle, U+2220 ISOamso" },
2041 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2042 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
2043 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2044 { 8746, "cup",  "union = cup, U+222A ISOtech" },
2045 { 8747, "int",  "integral, U+222B ISOtech" },
2046 { 8756, "there4","therefore, U+2234 ISOtech" },
2047 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
2051 { 8801, "equiv","identical to, U+2261 ISOtech" },
2052 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
2053 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
2054 { 8834, "sub",  "subset of, U+2282 ISOtech" },
2055 { 8835, "sup",  "superset of, U+2283 ISOtech" },
2056 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2067 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
2070
2071 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2072 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2075
2076 };
2077
2078 /************************************************************************
2079  *                                                                      *
2080  *              Commodity functions to handle entities                  *
2081  *                                                                      *
2082  ************************************************************************/
2083
2084 /*
2085  * Macro used to grow the current buffer.
2086  */
2087 #define growBuffer(buffer) {                                            \
2088     xmlChar *tmp;                                                       \
2089     buffer##_size *= 2;                                                 \
2090     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091     if (tmp == NULL) {                                          \
2092         htmlErrMemory(ctxt, "growing buffer\n");                        \
2093         xmlFree(buffer);                                                \
2094         return(NULL);                                                   \
2095     }                                                                   \
2096     buffer = tmp;                                                       \
2097 }
2098
2099 /**
2100  * htmlEntityLookup:
2101  * @name: the entity name
2102  *
2103  * Lookup the given entity in EntitiesTable
2104  *
2105  * TODO: the linear scan is really ugly, an hash table is really needed.
2106  *
2107  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108  */
2109 const htmlEntityDesc *
2110 htmlEntityLookup(const xmlChar *name) {
2111     unsigned int i;
2112
2113     for (i = 0;i < (sizeof(html40EntitiesTable)/
2114                     sizeof(html40EntitiesTable[0]));i++) {
2115         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117         }
2118     }
2119     return(NULL);
2120 }
2121
2122 /**
2123  * htmlEntityValueLookup:
2124  * @value: the entity's unicode value
2125  *
2126  * Lookup the given entity in EntitiesTable
2127  *
2128  * TODO: the linear scan is really ugly, an hash table is really needed.
2129  *
2130  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131  */
2132 const htmlEntityDesc *
2133 htmlEntityValueLookup(unsigned int value) {
2134     unsigned int i;
2135
2136     for (i = 0;i < (sizeof(html40EntitiesTable)/
2137                     sizeof(html40EntitiesTable[0]));i++) {
2138         if (html40EntitiesTable[i].value >= value) {
2139             if (html40EntitiesTable[i].value > value)
2140                 break;
2141             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142         }
2143     }
2144     return(NULL);
2145 }
2146
2147 /**
2148  * UTF8ToHtml:
2149  * @out:  a pointer to an array of bytes to store the result
2150  * @outlen:  the length of @out
2151  * @in:  a pointer to an array of UTF-8 chars
2152  * @inlen:  the length of @in
2153  *
2154  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155  * plus HTML entities block of chars out.
2156  *
2157  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158  * The value of @inlen after return is the number of octets consumed
2159  *     as the return value is positive, else unpredictable.
2160  * The value of @outlen after return is the number of octets consumed.
2161  */
2162 int
2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164               const unsigned char* in, int *inlen) {
2165     const unsigned char* processed = in;
2166     const unsigned char* outend;
2167     const unsigned char* outstart = out;
2168     const unsigned char* instart = in;
2169     const unsigned char* inend;
2170     unsigned int c, d;
2171     int trailing;
2172
2173     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174     if (in == NULL) {
2175         /*
2176          * initialization nothing to do
2177          */
2178         *outlen = 0;
2179         *inlen = 0;
2180         return(0);
2181     }
2182     inend = in + (*inlen);
2183     outend = out + (*outlen);
2184     while (in < inend) {
2185         d = *in++;
2186         if      (d < 0x80)  { c= d; trailing= 0; }
2187         else if (d < 0xC0) {
2188             /* trailing byte in leading position */
2189             *outlen = out - outstart;
2190             *inlen = processed - instart;
2191             return(-2);
2192         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2193         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2194         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2195         else {
2196             /* no chance for this in Ascii */
2197             *outlen = out - outstart;
2198             *inlen = processed - instart;
2199             return(-2);
2200         }
2201
2202         if (inend - in < trailing) {
2203             break;
2204         }
2205
2206         for ( ; trailing; trailing--) {
2207             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208                 break;
2209             c <<= 6;
2210             c |= d & 0x3F;
2211         }
2212
2213         /* assertion: c is a single UTF-4 value */
2214         if (c < 0x80) {
2215             if (out + 1 >= outend)
2216                 break;
2217             *out++ = c;
2218         } else {
2219             int len;
2220             const htmlEntityDesc * ent;
2221             const char *cp;
2222             char nbuf[16];
2223
2224             /*
2225              * Try to lookup a predefined HTML entity for it
2226              */
2227
2228             ent = htmlEntityValueLookup(c);
2229             if (ent == NULL) {
2230               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231               cp = nbuf;
2232             }
2233             else
2234               cp = ent->name;
2235             len = strlen(cp);
2236             if (out + 2 + len >= outend)
2237                 break;
2238             *out++ = '&';
2239             memcpy(out, cp, len);
2240             out += len;
2241             *out++ = ';';
2242         }
2243         processed = in;
2244     }
2245     *outlen = out - outstart;
2246     *inlen = processed - instart;
2247     return(0);
2248 }
2249
2250 /**
2251  * htmlEncodeEntities:
2252  * @out:  a pointer to an array of bytes to store the result
2253  * @outlen:  the length of @out
2254  * @in:  a pointer to an array of UTF-8 chars
2255  * @inlen:  the length of @in
2256  * @quoteChar: the quote character to escape (' or ") or zero.
2257  *
2258  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259  * plus HTML entities block of chars out.
2260  *
2261  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262  * The value of @inlen after return is the number of octets consumed
2263  *     as the return value is positive, else unpredictable.
2264  * The value of @outlen after return is the number of octets consumed.
2265  */
2266 int
2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268                    const unsigned char* in, int *inlen, int quoteChar) {
2269     const unsigned char* processed = in;
2270     const unsigned char* outend;
2271     const unsigned char* outstart = out;
2272     const unsigned char* instart = in;
2273     const unsigned char* inend;
2274     unsigned int c, d;
2275     int trailing;
2276
2277     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278         return(-1);
2279     outend = out + (*outlen);
2280     inend = in + (*inlen);
2281     while (in < inend) {
2282         d = *in++;
2283         if      (d < 0x80)  { c= d; trailing= 0; }
2284         else if (d < 0xC0) {
2285             /* trailing byte in leading position */
2286             *outlen = out - outstart;
2287             *inlen = processed - instart;
2288             return(-2);
2289         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2290         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2291         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2292         else {
2293             /* no chance for this in Ascii */
2294             *outlen = out - outstart;
2295             *inlen = processed - instart;
2296             return(-2);
2297         }
2298
2299         if (inend - in < trailing)
2300             break;
2301
2302         while (trailing--) {
2303             if (((d= *in++) & 0xC0) != 0x80) {
2304                 *outlen = out - outstart;
2305                 *inlen = processed - instart;
2306                 return(-2);
2307             }
2308             c <<= 6;
2309             c |= d & 0x3F;
2310         }
2311
2312         /* assertion: c is a single UTF-4 value */
2313         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314             (c != '&') && (c != '<') && (c != '>')) {
2315             if (out >= outend)
2316                 break;
2317             *out++ = c;
2318         } else {
2319             const htmlEntityDesc * ent;
2320             const char *cp;
2321             char nbuf[16];
2322             int len;
2323
2324             /*
2325              * Try to lookup a predefined HTML entity for it
2326              */
2327             ent = htmlEntityValueLookup(c);
2328             if (ent == NULL) {
2329                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330                 cp = nbuf;
2331             }
2332             else
2333                 cp = ent->name;
2334             len = strlen(cp);
2335             if (out + 2 + len > outend)
2336                 break;
2337             *out++ = '&';
2338             memcpy(out, cp, len);
2339             out += len;
2340             *out++ = ';';
2341         }
2342         processed = in;
2343     }
2344     *outlen = out - outstart;
2345     *inlen = processed - instart;
2346     return(0);
2347 }
2348
2349 /************************************************************************
2350  *                                                                      *
2351  *              Commodity functions to handle streams                   *
2352  *                                                                      *
2353  ************************************************************************/
2354
2355 #ifdef LIBXML_PUSH_ENABLED
2356 /**
2357  * htmlNewInputStream:
2358  * @ctxt:  an HTML parser context
2359  *
2360  * Create a new input stream structure
2361  * Returns the new input stream or NULL
2362  */
2363 static htmlParserInputPtr
2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365     htmlParserInputPtr input;
2366
2367     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368     if (input == NULL) {
2369         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370         return(NULL);
2371     }
2372     memset(input, 0, sizeof(htmlParserInput));
2373     input->filename = NULL;
2374     input->directory = NULL;
2375     input->base = NULL;
2376     input->cur = NULL;
2377     input->buf = NULL;
2378     input->line = 1;
2379     input->col = 1;
2380     input->buf = NULL;
2381     input->free = NULL;
2382     input->version = NULL;
2383     input->consumed = 0;
2384     input->length = 0;
2385     return(input);
2386 }
2387 #endif
2388
2389
2390 /************************************************************************
2391  *                                                                      *
2392  *              Commodity functions, cleanup needed ?                   *
2393  *                                                                      *
2394  ************************************************************************/
2395 /*
2396  * all tags allowing pc data from the html 4.01 loose dtd
2397  * NOTE: it might be more appropriate to integrate this information
2398  * into the html40ElementTable array but I don't want to risk any
2399  * binary incompatibility
2400  */
2401 static const char *allowPCData[] = {
2402     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403     "blockquote", "body", "button", "caption", "center", "cite", "code",
2404     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409
2410 /**
2411  * areBlanks:
2412  * @ctxt:  an HTML parser context
2413  * @str:  a xmlChar *
2414  * @len:  the size of @str
2415  *
2416  * Is this a sequence of blank chars that one can ignore ?
2417  *
2418  * Returns 1 if ignorable 0 otherwise.
2419  */
2420
2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422     unsigned int i;
2423     int j;
2424     xmlNodePtr lastChild;
2425     xmlDtdPtr dtd;
2426
2427     for (j = 0;j < len;j++)
2428         if (!(IS_BLANK_CH(str[j]))) return(0);
2429
2430     if (CUR == 0) return(1);
2431     if (CUR != '<') return(0);
2432     if (ctxt->name == NULL)
2433         return(1);
2434     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435         return(1);
2436     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437         return(1);
2438
2439     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441         dtd = xmlGetIntSubset(ctxt->myDoc);
2442         if (dtd != NULL && dtd->ExternalID != NULL) {
2443             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445                 return(1);
2446         }
2447     }
2448
2449     if (ctxt->node == NULL) return(0);
2450     lastChild = xmlGetLastChild(ctxt->node);
2451     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452         lastChild = lastChild->prev;
2453     if (lastChild == NULL) {
2454         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455             (ctxt->node->content != NULL)) return(0);
2456         /* keep ws in constructs like ...<b> </b>...
2457            for all tags "b" allowing PCDATA */
2458         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460                 return(0);
2461             }
2462         }
2463     } else if (xmlNodeIsText(lastChild)) {
2464         return(0);
2465     } else {
2466         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467            for all tags "p" allowing PCDATA */
2468         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470                 return(0);
2471             }
2472         }
2473     }
2474     return(1);
2475 }
2476
2477 /**
2478  * htmlNewDocNoDtD:
2479  * @URI:  URI for the dtd, or NULL
2480  * @ExternalID:  the external ID of the DTD, or NULL
2481  *
2482  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483  * are NULL
2484  *
2485  * Returns a new document, do not initialize the DTD if not provided
2486  */
2487 htmlDocPtr
2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489     xmlDocPtr cur;
2490
2491     /*
2492      * Allocate a new document and fill the fields.
2493      */
2494     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495     if (cur == NULL) {
2496         htmlErrMemory(NULL, "HTML document creation failed\n");
2497         return(NULL);
2498     }
2499     memset(cur, 0, sizeof(xmlDoc));
2500
2501     cur->type = XML_HTML_DOCUMENT_NODE;
2502     cur->version = NULL;
2503     cur->intSubset = NULL;
2504     cur->doc = cur;
2505     cur->name = NULL;
2506     cur->children = NULL;
2507     cur->extSubset = NULL;
2508     cur->oldNs = NULL;
2509     cur->encoding = NULL;
2510     cur->standalone = 1;
2511     cur->compression = 0;
2512     cur->ids = NULL;
2513     cur->refs = NULL;
2514     cur->_private = NULL;
2515     cur->charset = XML_CHAR_ENCODING_UTF8;
2516     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517     if ((ExternalID != NULL) ||
2518         (URI != NULL))
2519         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520     return(cur);
2521 }
2522
2523 /**
2524  * htmlNewDoc:
2525  * @URI:  URI for the dtd, or NULL
2526  * @ExternalID:  the external ID of the DTD, or NULL
2527  *
2528  * Creates a new HTML document
2529  *
2530  * Returns a new document
2531  */
2532 htmlDocPtr
2533 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2534     if ((URI == NULL) && (ExternalID == NULL))
2535         return(htmlNewDocNoDtD(
2536                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2537                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2538
2539     return(htmlNewDocNoDtD(URI, ExternalID));
2540 }
2541
2542
2543 /************************************************************************
2544  *                                                                      *
2545  *                      The parser itself                               *
2546  *      Relates to http://www.w3.org/TR/html40                          *
2547  *                                                                      *
2548  ************************************************************************/
2549
2550 /************************************************************************
2551  *                                                                      *
2552  *                      The parser itself                               *
2553  *                                                                      *
2554  ************************************************************************/
2555
2556 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2557
2558 /**
2559  * htmlParseHTMLName:
2560  * @ctxt:  an HTML parser context
2561  *
2562  * parse an HTML tag or attribute name, note that we convert it to lowercase
2563  * since HTML names are not case-sensitive.
2564  *
2565  * Returns the Tag Name parsed or NULL
2566  */
2567
2568 static const xmlChar *
2569 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2570     int i = 0;
2571     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2572
2573     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2574         (CUR != ':') && (CUR != '.')) return(NULL);
2575
2576     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2577            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2578            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2579            (CUR == '.'))) {
2580         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2581         else loc[i] = CUR;
2582         i++;
2583
2584         NEXT;
2585     }
2586
2587     return(xmlDictLookup(ctxt->dict, loc, i));
2588 }
2589
2590
2591 /**
2592  * htmlParseHTMLName_nonInvasive:
2593  * @ctxt:  an HTML parser context
2594  *
2595  * parse an HTML tag or attribute name, note that we convert it to lowercase
2596  * since HTML names are not case-sensitive, this doesn't consume the data
2597  * from the stream, it's a look-ahead
2598  *
2599  * Returns the Tag Name parsed or NULL
2600  */
2601
2602 static const xmlChar *
2603 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2604     int i = 0;
2605     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2606
2607     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2608         (NXT(1) != ':')) return(NULL);
2609
2610     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2611            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2612            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2613         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2614         else loc[i] = NXT(1+i);
2615         i++;
2616     }
2617
2618     return(xmlDictLookup(ctxt->dict, loc, i));
2619 }
2620
2621
2622 /**
2623  * htmlParseName:
2624  * @ctxt:  an HTML parser context
2625  *
2626  * parse an HTML name, this routine is case sensitive.
2627  *
2628  * Returns the Name parsed or NULL
2629  */
2630
2631 static const xmlChar *
2632 htmlParseName(htmlParserCtxtPtr ctxt) {
2633     const xmlChar *in;
2634     const xmlChar *ret;
2635     int count = 0;
2636
2637     GROW;
2638
2639     /*
2640      * Accelerator for simple ASCII names
2641      */
2642     in = ctxt->input->cur;
2643     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2644         ((*in >= 0x41) && (*in <= 0x5A)) ||
2645         (*in == '_') || (*in == ':')) {
2646         in++;
2647         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2648                ((*in >= 0x41) && (*in <= 0x5A)) ||
2649                ((*in >= 0x30) && (*in <= 0x39)) ||
2650                (*in == '_') || (*in == '-') ||
2651                (*in == ':') || (*in == '.'))
2652             in++;
2653
2654         if (in == ctxt->input->end)
2655             return(NULL);
2656
2657         if ((*in > 0) && (*in < 0x80)) {
2658             count = in - ctxt->input->cur;
2659             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2660             ctxt->input->cur = in;
2661             ctxt->input->col += count;
2662             return(ret);
2663         }
2664     }
2665     return(htmlParseNameComplex(ctxt));
2666 }
2667
2668 static const xmlChar *
2669 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2670     int len = 0, l;
2671     int c;
2672     int count = 0;
2673     const xmlChar *base = ctxt->input->base;
2674
2675     /*
2676      * Handler for more complex cases
2677      */
2678     GROW;
2679     c = CUR_CHAR(l);
2680     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2681         (!IS_LETTER(c) && (c != '_') &&
2682          (c != ':'))) {
2683         return(NULL);
2684     }
2685
2686     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2687            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2688             (c == '.') || (c == '-') ||
2689             (c == '_') || (c == ':') ||
2690             (IS_COMBINING(c)) ||
2691             (IS_EXTENDER(c)))) {
2692         if (count++ > 100) {
2693             count = 0;
2694             GROW;
2695         }
2696         len += l;
2697         NEXTL(l);
2698         c = CUR_CHAR(l);
2699         if (ctxt->input->base != base) {
2700             /*
2701              * We changed encoding from an unknown encoding
2702              * Input buffer changed location, so we better start again
2703              */
2704             return(htmlParseNameComplex(ctxt));
2705         }
2706     }
2707
2708     if (ctxt->input->cur - ctxt->input->base < len) {
2709         /* Sanity check */
2710         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2711                      "unexpected change of input buffer", NULL, NULL);
2712         return (NULL);
2713     }
2714
2715     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2716 }
2717
2718
2719 /**
2720  * htmlParseHTMLAttribute:
2721  * @ctxt:  an HTML parser context
2722  * @stop:  a char stop value
2723  *
2724  * parse an HTML attribute value till the stop (quote), if
2725  * stop is 0 then it stops at the first space
2726  *
2727  * Returns the attribute parsed or NULL
2728  */
2729
2730 static xmlChar *
2731 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2732     xmlChar *buffer = NULL;
2733     int buffer_size = 0;
2734     xmlChar *out = NULL;
2735     const xmlChar *name = NULL;
2736     const xmlChar *cur = NULL;
2737     const htmlEntityDesc * ent;
2738
2739     /*
2740      * allocate a translation buffer.
2741      */
2742     buffer_size = HTML_PARSER_BUFFER_SIZE;
2743     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2744     if (buffer == NULL) {
2745         htmlErrMemory(ctxt, "buffer allocation failed\n");
2746         return(NULL);
2747     }
2748     out = buffer;
2749
2750     /*
2751      * Ok loop until we reach one of the ending chars
2752      */
2753     while ((CUR != 0) && (CUR != stop)) {
2754         if ((stop == 0) && (CUR == '>')) break;
2755         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2756         if (CUR == '&') {
2757             if (NXT(1) == '#') {
2758                 unsigned int c;
2759                 int bits;
2760
2761                 c = htmlParseCharRef(ctxt);
2762                 if      (c <    0x80)
2763                         { *out++  = c;                bits= -6; }
2764                 else if (c <   0x800)
2765                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2766                 else if (c < 0x10000)
2767                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2768                 else
2769                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2770
2771                 for ( ; bits >= 0; bits-= 6) {
2772                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2773                 }
2774
2775                 if (out - buffer > buffer_size - 100) {
2776                         int indx = out - buffer;
2777
2778                         growBuffer(buffer);
2779                         out = &buffer[indx];
2780                 }
2781             } else {
2782                 ent = htmlParseEntityRef(ctxt, &name);
2783                 if (name == NULL) {
2784                     *out++ = '&';
2785                     if (out - buffer > buffer_size - 100) {
2786                         int indx = out - buffer;
2787
2788                         growBuffer(buffer);
2789                         out = &buffer[indx];
2790                     }
2791                 } else if (ent == NULL) {
2792                     *out++ = '&';
2793                     cur = name;
2794                     while (*cur != 0) {
2795                         if (out - buffer > buffer_size - 100) {
2796                             int indx = out - buffer;
2797
2798                             growBuffer(buffer);
2799                             out = &buffer[indx];
2800                         }
2801                         *out++ = *cur++;
2802                     }
2803                 } else {
2804                     unsigned int c;
2805                     int bits;
2806
2807                     if (out - buffer > buffer_size - 100) {
2808                         int indx = out - buffer;
2809
2810                         growBuffer(buffer);
2811                         out = &buffer[indx];
2812                     }
2813                     c = ent->value;
2814                     if      (c <    0x80)
2815                         { *out++  = c;                bits= -6; }
2816                     else if (c <   0x800)
2817                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2818                     else if (c < 0x10000)
2819                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2820                     else
2821                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2822
2823                     for ( ; bits >= 0; bits-= 6) {
2824                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2825                     }
2826                 }
2827             }
2828         } else {
2829             unsigned int c;
2830             int bits, l;
2831
2832             if (out - buffer > buffer_size - 100) {
2833                 int indx = out - buffer;
2834
2835                 growBuffer(buffer);
2836                 out = &buffer[indx];
2837             }
2838             c = CUR_CHAR(l);
2839             if      (c <    0x80)
2840                     { *out++  = c;                bits= -6; }
2841             else if (c <   0x800)
2842                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2843             else if (c < 0x10000)
2844                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2845             else
2846                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2847
2848             for ( ; bits >= 0; bits-= 6) {
2849                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2850             }
2851             NEXT;
2852         }
2853     }
2854     *out = 0;
2855     return(buffer);
2856 }
2857
2858 /**
2859  * htmlParseEntityRef:
2860  * @ctxt:  an HTML parser context
2861  * @str:  location to store the entity name
2862  *
2863  * parse an HTML ENTITY references
2864  *
2865  * [68] EntityRef ::= '&' Name ';'
2866  *
2867  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2868  *         if non-NULL *str will have to be freed by the caller.
2869  */
2870 const htmlEntityDesc *
2871 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2872     const xmlChar *name;
2873     const htmlEntityDesc * ent = NULL;
2874
2875     if (str != NULL) *str = NULL;
2876     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2877
2878     if (CUR == '&') {
2879         NEXT;
2880         name = htmlParseName(ctxt);
2881         if (name == NULL) {
2882             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2883                          "htmlParseEntityRef: no name\n", NULL, NULL);
2884         } else {
2885             GROW;
2886             if (CUR == ';') {
2887                 if (str != NULL)
2888                     *str = name;
2889
2890                 /*
2891                  * Lookup the entity in the table.
2892                  */
2893                 ent = htmlEntityLookup(name);
2894                 if (ent != NULL) /* OK that's ugly !!! */
2895                     NEXT;
2896             } else {
2897                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2898                              "htmlParseEntityRef: expecting ';'\n",
2899                              NULL, NULL);
2900                 if (str != NULL)
2901                     *str = name;
2902             }
2903         }
2904     }
2905     return(ent);
2906 }
2907
2908 /**
2909  * htmlParseAttValue:
2910  * @ctxt:  an HTML parser context
2911  *
2912  * parse a value for an attribute
2913  * Note: the parser won't do substitution of entities here, this
2914  * will be handled later in xmlStringGetNodeList, unless it was
2915  * asked for ctxt->replaceEntities != 0
2916  *
2917  * Returns the AttValue parsed or NULL.
2918  */
2919
2920 static xmlChar *
2921 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2922     xmlChar *ret = NULL;
2923
2924     if (CUR == '"') {
2925         NEXT;
2926         ret = htmlParseHTMLAttribute(ctxt, '"');
2927         if (CUR != '"') {
2928             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2929                          "AttValue: \" expected\n", NULL, NULL);
2930         } else
2931             NEXT;
2932     } else if (CUR == '\'') {
2933         NEXT;
2934         ret = htmlParseHTMLAttribute(ctxt, '\'');
2935         if (CUR != '\'') {
2936             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2937                          "AttValue: ' expected\n", NULL, NULL);
2938         } else
2939             NEXT;
2940     } else {
2941         /*
2942          * That's an HTMLism, the attribute value may not be quoted
2943          */
2944         ret = htmlParseHTMLAttribute(ctxt, 0);
2945         if (ret == NULL) {
2946             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2947                          "AttValue: no value found\n", NULL, NULL);
2948         }
2949     }
2950     return(ret);
2951 }
2952
2953 /**
2954  * htmlParseSystemLiteral:
2955  * @ctxt:  an HTML parser context
2956  *
2957  * parse an HTML Literal
2958  *
2959  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2960  *
2961  * Returns the SystemLiteral parsed or NULL
2962  */
2963
2964 static xmlChar *
2965 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2966     size_t len = 0, startPosition = 0;
2967     int err = 0;
2968     int quote;
2969     xmlChar *ret = NULL;
2970
2971     if ((CUR != '"') && (CUR != '\'')) {
2972         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2973                      "SystemLiteral \" or ' expected\n", NULL, NULL);
2974         return(NULL);
2975     }
2976     quote = CUR;
2977     NEXT;
2978
2979     if (CUR_PTR < BASE_PTR)
2980         return(ret);
2981     startPosition = CUR_PTR - BASE_PTR;
2982
2983     while ((CUR != 0) && (CUR != quote)) {
2984         /* TODO: Handle UTF-8 */
2985         if (!IS_CHAR_CH(CUR)) {
2986             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2987                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2988             err = 1;
2989         }
2990         NEXT;
2991         len++;
2992     }
2993     if (CUR != quote) {
2994         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2995                      "Unfinished SystemLiteral\n", NULL, NULL);
2996     } else {
2997         NEXT;
2998         if (err == 0)
2999             ret = xmlStrndup((BASE_PTR+startPosition), len);
3000     }
3001
3002     return(ret);
3003 }
3004
3005 /**
3006  * htmlParsePubidLiteral:
3007  * @ctxt:  an HTML parser context
3008  *
3009  * parse an HTML public literal
3010  *
3011  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3012  *
3013  * Returns the PubidLiteral parsed or NULL.
3014  */
3015
3016 static xmlChar *
3017 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3018     size_t len = 0, startPosition = 0;
3019     int err = 0;
3020     int quote;
3021     xmlChar *ret = NULL;
3022
3023     if ((CUR != '"') && (CUR != '\'')) {
3024         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3025                      "PubidLiteral \" or ' expected\n", NULL, NULL);
3026         return(NULL);
3027     }
3028     quote = CUR;
3029     NEXT;
3030
3031     /*
3032      * Name ::= (Letter | '_') (NameChar)*
3033      */
3034     if (CUR_PTR < BASE_PTR)
3035         return(ret);
3036     startPosition = CUR_PTR - BASE_PTR;
3037
3038     while ((CUR != 0) && (CUR != quote)) {
3039         if (!IS_PUBIDCHAR_CH(CUR)) {
3040             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3041                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3042             err = 1;
3043         }
3044         len++;
3045         NEXT;
3046     }
3047
3048     if (CUR != '"') {
3049         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3050                      "Unfinished PubidLiteral\n", NULL, NULL);
3051     } else {
3052         NEXT;
3053         if (err == 0)
3054             ret = xmlStrndup((BASE_PTR + startPosition), len);
3055     }
3056
3057     return(ret);
3058 }
3059
3060 /**
3061  * htmlParseScript:
3062  * @ctxt:  an HTML parser context
3063  *
3064  * parse the content of an HTML SCRIPT or STYLE element
3065  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3066  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3067  * http://www.w3.org/TR/html4/types.html#type-script
3068  * http://www.w3.org/TR/html4/types.html#h-6.15
3069  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3070  *
3071  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3072  * element and the value of intrinsic event attributes. User agents must
3073  * not evaluate script data as HTML markup but instead must pass it on as
3074  * data to a script engine.
3075  * NOTES:
3076  * - The content is passed like CDATA
3077  * - the attributes for style and scripting "onXXX" are also described
3078  *   as CDATA but SGML allows entities references in attributes so their
3079  *   processing is identical as other attributes
3080  */
3081 static void
3082 htmlParseScript(htmlParserCtxtPtr ctxt) {
3083     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3084     int nbchar = 0;
3085     int cur,l;
3086
3087     SHRINK;
3088     cur = CUR_CHAR(l);
3089     while (cur != 0) {
3090         if ((cur == '<') && (NXT(1) == '/')) {
3091             /*
3092              * One should break here, the specification is clear:
3093              * Authors should therefore escape "</" within the content.
3094              * Escape mechanisms are specific to each scripting or
3095              * style sheet language.
3096              *
3097              * In recovery mode, only break if end tag match the
3098              * current tag, effectively ignoring all tags inside the
3099              * script/style block and treating the entire block as
3100              * CDATA.
3101              */
3102             if (ctxt->recovery) {
3103                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3104                                    xmlStrlen(ctxt->name)) == 0)
3105                 {
3106                     break; /* while */
3107                 } else {
3108                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3109                                  "Element %s embeds close tag\n",
3110                                  ctxt->name, NULL);
3111                 }
3112             } else {
3113                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3114                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3115                 {
3116                     break; /* while */
3117                 }
3118             }
3119         }
3120         if (IS_CHAR(cur)) {
3121             COPY_BUF(l,buf,nbchar,cur);
3122         } else {
3123             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3124                             "Invalid char in CDATA 0x%X\n", cur);
3125         }
3126         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3127             buf[nbchar] = 0;
3128             if (ctxt->sax->cdataBlock!= NULL) {
3129                 /*
3130                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3131                  */
3132                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3133             } else if (ctxt->sax->characters != NULL) {
3134                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3135             }
3136             nbchar = 0;
3137         }
3138         GROW;
3139         NEXTL(l);
3140         cur = CUR_CHAR(l);
3141     }
3142
3143     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144         buf[nbchar] = 0;
3145         if (ctxt->sax->cdataBlock!= NULL) {
3146             /*
3147              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3148              */
3149             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3150         } else if (ctxt->sax->characters != NULL) {
3151             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152         }
3153     }
3154 }
3155
3156
3157 /**
3158  * htmlParseCharDataInternal:
3159  * @ctxt:  an HTML parser context
3160  * @readahead: optional read ahead character in ascii range
3161  *
3162  * parse a CharData section.
3163  * if we are within a CDATA section ']]>' marks an end of section.
3164  *
3165  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3166  */
3167
3168 static void
3169 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3170     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3171     int nbchar = 0;
3172     int cur, l;
3173     int chunk = 0;
3174
3175     if (readahead)
3176         buf[nbchar++] = readahead;
3177
3178     SHRINK;
3179     cur = CUR_CHAR(l);
3180     while (((cur != '<') || (ctxt->token == '<')) &&
3181            ((cur != '&') || (ctxt->token == '&')) &&
3182            (cur != 0)) {
3183         if (!(IS_CHAR(cur))) {
3184             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3185                         "Invalid char in CDATA 0x%X\n", cur);
3186         } else {
3187             COPY_BUF(l,buf,nbchar,cur);
3188         }
3189         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3190             buf[nbchar] = 0;
3191
3192             /*
3193              * Ok the segment is to be consumed as chars.
3194              */
3195             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3196                 if (areBlanks(ctxt, buf, nbchar)) {
3197                     if (ctxt->keepBlanks) {
3198                         if (ctxt->sax->characters != NULL)
3199                             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3200                     } else {
3201                         if (ctxt->sax->ignorableWhitespace != NULL)
3202                             ctxt->sax->ignorableWhitespace(ctxt->userData,
3203                                                            buf, nbchar);
3204                     }
3205                 } else {
3206                     htmlCheckParagraph(ctxt);
3207                     if (ctxt->sax->characters != NULL)
3208                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3209                 }
3210             }
3211             nbchar = 0;
3212         }
3213         NEXTL(l);
3214         chunk++;
3215         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3216             chunk = 0;
3217             SHRINK;
3218             GROW;
3219         }
3220         cur = CUR_CHAR(l);
3221         if (cur == 0) {
3222             SHRINK;
3223             GROW;
3224             cur = CUR_CHAR(l);
3225         }
3226     }
3227     if (nbchar != 0) {
3228         buf[nbchar] = 0;
3229
3230         /*
3231          * Ok the segment is to be consumed as chars.
3232          */
3233         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234             if (areBlanks(ctxt, buf, nbchar)) {
3235                 if (ctxt->keepBlanks) {
3236                     if (ctxt->sax->characters != NULL)
3237                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3238                 } else {
3239                     if (ctxt->sax->ignorableWhitespace != NULL)
3240                         ctxt->sax->ignorableWhitespace(ctxt->userData,
3241                                                        buf, nbchar);
3242                 }
3243             } else {
3244                 htmlCheckParagraph(ctxt);
3245                 if (ctxt->sax->characters != NULL)
3246                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
3247             }
3248         }
3249     } else {
3250         /*
3251          * Loop detection
3252          */
3253         if (cur == 0)
3254             ctxt->instate = XML_PARSER_EOF;
3255     }
3256 }
3257
3258 /**
3259  * htmlParseCharData:
3260  * @ctxt:  an HTML parser context
3261  *
3262  * parse a CharData section.
3263  * if we are within a CDATA section ']]>' marks an end of section.
3264  *
3265  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3266  */
3267
3268 static void
3269 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3270     htmlParseCharDataInternal(ctxt, 0);
3271 }
3272
3273 /**
3274  * htmlParseExternalID:
3275  * @ctxt:  an HTML parser context
3276  * @publicID:  a xmlChar** receiving PubidLiteral
3277  *
3278  * Parse an External ID or a Public ID
3279  *
3280  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3281  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3282  *
3283  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3284  *
3285  * Returns the function returns SystemLiteral and in the second
3286  *                case publicID receives PubidLiteral, is strict is off
3287  *                it is possible to return NULL and have publicID set.
3288  */
3289
3290 static xmlChar *
3291 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3292     xmlChar *URI = NULL;
3293
3294     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3295          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3296          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3297         SKIP(6);
3298         if (!IS_BLANK_CH(CUR)) {
3299             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3300                          "Space required after 'SYSTEM'\n", NULL, NULL);
3301         }
3302         SKIP_BLANKS;
3303         URI = htmlParseSystemLiteral(ctxt);
3304         if (URI == NULL) {
3305             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3306                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3307         }
3308     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3309                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3310                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3311         SKIP(6);
3312         if (!IS_BLANK_CH(CUR)) {
3313             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3314                          "Space required after 'PUBLIC'\n", NULL, NULL);
3315         }
3316         SKIP_BLANKS;
3317         *publicID = htmlParsePubidLiteral(ctxt);
3318         if (*publicID == NULL) {
3319             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3320                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3321                          NULL, NULL);
3322         }
3323         SKIP_BLANKS;
3324         if ((CUR == '"') || (CUR == '\'')) {
3325             URI = htmlParseSystemLiteral(ctxt);
3326         }
3327     }
3328     return(URI);
3329 }
3330
3331 /**
3332  * xmlParsePI:
3333  * @ctxt:  an XML parser context
3334  *
3335  * parse an XML Processing Instruction.
3336  *
3337  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3338  */
3339 static void
3340 htmlParsePI(htmlParserCtxtPtr ctxt) {
3341     xmlChar *buf = NULL;
3342     int len = 0;
3343     int size = HTML_PARSER_BUFFER_SIZE;
3344     int cur, l;
3345     const xmlChar *target;
3346     xmlParserInputState state;
3347     int count = 0;
3348
3349     if ((RAW == '<') && (NXT(1) == '?')) {
3350         state = ctxt->instate;
3351         ctxt->instate = XML_PARSER_PI;
3352         /*
3353          * this is a Processing Instruction.
3354          */
3355         SKIP(2);
3356         SHRINK;
3357
3358         /*
3359          * Parse the target name and check for special support like
3360          * namespace.
3361          */
3362         target = htmlParseName(ctxt);
3363         if (target != NULL) {
3364             if (RAW == '>') {
3365                 SKIP(1);
3366
3367                 /*
3368                  * SAX: PI detected.
3369                  */
3370                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3371                     (ctxt->sax->processingInstruction != NULL))
3372                     ctxt->sax->processingInstruction(ctxt->userData,
3373                                                      target, NULL);
3374                 ctxt->instate = state;
3375                 return;
3376             }
3377             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3378             if (buf == NULL) {
3379                 htmlErrMemory(ctxt, NULL);
3380                 ctxt->instate = state;
3381                 return;
3382             }
3383             cur = CUR;
3384             if (!IS_BLANK(cur)) {
3385                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3386                           "ParsePI: PI %s space expected\n", target, NULL);
3387             }
3388             SKIP_BLANKS;
3389             cur = CUR_CHAR(l);
3390             while ((cur != 0) && (cur != '>')) {
3391                 if (len + 5 >= size) {
3392                     xmlChar *tmp;
3393
3394                     size *= 2;
3395                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3396                     if (tmp == NULL) {
3397                         htmlErrMemory(ctxt, NULL);
3398                         xmlFree(buf);
3399                         ctxt->instate = state;
3400                         return;
3401                     }
3402                     buf = tmp;
3403                 }
3404                 count++;
3405                 if (count > 50) {
3406                     GROW;
3407                     count = 0;
3408                 }
3409                 if (IS_CHAR(cur)) {
3410                     COPY_BUF(l,buf,len,cur);
3411                 } else {
3412                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3413                                     "Invalid char in processing instruction "
3414                                     "0x%X\n", cur);
3415                 }
3416                 NEXTL(l);
3417                 cur = CUR_CHAR(l);
3418                 if (cur == 0) {
3419                     SHRINK;
3420                     GROW;
3421                     cur = CUR_CHAR(l);
3422                 }
3423             }
3424             buf[len] = 0;
3425             if (cur != '>') {
3426                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3427                       "ParsePI: PI %s never end ...\n", target, NULL);
3428             } else {
3429                 SKIP(1);
3430
3431                 /*
3432                  * SAX: PI detected.
3433                  */
3434                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3435                     (ctxt->sax->processingInstruction != NULL))
3436                     ctxt->sax->processingInstruction(ctxt->userData,
3437                                                      target, buf);
3438             }
3439             xmlFree(buf);
3440         } else {
3441             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3442                          "PI is not started correctly", NULL, NULL);
3443         }
3444         ctxt->instate = state;
3445     }
3446 }
3447
3448 /**
3449  * htmlParseComment:
3450  * @ctxt:  an HTML parser context
3451  *
3452  * Parse an XML (SGML) comment <!-- .... -->
3453  *
3454  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3455  */
3456 static void
3457 htmlParseComment(htmlParserCtxtPtr ctxt) {
3458     xmlChar *buf = NULL;
3459     int len;
3460     int size = HTML_PARSER_BUFFER_SIZE;
3461     int q, ql;
3462     int r, rl;
3463     int cur, l;
3464     int next, nl;
3465     xmlParserInputState state;
3466
3467     /*
3468      * Check that there is a comment right here.
3469      */
3470     if ((RAW != '<') || (NXT(1) != '!') ||
3471         (NXT(2) != '-') || (NXT(3) != '-')) return;
3472
3473     state = ctxt->instate;
3474     ctxt->instate = XML_PARSER_COMMENT;
3475     SHRINK;
3476     SKIP(4);
3477     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3478     if (buf == NULL) {
3479         htmlErrMemory(ctxt, "buffer allocation failed\n");
3480         ctxt->instate = state;
3481         return;
3482     }
3483     len = 0;
3484     buf[len] = 0;
3485     q = CUR_CHAR(ql);
3486     if (q == 0)
3487         goto unfinished;
3488     NEXTL(ql);
3489     r = CUR_CHAR(rl);
3490     if (r == 0)
3491         goto unfinished;
3492     NEXTL(rl);
3493     cur = CUR_CHAR(l);
3494     while ((cur != 0) &&
3495            ((cur != '>') ||
3496             (r != '-') || (q != '-'))) {
3497         NEXTL(l);
3498         next = CUR_CHAR(nl);
3499         if (next == 0) {
3500             SHRINK;
3501             GROW;
3502             next = CUR_CHAR(nl);
3503         }
3504
3505         if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3506           htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3507                        "Comment incorrectly closed by '--!>'", NULL, NULL);
3508           cur = '>';
3509           break;
3510         }
3511
3512         if (len + 5 >= size) {
3513             xmlChar *tmp;
3514
3515             size *= 2;
3516             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3517             if (tmp == NULL) {
3518                 xmlFree(buf);
3519                 htmlErrMemory(ctxt, "growing buffer failed\n");
3520                 ctxt->instate = state;
3521                 return;
3522             }
3523             buf = tmp;
3524         }
3525         if (IS_CHAR(q)) {
3526             COPY_BUF(ql,buf,len,q);
3527         } else {
3528             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3529                             "Invalid char in comment 0x%X\n", q);
3530         }
3531
3532         q = r;
3533         ql = rl;
3534         r = cur;
3535         rl = l;
3536         cur = next;
3537         l = nl;
3538     }
3539     buf[len] = 0;
3540     if (cur == '>') {
3541         NEXT;
3542         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3543             (!ctxt->disableSAX))
3544             ctxt->sax->comment(ctxt->userData, buf);
3545         xmlFree(buf);
3546         ctxt->instate = state;
3547         return;
3548     }
3549
3550 unfinished:
3551     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3552                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3553     xmlFree(buf);
3554 }
3555
3556 /**
3557  * htmlParseCharRef:
3558  * @ctxt:  an HTML parser context
3559  *
3560  * parse Reference declarations
3561  *
3562  * [66] CharRef ::= '&#' [0-9]+ ';' |
3563  *                  '&#x' [0-9a-fA-F]+ ';'
3564  *
3565  * Returns the value parsed (as an int)
3566  */
3567 int
3568 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3569     int val = 0;
3570
3571     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3572         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3573                      "htmlParseCharRef: context error\n",
3574                      NULL, NULL);
3575         return(0);
3576     }
3577     if ((CUR == '&') && (NXT(1) == '#') &&
3578         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3579         SKIP(3);
3580         while (CUR != ';') {
3581             if ((CUR >= '0') && (CUR <= '9')) {
3582                 if (val < 0x110000)
3583                     val = val * 16 + (CUR - '0');
3584             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3585                 if (val < 0x110000)
3586                     val = val * 16 + (CUR - 'a') + 10;
3587             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3588                 if (val < 0x110000)
3589                     val = val * 16 + (CUR - 'A') + 10;
3590             } else {
3591                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3592                              "htmlParseCharRef: missing semicolon\n",
3593                              NULL, NULL);
3594                 break;
3595             }
3596             NEXT;
3597         }
3598         if (CUR == ';')
3599             NEXT;
3600     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3601         SKIP(2);
3602         while (CUR != ';') {
3603             if ((CUR >= '0') && (CUR <= '9')) {
3604                 if (val < 0x110000)
3605                     val = val * 10 + (CUR - '0');
3606             } else {
3607                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3608                              "htmlParseCharRef: missing semicolon\n",
3609                              NULL, NULL);
3610                 break;
3611             }
3612             NEXT;
3613         }
3614         if (CUR == ';')
3615             NEXT;
3616     } else {
3617         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3618                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3619     }
3620     /*
3621      * Check the value IS_CHAR ...
3622      */
3623     if (IS_CHAR(val)) {
3624         return(val);
3625     } else if (val >= 0x110000) {
3626         htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3627                      "htmlParseCharRef: value too large\n", NULL, NULL);
3628     } else {
3629         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3630                         "htmlParseCharRef: invalid xmlChar value %d\n",
3631                         val);
3632     }
3633     return(0);
3634 }
3635
3636
3637 /**
3638  * htmlParseDocTypeDecl:
3639  * @ctxt:  an HTML parser context
3640  *
3641  * parse a DOCTYPE declaration
3642  *
3643  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3644  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3645  */
3646
3647 static void
3648 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3649     const xmlChar *name;
3650     xmlChar *ExternalID = NULL;
3651     xmlChar *URI = NULL;
3652
3653     /*
3654      * We know that '<!DOCTYPE' has been detected.
3655      */
3656     SKIP(9);
3657
3658     SKIP_BLANKS;
3659
3660     /*
3661      * Parse the DOCTYPE name.
3662      */
3663     name = htmlParseName(ctxt);
3664     if (name == NULL) {
3665         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3666                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3667                      NULL, NULL);
3668     }
3669     /*
3670      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3671      */
3672
3673     SKIP_BLANKS;
3674
3675     /*
3676      * Check for SystemID and ExternalID
3677      */
3678     URI = htmlParseExternalID(ctxt, &ExternalID);
3679     SKIP_BLANKS;
3680
3681     /*
3682      * We should be at the end of the DOCTYPE declaration.
3683      */
3684     if (CUR != '>') {
3685         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3686                      "DOCTYPE improperly terminated\n", NULL, NULL);
3687         /* Ignore bogus content */
3688         while ((CUR != 0) && (CUR != '>'))
3689             NEXT;
3690     }
3691     if (CUR == '>')
3692         NEXT;
3693
3694     /*
3695      * Create or update the document accordingly to the DOCTYPE
3696      */
3697     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3698         (!ctxt->disableSAX))
3699         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3700
3701     /*
3702      * Cleanup, since we don't use all those identifiers
3703      */
3704     if (URI != NULL) xmlFree(URI);
3705     if (ExternalID != NULL) xmlFree(ExternalID);
3706 }
3707
3708 /**
3709  * htmlParseAttribute:
3710  * @ctxt:  an HTML parser context
3711  * @value:  a xmlChar ** used to store the value of the attribute
3712  *
3713  * parse an attribute
3714  *
3715  * [41] Attribute ::= Name Eq AttValue
3716  *
3717  * [25] Eq ::= S? '=' S?
3718  *
3719  * With namespace:
3720  *
3721  * [NS 11] Attribute ::= QName Eq AttValue
3722  *
3723  * Also the case QName == xmlns:??? is handled independently as a namespace
3724  * definition.
3725  *
3726  * Returns the attribute name, and the value in *value.
3727  */
3728
3729 static const xmlChar *
3730 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3731     const xmlChar *name;
3732     xmlChar *val = NULL;
3733
3734     *value = NULL;
3735     name = htmlParseHTMLName(ctxt);
3736     if (name == NULL) {
3737         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3738                      "error parsing attribute name\n", NULL, NULL);
3739         return(NULL);
3740     }
3741
3742     /*
3743      * read the value
3744      */
3745     SKIP_BLANKS;
3746     if (CUR == '=') {
3747         NEXT;
3748         SKIP_BLANKS;
3749         val = htmlParseAttValue(ctxt);
3750     }
3751
3752     *value = val;
3753     return(name);
3754 }
3755
3756 /**
3757  * htmlCheckEncodingDirect:
3758  * @ctxt:  an HTML parser context
3759  * @attvalue: the attribute value
3760  *
3761  * Checks an attribute value to detect
3762  * the encoding
3763  * If a new encoding is detected the parser is switched to decode
3764  * it and pass UTF8
3765  */
3766 static void
3767 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3768
3769     if ((ctxt == NULL) || (encoding == NULL) ||
3770         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3771         return;
3772
3773     /* do not change encoding */
3774     if (ctxt->input->encoding != NULL)
3775         return;
3776
3777     if (encoding != NULL) {
3778         xmlCharEncoding enc;
3779         xmlCharEncodingHandlerPtr handler;
3780
3781         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3782
3783         if (ctxt->input->encoding != NULL)
3784             xmlFree((xmlChar *) ctxt->input->encoding);
3785         ctxt->input->encoding = xmlStrdup(encoding);
3786
3787         enc = xmlParseCharEncoding((const char *) encoding);
3788         /*
3789          * registered set of known encodings
3790          */
3791         if (enc != XML_CHAR_ENCODING_ERROR) {
3792             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3793                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3794                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3795                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3796                 (ctxt->input->buf != NULL) &&
3797                 (ctxt->input->buf->encoder == NULL)) {
3798                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3799                              "htmlCheckEncoding: wrong encoding meta\n",
3800                              NULL, NULL);
3801             } else {
3802                 xmlSwitchEncoding(ctxt, enc);
3803             }
3804             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3805         } else {
3806             /*
3807              * fallback for unknown encodings
3808              */
3809             handler = xmlFindCharEncodingHandler((const char *) encoding);
3810             if (handler != NULL) {
3811                 xmlSwitchToEncoding(ctxt, handler);
3812                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3813             } else {
3814                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3815                              "htmlCheckEncoding: unknown encoding %s\n",
3816                              encoding, NULL);
3817             }
3818         }
3819
3820         if ((ctxt->input->buf != NULL) &&
3821             (ctxt->input->buf->encoder != NULL) &&
3822             (ctxt->input->buf->raw != NULL) &&
3823             (ctxt->input->buf->buffer != NULL)) {
3824             int nbchars;
3825             int processed;
3826
3827             /*
3828              * convert as much as possible to the parser reading buffer.
3829              */
3830             processed = ctxt->input->cur - ctxt->input->base;
3831             xmlBufShrink(ctxt->input->buf->buffer, processed);
3832             nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3833             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3834             if (nbchars < 0) {
3835                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3836                              "htmlCheckEncoding: encoder error\n",
3837                              NULL, NULL);
3838             }
3839         }
3840     }
3841 }
3842
3843 /**
3844  * htmlCheckEncoding:
3845  * @ctxt:  an HTML parser context
3846  * @attvalue: the attribute value
3847  *
3848  * Checks an http-equiv attribute from a Meta tag to detect
3849  * the encoding
3850  * If a new encoding is detected the parser is switched to decode
3851  * it and pass UTF8
3852  */
3853 static void
3854 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3855     const xmlChar *encoding;
3856
3857     if (!attvalue)
3858         return;
3859
3860     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3861     if (encoding != NULL) {
3862         encoding += 7;
3863     }
3864     /*
3865      * skip blank
3866      */
3867     if (encoding && IS_BLANK_CH(*encoding))
3868         encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3869     if (encoding && *encoding == '=') {
3870         encoding ++;
3871         htmlCheckEncodingDirect(ctxt, encoding);
3872     }
3873 }
3874
3875 /**
3876  * htmlCheckMeta:
3877  * @ctxt:  an HTML parser context
3878  * @atts:  the attributes values
3879  *
3880  * Checks an attributes from a Meta tag
3881  */
3882 static void
3883 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3884     int i;
3885     const xmlChar *att, *value;
3886     int http = 0;
3887     const xmlChar *content = NULL;
3888
3889     if ((ctxt == NULL) || (atts == NULL))
3890         return;
3891
3892     i = 0;
3893     att = atts[i++];
3894     while (att != NULL) {
3895         value = atts[i++];
3896         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3897          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3898             http = 1;
3899         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3900             htmlCheckEncodingDirect(ctxt, value);
3901         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3902             content = value;
3903         att = atts[i++];
3904     }
3905     if ((http) && (content != NULL))
3906         htmlCheckEncoding(ctxt, content);
3907
3908 }
3909
3910 /**
3911  * htmlParseStartTag:
3912  * @ctxt:  an HTML parser context
3913  *
3914  * parse a start of tag either for rule element or
3915  * EmptyElement. In both case we don't parse the tag closing chars.
3916  *
3917  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3918  *
3919  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3920  *
3921  * With namespace:
3922  *
3923  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3924  *
3925  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3926  *
3927  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3928  */
3929
3930 static int
3931 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3932     const xmlChar *name;
3933     const xmlChar *attname;
3934     xmlChar *attvalue;
3935     const xmlChar **atts;
3936     int nbatts = 0;
3937     int maxatts;
3938     int meta = 0;
3939     int i;
3940     int discardtag = 0;
3941
3942     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3943         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3944                      "htmlParseStartTag: context error\n", NULL, NULL);
3945         return -1;
3946     }
3947     if (ctxt->instate == XML_PARSER_EOF)
3948         return(-1);
3949     if (CUR != '<') return -1;
3950     NEXT;
3951
3952     atts = ctxt->atts;
3953     maxatts = ctxt->maxatts;
3954
3955     GROW;
3956     name = htmlParseHTMLName(ctxt);
3957     if (name == NULL) {
3958         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3959                      "htmlParseStartTag: invalid element name\n",
3960                      NULL, NULL);
3961         /* if recover preserve text on classic misconstructs */
3962         if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3963             (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3964             htmlParseCharDataInternal(ctxt, '<');
3965             return(-1);
3966         }
3967
3968
3969         /* Dump the bogus tag like browsers do */
3970         while ((CUR != 0) && (CUR != '>') &&
3971                (ctxt->instate != XML_PARSER_EOF))
3972             NEXT;
3973         return -1;
3974     }
3975     if (xmlStrEqual(name, BAD_CAST"meta"))
3976         meta = 1;
3977
3978     /*
3979      * Check for auto-closure of HTML elements.
3980      */
3981     htmlAutoClose(ctxt, name);
3982
3983     /*
3984      * Check for implied HTML elements.
3985      */
3986     htmlCheckImplied(ctxt, name);
3987
3988     /*
3989      * Avoid html at any level > 0, head at any level != 1
3990      * or any attempt to recurse body
3991      */
3992     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3993         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3994                      "htmlParseStartTag: misplaced <html> tag\n",
3995                      name, NULL);
3996         discardtag = 1;
3997         ctxt->depth++;
3998     }
3999     if ((ctxt->nameNr != 1) &&
4000         (xmlStrEqual(name, BAD_CAST"head"))) {
4001         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002                      "htmlParseStartTag: misplaced <head> tag\n",
4003                      name, NULL);
4004         discardtag = 1;
4005         ctxt->depth++;
4006     }
4007     if (xmlStrEqual(name, BAD_CAST"body")) {
4008         int indx;
4009         for (indx = 0;indx < ctxt->nameNr;indx++) {
4010             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4011                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012                              "htmlParseStartTag: misplaced <body> tag\n",
4013                              name, NULL);
4014                 discardtag = 1;
4015                 ctxt->depth++;
4016             }
4017         }
4018     }
4019
4020     /*
4021      * Now parse the attributes, it ends up with the ending
4022      *
4023      * (S Attribute)* S?
4024      */
4025     SKIP_BLANKS;
4026     while ((CUR != 0) &&
4027            (CUR != '>') &&
4028            ((CUR != '/') || (NXT(1) != '>'))) {
4029         GROW;
4030         attname = htmlParseAttribute(ctxt, &attvalue);
4031         if (attname != NULL) {
4032
4033             /*
4034              * Well formedness requires at most one declaration of an attribute
4035              */
4036             for (i = 0; i < nbatts;i += 2) {
4037                 if (xmlStrEqual(atts[i], attname)) {
4038                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4039                                  "Attribute %s redefined\n", attname, NULL);
4040                     if (attvalue != NULL)
4041                         xmlFree(attvalue);
4042                     goto failed;
4043                 }
4044             }
4045
4046             /*
4047              * Add the pair to atts
4048              */
4049             if (atts == NULL) {
4050                 maxatts = 22; /* allow for 10 attrs by default */
4051                 atts = (const xmlChar **)
4052                        xmlMalloc(maxatts * sizeof(xmlChar *));
4053                 if (atts == NULL) {
4054                     htmlErrMemory(ctxt, NULL);
4055                     if (attvalue != NULL)
4056                         xmlFree(attvalue);
4057                     goto failed;
4058                 }
4059                 ctxt->atts = atts;
4060                 ctxt->maxatts = maxatts;
4061             } else if (nbatts + 4 > maxatts) {
4062                 const xmlChar **n;
4063
4064                 maxatts *= 2;
4065                 n = (const xmlChar **) xmlRealloc((void *) atts,
4066                                              maxatts * sizeof(const xmlChar *));
4067                 if (n == NULL) {
4068                     htmlErrMemory(ctxt, NULL);
4069                     if (attvalue != NULL)
4070                         xmlFree(attvalue);
4071                     goto failed;
4072                 }
4073                 atts = n;
4074                 ctxt->atts = atts;
4075                 ctxt->maxatts = maxatts;
4076             }
4077             atts[nbatts++] = attname;
4078             atts[nbatts++] = attvalue;
4079             atts[nbatts] = NULL;
4080             atts[nbatts + 1] = NULL;
4081         }
4082         else {
4083             if (attvalue != NULL)
4084                 xmlFree(attvalue);
4085             /* Dump the bogus attribute string up to the next blank or
4086              * the end of the tag. */
4087             while ((CUR != 0) &&
4088                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4089                    ((CUR != '/') || (NXT(1) != '>')))
4090                 NEXT;
4091         }
4092
4093 failed:
4094         SKIP_BLANKS;
4095     }
4096
4097     /*
4098      * Handle specific association to the META tag
4099      */
4100     if (meta && (nbatts != 0))
4101         htmlCheckMeta(ctxt, atts);
4102
4103     /*
4104      * SAX: Start of Element !
4105      */
4106     if (!discardtag) {
4107         htmlnamePush(ctxt, name);
4108         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4109             if (nbatts != 0)
4110                 ctxt->sax->startElement(ctxt->userData, name, atts);
4111             else
4112                 ctxt->sax->startElement(ctxt->userData, name, NULL);
4113         }
4114     }
4115
4116     if (atts != NULL) {
4117         for (i = 1;i < nbatts;i += 2) {
4118             if (atts[i] != NULL)
4119                 xmlFree((xmlChar *) atts[i]);
4120         }
4121     }
4122
4123     return(discardtag);
4124 }
4125
4126 /**
4127  * htmlParseEndTag:
4128  * @ctxt:  an HTML parser context
4129  *
4130  * parse an end of tag
4131  *
4132  * [42] ETag ::= '</' Name S? '>'
4133  *
4134  * With namespace
4135  *
4136  * [NS 9] ETag ::= '</' QName S? '>'
4137  *
4138  * Returns 1 if the current level should be closed.
4139  */
4140
4141 static int
4142 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4143 {
4144     const xmlChar *name;
4145     const xmlChar *oldname;
4146     int i, ret;
4147
4148     if ((CUR != '<') || (NXT(1) != '/')) {
4149         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4150                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
4151         return (0);
4152     }
4153     SKIP(2);
4154
4155     name = htmlParseHTMLName(ctxt);
4156     if (name == NULL)
4157         return (0);
4158     /*
4159      * We should definitely be at the ending "S? '>'" part
4160      */
4161     SKIP_BLANKS;
4162     if (CUR != '>') {
4163         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4164                      "End tag : expected '>'\n", NULL, NULL);
4165         /* Skip to next '>' */
4166         while ((CUR != 0) && (CUR != '>'))
4167             NEXT;
4168     }
4169     if (CUR == '>')
4170         NEXT;
4171
4172     /*
4173      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4174      * out now.
4175      */
4176     if ((ctxt->depth > 0) &&
4177         (xmlStrEqual(name, BAD_CAST "html") ||
4178          xmlStrEqual(name, BAD_CAST "body") ||
4179          xmlStrEqual(name, BAD_CAST "head"))) {
4180         ctxt->depth--;
4181         return (0);
4182     }
4183
4184     /*
4185      * If the name read is not one of the element in the parsing stack
4186      * then return, it's just an error.
4187      */
4188     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4189         if (xmlStrEqual(name, ctxt->nameTab[i]))
4190             break;
4191     }
4192     if (i < 0) {
4193         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4194                      "Unexpected end tag : %s\n", name, NULL);
4195         return (0);
4196     }
4197
4198
4199     /*
4200      * Check for auto-closure of HTML elements.
4201      */
4202
4203     htmlAutoCloseOnClose(ctxt, name);
4204
4205     /*
4206      * Well formedness constraints, opening and closing must match.
4207      * With the exception that the autoclose may have popped stuff out
4208      * of the stack.
4209      */
4210     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4211         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4212                      "Opening and ending tag mismatch: %s and %s\n",
4213                      name, ctxt->name);
4214     }
4215
4216     /*
4217      * SAX: End of Tag
4218      */
4219     oldname = ctxt->name;
4220     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4221         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4222             ctxt->sax->endElement(ctxt->userData, name);
4223         htmlNodeInfoPop(ctxt);
4224         htmlnamePop(ctxt);
4225         ret = 1;
4226     } else {
4227         ret = 0;
4228     }
4229
4230     return (ret);
4231 }
4232
4233
4234 /**
4235  * htmlParseReference:
4236  * @ctxt:  an HTML parser context
4237  *
4238  * parse and handle entity references in content,
4239  * this will end-up in a call to character() since this is either a
4240  * CharRef, or a predefined entity.
4241  */
4242 static void
4243 htmlParseReference(htmlParserCtxtPtr ctxt) {
4244     const htmlEntityDesc * ent;
4245     xmlChar out[6];
4246     const xmlChar *name;
4247     if (CUR != '&') return;
4248
4249     if (NXT(1) == '#') {
4250         unsigned int c;
4251         int bits, i = 0;
4252
4253         c = htmlParseCharRef(ctxt);
4254         if (c == 0)
4255             return;
4256
4257         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4258         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4259         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4260         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4261
4262         for ( ; bits >= 0; bits-= 6) {
4263             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4264         }
4265         out[i] = 0;
4266
4267         htmlCheckParagraph(ctxt);
4268         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269             ctxt->sax->characters(ctxt->userData, out, i);
4270     } else {
4271         ent = htmlParseEntityRef(ctxt, &name);
4272         if (name == NULL) {
4273             htmlCheckParagraph(ctxt);
4274             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4275                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276             return;
4277         }
4278         if ((ent == NULL) || !(ent->value > 0)) {
4279             htmlCheckParagraph(ctxt);
4280             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4281                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4282                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4283                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4284             }
4285         } else {
4286             unsigned int c;
4287             int bits, i = 0;
4288
4289             c = ent->value;
4290             if      (c <    0x80)
4291                     { out[i++]= c;                bits= -6; }
4292             else if (c <   0x800)
4293                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4294             else if (c < 0x10000)
4295                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4296             else
4297                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4298
4299             for ( ; bits >= 0; bits-= 6) {
4300                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4301             }
4302             out[i] = 0;
4303
4304             htmlCheckParagraph(ctxt);
4305             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4306                 ctxt->sax->characters(ctxt->userData, out, i);
4307         }
4308     }
4309 }
4310
4311 /**
4312  * htmlParseContent:
4313  * @ctxt:  an HTML parser context
4314  *
4315  * Parse a content: comment, sub-element, reference or text.
4316  * Kept for compatibility with old code
4317  */
4318
4319 static void
4320 htmlParseContent(htmlParserCtxtPtr ctxt) {
4321     xmlChar *currentNode;
4322     int depth;
4323     const xmlChar *name;
4324
4325     currentNode = xmlStrdup(ctxt->name);
4326     depth = ctxt->nameNr;
4327     while (1) {
4328         GROW;
4329
4330         if (ctxt->instate == XML_PARSER_EOF)
4331             break;
4332
4333         /*
4334          * Our tag or one of it's parent or children is ending.
4335          */
4336         if ((CUR == '<') && (NXT(1) == '/')) {
4337             if (htmlParseEndTag(ctxt) &&
4338                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4339                 if (currentNode != NULL)
4340                     xmlFree(currentNode);
4341                 return;
4342             }
4343             continue; /* while */
4344         }
4345
4346         else if ((CUR == '<') &&
4347                  ((IS_ASCII_LETTER(NXT(1))) ||
4348                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4349             name = htmlParseHTMLName_nonInvasive(ctxt);
4350             if (name == NULL) {
4351                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4352                          "htmlParseStartTag: invalid element name\n",
4353                          NULL, NULL);
4354                 /* Dump the bogus tag like browsers do */
4355                 while ((CUR != 0) && (CUR != '>'))
4356                     NEXT;
4357
4358                 if (currentNode != NULL)
4359                     xmlFree(currentNode);
4360                 return;
4361             }
4362
4363             if (ctxt->name != NULL) {
4364                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4365                     htmlAutoClose(ctxt, name);
4366                     continue;
4367                 }
4368             }
4369         }
4370
4371         /*
4372          * Has this node been popped out during parsing of
4373          * the next element
4374          */
4375         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4376             (!xmlStrEqual(currentNode, ctxt->name)))
4377              {
4378             if (currentNode != NULL) xmlFree(currentNode);
4379             return;
4380         }
4381
4382         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4383             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4384             /*
4385              * Handle SCRIPT/STYLE separately
4386              */
4387             htmlParseScript(ctxt);
4388         } else {
4389             /*
4390              * Sometimes DOCTYPE arrives in the middle of the document
4391              */
4392             if ((CUR == '<') && (NXT(1) == '!') &&
4393                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4394                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4395                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4396                 (UPP(8) == 'E')) {
4397                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4398                              "Misplaced DOCTYPE declaration\n",
4399                              BAD_CAST "DOCTYPE" , NULL);
4400                 htmlParseDocTypeDecl(ctxt);
4401             }
4402
4403             /*
4404              * First case :  a comment
4405              */
4406             if ((CUR == '<') && (NXT(1) == '!') &&
4407                 (NXT(2) == '-') && (NXT(3) == '-')) {
4408                 htmlParseComment(ctxt);
4409             }
4410
4411             /*
4412              * Second case : a Processing Instruction.
4413              */
4414             else if ((CUR == '<') && (NXT(1) == '?')) {
4415                 htmlParsePI(ctxt);
4416             }
4417
4418             /*
4419              * Third case :  a sub-element.
4420              */
4421             else if (CUR == '<') {
4422                 htmlParseElement(ctxt);
4423             }
4424
4425             /*
4426              * Fourth case : a reference. If if has not been resolved,
4427              *    parsing returns it's Name, create the node
4428              */
4429             else if (CUR == '&') {
4430                 htmlParseReference(ctxt);
4431             }
4432
4433             /*
4434              * Fifth case : end of the resource
4435              */
4436             else if (CUR == 0) {
4437                 htmlAutoCloseOnEnd(ctxt);
4438                 break;
4439             }
4440
4441             /*
4442              * Last case, text. Note that References are handled directly.
4443              */
4444             else {
4445                 htmlParseCharData(ctxt);
4446             }
4447         }
4448         GROW;
4449     }
4450     if (currentNode != NULL) xmlFree(currentNode);
4451 }
4452
4453 /**
4454  * htmlParseElement:
4455  * @ctxt:  an HTML parser context
4456  *
4457  * parse an HTML element, this is highly recursive
4458  * this is kept for compatibility with previous code versions
4459  *
4460  * [39] element ::= EmptyElemTag | STag content ETag
4461  *
4462  * [41] Attribute ::= Name Eq AttValue
4463  */
4464
4465 void
4466 htmlParseElement(htmlParserCtxtPtr ctxt) {
4467     const xmlChar *name;
4468     xmlChar *currentNode = NULL;
4469     const htmlElemDesc * info;
4470     htmlParserNodeInfo node_info;
4471     int failed;
4472     int depth;
4473     const xmlChar *oldptr;
4474
4475     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477                      "htmlParseElement: context error\n", NULL, NULL);
4478         return;
4479     }
4480
4481     if (ctxt->instate == XML_PARSER_EOF)
4482         return;
4483
4484     /* Capture start position */
4485     if (ctxt->record_info) {
4486         node_info.begin_pos = ctxt->input->consumed +
4487                           (CUR_PTR - ctxt->input->base);
4488         node_info.begin_line = ctxt->input->line;
4489     }
4490
4491     failed = htmlParseStartTag(ctxt);
4492     name = ctxt->name;
4493     if ((failed == -1) || (name == NULL)) {
4494         if (CUR == '>')
4495             NEXT;
4496         return;
4497     }
4498
4499     /*
4500      * Lookup the info for that element.
4501      */
4502     info = htmlTagLookup(name);
4503     if (info == NULL) {
4504         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505                      "Tag %s invalid\n", name, NULL);
4506     }
4507
4508     /*
4509      * Check for an Empty Element labeled the XML/SGML way
4510      */
4511     if ((CUR == '/') && (NXT(1) == '>')) {
4512         SKIP(2);
4513         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514             ctxt->sax->endElement(ctxt->userData, name);
4515         htmlnamePop(ctxt);
4516         return;
4517     }
4518
4519     if (CUR == '>') {
4520         NEXT;
4521     } else {
4522         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523                      "Couldn't find end of Start Tag %s\n", name, NULL);
4524
4525         /*
4526          * end of parsing of this node.
4527          */
4528         if (xmlStrEqual(name, ctxt->name)) {
4529             nodePop(ctxt);
4530             htmlnamePop(ctxt);
4531         }
4532
4533         /*
4534          * Capture end position and add node
4535          */
4536         if (ctxt->record_info) {
4537            node_info.end_pos = ctxt->input->consumed +
4538                               (CUR_PTR - ctxt->input->base);
4539            node_info.end_line = ctxt->input->line;
4540            node_info.node = ctxt->node;
4541            xmlParserAddNodeInfo(ctxt, &node_info);
4542         }
4543         return;
4544     }
4545
4546     /*
4547      * Check for an Empty Element from DTD definition
4548      */
4549     if ((info != NULL) && (info->empty)) {
4550         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551             ctxt->sax->endElement(ctxt->userData, name);
4552         htmlnamePop(ctxt);
4553         return;
4554     }
4555
4556     /*
4557      * Parse the content of the element:
4558      */
4559     currentNode = xmlStrdup(ctxt->name);
4560     depth = ctxt->nameNr;
4561     while (CUR != 0) {
4562         oldptr = ctxt->input->cur;
4563         htmlParseContent(ctxt);
4564         if (oldptr==ctxt->input->cur) break;
4565         if (ctxt->nameNr < depth) break;
4566     }
4567
4568     /*
4569      * Capture end position and add node
4570      */
4571     if ( currentNode != NULL && ctxt->record_info ) {
4572        node_info.end_pos = ctxt->input->consumed +
4573                           (CUR_PTR - ctxt->input->base);
4574        node_info.end_line = ctxt->input->line;
4575        node_info.node = ctxt->node;
4576        xmlParserAddNodeInfo(ctxt, &node_info);
4577     }
4578     if (CUR == 0) {
4579         htmlAutoCloseOnEnd(ctxt);
4580     }
4581
4582     if (currentNode != NULL)
4583         xmlFree(currentNode);
4584 }
4585
4586 static void
4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4588     /*
4589      * Capture end position and add node
4590      */
4591     if ( ctxt->node != NULL && ctxt->record_info ) {
4592        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593                                 (CUR_PTR - ctxt->input->base);
4594        ctxt->nodeInfo->end_line = ctxt->input->line;
4595        ctxt->nodeInfo->node = ctxt->node;
4596        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597        htmlNodeInfoPop(ctxt);
4598     }
4599     if (CUR == 0) {
4600        htmlAutoCloseOnEnd(ctxt);
4601     }
4602 }
4603
4604 /**
4605  * htmlParseElementInternal:
4606  * @ctxt:  an HTML parser context
4607  *
4608  * parse an HTML element, new version, non recursive
4609  *
4610  * [39] element ::= EmptyElemTag | STag content ETag
4611  *
4612  * [41] Attribute ::= Name Eq AttValue
4613  */
4614
4615 static void
4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617     const xmlChar *name;
4618     const htmlElemDesc * info;
4619     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620     int failed;
4621
4622     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624                      "htmlParseElementInternal: context error\n", NULL, NULL);
4625         return;
4626     }
4627
4628     if (ctxt->instate == XML_PARSER_EOF)
4629         return;
4630
4631     /* Capture start position */
4632     if (ctxt->record_info) {
4633         node_info.begin_pos = ctxt->input->consumed +
4634                           (CUR_PTR - ctxt->input->base);
4635         node_info.begin_line = ctxt->input->line;
4636     }
4637
4638     failed = htmlParseStartTag(ctxt);
4639     name = ctxt->name;
4640     if ((failed == -1) || (name == NULL)) {
4641         if (CUR == '>')
4642             NEXT;
4643         return;
4644     }
4645
4646     /*
4647      * Lookup the info for that element.
4648      */
4649     info = htmlTagLookup(name);
4650     if (info == NULL) {
4651         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652                      "Tag %s invalid\n", name, NULL);
4653     }
4654
4655     /*
4656      * Check for an Empty Element labeled the XML/SGML way
4657      */
4658     if ((CUR == '/') && (NXT(1) == '>')) {
4659         SKIP(2);
4660         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661             ctxt->sax->endElement(ctxt->userData, name);
4662         htmlnamePop(ctxt);
4663         return;
4664     }
4665
4666     if (CUR == '>') {
4667         NEXT;
4668     } else {
4669         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670                      "Couldn't find end of Start Tag %s\n", name, NULL);
4671
4672         /*
4673          * end of parsing of this node.
4674          */
4675         if (xmlStrEqual(name, ctxt->name)) {
4676             nodePop(ctxt);
4677             htmlnamePop(ctxt);
4678         }
4679
4680         if (ctxt->record_info)
4681             htmlNodeInfoPush(ctxt, &node_info);
4682         htmlParserFinishElementParsing(ctxt);
4683         return;
4684     }
4685
4686     /*
4687      * Check for an Empty Element from DTD definition
4688      */
4689     if ((info != NULL) && (info->empty)) {
4690         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691             ctxt->sax->endElement(ctxt->userData, name);
4692         htmlnamePop(ctxt);
4693         return;
4694     }
4695
4696     if (ctxt->record_info)
4697         htmlNodeInfoPush(ctxt, &node_info);
4698 }
4699
4700 /**
4701  * htmlParseContentInternal:
4702  * @ctxt:  an HTML parser context
4703  *
4704  * Parse a content: comment, sub-element, reference or text.
4705  * New version for non recursive htmlParseElementInternal
4706  */
4707
4708 static void
4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710     xmlChar *currentNode;
4711     int depth;
4712     const xmlChar *name;
4713
4714     currentNode = xmlStrdup(ctxt->name);
4715     depth = ctxt->nameNr;
4716     while (1) {
4717         GROW;
4718
4719         if (ctxt->instate == XML_PARSER_EOF)
4720             break;
4721
4722         /*
4723          * Our tag or one of it's parent or children is ending.
4724          */
4725         if ((CUR == '<') && (NXT(1) == '/')) {
4726             if (htmlParseEndTag(ctxt) &&
4727                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728                 if (currentNode != NULL)
4729                     xmlFree(currentNode);
4730
4731                 currentNode = xmlStrdup(ctxt->name);
4732                 depth = ctxt->nameNr;
4733             }
4734             continue; /* while */
4735         }
4736
4737         else if ((CUR == '<') &&
4738                  ((IS_ASCII_LETTER(NXT(1))) ||
4739                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4740             name = htmlParseHTMLName_nonInvasive(ctxt);
4741             if (name == NULL) {
4742                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743                          "htmlParseStartTag: invalid element name\n",
4744                          NULL, NULL);
4745                 /* Dump the bogus tag like browsers do */
4746                 while ((CUR == 0) && (CUR != '>'))
4747                     NEXT;
4748
4749                 htmlParserFinishElementParsing(ctxt);
4750                 if (currentNode != NULL)
4751                     xmlFree(currentNode);
4752
4753                 currentNode = xmlStrdup(ctxt->name);
4754                 depth = ctxt->nameNr;
4755                 continue;
4756             }
4757
4758             if (ctxt->name != NULL) {
4759                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760                     htmlAutoClose(ctxt, name);
4761                     continue;
4762                 }
4763             }
4764         }
4765
4766         /*
4767          * Has this node been popped out during parsing of
4768          * the next element
4769          */
4770         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771             (!xmlStrEqual(currentNode, ctxt->name)))
4772              {
4773             htmlParserFinishElementParsing(ctxt);
4774             if (currentNode != NULL) xmlFree(currentNode);
4775
4776             currentNode = xmlStrdup(ctxt->name);
4777             depth = ctxt->nameNr;
4778             continue;
4779         }
4780
4781         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4783             /*
4784              * Handle SCRIPT/STYLE separately
4785              */
4786             htmlParseScript(ctxt);
4787         } else {
4788             /*
4789              * Sometimes DOCTYPE arrives in the middle of the document
4790              */
4791             if ((CUR == '<') && (NXT(1) == '!') &&
4792                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4793                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4794                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795                 (UPP(8) == 'E')) {
4796                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797                              "Misplaced DOCTYPE declaration\n",
4798                              BAD_CAST "DOCTYPE" , NULL);
4799                 htmlParseDocTypeDecl(ctxt);
4800             }
4801
4802             /*
4803              * First case :  a comment
4804              */
4805             if ((CUR == '<') && (NXT(1) == '!') &&
4806                 (NXT(2) == '-') && (NXT(3) == '-')) {
4807                 htmlParseComment(ctxt);
4808             }
4809
4810             /*
4811              * Second case : a Processing Instruction.
4812              */
4813             else if ((CUR == '<') && (NXT(1) == '?')) {
4814                 htmlParsePI(ctxt);
4815             }
4816
4817             /*
4818              * Third case :  a sub-element.
4819              */
4820             else if (CUR == '<') {
4821                 htmlParseElementInternal(ctxt);
4822                 if (currentNode != NULL) xmlFree(currentNode);
4823
4824                 currentNode = xmlStrdup(ctxt->name);
4825                 depth = ctxt->nameNr;
4826             }
4827
4828             /*
4829              * Fourth case : a reference. If if has not been resolved,
4830              *    parsing returns it's Name, create the node
4831              */
4832             else if (CUR == '&') {
4833                 htmlParseReference(ctxt);
4834             }
4835
4836             /*
4837              * Fifth case : end of the resource
4838              */
4839             else if (CUR == 0) {
4840                 htmlAutoCloseOnEnd(ctxt);
4841                 break;
4842             }
4843
4844             /*
4845              * Last case, text. Note that References are handled directly.
4846              */
4847             else {
4848                 htmlParseCharData(ctxt);
4849             }
4850         }
4851         GROW;
4852     }
4853     if (currentNode != NULL) xmlFree(currentNode);
4854 }
4855
4856 /**
4857  * htmlParseContent:
4858  * @ctxt:  an HTML parser context
4859  *
4860  * Parse a content: comment, sub-element, reference or text.
4861  * This is the entry point when called from parser.c
4862  */
4863
4864 void
4865 __htmlParseContent(void *ctxt) {
4866     if (ctxt != NULL)
4867         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4868 }
4869
4870 /**
4871  * htmlParseDocument:
4872  * @ctxt:  an HTML parser context
4873  *
4874  * parse an HTML document (and build a tree if using the standard SAX
4875  * interface).
4876  *
4877  * Returns 0, -1 in case of error. the parser context is augmented
4878  *                as a result of the parsing.
4879  */
4880
4881 int
4882 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4883     xmlChar start[4];
4884     xmlCharEncoding enc;
4885     xmlDtdPtr dtd;
4886
4887     xmlInitParser();
4888
4889     htmlDefaultSAXHandlerInit();
4890
4891     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4892         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4893                      "htmlParseDocument: context error\n", NULL, NULL);
4894         return(XML_ERR_INTERNAL_ERROR);
4895     }
4896     ctxt->html = 1;
4897     ctxt->linenumbers = 1;
4898     GROW;
4899     /*
4900      * SAX: beginning of the document processing.
4901      */
4902     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4903         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4904
4905     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4906         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4907         /*
4908          * Get the 4 first bytes and decode the charset
4909          * if enc != XML_CHAR_ENCODING_NONE
4910          * plug some encoding conversion routines.
4911          */
4912         start[0] = RAW;
4913         start[1] = NXT(1);
4914         start[2] = NXT(2);
4915         start[3] = NXT(3);
4916         enc = xmlDetectCharEncoding(&start[0], 4);
4917         if (enc != XML_CHAR_ENCODING_NONE) {
4918             xmlSwitchEncoding(ctxt, enc);
4919         }
4920     }
4921
4922     /*
4923      * Wipe out everything which is before the first '<'
4924      */
4925     SKIP_BLANKS;
4926     if (CUR == 0) {
4927         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4928                      "Document is empty\n", NULL, NULL);
4929     }
4930
4931     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4932         ctxt->sax->startDocument(ctxt->userData);
4933
4934
4935     /*
4936      * Parse possible comments and PIs before any content
4937      */
4938     while (((CUR == '<') && (NXT(1) == '!') &&
4939             (NXT(2) == '-') && (NXT(3) == '-')) ||
4940            ((CUR == '<') && (NXT(1) == '?'))) {
4941         htmlParseComment(ctxt);
4942         htmlParsePI(ctxt);
4943         SKIP_BLANKS;
4944     }
4945
4946
4947     /*
4948      * Then possibly doc type declaration(s) and more Misc
4949      * (doctypedecl Misc*)?
4950      */
4951     if ((CUR == '<') && (NXT(1) == '!') &&
4952         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4953         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4954         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4955         (UPP(8) == 'E')) {
4956         htmlParseDocTypeDecl(ctxt);
4957     }
4958     SKIP_BLANKS;
4959
4960     /*
4961      * Parse possible comments and PIs before any content
4962      */
4963     while (((CUR == '<') && (NXT(1) == '!') &&
4964             (NXT(2) == '-') && (NXT(3) == '-')) ||
4965            ((CUR == '<') && (NXT(1) == '?'))) {
4966         htmlParseComment(ctxt);
4967         htmlParsePI(ctxt);
4968         SKIP_BLANKS;
4969     }
4970
4971     /*
4972      * Time to start parsing the tree itself
4973      */
4974     htmlParseContentInternal(ctxt);
4975
4976     /*
4977      * autoclose
4978      */
4979     if (CUR == 0)
4980         htmlAutoCloseOnEnd(ctxt);
4981
4982
4983     /*
4984      * SAX: end of the document processing.
4985      */
4986     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4987         ctxt->sax->endDocument(ctxt->userData);
4988
4989     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4990         dtd = xmlGetIntSubset(ctxt->myDoc);
4991         if (dtd == NULL)
4992             ctxt->myDoc->intSubset =
4993                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4994                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4995                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4996     }
4997     if (! ctxt->wellFormed) return(-1);
4998     return(0);
4999 }
5000
5001
5002 /************************************************************************
5003  *                                                                      *
5004  *                      Parser contexts handling                        *
5005  *                                                                      *
5006  ************************************************************************/
5007
5008 /**
5009  * htmlInitParserCtxt:
5010  * @ctxt:  an HTML parser context
5011  *
5012  * Initialize a parser context
5013  *
5014  * Returns 0 in case of success and -1 in case of error
5015  */
5016
5017 static int
5018 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5019 {
5020     htmlSAXHandler *sax;
5021
5022     if (ctxt == NULL) return(-1);
5023     memset(ctxt, 0, sizeof(htmlParserCtxt));
5024
5025     ctxt->dict = xmlDictCreate();
5026     if (ctxt->dict == NULL) {
5027         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5028         return(-1);
5029     }
5030     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5031     if (sax == NULL) {
5032         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5033         return(-1);
5034     }
5035     else
5036         memset(sax, 0, sizeof(htmlSAXHandler));
5037
5038     /* Allocate the Input stack */
5039     ctxt->inputTab = (htmlParserInputPtr *)
5040                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5041     if (ctxt->inputTab == NULL) {
5042         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5043         ctxt->inputNr = 0;
5044         ctxt->inputMax = 0;
5045         ctxt->input = NULL;
5046         return(-1);
5047     }
5048     ctxt->inputNr = 0;
5049     ctxt->inputMax = 5;
5050     ctxt->input = NULL;
5051     ctxt->version = NULL;
5052     ctxt->encoding = NULL;
5053     ctxt->standalone = -1;
5054     ctxt->instate = XML_PARSER_START;
5055
5056     /* Allocate the Node stack */
5057     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5058     if (ctxt->nodeTab == NULL) {
5059         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5060         ctxt->nodeNr = 0;
5061         ctxt->nodeMax = 0;
5062         ctxt->node = NULL;
5063         ctxt->inputNr = 0;
5064         ctxt->inputMax = 0;
5065         ctxt->input = NULL;
5066         return(-1);
5067     }
5068     ctxt->nodeNr = 0;
5069     ctxt->nodeMax = 10;
5070     ctxt->node = NULL;
5071
5072     /* Allocate the Name stack */
5073     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5074     if (ctxt->nameTab == NULL) {
5075         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5076         ctxt->nameNr = 0;
5077         ctxt->nameMax = 0;
5078         ctxt->name = NULL;
5079         ctxt->nodeNr = 0;
5080         ctxt->nodeMax = 0;
5081         ctxt->node = NULL;
5082         ctxt->inputNr = 0;
5083         ctxt->inputMax = 0;
5084         ctxt->input = NULL;
5085         return(-1);
5086     }
5087     ctxt->nameNr = 0;
5088     ctxt->nameMax = 10;
5089     ctxt->name = NULL;
5090
5091     ctxt->nodeInfoTab = NULL;
5092     ctxt->nodeInfoNr  = 0;
5093     ctxt->nodeInfoMax = 0;
5094
5095     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5096     else {
5097         ctxt->sax = sax;
5098         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5099     }
5100     ctxt->userData = ctxt;
5101     ctxt->myDoc = NULL;
5102     ctxt->wellFormed = 1;
5103     ctxt->replaceEntities = 0;
5104     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5105     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5106     ctxt->html = 1;
5107     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5108     ctxt->vctxt.userData = ctxt;
5109     ctxt->vctxt.error = xmlParserValidityError;
5110     ctxt->vctxt.warning = xmlParserValidityWarning;
5111     ctxt->record_info = 0;
5112     ctxt->validate = 0;
5113     ctxt->checkIndex = 0;
5114     ctxt->catalogs = NULL;
5115     xmlInitNodeInfoSeq(&ctxt->node_seq);
5116     return(0);
5117 }
5118
5119 /**
5120  * htmlFreeParserCtxt:
5121  * @ctxt:  an HTML parser context
5122  *
5123  * Free all the memory used by a parser context. However the parsed
5124  * document in ctxt->myDoc is not freed.
5125  */
5126
5127 void
5128 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5129 {
5130     xmlFreeParserCtxt(ctxt);
5131 }
5132
5133 /**
5134  * htmlNewParserCtxt:
5135  *
5136  * Allocate and initialize a new parser context.
5137  *
5138  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5139  */
5140
5141 htmlParserCtxtPtr
5142 htmlNewParserCtxt(void)
5143 {
5144     xmlParserCtxtPtr ctxt;
5145
5146     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5147     if (ctxt == NULL) {
5148         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5149         return(NULL);
5150     }
5151     memset(ctxt, 0, sizeof(xmlParserCtxt));
5152     if (htmlInitParserCtxt(ctxt) < 0) {
5153         htmlFreeParserCtxt(ctxt);
5154         return(NULL);
5155     }
5156     return(ctxt);
5157 }
5158
5159 /**
5160  * htmlCreateMemoryParserCtxt:
5161  * @buffer:  a pointer to a char array
5162  * @size:  the size of the array
5163  *
5164  * Create a parser context for an HTML in-memory document.
5165  *
5166  * Returns the new parser context or NULL
5167  */
5168 htmlParserCtxtPtr
5169 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5170     xmlParserCtxtPtr ctxt;
5171     xmlParserInputPtr input;
5172     xmlParserInputBufferPtr buf;
5173
5174     if (buffer == NULL)
5175         return(NULL);
5176     if (size <= 0)
5177         return(NULL);
5178
5179     ctxt = htmlNewParserCtxt();
5180     if (ctxt == NULL)
5181         return(NULL);
5182
5183     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5184     if (buf == NULL) return(NULL);
5185
5186     input = xmlNewInputStream(ctxt);
5187     if (input == NULL) {
5188         xmlFreeParserCtxt(ctxt);
5189         return(NULL);
5190     }
5191
5192     input->filename = NULL;
5193     input->buf = buf;
5194     xmlBufResetInput(buf->buffer, input);
5195
5196     inputPush(ctxt, input);
5197     return(ctxt);
5198 }
5199
5200 /**
5201  * htmlCreateDocParserCtxt:
5202  * @cur:  a pointer to an array of xmlChar
5203  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5204  *
5205  * Create a parser context for an HTML document.
5206  *
5207  * TODO: check the need to add encoding handling there
5208  *
5209  * Returns the new parser context or NULL
5210  */
5211 static htmlParserCtxtPtr
5212 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5213     int len;
5214     htmlParserCtxtPtr ctxt;
5215
5216     if (cur == NULL)
5217         return(NULL);
5218     len = xmlStrlen(cur);
5219     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5220     if (ctxt == NULL)
5221         return(NULL);
5222
5223     if (encoding != NULL) {
5224         xmlCharEncoding enc;
5225         xmlCharEncodingHandlerPtr handler;
5226
5227         if (ctxt->input->encoding != NULL)
5228             xmlFree((xmlChar *) ctxt->input->encoding);
5229         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5230
5231         enc = xmlParseCharEncoding(encoding);
5232         /*
5233          * registered set of known encodings
5234          */
5235         if (enc != XML_CHAR_ENCODING_ERROR) {
5236             xmlSwitchEncoding(ctxt, enc);
5237             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5238                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5239                              "Unsupported encoding %s\n",
5240                              (const xmlChar *) encoding, NULL);
5241             }
5242         } else {
5243             /*
5244              * fallback for unknown encodings
5245              */
5246             handler = xmlFindCharEncodingHandler((const char *) encoding);
5247             if (handler != NULL) {
5248                 xmlSwitchToEncoding(ctxt, handler);
5249             } else {
5250                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5251                              "Unsupported encoding %s\n",
5252                              (const xmlChar *) encoding, NULL);
5253             }
5254         }
5255     }
5256     return(ctxt);
5257 }
5258
5259 #ifdef LIBXML_PUSH_ENABLED
5260 /************************************************************************
5261  *                                                                      *
5262  *      Progressive parsing interfaces                          *
5263  *                                                                      *
5264  ************************************************************************/
5265
5266 /**
5267  * htmlParseLookupSequence:
5268  * @ctxt:  an HTML parser context
5269  * @first:  the first char to lookup
5270  * @next:  the next char to lookup or zero
5271  * @third:  the next char to lookup or zero
5272  * @ignoreattrval: skip over attribute values
5273  *
5274  * Try to find if a sequence (first, next, third) or  just (first next) or
5275  * (first) is available in the input stream.
5276  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5277  * to avoid rescanning sequences of bytes, it DOES change the state of the
5278  * parser, do not use liberally.
5279  * This is basically similar to xmlParseLookupSequence()
5280  *
5281  * Returns the index to the current parsing point if the full sequence
5282  *      is available, -1 otherwise.
5283  */
5284 static int
5285 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5286                         xmlChar next, xmlChar third, int ignoreattrval)
5287 {
5288     int base, len;
5289     htmlParserInputPtr in;
5290     const xmlChar *buf;
5291     int invalue = 0;
5292     char valdellim = 0x0;
5293
5294     in = ctxt->input;
5295     if (in == NULL)
5296         return (-1);
5297
5298     base = in->cur - in->base;
5299     if (base < 0)
5300         return (-1);
5301
5302     if (ctxt->checkIndex > base) {
5303         base = ctxt->checkIndex;
5304         /* Abuse hasPErefs member to restore current state. */
5305         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5306     }
5307
5308     if (in->buf == NULL) {
5309         buf = in->base;
5310         len = in->length;
5311     } else {
5312         buf = xmlBufContent(in->buf->buffer);
5313         len = xmlBufUse(in->buf->buffer);
5314     }
5315
5316     /* take into account the sequence length */
5317     if (third)
5318         len -= 2;
5319     else if (next)
5320         len--;
5321     for (; base < len; base++) {
5322         if (ignoreattrval) {
5323             if (buf[base] == '"' || buf[base] == '\'') {
5324                 if (invalue) {
5325                     if (buf[base] == valdellim) {
5326                         invalue = 0;
5327                         continue;
5328                     }
5329                 } else {
5330                     valdellim = buf[base];
5331                     invalue = 1;
5332                     continue;
5333                 }
5334             } else if (invalue) {
5335                 continue;
5336             }
5337         }
5338         if (buf[base] == first) {
5339             if (third != 0) {
5340                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5341                     continue;
5342             } else if (next != 0) {
5343                 if (buf[base + 1] != next)
5344                     continue;
5345             }
5346             ctxt->checkIndex = 0;
5347 #ifdef DEBUG_PUSH
5348             if (next == 0)
5349                 xmlGenericError(xmlGenericErrorContext,
5350                                 "HPP: lookup '%c' found at %d\n",
5351                                 first, base);
5352             else if (third == 0)
5353                 xmlGenericError(xmlGenericErrorContext,
5354                                 "HPP: lookup '%c%c' found at %d\n",
5355                                 first, next, base);
5356             else
5357                 xmlGenericError(xmlGenericErrorContext,
5358                                 "HPP: lookup '%c%c%c' found at %d\n",
5359                                 first, next, third, base);
5360 #endif
5361             return (base - (in->cur - in->base));
5362         }
5363     }
5364     ctxt->checkIndex = base;
5365     /* Abuse hasPErefs member to track current state. */
5366     if (invalue)
5367         ctxt->hasPErefs |= 1;
5368     else
5369         ctxt->hasPErefs &= ~1;
5370 #ifdef DEBUG_PUSH
5371     if (next == 0)
5372         xmlGenericError(xmlGenericErrorContext,
5373                         "HPP: lookup '%c' failed\n", first);
5374     else if (third == 0)
5375         xmlGenericError(xmlGenericErrorContext,
5376                         "HPP: lookup '%c%c' failed\n", first, next);
5377     else
5378         xmlGenericError(xmlGenericErrorContext,
5379                         "HPP: lookup '%c%c%c' failed\n", first, next,
5380                         third);
5381 #endif
5382     return (-1);
5383 }
5384
5385 /**
5386  * htmlParseLookupCommentEnd:
5387  * @ctxt: an HTML parser context
5388  *
5389  * Try to find a comment end tag in the input stream
5390  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5391  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5392  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5393  * to avoid rescanning sequences of bytes, it DOES change the state of the
5394  * parser, do not use liberally.
5395  * This wraps to htmlParseLookupSequence()
5396  *
5397  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5398  */
5399 static int
5400 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5401 {
5402     int mark = 0;
5403     int cur = CUR_PTR - BASE_PTR;
5404
5405     while (mark >= 0) {
5406         mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5407         if ((mark < 0) ||
5408             (NXT(mark+2) == '>') ||
5409             ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5410             return mark;
5411         }
5412         ctxt->checkIndex = cur + mark + 1;
5413     }
5414     return mark;
5415 }
5416
5417
5418 /**
5419  * htmlParseTryOrFinish:
5420  * @ctxt:  an HTML parser context
5421  * @terminate:  last chunk indicator
5422  *
5423  * Try to progress on parsing
5424  *
5425  * Returns zero if no parsing was possible
5426  */
5427 static int
5428 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5429     int ret = 0;
5430     htmlParserInputPtr in;
5431     ptrdiff_t avail = 0;
5432     xmlChar cur, next;
5433
5434     htmlParserNodeInfo node_info;
5435
5436 #ifdef DEBUG_PUSH
5437     switch (ctxt->instate) {
5438         case XML_PARSER_EOF:
5439             xmlGenericError(xmlGenericErrorContext,
5440                     "HPP: try EOF\n"); break;
5441         case XML_PARSER_START:
5442             xmlGenericError(xmlGenericErrorContext,
5443                     "HPP: try START\n"); break;
5444         case XML_PARSER_MISC:
5445             xmlGenericError(xmlGenericErrorContext,
5446                     "HPP: try MISC\n");break;
5447         case XML_PARSER_COMMENT:
5448             xmlGenericError(xmlGenericErrorContext,
5449                     "HPP: try COMMENT\n");break;
5450         case XML_PARSER_PROLOG:
5451             xmlGenericError(xmlGenericErrorContext,
5452                     "HPP: try PROLOG\n");break;
5453         case XML_PARSER_START_TAG:
5454             xmlGenericError(xmlGenericErrorContext,
5455                     "HPP: try START_TAG\n");break;
5456         case XML_PARSER_CONTENT:
5457             xmlGenericError(xmlGenericErrorContext,
5458                     "HPP: try CONTENT\n");break;
5459         case XML_PARSER_CDATA_SECTION:
5460             xmlGenericError(xmlGenericErrorContext,
5461                     "HPP: try CDATA_SECTION\n");break;
5462         case XML_PARSER_END_TAG:
5463             xmlGenericError(xmlGenericErrorContext,
5464                     "HPP: try END_TAG\n");break;
5465         case XML_PARSER_ENTITY_DECL:
5466             xmlGenericError(xmlGenericErrorContext,
5467                     "HPP: try ENTITY_DECL\n");break;
5468         case XML_PARSER_ENTITY_VALUE:
5469             xmlGenericError(xmlGenericErrorContext,
5470                     "HPP: try ENTITY_VALUE\n");break;
5471         case XML_PARSER_ATTRIBUTE_VALUE:
5472             xmlGenericError(xmlGenericErrorContext,
5473                     "HPP: try ATTRIBUTE_VALUE\n");break;
5474         case XML_PARSER_DTD:
5475             xmlGenericError(xmlGenericErrorContext,
5476                     "HPP: try DTD\n");break;
5477         case XML_PARSER_EPILOG:
5478             xmlGenericError(xmlGenericErrorContext,
5479                     "HPP: try EPILOG\n");break;
5480         case XML_PARSER_PI:
5481             xmlGenericError(xmlGenericErrorContext,
5482                     "HPP: try PI\n");break;
5483         case XML_PARSER_SYSTEM_LITERAL:
5484             xmlGenericError(xmlGenericErrorContext,
5485                     "HPP: try SYSTEM_LITERAL\n");break;
5486     }
5487 #endif
5488
5489     while (1) {
5490
5491         in = ctxt->input;
5492         if (in == NULL) break;
5493         if (in->buf == NULL)
5494             avail = in->length - (in->cur - in->base);
5495         else
5496             avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5497                     (in->cur - in->base);
5498         if ((avail == 0) && (terminate)) {
5499             htmlAutoCloseOnEnd(ctxt);
5500             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5501                 /*
5502                  * SAX: end of the document processing.
5503                  */
5504                 ctxt->instate = XML_PARSER_EOF;
5505                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5506                     ctxt->sax->endDocument(ctxt->userData);
5507             }
5508         }
5509         if (avail < 1)
5510             goto done;
5511         /*
5512          * This is done to make progress and avoid an infinite loop
5513          * if a parsing attempt was aborted by hitting a NUL byte. After
5514          * changing htmlCurrentChar, this probably isn't necessary anymore.
5515          * We should consider removing this check.
5516          */
5517         cur = in->cur[0];
5518         if (cur == 0) {
5519             SKIP(1);
5520             continue;
5521         }
5522
5523         switch (ctxt->instate) {
5524             case XML_PARSER_EOF:
5525                 /*
5526                  * Document parsing is done !
5527                  */
5528                 goto done;
5529             case XML_PARSER_START:
5530                 /*
5531                  * Very first chars read from the document flow.
5532                  */
5533                 cur = in->cur[0];
5534                 if (IS_BLANK_CH(cur)) {
5535                     SKIP_BLANKS;
5536                     if (in->buf == NULL)
5537                         avail = in->length - (in->cur - in->base);
5538                     else
5539                         avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5540                                 (in->cur - in->base);
5541                 }
5542                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5543                     ctxt->sax->setDocumentLocator(ctxt->userData,
5544                                                   &xmlDefaultSAXLocator);
5545                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5546                     (!ctxt->disableSAX))
5547                     ctxt->sax->startDocument(ctxt->userData);
5548
5549                 cur = in->cur[0];
5550                 next = in->cur[1];
5551                 if ((cur == '<') && (next == '!') &&
5552                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5553                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5554                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5555                     (UPP(8) == 'E')) {
5556                     if ((!terminate) &&
5557                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5558                         goto done;
5559 #ifdef DEBUG_PUSH
5560                     xmlGenericError(xmlGenericErrorContext,
5561                             "HPP: Parsing internal subset\n");
5562 #endif
5563                     htmlParseDocTypeDecl(ctxt);
5564                     ctxt->instate = XML_PARSER_PROLOG;
5565 #ifdef DEBUG_PUSH
5566                     xmlGenericError(xmlGenericErrorContext,
5567                             "HPP: entering PROLOG\n");
5568 #endif
5569                 } else {
5570                     ctxt->instate = XML_PARSER_MISC;
5571 #ifdef DEBUG_PUSH
5572                     xmlGenericError(xmlGenericErrorContext,
5573                             "HPP: entering MISC\n");
5574 #endif
5575                 }
5576                 break;
5577             case XML_PARSER_MISC:
5578                 SKIP_BLANKS;
5579                 if (in->buf == NULL)
5580                     avail = in->length - (in->cur - in->base);
5581                 else
5582                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5583                             (in->cur - in->base);
5584                 /*
5585                  * no chars in buffer
5586                  */
5587                 if (avail < 1)
5588                     goto done;
5589                 /*
5590                  * not enough chars in buffer
5591                  */
5592                 if (avail < 2) {
5593                     if (!terminate)
5594                         goto done;
5595                     else
5596                         next = ' ';
5597                 } else {
5598                     next = in->cur[1];
5599                 }
5600                 cur = in->cur[0];
5601                 if ((cur == '<') && (next == '!') &&
5602                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5603                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5604                         goto done;
5605 #ifdef DEBUG_PUSH
5606                     xmlGenericError(xmlGenericErrorContext,
5607                             "HPP: Parsing Comment\n");
5608 #endif
5609                     htmlParseComment(ctxt);
5610                     ctxt->instate = XML_PARSER_MISC;
5611                 } else if ((cur == '<') && (next == '?')) {
5612                     if ((!terminate) &&
5613                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5614                         goto done;
5615 #ifdef DEBUG_PUSH
5616                     xmlGenericError(xmlGenericErrorContext,
5617                             "HPP: Parsing PI\n");
5618 #endif
5619                     htmlParsePI(ctxt);
5620                     ctxt->instate = XML_PARSER_MISC;
5621                 } else if ((cur == '<') && (next == '!') &&
5622                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5623                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5624                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5625                     (UPP(8) == 'E')) {
5626                     if ((!terminate) &&
5627                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5628                         goto done;
5629 #ifdef DEBUG_PUSH
5630                     xmlGenericError(xmlGenericErrorContext,
5631                             "HPP: Parsing internal subset\n");
5632 #endif
5633                     htmlParseDocTypeDecl(ctxt);
5634                     ctxt->instate = XML_PARSER_PROLOG;
5635 #ifdef DEBUG_PUSH
5636                     xmlGenericError(xmlGenericErrorContext,
5637                             "HPP: entering PROLOG\n");
5638 #endif
5639                 } else if ((cur == '<') && (next == '!') &&
5640                            (avail < 9)) {
5641                     goto done;
5642                 } else {
5643                     ctxt->instate = XML_PARSER_CONTENT;
5644 #ifdef DEBUG_PUSH
5645                     xmlGenericError(xmlGenericErrorContext,
5646                             "HPP: entering START_TAG\n");
5647 #endif
5648                 }
5649                 break;
5650             case XML_PARSER_PROLOG:
5651                 SKIP_BLANKS;
5652                 if (in->buf == NULL)
5653                     avail = in->length - (in->cur - in->base);
5654                 else
5655                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5656                             (in->cur - in->base);
5657                 if (avail < 2)
5658                     goto done;
5659                 cur = in->cur[0];
5660                 next = in->cur[1];
5661                 if ((cur == '<') && (next == '!') &&
5662                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5663                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5664                         goto done;
5665 #ifdef DEBUG_PUSH
5666                     xmlGenericError(xmlGenericErrorContext,
5667                             "HPP: Parsing Comment\n");
5668 #endif
5669                     htmlParseComment(ctxt);
5670                     ctxt->instate = XML_PARSER_PROLOG;
5671                 } else if ((cur == '<') && (next == '?')) {
5672                     if ((!terminate) &&
5673                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5674                         goto done;
5675 #ifdef DEBUG_PUSH
5676                     xmlGenericError(xmlGenericErrorContext,
5677                             "HPP: Parsing PI\n");
5678 #endif
5679                     htmlParsePI(ctxt);
5680                     ctxt->instate = XML_PARSER_PROLOG;
5681                 } else if ((cur == '<') && (next == '!') &&
5682                            (avail < 4)) {
5683                     goto done;
5684                 } else {
5685                     ctxt->instate = XML_PARSER_CONTENT;
5686 #ifdef DEBUG_PUSH
5687                     xmlGenericError(xmlGenericErrorContext,
5688                             "HPP: entering START_TAG\n");
5689 #endif
5690                 }
5691                 break;
5692             case XML_PARSER_EPILOG:
5693                 if (in->buf == NULL)
5694                     avail = in->length - (in->cur - in->base);
5695                 else
5696                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5697                             (in->cur - in->base);
5698                 if (avail < 1)
5699                     goto done;
5700                 cur = in->cur[0];
5701                 if (IS_BLANK_CH(cur)) {
5702                     htmlParseCharData(ctxt);
5703                     goto done;
5704                 }
5705                 if (avail < 2)
5706                     goto done;
5707                 next = in->cur[1];
5708                 if ((cur == '<') && (next == '!') &&
5709                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5710                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5711                         goto done;
5712 #ifdef DEBUG_PUSH
5713                     xmlGenericError(xmlGenericErrorContext,
5714                             "HPP: Parsing Comment\n");
5715 #endif
5716                     htmlParseComment(ctxt);
5717                     ctxt->instate = XML_PARSER_EPILOG;
5718                 } else if ((cur == '<') && (next == '?')) {
5719                     if ((!terminate) &&
5720                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5721                         goto done;
5722 #ifdef DEBUG_PUSH
5723                     xmlGenericError(xmlGenericErrorContext,
5724                             "HPP: Parsing PI\n");
5725 #endif
5726                     htmlParsePI(ctxt);
5727                     ctxt->instate = XML_PARSER_EPILOG;
5728                 } else if ((cur == '<') && (next == '!') &&
5729                            (avail < 4)) {
5730                     goto done;
5731                 } else {
5732                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5733                     ctxt->wellFormed = 0;
5734                     ctxt->instate = XML_PARSER_EOF;
5735 #ifdef DEBUG_PUSH
5736                     xmlGenericError(xmlGenericErrorContext,
5737                             "HPP: entering EOF\n");
5738 #endif
5739                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5740                         ctxt->sax->endDocument(ctxt->userData);
5741                     goto done;
5742                 }
5743                 break;
5744             case XML_PARSER_START_TAG: {
5745                 const xmlChar *name;
5746                 int failed;
5747                 const htmlElemDesc * info;
5748
5749                 /*
5750                  * no chars in buffer
5751                  */
5752                 if (avail < 1)
5753                     goto done;
5754                 /*
5755                  * not enough chars in buffer
5756                  */
5757                 if (avail < 2) {
5758                     if (!terminate)
5759                         goto done;
5760                     else
5761                         next = ' ';
5762                 } else {
5763                     next = in->cur[1];
5764                 }
5765                 cur = in->cur[0];
5766                 if (cur != '<') {
5767                     ctxt->instate = XML_PARSER_CONTENT;
5768 #ifdef DEBUG_PUSH
5769                     xmlGenericError(xmlGenericErrorContext,
5770                             "HPP: entering CONTENT\n");
5771 #endif
5772                     break;
5773                 }
5774                 if (next == '/') {
5775                     ctxt->instate = XML_PARSER_END_TAG;
5776                     ctxt->checkIndex = 0;
5777 #ifdef DEBUG_PUSH
5778                     xmlGenericError(xmlGenericErrorContext,
5779                             "HPP: entering END_TAG\n");
5780 #endif
5781                     break;
5782                 }
5783                 if ((!terminate) &&
5784                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5785                     goto done;
5786
5787                 /* Capture start position */
5788                 if (ctxt->record_info) {
5789                      node_info.begin_pos = ctxt->input->consumed +
5790                                         (CUR_PTR - ctxt->input->base);
5791                      node_info.begin_line = ctxt->input->line;
5792                 }
5793
5794
5795                 failed = htmlParseStartTag(ctxt);
5796                 name = ctxt->name;
5797                 if ((failed == -1) ||
5798                     (name == NULL)) {
5799                     if (CUR == '>')
5800                         NEXT;
5801                     break;
5802                 }
5803
5804                 /*
5805                  * Lookup the info for that element.
5806                  */
5807                 info = htmlTagLookup(name);
5808                 if (info == NULL) {
5809                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5810                                  "Tag %s invalid\n", name, NULL);
5811                 }
5812
5813                 /*
5814                  * Check for an Empty Element labeled the XML/SGML way
5815                  */
5816                 if ((CUR == '/') && (NXT(1) == '>')) {
5817                     SKIP(2);
5818                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5819                         ctxt->sax->endElement(ctxt->userData, name);
5820                     htmlnamePop(ctxt);
5821                     ctxt->instate = XML_PARSER_CONTENT;
5822 #ifdef DEBUG_PUSH
5823                     xmlGenericError(xmlGenericErrorContext,
5824                             "HPP: entering CONTENT\n");
5825 #endif
5826                     break;
5827                 }
5828
5829                 if (CUR == '>') {
5830                     NEXT;
5831                 } else {
5832                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5833                                  "Couldn't find end of Start Tag %s\n",
5834                                  name, NULL);
5835
5836                     /*
5837                      * end of parsing of this node.
5838                      */
5839                     if (xmlStrEqual(name, ctxt->name)) {
5840                         nodePop(ctxt);
5841                         htmlnamePop(ctxt);
5842                     }
5843
5844                     if (ctxt->record_info)
5845                         htmlNodeInfoPush(ctxt, &node_info);
5846
5847                     ctxt->instate = XML_PARSER_CONTENT;
5848 #ifdef DEBUG_PUSH
5849                     xmlGenericError(xmlGenericErrorContext,
5850                             "HPP: entering CONTENT\n");
5851 #endif
5852                     break;
5853                 }
5854
5855                 /*
5856                  * Check for an Empty Element from DTD definition
5857                  */
5858                 if ((info != NULL) && (info->empty)) {
5859                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5860                         ctxt->sax->endElement(ctxt->userData, name);
5861                     htmlnamePop(ctxt);
5862                 }
5863
5864                 if (ctxt->record_info)
5865                     htmlNodeInfoPush(ctxt, &node_info);
5866
5867                 ctxt->instate = XML_PARSER_CONTENT;
5868 #ifdef DEBUG_PUSH
5869                 xmlGenericError(xmlGenericErrorContext,
5870                         "HPP: entering CONTENT\n");
5871 #endif
5872                 break;
5873             }
5874             case XML_PARSER_CONTENT: {
5875                 xmlChar chr[2] = { 0, 0 };
5876
5877                 /*
5878                  * Handle preparsed entities and charRef
5879                  */
5880                 if (ctxt->token != 0) {
5881                     chr[0] = (xmlChar) ctxt->token;
5882                     htmlCheckParagraph(ctxt);
5883                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5884                         ctxt->sax->characters(ctxt->userData, chr, 1);
5885                     ctxt->token = 0;
5886                     ctxt->checkIndex = 0;
5887                 }
5888                 if ((avail == 1) && (terminate)) {
5889                     cur = in->cur[0];
5890                     if ((cur != '<') && (cur != '&')) {
5891                         if (ctxt->sax != NULL) {
5892                             chr[0] = cur;
5893                             if (IS_BLANK_CH(cur)) {
5894                                 if (ctxt->keepBlanks) {
5895                                     if (ctxt->sax->characters != NULL)
5896                                         ctxt->sax->characters(
5897                                                 ctxt->userData, chr, 1);
5898                                 } else {
5899                                     if (ctxt->sax->ignorableWhitespace != NULL)
5900                                         ctxt->sax->ignorableWhitespace(
5901                                                 ctxt->userData, chr, 1);
5902                                 }
5903                             } else {
5904                                 htmlCheckParagraph(ctxt);
5905                                 if (ctxt->sax->characters != NULL)
5906                                     ctxt->sax->characters(
5907                                             ctxt->userData, chr, 1);
5908                             }
5909                         }
5910                         ctxt->token = 0;
5911                         ctxt->checkIndex = 0;
5912                         in->cur++;
5913                         break;
5914                     }
5915                 }
5916                 if (avail < 2)
5917                     goto done;
5918                 cur = in->cur[0];
5919                 next = in->cur[1];
5920                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5921                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5922                     /*
5923                      * Handle SCRIPT/STYLE separately
5924                      */
5925                     if (!terminate) {
5926                         int idx;
5927                         xmlChar val;
5928
5929                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5930                         if (idx < 0)
5931                             goto done;
5932                         val = in->cur[idx + 2];
5933                         if (val == 0) /* bad cut of input */
5934                             goto done;
5935                     }
5936                     htmlParseScript(ctxt);
5937                     if ((cur == '<') && (next == '/')) {
5938                         ctxt->instate = XML_PARSER_END_TAG;
5939                         ctxt->checkIndex = 0;
5940 #ifdef DEBUG_PUSH
5941                         xmlGenericError(xmlGenericErrorContext,
5942                                 "HPP: entering END_TAG\n");
5943 #endif
5944                         break;
5945                     }
5946                 } else {
5947                     /*
5948                      * Sometimes DOCTYPE arrives in the middle of the document
5949                      */
5950                     if ((cur == '<') && (next == '!') &&
5951                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
5952                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5953                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5954                         (UPP(8) == 'E')) {
5955                         if ((!terminate) &&
5956                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5957                             goto done;
5958                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5959                                      "Misplaced DOCTYPE declaration\n",
5960                                      BAD_CAST "DOCTYPE" , NULL);
5961                         htmlParseDocTypeDecl(ctxt);
5962                     } else if ((cur == '<') && (next == '!') &&
5963                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
5964                         if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5965                             goto done;
5966 #ifdef DEBUG_PUSH
5967                         xmlGenericError(xmlGenericErrorContext,
5968                                 "HPP: Parsing Comment\n");
5969 #endif
5970                         htmlParseComment(ctxt);
5971                         ctxt->instate = XML_PARSER_CONTENT;
5972                     } else if ((cur == '<') && (next == '?')) {
5973                         if ((!terminate) &&
5974                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5975                             goto done;
5976 #ifdef DEBUG_PUSH
5977                         xmlGenericError(xmlGenericErrorContext,
5978                                 "HPP: Parsing PI\n");
5979 #endif
5980                         htmlParsePI(ctxt);
5981                         ctxt->instate = XML_PARSER_CONTENT;
5982                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5983                         goto done;
5984                     } else if ((cur == '<') && (next == '/')) {
5985                         ctxt->instate = XML_PARSER_END_TAG;
5986                         ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988                         xmlGenericError(xmlGenericErrorContext,
5989                                 "HPP: entering END_TAG\n");
5990 #endif
5991                         break;
5992                     } else if (cur == '<') {
5993                         if ((!terminate) && (next == 0))
5994                             goto done;
5995                         /*
5996                          * Only switch to START_TAG if the next character
5997                          * starts a valid name. Otherwise, htmlParseStartTag
5998                          * might return without consuming all characters
5999                          * up to the final '>'.
6000                          */
6001                         if ((IS_ASCII_LETTER(next)) ||
6002                             (next == '_') || (next == ':') || (next == '.')) {
6003                             ctxt->instate = XML_PARSER_START_TAG;
6004                             ctxt->checkIndex = 0;
6005 #ifdef DEBUG_PUSH
6006                             xmlGenericError(xmlGenericErrorContext,
6007                                     "HPP: entering START_TAG\n");
6008 #endif
6009                         } else {
6010                             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
6011                                          "htmlParseTryOrFinish: "
6012                                          "invalid element name\n",
6013                                          NULL, NULL);
6014                             htmlCheckParagraph(ctxt);
6015                             if ((ctxt->sax != NULL) &&
6016                                 (ctxt->sax->characters != NULL))
6017                                 ctxt->sax->characters(ctxt->userData,
6018                                                       in->cur, 1);
6019                             NEXT;
6020                         }
6021                         break;
6022                     } else {
6023                         /*
6024                          * check that the text sequence is complete
6025                          * before handing out the data to the parser
6026                          * to avoid problems with erroneous end of
6027                          * data detection.
6028                          */
6029                         if ((!terminate) &&
6030                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6031                             goto done;
6032                         ctxt->checkIndex = 0;
6033 #ifdef DEBUG_PUSH
6034                         xmlGenericError(xmlGenericErrorContext,
6035                                 "HPP: Parsing char data\n");
6036 #endif
6037                         while ((ctxt->instate != XML_PARSER_EOF) &&
6038                                (cur != '<') && (in->cur < in->end)) {
6039                             if (cur == '&') {
6040                                 htmlParseReference(ctxt);
6041                             } else {
6042                                 htmlParseCharData(ctxt);
6043                             }
6044                             cur = in->cur[0];
6045                         }
6046                     }
6047                 }
6048
6049                 break;
6050             }
6051             case XML_PARSER_END_TAG:
6052                 if (avail < 2)
6053                     goto done;
6054                 if ((!terminate) &&
6055                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6056                     goto done;
6057                 htmlParseEndTag(ctxt);
6058                 if (ctxt->nameNr == 0) {
6059                     ctxt->instate = XML_PARSER_EPILOG;
6060                 } else {
6061                     ctxt->instate = XML_PARSER_CONTENT;
6062                 }
6063                 ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065                 xmlGenericError(xmlGenericErrorContext,
6066                         "HPP: entering CONTENT\n");
6067 #endif
6068                 break;
6069             case XML_PARSER_CDATA_SECTION:
6070                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6071                         "HPP: internal error, state == CDATA\n",
6072                              NULL, NULL);
6073                 ctxt->instate = XML_PARSER_CONTENT;
6074                 ctxt->checkIndex = 0;
6075 #ifdef DEBUG_PUSH
6076                 xmlGenericError(xmlGenericErrorContext,
6077                         "HPP: entering CONTENT\n");
6078 #endif
6079                 break;
6080             case XML_PARSER_DTD:
6081                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6082                         "HPP: internal error, state == DTD\n",
6083                              NULL, NULL);
6084                 ctxt->instate = XML_PARSER_CONTENT;
6085                 ctxt->checkIndex = 0;
6086 #ifdef DEBUG_PUSH
6087                 xmlGenericError(xmlGenericErrorContext,
6088                         "HPP: entering CONTENT\n");
6089 #endif
6090                 break;
6091             case XML_PARSER_COMMENT:
6092                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6093                         "HPP: internal error, state == COMMENT\n",
6094                              NULL, NULL);
6095                 ctxt->instate = XML_PARSER_CONTENT;
6096                 ctxt->checkIndex = 0;
6097 #ifdef DEBUG_PUSH
6098                 xmlGenericError(xmlGenericErrorContext,
6099                         "HPP: entering CONTENT\n");
6100 #endif
6101                 break;
6102             case XML_PARSER_PI:
6103                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6104                         "HPP: internal error, state == PI\n",
6105                              NULL, NULL);
6106                 ctxt->instate = XML_PARSER_CONTENT;
6107                 ctxt->checkIndex = 0;
6108 #ifdef DEBUG_PUSH
6109                 xmlGenericError(xmlGenericErrorContext,
6110                         "HPP: entering CONTENT\n");
6111 #endif
6112                 break;
6113             case XML_PARSER_ENTITY_DECL:
6114                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6115                         "HPP: internal error, state == ENTITY_DECL\n",
6116                              NULL, NULL);
6117                 ctxt->instate = XML_PARSER_CONTENT;
6118                 ctxt->checkIndex = 0;
6119 #ifdef DEBUG_PUSH
6120                 xmlGenericError(xmlGenericErrorContext,
6121                         "HPP: entering CONTENT\n");
6122 #endif
6123                 break;
6124             case XML_PARSER_ENTITY_VALUE:
6125                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6126                         "HPP: internal error, state == ENTITY_VALUE\n",
6127                              NULL, NULL);
6128                 ctxt->instate = XML_PARSER_CONTENT;
6129                 ctxt->checkIndex = 0;
6130 #ifdef DEBUG_PUSH
6131                 xmlGenericError(xmlGenericErrorContext,
6132                         "HPP: entering DTD\n");
6133 #endif
6134                 break;
6135             case XML_PARSER_ATTRIBUTE_VALUE:
6136                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6137                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6138                              NULL, NULL);
6139                 ctxt->instate = XML_PARSER_START_TAG;
6140                 ctxt->checkIndex = 0;
6141 #ifdef DEBUG_PUSH
6142                 xmlGenericError(xmlGenericErrorContext,
6143                         "HPP: entering START_TAG\n");
6144 #endif
6145                 break;
6146             case XML_PARSER_SYSTEM_LITERAL:
6147                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6148                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6149                              NULL, NULL);
6150                 ctxt->instate = XML_PARSER_CONTENT;
6151                 ctxt->checkIndex = 0;
6152 #ifdef DEBUG_PUSH
6153                 xmlGenericError(xmlGenericErrorContext,
6154                         "HPP: entering CONTENT\n");
6155 #endif
6156                 break;
6157             case XML_PARSER_IGNORE:
6158                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6159                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
6160                              NULL, NULL);
6161                 ctxt->instate = XML_PARSER_CONTENT;
6162                 ctxt->checkIndex = 0;
6163 #ifdef DEBUG_PUSH
6164                 xmlGenericError(xmlGenericErrorContext,
6165                         "HPP: entering CONTENT\n");
6166 #endif
6167                 break;
6168             case XML_PARSER_PUBLIC_LITERAL:
6169                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6170                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
6171                              NULL, NULL);
6172                 ctxt->instate = XML_PARSER_CONTENT;
6173                 ctxt->checkIndex = 0;
6174 #ifdef DEBUG_PUSH
6175                 xmlGenericError(xmlGenericErrorContext,
6176                         "HPP: entering CONTENT\n");
6177 #endif
6178                 break;
6179
6180         }
6181     }
6182 done:
6183     if ((avail == 0) && (terminate)) {
6184         htmlAutoCloseOnEnd(ctxt);
6185         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6186             /*
6187              * SAX: end of the document processing.
6188              */
6189             ctxt->instate = XML_PARSER_EOF;
6190             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6191                 ctxt->sax->endDocument(ctxt->userData);
6192         }
6193     }
6194     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6195         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6196          (ctxt->instate == XML_PARSER_EPILOG))) {
6197         xmlDtdPtr dtd;
6198         dtd = xmlGetIntSubset(ctxt->myDoc);
6199         if (dtd == NULL)
6200             ctxt->myDoc->intSubset =
6201                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6202                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6203                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6204     }
6205 #ifdef DEBUG_PUSH
6206     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6207 #endif
6208     return(ret);
6209 }
6210
6211 /**
6212  * htmlParseChunk:
6213  * @ctxt:  an HTML parser context
6214  * @chunk:  an char array
6215  * @size:  the size in byte of the chunk
6216  * @terminate:  last chunk indicator
6217  *
6218  * Parse a Chunk of memory
6219  *
6220  * Returns zero if no error, the xmlParserErrors otherwise.
6221  */
6222 int
6223 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6224               int terminate) {
6225     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6226         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6227                      "htmlParseChunk: context error\n", NULL, NULL);
6228         return(XML_ERR_INTERNAL_ERROR);
6229     }
6230     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6231         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6232         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6233         size_t cur = ctxt->input->cur - ctxt->input->base;
6234         int res;
6235
6236         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6237         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6238         if (res < 0) {
6239             ctxt->errNo = XML_PARSER_EOF;
6240             ctxt->disableSAX = 1;
6241             return (XML_PARSER_EOF);
6242         }
6243 #ifdef DEBUG_PUSH
6244         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6245 #endif
6246
6247 #if 0
6248         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6249             htmlParseTryOrFinish(ctxt, terminate);
6250 #endif
6251     } else if (ctxt->instate != XML_PARSER_EOF) {
6252         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6253             xmlParserInputBufferPtr in = ctxt->input->buf;
6254             if ((in->encoder != NULL) && (in->buffer != NULL) &&
6255                     (in->raw != NULL)) {
6256                 int nbchars;
6257                 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6258                 size_t current = ctxt->input->cur - ctxt->input->base;
6259
6260                 nbchars = xmlCharEncInput(in, terminate);
6261                 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6262                 if (nbchars < 0) {
6263                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6264                                  "encoder error\n", NULL, NULL);
6265                     return(XML_ERR_INVALID_ENCODING);
6266                 }
6267             }
6268         }
6269     }
6270     htmlParseTryOrFinish(ctxt, terminate);
6271     if (terminate) {
6272         if ((ctxt->instate != XML_PARSER_EOF) &&
6273             (ctxt->instate != XML_PARSER_EPILOG) &&
6274             (ctxt->instate != XML_PARSER_MISC)) {
6275             ctxt->errNo = XML_ERR_DOCUMENT_END;
6276             ctxt->wellFormed = 0;
6277         }
6278         if (ctxt->instate != XML_PARSER_EOF) {
6279             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6280                 ctxt->sax->endDocument(ctxt->userData);
6281         }
6282         ctxt->instate = XML_PARSER_EOF;
6283     }
6284     return((xmlParserErrors) ctxt->errNo);
6285 }
6286
6287 /************************************************************************
6288  *                                                                      *
6289  *                      User entry points                               *
6290  *                                                                      *
6291  ************************************************************************/
6292
6293 /**
6294  * htmlCreatePushParserCtxt:
6295  * @sax:  a SAX handler
6296  * @user_data:  The user data returned on SAX callbacks
6297  * @chunk:  a pointer to an array of chars
6298  * @size:  number of chars in the array
6299  * @filename:  an optional file name or URI
6300  * @enc:  an optional encoding
6301  *
6302  * Create a parser context for using the HTML parser in push mode
6303  * The value of @filename is used for fetching external entities
6304  * and error/warning reports.
6305  *
6306  * Returns the new parser context or NULL
6307  */
6308 htmlParserCtxtPtr
6309 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6310                          const char *chunk, int size, const char *filename,
6311                          xmlCharEncoding enc) {
6312     htmlParserCtxtPtr ctxt;
6313     htmlParserInputPtr inputStream;
6314     xmlParserInputBufferPtr buf;
6315
6316     xmlInitParser();
6317
6318     buf = xmlAllocParserInputBuffer(enc);
6319     if (buf == NULL) return(NULL);
6320
6321     ctxt = htmlNewParserCtxt();
6322     if (ctxt == NULL) {
6323         xmlFreeParserInputBuffer(buf);
6324         return(NULL);
6325     }
6326     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6327         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6328     if (sax != NULL) {
6329         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6330             xmlFree(ctxt->sax);
6331         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6332         if (ctxt->sax == NULL) {
6333             xmlFree(buf);
6334             xmlFree(ctxt);
6335             return(NULL);
6336         }
6337         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6338         if (user_data != NULL)
6339             ctxt->userData = user_data;
6340     }
6341     if (filename == NULL) {
6342         ctxt->directory = NULL;
6343     } else {
6344         ctxt->directory = xmlParserGetDirectory(filename);
6345     }
6346
6347     inputStream = htmlNewInputStream(ctxt);
6348     if (inputStream == NULL) {
6349         xmlFreeParserCtxt(ctxt);
6350         xmlFree(buf);
6351         return(NULL);
6352     }
6353
6354     if (filename == NULL)
6355         inputStream->filename = NULL;
6356     else
6357         inputStream->filename = (char *)
6358             xmlCanonicPath((const xmlChar *) filename);
6359     inputStream->buf = buf;
6360     xmlBufResetInput(buf->buffer, inputStream);
6361
6362     inputPush(ctxt, inputStream);
6363
6364     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6365         (ctxt->input->buf != NULL))  {
6366         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6367         size_t cur = ctxt->input->cur - ctxt->input->base;
6368
6369         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6370
6371         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6372 #ifdef DEBUG_PUSH
6373         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6374 #endif
6375     }
6376     ctxt->progressive = 1;
6377
6378     return(ctxt);
6379 }
6380 #endif /* LIBXML_PUSH_ENABLED */
6381
6382 /**
6383  * htmlSAXParseDoc:
6384  * @cur:  a pointer to an array of xmlChar
6385  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6386  * @sax:  the SAX handler block
6387  * @userData: if using SAX, this pointer will be provided on callbacks.
6388  *
6389  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6390  * to handle parse events. If sax is NULL, fallback to the default DOM
6391  * behavior and return a tree.
6392  *
6393  * Returns the resulting document tree unless SAX is NULL or the document is
6394  *     not well formed.
6395  */
6396
6397 htmlDocPtr
6398 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6399                 htmlSAXHandlerPtr sax, void *userData) {
6400     htmlDocPtr ret;
6401     htmlParserCtxtPtr ctxt;
6402
6403     xmlInitParser();
6404
6405     if (cur == NULL) return(NULL);
6406
6407
6408     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6409     if (ctxt == NULL) return(NULL);
6410     if (sax != NULL) {
6411         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6412         ctxt->sax = sax;
6413         ctxt->userData = userData;
6414     }
6415
6416     htmlParseDocument(ctxt);
6417     ret = ctxt->myDoc;
6418     if (sax != NULL) {
6419         ctxt->sax = NULL;
6420         ctxt->userData = NULL;
6421     }
6422     htmlFreeParserCtxt(ctxt);
6423
6424     return(ret);
6425 }
6426
6427 /**
6428  * htmlParseDoc:
6429  * @cur:  a pointer to an array of xmlChar
6430  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6431  *
6432  * parse an HTML in-memory document and build a tree.
6433  *
6434  * Returns the resulting document tree
6435  */
6436
6437 htmlDocPtr
6438 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6439     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6440 }
6441
6442
6443 /**
6444  * htmlCreateFileParserCtxt:
6445  * @filename:  the filename
6446  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6447  *
6448  * Create a parser context for a file content.
6449  * Automatic support for ZLIB/Compress compressed document is provided
6450  * by default if found at compile-time.
6451  *
6452  * Returns the new parser context or NULL
6453  */
6454 htmlParserCtxtPtr
6455 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6456 {
6457     htmlParserCtxtPtr ctxt;
6458     htmlParserInputPtr inputStream;
6459     char *canonicFilename;
6460     /* htmlCharEncoding enc; */
6461     xmlChar *content, *content_line = (xmlChar *) "charset=";
6462
6463     if (filename == NULL)
6464         return(NULL);
6465
6466     ctxt = htmlNewParserCtxt();
6467     if (ctxt == NULL) {
6468         return(NULL);
6469     }
6470     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6471     if (canonicFilename == NULL) {
6472 #ifdef LIBXML_SAX1_ENABLED
6473         if (xmlDefaultSAXHandler.error != NULL) {
6474             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6475         }
6476 #endif
6477         xmlFreeParserCtxt(ctxt);
6478         return(NULL);
6479     }
6480
6481     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6482     xmlFree(canonicFilename);
6483     if (inputStream == NULL) {
6484         xmlFreeParserCtxt(ctxt);
6485         return(NULL);
6486     }
6487
6488     inputPush(ctxt, inputStream);
6489
6490     /* set encoding */
6491     if (encoding) {
6492         size_t l = strlen(encoding);
6493
6494         if (l < 1000) {
6495             content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6496             if (content) {
6497                 strcpy ((char *)content, (char *)content_line);
6498                 strcat ((char *)content, (char *)encoding);
6499                 htmlCheckEncoding (ctxt, content);
6500                 xmlFree (content);
6501             }
6502         }
6503     }
6504
6505     return(ctxt);
6506 }
6507
6508 /**
6509  * htmlSAXParseFile:
6510  * @filename:  the filename
6511  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6512  * @sax:  the SAX handler block
6513  * @userData: if using SAX, this pointer will be provided on callbacks.
6514  *
6515  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6516  * compressed document is provided by default if found at compile-time.
6517  * It use the given SAX function block to handle the parsing callback.
6518  * If sax is NULL, fallback to the default DOM tree building routines.
6519  *
6520  * Returns the resulting document tree unless SAX is NULL or the document is
6521  *     not well formed.
6522  */
6523
6524 htmlDocPtr
6525 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6526                  void *userData) {
6527     htmlDocPtr ret;
6528     htmlParserCtxtPtr ctxt;
6529     htmlSAXHandlerPtr oldsax = NULL;
6530
6531     xmlInitParser();
6532
6533     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6534     if (ctxt == NULL) return(NULL);
6535     if (sax != NULL) {
6536         oldsax = ctxt->sax;
6537         ctxt->sax = sax;
6538         ctxt->userData = userData;
6539     }
6540
6541     htmlParseDocument(ctxt);
6542
6543     ret = ctxt->myDoc;
6544     if (sax != NULL) {
6545         ctxt->sax = oldsax;
6546         ctxt->userData = NULL;
6547     }
6548     htmlFreeParserCtxt(ctxt);
6549
6550     return(ret);
6551 }
6552
6553 /**
6554  * htmlParseFile:
6555  * @filename:  the filename
6556  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6557  *
6558  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6559  * compressed document is provided by default if found at compile-time.
6560  *
6561  * Returns the resulting document tree
6562  */
6563
6564 htmlDocPtr
6565 htmlParseFile(const char *filename, const char *encoding) {
6566     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6567 }
6568
6569 /**
6570  * htmlHandleOmittedElem:
6571  * @val:  int 0 or 1
6572  *
6573  * Set and return the previous value for handling HTML omitted tags.
6574  *
6575  * Returns the last value for 0 for no handling, 1 for auto insertion.
6576  */
6577
6578 int
6579 htmlHandleOmittedElem(int val) {
6580     int old = htmlOmittedDefaultValue;
6581
6582     htmlOmittedDefaultValue = val;
6583     return(old);
6584 }
6585
6586 /**
6587  * htmlElementAllowedHere:
6588  * @parent: HTML parent element
6589  * @elt: HTML element
6590  *
6591  * Checks whether an HTML element may be a direct child of a parent element.
6592  * Note - doesn't check for deprecated elements
6593  *
6594  * Returns 1 if allowed; 0 otherwise.
6595  */
6596 int
6597 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6598   const char** p ;
6599
6600   if ( ! elt || ! parent || ! parent->subelts )
6601         return 0 ;
6602
6603   for ( p = parent->subelts; *p; ++p )
6604     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6605       return 1 ;
6606
6607   return 0 ;
6608 }
6609 /**
6610  * htmlElementStatusHere:
6611  * @parent: HTML parent element
6612  * @elt: HTML element
6613  *
6614  * Checks whether an HTML element may be a direct child of a parent element.
6615  * and if so whether it is valid or deprecated.
6616  *
6617  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6618  */
6619 htmlStatus
6620 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6621   if ( ! parent || ! elt )
6622     return HTML_INVALID ;
6623   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6624     return HTML_INVALID ;
6625
6626   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6627 }
6628 /**
6629  * htmlAttrAllowed:
6630  * @elt: HTML element
6631  * @attr: HTML attribute
6632  * @legacy: whether to allow deprecated attributes
6633  *
6634  * Checks whether an attribute is valid for an element
6635  * Has full knowledge of Required and Deprecated attributes
6636  *
6637  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6638  */
6639 htmlStatus
6640 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6641   const char** p ;
6642
6643   if ( !elt || ! attr )
6644         return HTML_INVALID ;
6645
6646   if ( elt->attrs_req )
6647     for ( p = elt->attrs_req; *p; ++p)
6648       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649         return HTML_REQUIRED ;
6650
6651   if ( elt->attrs_opt )
6652     for ( p = elt->attrs_opt; *p; ++p)
6653       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654         return HTML_VALID ;
6655
6656   if ( legacy && elt->attrs_depr )
6657     for ( p = elt->attrs_depr; *p; ++p)
6658       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6659         return HTML_DEPRECATED ;
6660
6661   return HTML_INVALID ;
6662 }
6663 /**
6664  * htmlNodeStatus:
6665  * @node: an htmlNodePtr in a tree
6666  * @legacy: whether to allow deprecated elements (YES is faster here
6667  *      for Element nodes)
6668  *
6669  * Checks whether the tree node is valid.  Experimental (the author
6670  *     only uses the HTML enhancements in a SAX parser)
6671  *
6672  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6673  *      legacy allowed) or htmlElementStatusHere (otherwise).
6674  *      for Attribute nodes, a return from htmlAttrAllowed
6675  *      for other nodes, HTML_NA (no checks performed)
6676  */
6677 htmlStatus
6678 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6679   if ( ! node )
6680     return HTML_INVALID ;
6681
6682   switch ( node->type ) {
6683     case XML_ELEMENT_NODE:
6684       return legacy
6685         ? ( htmlElementAllowedHere (
6686                 htmlTagLookup(node->parent->name) , node->name
6687                 ) ? HTML_VALID : HTML_INVALID )
6688         : htmlElementStatusHere(
6689                 htmlTagLookup(node->parent->name) ,
6690                 htmlTagLookup(node->name) )
6691         ;
6692     case XML_ATTRIBUTE_NODE:
6693       return htmlAttrAllowed(
6694         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6695     default: return HTML_NA ;
6696   }
6697 }
6698 /************************************************************************
6699  *                                                                      *
6700  *      New set (2.6.0) of simpler and more flexible APIs               *
6701  *                                                                      *
6702  ************************************************************************/
6703 /**
6704  * DICT_FREE:
6705  * @str:  a string
6706  *
6707  * Free a string if it is not owned by the "dict" dictionary in the
6708  * current scope
6709  */
6710 #define DICT_FREE(str)                                          \
6711         if ((str) && ((!dict) ||                                \
6712             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6713             xmlFree((char *)(str));
6714
6715 /**
6716  * htmlCtxtReset:
6717  * @ctxt: an HTML parser context
6718  *
6719  * Reset a parser context
6720  */
6721 void
6722 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6723 {
6724     xmlParserInputPtr input;
6725     xmlDictPtr dict;
6726
6727     if (ctxt == NULL)
6728         return;
6729
6730     xmlInitParser();
6731     dict = ctxt->dict;
6732
6733     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6734         xmlFreeInputStream(input);
6735     }
6736     ctxt->inputNr = 0;
6737     ctxt->input = NULL;
6738
6739     ctxt->spaceNr = 0;
6740     if (ctxt->spaceTab != NULL) {
6741         ctxt->spaceTab[0] = -1;
6742         ctxt->space = &ctxt->spaceTab[0];
6743     } else {
6744         ctxt->space = NULL;
6745     }
6746
6747
6748     ctxt->nodeNr = 0;
6749     ctxt->node = NULL;
6750
6751     ctxt->nameNr = 0;
6752     ctxt->name = NULL;
6753
6754     DICT_FREE(ctxt->version);
6755     ctxt->version = NULL;
6756     DICT_FREE(ctxt->encoding);
6757     ctxt->encoding = NULL;
6758     DICT_FREE(ctxt->directory);
6759     ctxt->directory = NULL;
6760     DICT_FREE(ctxt->extSubURI);
6761     ctxt->extSubURI = NULL;
6762     DICT_FREE(ctxt->extSubSystem);
6763     ctxt->extSubSystem = NULL;
6764     if (ctxt->myDoc != NULL)
6765         xmlFreeDoc(ctxt->myDoc);
6766     ctxt->myDoc = NULL;
6767
6768     ctxt->standalone = -1;
6769     ctxt->hasExternalSubset = 0;
6770     ctxt->hasPErefs = 0;
6771     ctxt->html = 1;
6772     ctxt->external = 0;
6773     ctxt->instate = XML_PARSER_START;
6774     ctxt->token = 0;
6775
6776     ctxt->wellFormed = 1;
6777     ctxt->nsWellFormed = 1;
6778     ctxt->disableSAX = 0;
6779     ctxt->valid = 1;
6780     ctxt->vctxt.userData = ctxt;
6781     ctxt->vctxt.error = xmlParserValidityError;
6782     ctxt->vctxt.warning = xmlParserValidityWarning;
6783     ctxt->record_info = 0;
6784     ctxt->checkIndex = 0;
6785     ctxt->inSubset = 0;
6786     ctxt->errNo = XML_ERR_OK;
6787     ctxt->depth = 0;
6788     ctxt->charset = XML_CHAR_ENCODING_NONE;
6789     ctxt->catalogs = NULL;
6790     xmlInitNodeInfoSeq(&ctxt->node_seq);
6791
6792     if (ctxt->attsDefault != NULL) {
6793         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6794         ctxt->attsDefault = NULL;
6795     }
6796     if (ctxt->attsSpecial != NULL) {
6797         xmlHashFree(ctxt->attsSpecial, NULL);
6798         ctxt->attsSpecial = NULL;
6799     }
6800 }
6801
6802 /**
6803  * htmlCtxtUseOptions:
6804  * @ctxt: an HTML parser context
6805  * @options:  a combination of htmlParserOption(s)
6806  *
6807  * Applies the options to the parser context
6808  *
6809  * Returns 0 in case of success, the set of unknown or unimplemented options
6810  *         in case of error.
6811  */
6812 int
6813 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6814 {
6815     if (ctxt == NULL)
6816         return(-1);
6817
6818     if (options & HTML_PARSE_NOWARNING) {
6819         ctxt->sax->warning = NULL;
6820         ctxt->vctxt.warning = NULL;
6821         options -= XML_PARSE_NOWARNING;
6822         ctxt->options |= XML_PARSE_NOWARNING;
6823     }
6824     if (options & HTML_PARSE_NOERROR) {
6825         ctxt->sax->error = NULL;
6826         ctxt->vctxt.error = NULL;
6827         ctxt->sax->fatalError = NULL;
6828         options -= XML_PARSE_NOERROR;
6829         ctxt->options |= XML_PARSE_NOERROR;
6830     }
6831     if (options & HTML_PARSE_PEDANTIC) {
6832         ctxt->pedantic = 1;
6833         options -= XML_PARSE_PEDANTIC;
6834         ctxt->options |= XML_PARSE_PEDANTIC;
6835     } else
6836         ctxt->pedantic = 0;
6837     if (options & XML_PARSE_NOBLANKS) {
6838         ctxt->keepBlanks = 0;
6839         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6840         options -= XML_PARSE_NOBLANKS;
6841         ctxt->options |= XML_PARSE_NOBLANKS;
6842     } else
6843         ctxt->keepBlanks = 1;
6844     if (options & HTML_PARSE_RECOVER) {
6845         ctxt->recovery = 1;
6846         options -= HTML_PARSE_RECOVER;
6847     } else
6848         ctxt->recovery = 0;
6849     if (options & HTML_PARSE_COMPACT) {
6850         ctxt->options |= HTML_PARSE_COMPACT;
6851         options -= HTML_PARSE_COMPACT;
6852     }
6853     if (options & XML_PARSE_HUGE) {
6854         ctxt->options |= XML_PARSE_HUGE;
6855         options -= XML_PARSE_HUGE;
6856     }
6857     if (options & HTML_PARSE_NODEFDTD) {
6858         ctxt->options |= HTML_PARSE_NODEFDTD;
6859         options -= HTML_PARSE_NODEFDTD;
6860     }
6861     if (options & HTML_PARSE_IGNORE_ENC) {
6862         ctxt->options |= HTML_PARSE_IGNORE_ENC;
6863         options -= HTML_PARSE_IGNORE_ENC;
6864     }
6865     if (options & HTML_PARSE_NOIMPLIED) {
6866         ctxt->options |= HTML_PARSE_NOIMPLIED;
6867         options -= HTML_PARSE_NOIMPLIED;
6868     }
6869     ctxt->dictNames = 0;
6870     return (options);
6871 }
6872
6873 /**
6874  * htmlDoRead:
6875  * @ctxt:  an HTML parser context
6876  * @URL:  the base URL to use for the document
6877  * @encoding:  the document encoding, or NULL
6878  * @options:  a combination of htmlParserOption(s)
6879  * @reuse:  keep the context for reuse
6880  *
6881  * Common front-end for the htmlRead functions
6882  *
6883  * Returns the resulting document tree or NULL
6884  */
6885 static htmlDocPtr
6886 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6887           int options, int reuse)
6888 {
6889     htmlDocPtr ret;
6890
6891     htmlCtxtUseOptions(ctxt, options);
6892     ctxt->html = 1;
6893     if (encoding != NULL) {
6894         xmlCharEncodingHandlerPtr hdlr;
6895
6896         hdlr = xmlFindCharEncodingHandler(encoding);
6897         if (hdlr != NULL) {
6898             xmlSwitchToEncoding(ctxt, hdlr);
6899             if (ctxt->input->encoding != NULL)
6900               xmlFree((xmlChar *) ctxt->input->encoding);
6901             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6902         }
6903     }
6904     if ((URL != NULL) && (ctxt->input != NULL) &&
6905         (ctxt->input->filename == NULL))
6906         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6907     htmlParseDocument(ctxt);
6908     ret = ctxt->myDoc;
6909     ctxt->myDoc = NULL;
6910     if (!reuse) {
6911         if ((ctxt->dictNames) &&
6912             (ret != NULL) &&
6913             (ret->dict == ctxt->dict))
6914             ctxt->dict = NULL;
6915         xmlFreeParserCtxt(ctxt);
6916     }
6917     return (ret);
6918 }
6919
6920 /**
6921  * htmlReadDoc:
6922  * @cur:  a pointer to a zero terminated string
6923  * @URL:  the base URL to use for the document
6924  * @encoding:  the document encoding, or NULL
6925  * @options:  a combination of htmlParserOption(s)
6926  *
6927  * parse an XML in-memory document and build a tree.
6928  *
6929  * Returns the resulting document tree
6930  */
6931 htmlDocPtr
6932 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6933 {
6934     htmlParserCtxtPtr ctxt;
6935
6936     if (cur == NULL)
6937         return (NULL);
6938
6939     xmlInitParser();
6940     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6941     if (ctxt == NULL)
6942         return (NULL);
6943     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6944 }
6945
6946 /**
6947  * htmlReadFile:
6948  * @filename:  a file or URL
6949  * @encoding:  the document encoding, or NULL
6950  * @options:  a combination of htmlParserOption(s)
6951  *
6952  * parse an XML file from the filesystem or the network.
6953  *
6954  * Returns the resulting document tree
6955  */
6956 htmlDocPtr
6957 htmlReadFile(const char *filename, const char *encoding, int options)
6958 {
6959     htmlParserCtxtPtr ctxt;
6960
6961     xmlInitParser();
6962     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6963     if (ctxt == NULL)
6964         return (NULL);
6965     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6966 }
6967
6968 /**
6969  * htmlReadMemory:
6970  * @buffer:  a pointer to a char array
6971  * @size:  the size of the array
6972  * @URL:  the base URL to use for the document
6973  * @encoding:  the document encoding, or NULL
6974  * @options:  a combination of htmlParserOption(s)
6975  *
6976  * parse an XML in-memory document and build a tree.
6977  *
6978  * Returns the resulting document tree
6979  */
6980 htmlDocPtr
6981 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6982 {
6983     htmlParserCtxtPtr ctxt;
6984
6985     xmlInitParser();
6986     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6987     if (ctxt == NULL)
6988         return (NULL);
6989     htmlDefaultSAXHandlerInit();
6990     if (ctxt->sax != NULL)
6991         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6992     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6993 }
6994
6995 /**
6996  * htmlReadFd:
6997  * @fd:  an open file descriptor
6998  * @URL:  the base URL to use for the document
6999  * @encoding:  the document encoding, or NULL
7000  * @options:  a combination of htmlParserOption(s)
7001  *
7002  * parse an XML from a file descriptor and build a tree.
7003  *
7004  * Returns the resulting document tree
7005  */
7006 htmlDocPtr
7007 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7008 {
7009     htmlParserCtxtPtr ctxt;
7010     xmlParserInputBufferPtr input;
7011     xmlParserInputPtr stream;
7012
7013     if (fd < 0)
7014         return (NULL);
7015     xmlInitParser();
7016
7017     xmlInitParser();
7018     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7019     if (input == NULL)
7020         return (NULL);
7021     ctxt = xmlNewParserCtxt();
7022     if (ctxt == NULL) {
7023         xmlFreeParserInputBuffer(input);
7024         return (NULL);
7025     }
7026     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7027     if (stream == NULL) {
7028         xmlFreeParserInputBuffer(input);
7029         xmlFreeParserCtxt(ctxt);
7030         return (NULL);
7031     }
7032     inputPush(ctxt, stream);
7033     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7034 }
7035
7036 /**
7037  * htmlReadIO:
7038  * @ioread:  an I/O read function
7039  * @ioclose:  an I/O close function
7040  * @ioctx:  an I/O handler
7041  * @URL:  the base URL to use for the document
7042  * @encoding:  the document encoding, or NULL
7043  * @options:  a combination of htmlParserOption(s)
7044  *
7045  * parse an HTML document from I/O functions and source and build a tree.
7046  *
7047  * Returns the resulting document tree
7048  */
7049 htmlDocPtr
7050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7051           void *ioctx, const char *URL, const char *encoding, int options)
7052 {
7053     htmlParserCtxtPtr ctxt;
7054     xmlParserInputBufferPtr input;
7055     xmlParserInputPtr stream;
7056
7057     if (ioread == NULL)
7058         return (NULL);
7059     xmlInitParser();
7060
7061     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7062                                          XML_CHAR_ENCODING_NONE);
7063     if (input == NULL) {
7064         if (ioclose != NULL)
7065             ioclose(ioctx);
7066         return (NULL);
7067     }
7068     ctxt = htmlNewParserCtxt();
7069     if (ctxt == NULL) {
7070         xmlFreeParserInputBuffer(input);
7071         return (NULL);
7072     }
7073     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7074     if (stream == NULL) {
7075         xmlFreeParserInputBuffer(input);
7076         xmlFreeParserCtxt(ctxt);
7077         return (NULL);
7078     }
7079     inputPush(ctxt, stream);
7080     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7081 }
7082
7083 /**
7084  * htmlCtxtReadDoc:
7085  * @ctxt:  an HTML parser context
7086  * @cur:  a pointer to a zero terminated string
7087  * @URL:  the base URL to use for the document
7088  * @encoding:  the document encoding, or NULL
7089  * @options:  a combination of htmlParserOption(s)
7090  *
7091  * parse an XML in-memory document and build a tree.
7092  * This reuses the existing @ctxt parser context
7093  *
7094  * Returns the resulting document tree
7095  */
7096 htmlDocPtr
7097 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7098                const char *URL, const char *encoding, int options)
7099 {
7100     xmlParserInputPtr stream;
7101
7102     if (cur == NULL)
7103         return (NULL);
7104     if (ctxt == NULL)
7105         return (NULL);
7106     xmlInitParser();
7107
7108     htmlCtxtReset(ctxt);
7109
7110     stream = xmlNewStringInputStream(ctxt, cur);
7111     if (stream == NULL) {
7112         return (NULL);
7113     }
7114     inputPush(ctxt, stream);
7115     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7116 }
7117
7118 /**
7119  * htmlCtxtReadFile:
7120  * @ctxt:  an HTML parser context
7121  * @filename:  a file or URL
7122  * @encoding:  the document encoding, or NULL
7123  * @options:  a combination of htmlParserOption(s)
7124  *
7125  * parse an XML file from the filesystem or the network.
7126  * This reuses the existing @ctxt parser context
7127  *
7128  * Returns the resulting document tree
7129  */
7130 htmlDocPtr
7131 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7132                 const char *encoding, int options)
7133 {
7134     xmlParserInputPtr stream;
7135
7136     if (filename == NULL)
7137         return (NULL);
7138     if (ctxt == NULL)
7139         return (NULL);
7140     xmlInitParser();
7141
7142     htmlCtxtReset(ctxt);
7143
7144     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7145     if (stream == NULL) {
7146         return (NULL);
7147     }
7148     inputPush(ctxt, stream);
7149     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7150 }
7151
7152 /**
7153  * htmlCtxtReadMemory:
7154  * @ctxt:  an HTML parser context
7155  * @buffer:  a pointer to a char array
7156  * @size:  the size of the array
7157  * @URL:  the base URL to use for the document
7158  * @encoding:  the document encoding, or NULL
7159  * @options:  a combination of htmlParserOption(s)
7160  *
7161  * parse an XML in-memory document and build a tree.
7162  * This reuses the existing @ctxt parser context
7163  *
7164  * Returns the resulting document tree
7165  */
7166 htmlDocPtr
7167 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7168                   const char *URL, const char *encoding, int options)
7169 {
7170     xmlParserInputBufferPtr input;
7171     xmlParserInputPtr stream;
7172
7173     if (ctxt == NULL)
7174         return (NULL);
7175     if (buffer == NULL)
7176         return (NULL);
7177     xmlInitParser();
7178
7179     htmlCtxtReset(ctxt);
7180
7181     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7182     if (input == NULL) {
7183         return(NULL);
7184     }
7185
7186     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7187     if (stream == NULL) {
7188         xmlFreeParserInputBuffer(input);
7189         return(NULL);
7190     }
7191
7192     inputPush(ctxt, stream);
7193     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7194 }
7195
7196 /**
7197  * htmlCtxtReadFd:
7198  * @ctxt:  an HTML parser context
7199  * @fd:  an open file descriptor
7200  * @URL:  the base URL to use for the document
7201  * @encoding:  the document encoding, or NULL
7202  * @options:  a combination of htmlParserOption(s)
7203  *
7204  * parse an XML from a file descriptor and build a tree.
7205  * This reuses the existing @ctxt parser context
7206  *
7207  * Returns the resulting document tree
7208  */
7209 htmlDocPtr
7210 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7211               const char *URL, const char *encoding, int options)
7212 {
7213     xmlParserInputBufferPtr input;
7214     xmlParserInputPtr stream;
7215
7216     if (fd < 0)
7217         return (NULL);
7218     if (ctxt == NULL)
7219         return (NULL);
7220     xmlInitParser();
7221
7222     htmlCtxtReset(ctxt);
7223
7224
7225     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7226     if (input == NULL)
7227         return (NULL);
7228     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7229     if (stream == NULL) {
7230         xmlFreeParserInputBuffer(input);
7231         return (NULL);
7232     }
7233     inputPush(ctxt, stream);
7234     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7235 }
7236
7237 /**
7238  * htmlCtxtReadIO:
7239  * @ctxt:  an HTML parser context
7240  * @ioread:  an I/O read function
7241  * @ioclose:  an I/O close function
7242  * @ioctx:  an I/O handler
7243  * @URL:  the base URL to use for the document
7244  * @encoding:  the document encoding, or NULL
7245  * @options:  a combination of htmlParserOption(s)
7246  *
7247  * parse an HTML document from I/O functions and source and build a tree.
7248  * This reuses the existing @ctxt parser context
7249  *
7250  * Returns the resulting document tree
7251  */
7252 htmlDocPtr
7253 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7254               xmlInputCloseCallback ioclose, void *ioctx,
7255               const char *URL,
7256               const char *encoding, int options)
7257 {
7258     xmlParserInputBufferPtr input;
7259     xmlParserInputPtr stream;
7260
7261     if (ioread == NULL)
7262         return (NULL);
7263     if (ctxt == NULL)
7264         return (NULL);
7265     xmlInitParser();
7266
7267     htmlCtxtReset(ctxt);
7268
7269     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7270                                          XML_CHAR_ENCODING_NONE);
7271     if (input == NULL) {
7272         if (ioclose != NULL)
7273             ioclose(ioctx);
7274         return (NULL);
7275     }
7276     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7277     if (stream == NULL) {
7278         xmlFreeParserInputBuffer(input);
7279         return (NULL);
7280     }
7281     inputPush(ctxt, stream);
7282     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7283 }
7284
7285 #define bottom_HTMLparser
7286 #include "elfgcchack.h"
7287 #endif /* LIBXML_HTML_ENABLED */