libs/xml2/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #include <ctype.h>
  15 #include <stdlib.h>
  16
  17 #include <libxml/xmlmemory.h>
  18 #include <libxml/tree.h>
  19 #include <libxml/parser.h>
  20 #include <libxml/parserInternals.h>
  21 #include <libxml/xmlerror.h>
  22 #include <libxml/HTMLparser.h>
  23 #include <libxml/HTMLtree.h>
  24 #include <libxml/entities.h>
  25 #include <libxml/encoding.h>
  26 #include <libxml/valid.h>
  27 #include <libxml/xmlIO.h>
  28 #include <libxml/globals.h>
  29 #include <libxml/uri.h>
  30
  31 #include "buf.h"
  32 #include "enc.h"
  33
  34 #define HTML_MAX_NAMELEN 1000
  35 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  36 #define HTML_PARSER_BUFFER_SIZE 100
  37
  38 /* #define DEBUG */
  39 /* #define DEBUG_PUSH */
  40
  41 static int htmlOmittedDefaultValue = 1;
  42
  43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  44                              xmlChar end, xmlChar  end2, xmlChar end3);
  45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  46
  47 /************************************************************************
  48  *                                                                      *
  49  *              Some factorized error routines                          *
  50  *                                                                      *
  51  ************************************************************************/
  52
  53 /**
  54  * htmlErrMemory:
  55  * @ctxt:  an HTML parser context
  56  * @extra:  extra information
  57  *
  58  * Handle a redefinition of attribute error
  59  */
  60 static void
  61 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  62 {
  63     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  64         (ctxt->instate == XML_PARSER_EOF))
  65         return;
  66     if (ctxt != NULL) {
  67         ctxt->errNo = XML_ERR_NO_MEMORY;
  68         ctxt->instate = XML_PARSER_EOF;
  69         ctxt->disableSAX = 1;
  70     }
  71     if (extra)
  72         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  73                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  74                         NULL, NULL, 0, 0,
  75                         "Memory allocation failed : %s\n", extra);
  76     else
  77         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  78                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  79                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  80 }
  81
  82 /**
  83  * htmlParseErr:
  84  * @ctxt:  an HTML parser context
  85  * @error:  the error number
  86  * @msg:  the error message
  87  * @str1:  string infor
  88  * @str2:  string infor
  89  *
  90  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  91  */
  92 static void LIBXML_ATTR_FORMAT(3,0)
  93 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  94              const char *msg, const xmlChar *str1, const xmlChar *str2)
  95 {
  96     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  97         (ctxt->instate == XML_PARSER_EOF))
  98         return;
  99     if (ctxt != NULL)
 100         ctxt->errNo = error;
 101     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 102                     XML_ERR_ERROR, NULL, 0,
 103                     (const char *) str1, (const char *) str2,
 104                     NULL, 0, 0,
 105                     msg, str1, str2);
 106     if (ctxt != NULL)
 107         ctxt->wellFormed = 0;
 108 }
 109
 110 /**
 111  * htmlParseErrInt:
 112  * @ctxt:  an HTML parser context
 113  * @error:  the error number
 114  * @msg:  the error message
 115  * @val:  integer info
 116  *
 117  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 118  */
 119 static void LIBXML_ATTR_FORMAT(3,0)
 120 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 121              const char *msg, int val)
 122 {
 123     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 124         (ctxt->instate == XML_PARSER_EOF))
 125         return;
 126     if (ctxt != NULL)
 127         ctxt->errNo = error;
 128     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 129                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 130                     NULL, val, 0, msg, val);
 131     if (ctxt != NULL)
 132         ctxt->wellFormed = 0;
 133 }
 134
 135 /************************************************************************
 136  *                                                                      *
 137  *      Parser stacks related functions and macros              *
 138  *                                                                      *
 139  ************************************************************************/
 140
 141 /**
 142  * htmlnamePush:
 143  * @ctxt:  an HTML parser context
 144  * @value:  the element name
 145  *
 146  * Pushes a new element name on top of the name stack
 147  *
 148  * Returns 0 in case of error, the index in the stack otherwise
 149  */
 150 static int
 151 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 152 {
 153     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 154         ctxt->html = 3;
 155     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 156         ctxt->html = 10;
 157     if (ctxt->nameNr >= ctxt->nameMax) {
 158         ctxt->nameMax *= 2;
 159         ctxt->nameTab = (const xmlChar * *)
 160                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 161                                     ctxt->nameMax *
 162                                     sizeof(ctxt->nameTab[0]));
 163         if (ctxt->nameTab == NULL) {
 164             htmlErrMemory(ctxt, NULL);
 165             return (0);
 166         }
 167     }
 168     ctxt->nameTab[ctxt->nameNr] = value;
 169     ctxt->name = value;
 170     return (ctxt->nameNr++);
 171 }
 172 /**
 173  * htmlnamePop:
 174  * @ctxt: an HTML parser context
 175  *
 176  * Pops the top element name from the name stack
 177  *
 178  * Returns the name just removed
 179  */
 180 static const xmlChar *
 181 htmlnamePop(htmlParserCtxtPtr ctxt)
 182 {
 183     const xmlChar *ret;
 184
 185     if (ctxt->nameNr <= 0)
 186         return (NULL);
 187     ctxt->nameNr--;
 188     if (ctxt->nameNr < 0)
 189         return (NULL);
 190     if (ctxt->nameNr > 0)
 191         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 192     else
 193         ctxt->name = NULL;
 194     ret = ctxt->nameTab[ctxt->nameNr];
 195     ctxt->nameTab[ctxt->nameNr] = NULL;
 196     return (ret);
 197 }
 198
 199 /**
 200  * htmlNodeInfoPush:
 201  * @ctxt:  an HTML parser context
 202  * @value:  the node info
 203  *
 204  * Pushes a new element name on top of the node info stack
 205  *
 206  * Returns 0 in case of error, the index in the stack otherwise
 207  */
 208 static int
 209 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 210 {
 211     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 212         if (ctxt->nodeInfoMax == 0)
 213                 ctxt->nodeInfoMax = 5;
 214         ctxt->nodeInfoMax *= 2;
 215         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 216                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 217                                     ctxt->nodeInfoMax *
 218                                     sizeof(ctxt->nodeInfoTab[0]));
 219         if (ctxt->nodeInfoTab == NULL) {
 220             htmlErrMemory(ctxt, NULL);
 221             return (0);
 222         }
 223     }
 224     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 225     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 226     return (ctxt->nodeInfoNr++);
 227 }
 228
 229 /**
 230  * htmlNodeInfoPop:
 231  * @ctxt:  an HTML parser context
 232  *
 233  * Pops the top element name from the node info stack
 234  *
 235  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 236  */
 237 static htmlParserNodeInfo *
 238 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 239 {
 240     if (ctxt->nodeInfoNr <= 0)
 241         return (NULL);
 242     ctxt->nodeInfoNr--;
 243     if (ctxt->nodeInfoNr < 0)
 244         return (NULL);
 245     if (ctxt->nodeInfoNr > 0)
 246         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 247     else
 248         ctxt->nodeInfo = NULL;
 249     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 250 }
 251
 252 /*
 253  * Macros for accessing the content. Those should be used only by the parser,
 254  * and not exported.
 255  *
 256  * Dirty macros, i.e. one need to make assumption on the context to use them
 257  *
 258  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 259  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 260  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 261  *           in UNICODE mode. This should be used internally by the parser
 262  *           only to compare to ASCII values otherwise it would break when
 263  *           running with UTF-8 encoding.
 264  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 265  *           to compare on ASCII based substring.
 266  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 267  *           it should be used only to compare on ASCII based substring.
 268  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 269  *           strings without newlines within the parser.
 270  *
 271  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 272  *
 273  *   CURRENT Returns the current char value, with the full decoding of
 274  *           UTF-8 if we are using this mode. It returns an int.
 275  *   NEXT    Skip to the next character, this does the proper decoding
 276  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 277  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 278  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 279  */
 280
 281 #define UPPER (toupper(*ctxt->input->cur))
 282
 283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
 284
 285 #define NXT(val) ctxt->input->cur[(val)]
 286
 287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 288
 289 #define CUR_PTR ctxt->input->cur
 290 #define BASE_PTR ctxt->input->base
 291
 292 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 293                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 294         xmlParserInputShrink(ctxt->input)
 295
 296 #define GROW if ((ctxt->progressive == 0) &&                            \
 297                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 298         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 299
 300 #define CURRENT ((int) (*ctxt->input->cur))
 301
 302 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 303
 304 /* Imported from XML */
 305
 306 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 307 #define CUR ((int) (*ctxt->input->cur))
 308 #define NEXT xmlNextChar(ctxt)
 309
 310 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 311
 312
 313 #define NEXTL(l) do {                                                   \
 314     if (*(ctxt->input->cur) == '\n') {                                  \
 315         ctxt->input->line++; ctxt->input->col = 1;                      \
 316     } else ctxt->input->col++;                                          \
 317     ctxt->token = 0; ctxt->input->cur += l;                             \
 318   } while (0)
 319
 320 /************
 321     \
 322     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 323     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 324  ************/
 325
 326 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 327 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 328
 329 #define COPY_BUF(l,b,i,v)                                               \
 330     if (l == 1) b[i++] = (xmlChar) v;                                   \
 331     else i += xmlCopyChar(l,&b[i],v)
 332
 333 /**
 334  * htmlFindEncoding:
 335  * @the HTML parser context
 336  *
 337  * Ty to find and encoding in the current data available in the input
 338  * buffer this is needed to try to switch to the proper encoding when
 339  * one face a character error.
 340  * That's an heuristic, since it's operating outside of parsing it could
 341  * try to use a meta which had been commented out, that's the reason it
 342  * should only be used in case of error, not as a default.
 343  *
 344  * Returns an encoding string or NULL if not found, the string need to
 345  *   be freed
 346  */
 347 static xmlChar *
 348 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 349     const xmlChar *start, *cur, *end;
 350
 351     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 352         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 353         (ctxt->input->buf->encoder != NULL))
 354         return(NULL);
 355     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 356         return(NULL);
 357
 358     start = ctxt->input->cur;
 359     end = ctxt->input->end;
 360     /* we also expect the input buffer to be zero terminated */
 361     if (*end != 0)
 362         return(NULL);
 363
 364     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 365     if (cur == NULL)
 366         return(NULL);
 367     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 368     if (cur == NULL)
 369         return(NULL);
 370     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 371     if (cur == NULL)
 372         return(NULL);
 373     cur += 8;
 374     start = cur;
 375     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 376            ((*cur >= 'a') && (*cur <= 'z')) ||
 377            ((*cur >= '0') && (*cur <= '9')) ||
 378            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 379            cur++;
 380     if (cur == start)
 381         return(NULL);
 382     return(xmlStrndup(start, cur - start));
 383 }
 384
 385 /**
 386  * htmlCurrentChar:
 387  * @ctxt:  the HTML parser context
 388  * @len:  pointer to the length of the char read
 389  *
 390  * The current char value, if using UTF-8 this may actually span multiple
 391  * bytes in the input buffer. Implement the end of line normalization:
 392  * 2.11 End-of-Line Handling
 393  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 394  * char, then the encoding converter is plugged in automatically.
 395  *
 396  * Returns the current char value and its length
 397  */
 398
 399 static int
 400 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 401     const unsigned char *cur;
 402     unsigned char c;
 403     unsigned int val;
 404
 405     if (ctxt->instate == XML_PARSER_EOF)
 406         return(0);
 407
 408     if (ctxt->token != 0) {
 409         *len = 0;
 410         return(ctxt->token);
 411     }
 412     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
 413         xmlChar * guess;
 414         xmlCharEncodingHandlerPtr handler;
 415
 416         /*
 417          * Assume it's a fixed length encoding (1) with
 418          * a compatible encoding for the ASCII set, since
 419          * HTML constructs only use < 128 chars
 420          */
 421         if ((int) *ctxt->input->cur < 0x80) {
 422             *len = 1;
 423             if ((*ctxt->input->cur == 0) &&
 424                 (ctxt->input->cur < ctxt->input->end)) {
 425                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 426                                 "Char 0x%X out of allowed range\n", 0);
 427                 return(' ');
 428             }
 429             return((int) *ctxt->input->cur);
 430         }
 431
 432         /*
 433          * Humm this is bad, do an automatic flow conversion
 434          */
 435         guess = htmlFindEncoding(ctxt);
 436         if (guess == NULL) {
 437             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 438         } else {
 439             if (ctxt->input->encoding != NULL)
 440                 xmlFree((xmlChar *) ctxt->input->encoding);
 441             ctxt->input->encoding = guess;
 442             handler = xmlFindCharEncodingHandler((const char *) guess);
 443             if (handler != NULL) {
 444                 /*
 445                  * Don't use UTF-8 encoder which isn't required and
 446                  * can produce invalid UTF-8.
 447                  */
 448                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
 449                     xmlSwitchToEncoding(ctxt, handler);
 450             } else {
 451                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 452                              "Unsupported encoding %s", guess, NULL);
 453             }
 454         }
 455         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 456     }
 457
 458     /*
 459      * We are supposed to handle UTF8, check it's valid
 460      * From rfc2044: encoding of the Unicode values on UTF-8:
 461      *
 462      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 463      * 0000 0000-0000 007F   0xxxxxxx
 464      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 465      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 466      *
 467      * Check for the 0x110000 limit too
 468      */
 469     cur = ctxt->input->cur;
 470     c = *cur;
 471     if (c & 0x80) {
 472         if ((c & 0x40) == 0)
 473             goto encoding_error;
 474         if (cur[1] == 0) {
 475             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 476             cur = ctxt->input->cur;
 477         }
 478         if ((cur[1] & 0xc0) != 0x80)
 479             goto encoding_error;
 480         if ((c & 0xe0) == 0xe0) {
 481
 482             if (cur[2] == 0) {
 483                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 484                 cur = ctxt->input->cur;
 485             }
 486             if ((cur[2] & 0xc0) != 0x80)
 487                 goto encoding_error;
 488             if ((c & 0xf0) == 0xf0) {
 489                 if (cur[3] == 0) {
 490                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 491                     cur = ctxt->input->cur;
 492                 }
 493                 if (((c & 0xf8) != 0xf0) ||
 494                     ((cur[3] & 0xc0) != 0x80))
 495                     goto encoding_error;
 496                 /* 4-byte code */
 497                 *len = 4;
 498                 val = (cur[0] & 0x7) << 18;
 499                 val |= (cur[1] & 0x3f) << 12;
 500                 val |= (cur[2] & 0x3f) << 6;
 501                 val |= cur[3] & 0x3f;
 502                 if (val < 0x10000)
 503                     goto encoding_error;
 504             } else {
 505               /* 3-byte code */
 506                 *len = 3;
 507                 val = (cur[0] & 0xf) << 12;
 508                 val |= (cur[1] & 0x3f) << 6;
 509                 val |= cur[2] & 0x3f;
 510                 if (val < 0x800)
 511                     goto encoding_error;
 512             }
 513         } else {
 514           /* 2-byte code */
 515             *len = 2;
 516             val = (cur[0] & 0x1f) << 6;
 517             val |= cur[1] & 0x3f;
 518             if (val < 0x80)
 519                 goto encoding_error;
 520         }
 521         if (!IS_CHAR(val)) {
 522             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 523                             "Char 0x%X out of allowed range\n", val);
 524         }
 525         return(val);
 526     } else {
 527         if ((*ctxt->input->cur == 0) &&
 528             (ctxt->input->cur < ctxt->input->end)) {
 529             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 530                             "Char 0x%X out of allowed range\n", 0);
 531             *len = 1;
 532             return(' ');
 533         }
 534         /* 1-byte code */
 535         *len = 1;
 536         return((int) *ctxt->input->cur);
 537     }
 538
 539 encoding_error:
 540     /*
 541      * If we detect an UTF8 error that probably mean that the
 542      * input encoding didn't get properly advertised in the
 543      * declaration header. Report the error and switch the encoding
 544      * to ISO-Latin-1 (if you don't like this policy, just declare the
 545      * encoding !)
 546      */
 547     {
 548         char buffer[150];
 549
 550         if (ctxt->input->end - ctxt->input->cur >= 4) {
 551             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 552                             ctxt->input->cur[0], ctxt->input->cur[1],
 553                             ctxt->input->cur[2], ctxt->input->cur[3]);
 554         } else {
 555             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 556         }
 557         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 558                      "Input is not proper UTF-8, indicate encoding !\n",
 559                      BAD_CAST buffer, NULL);
 560     }
 561
 562     /*
 563      * Don't switch encodings twice. Note that if there's an encoder, we
 564      * shouldn't receive invalid UTF-8 anyway.
 565      *
 566      * Note that if ctxt->input->buf == NULL, switching encodings is
 567      * impossible, see Gitlab issue #34.
 568      */
 569     if ((ctxt->input->buf != NULL) &&
 570         (ctxt->input->buf->encoder == NULL))
 571         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 572     *len = 1;
 573     return((int) *ctxt->input->cur);
 574 }
 575
 576 /**
 577  * htmlSkipBlankChars:
 578  * @ctxt:  the HTML parser context
 579  *
 580  * skip all blanks character found at that point in the input streams.
 581  *
 582  * Returns the number of space chars skipped
 583  */
 584
 585 static int
 586 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 587     int res = 0;
 588
 589     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 590         if ((*ctxt->input->cur == 0) &&
 591             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 592                 xmlPopInput(ctxt);
 593         } else {
 594             if (*(ctxt->input->cur) == '\n') {
 595                 ctxt->input->line++; ctxt->input->col = 1;
 596             } else ctxt->input->col++;
 597             ctxt->input->cur++;
 598             if (*ctxt->input->cur == 0)
 599                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 600         }
 601         if (res < INT_MAX)
 602             res++;
 603     }
 604     return(res);
 605 }
 606
 607
 608
 609 /************************************************************************
 610  *                                                                      *
 611  *      The list of HTML elements and their properties          *
 612  *                                                                      *
 613  ************************************************************************/
 614
 615 /*
 616  *  Start Tag: 1 means the start tag can be omitted
 617  *  End Tag:   1 means the end tag can be omitted
 618  *             2 means it's forbidden (empty elements)
 619  *             3 means the tag is stylistic and should be closed easily
 620  *  Depr:      this element is deprecated
 621  *  DTD:       1 means that this element is valid only in the Loose DTD
 622  *             2 means that this element is valid only in the Frameset DTD
 623  *
 624  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 625         , subElements , impliedsubelt , Attributes, userdata
 626  */
 627
 628 /* Definitions and a couple of vars for HTML Elements */
 629
 630 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 631 #define NB_FONTSTYLE 8
 632 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 633 #define NB_PHRASE 10
 634 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 635 #define NB_SPECIAL 16
 636 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 637 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 638 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 639 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 640 #define FORMCTRL "input", "select", "textarea", "label", "button"
 641 #define NB_FORMCTRL 5
 642 #define PCDATA
 643 #define NB_PCDATA 0
 644 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 645 #define NB_HEADING 6
 646 #define LIST "ul", "ol", "dir", "menu"
 647 #define NB_LIST 4
 648 #define MODIFIER
 649 #define NB_MODIFIER 0
 650 #define FLOW BLOCK,INLINE
 651 #define NB_FLOW NB_BLOCK + NB_INLINE
 652 #define EMPTY NULL
 653
 654
 655 static const char* const html_flow[] = { FLOW, NULL } ;
 656 static const char* const html_inline[] = { INLINE, NULL } ;
 657
 658 /* placeholders: elts with content but no subelements */
 659 static const char* const html_pcdata[] = { NULL } ;
 660 #define html_cdata html_pcdata
 661
 662
 663 /* ... and for HTML Attributes */
 664
 665 #define COREATTRS "id", "class", "style", "title"
 666 #define NB_COREATTRS 4
 667 #define I18N "lang", "dir"
 668 #define NB_I18N 2
 669 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 670 #define NB_EVENTS 9
 671 #define ATTRS COREATTRS,I18N,EVENTS
 672 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 673 #define CELLHALIGN "align", "char", "charoff"
 674 #define NB_CELLHALIGN 3
 675 #define CELLVALIGN "valign"
 676 #define NB_CELLVALIGN 1
 677
 678 static const char* const html_attrs[] = { ATTRS, NULL } ;
 679 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 680 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 681 static const char* const i18n_attrs[] = { I18N, NULL } ;
 682
 683
 684 /* Other declarations that should go inline ... */
 685 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 686         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 687         "tabindex", "onfocus", "onblur", NULL } ;
 688 static const char* const target_attr[] = { "target", NULL } ;
 689 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 690 static const char* const alt_attr[] = { "alt", NULL } ;
 691 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 692 static const char* const href_attrs[] = { "href", NULL } ;
 693 static const char* const clear_attrs[] = { "clear", NULL } ;
 694 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 695
 696 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 697 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 698                 "archive", "alt", "name", "height", "width", "align",
 699                 "hspace", "vspace", NULL } ;
 700 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 701         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 702 static const char* const basefont_attrs[] =
 703         { "id", "size", "color", "face", NULL } ;
 704 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 705 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 706 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 707 static const char* const body_depr[] = { "background", "bgcolor", "text",
 708         "link", "vlink", "alink", NULL } ;
 709 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 710         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 711
 712
 713 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 714 static const char* const col_elt[] = { "col", NULL } ;
 715 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 716 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 717 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 718 static const char* const compact_attr[] = { "compact", NULL } ;
 719 static const char* const label_attr[] = { "label", NULL } ;
 720 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 721 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 722 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 723 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 724 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 725 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 726 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 727 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 728 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 729 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 730 static const char* const version_attr[] = { "version", NULL } ;
 731 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 732 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 733 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 734 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 735 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 736 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 737 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 738 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 739 static const char* const align_attr[] = { "align", NULL } ;
 740 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 741 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 742 static const char* const name_attr[] = { "name", NULL } ;
 743 static const char* const action_attr[] = { "action", NULL } ;
 744 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 745 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 746 static const char* const content_attr[] = { "content", NULL } ;
 747 static const char* const type_attr[] = { "type", NULL } ;
 748 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 749 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 750 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 751 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 752 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 753 static const char* const option_elt[] = { "option", NULL } ;
 754 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 755 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 756 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 757 static const char* const width_attr[] = { "width", NULL } ;
 758 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 759 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 760 static const char* const language_attr[] = { "language", NULL } ;
 761 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 762 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 763 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 764 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 765 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 766 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 767 static const char* const tr_elt[] = { "tr", NULL } ;
 768 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 769 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 770 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 771 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 772 static const char* const tr_contents[] = { "th", "td", NULL } ;
 773 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 774 static const char* const li_elt[] = { "li", NULL } ;
 775 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 776 static const char* const dir_attr[] = { "dir", NULL} ;
 777
 778 #define DECL (const char**)
 779
 780 static const htmlElemDesc
 781 html40ElementTable[] = {
 782 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 783         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 784 },
 785 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 786         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 787 },
 788 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 789         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 790 },
 791 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 792         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 793 },
 794 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 795         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 796 },
 797 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 798         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 799 },
 800 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 801         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 802 },
 803 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 804         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 805 },
 806 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 807         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 808 },
 809 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 810         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 811 },
 812 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 813         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 814 },
 815 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 816         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 817 },
 818 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 819         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 820 },
 821 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 822         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 823 },
 824 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 825         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 826 },
 827 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 828         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 829 },
 830 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 831         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 832 },
 833 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 834         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 835 },
 836 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 837         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 838 },
 839 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 840         EMPTY , NULL , DECL col_attrs , NULL, NULL
 841 },
 842 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 843         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 844 },
 845 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 846         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 847 },
 848 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 849         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 850 },
 851 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 852         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 853 },
 854 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 855         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 856 },
 857 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 858         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 859 },
 860 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 861         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 862 },
 863 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 864         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 865 },
 866 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 867         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 868 },
 869 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 870         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 871 },
 872 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 873         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 874 },
 875 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 876         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 877 },
 878 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 879         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 880 },
 881 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 882         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 883 },
 884 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 885         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 886 },
 887 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 888         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 889 },
 890 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 891         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 892 },
 893 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 894         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 895 },
 896 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 897         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 898 },
 899 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 900         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 901 },
 902 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 903         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 904 },
 905 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 906         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 907 },
 908 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 909         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 910 },
 911 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 912         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 913 },
 914 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 915         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 916 },
 917 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 918         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 919 },
 920 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 921         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 922 },
 923 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 924         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 925 },
 926 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 927         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 928 },
 929 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 930         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 931 },
 932 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 933         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 934 },
 935 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 936         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 937 },
 938 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 939         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 940 },
 941 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 942         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 943 },
 944 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 945         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 946 },
 947 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 948         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 949 },
 950 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 951         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 952 },
 953 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 954         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 955 },
 956 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 957         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 958 },
 959 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 960         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 961 },
 962 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 963         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 964 },
 965 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 966         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 967 },
 968 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 969         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 970 },
 971 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 972         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 973 },
 974 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 975         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 976 },
 977 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 978         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 979 },
 980 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 981         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 982 },
 983 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 984         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 985 },
 986 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 987         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 988 },
 989 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 990         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 991 },
 992 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 993         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 994 },
 995 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 996         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 997 },
 998 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 999         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1002         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
1005         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1006 },
1007 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1008         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1009 },
1010 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
1011         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1012 },
1013 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
1014         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1017         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1020         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1021 },
1022 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1023         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1026         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1027 },
1028 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1029         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1030 },
1031 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1032         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1033 },
1034 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1035         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1036 },
1037 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1038         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1041         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1042 },
1043 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1044         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1045 },
1046 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1047         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1048 },
1049 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1050         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1051 },
1052 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1053         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1054 },
1055 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1056         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1057 }
1058 };
1059
1060 typedef struct {
1061     const char *oldTag;
1062     const char *newTag;
1063 } htmlStartCloseEntry;
1064
1065 /*
1066  * start tags that imply the end of current element
1067  */
1068 static const htmlStartCloseEntry htmlStartClose[] = {
1069     { "a", "a" },
1070     { "a", "fieldset" },
1071     { "a", "table" },
1072     { "a", "td" },
1073     { "a", "th" },
1074     { "address", "dd" },
1075     { "address", "dl" },
1076     { "address", "dt" },
1077     { "address", "form" },
1078     { "address", "li" },
1079     { "address", "ul" },
1080     { "b", "center" },
1081     { "b", "p" },
1082     { "b", "td" },
1083     { "b", "th" },
1084     { "big", "p" },
1085     { "caption", "col" },
1086     { "caption", "colgroup" },
1087     { "caption", "tbody" },
1088     { "caption", "tfoot" },
1089     { "caption", "thead" },
1090     { "caption", "tr" },
1091     { "col", "col" },
1092     { "col", "colgroup" },
1093     { "col", "tbody" },
1094     { "col", "tfoot" },
1095     { "col", "thead" },
1096     { "col", "tr" },
1097     { "colgroup", "colgroup" },
1098     { "colgroup", "tbody" },
1099     { "colgroup", "tfoot" },
1100     { "colgroup", "thead" },
1101     { "colgroup", "tr" },
1102     { "dd", "dt" },
1103     { "dir", "dd" },
1104     { "dir", "dl" },
1105     { "dir", "dt" },
1106     { "dir", "form" },
1107     { "dir", "ul" },
1108     { "dl", "form" },
1109     { "dl", "li" },
1110     { "dt", "dd" },
1111     { "dt", "dl" },
1112     { "font", "center" },
1113     { "font", "td" },
1114     { "font", "th" },
1115     { "form", "form" },
1116     { "h1", "fieldset" },
1117     { "h1", "form" },
1118     { "h1", "li" },
1119     { "h1", "p" },
1120     { "h1", "table" },
1121     { "h2", "fieldset" },
1122     { "h2", "form" },
1123     { "h2", "li" },
1124     { "h2", "p" },
1125     { "h2", "table" },
1126     { "h3", "fieldset" },
1127     { "h3", "form" },
1128     { "h3", "li" },
1129     { "h3", "p" },
1130     { "h3", "table" },
1131     { "h4", "fieldset" },
1132     { "h4", "form" },
1133     { "h4", "li" },
1134     { "h4", "p" },
1135     { "h4", "table" },
1136     { "h5", "fieldset" },
1137     { "h5", "form" },
1138     { "h5", "li" },
1139     { "h5", "p" },
1140     { "h5", "table" },
1141     { "h6", "fieldset" },
1142     { "h6", "form" },
1143     { "h6", "li" },
1144     { "h6", "p" },
1145     { "h6", "table" },
1146     { "head", "a" },
1147     { "head", "abbr" },
1148     { "head", "acronym" },
1149     { "head", "address" },
1150     { "head", "b" },
1151     { "head", "bdo" },
1152     { "head", "big" },
1153     { "head", "blockquote" },
1154     { "head", "body" },
1155     { "head", "br" },
1156     { "head", "center" },
1157     { "head", "cite" },
1158     { "head", "code" },
1159     { "head", "dd" },
1160     { "head", "dfn" },
1161     { "head", "dir" },
1162     { "head", "div" },
1163     { "head", "dl" },
1164     { "head", "dt" },
1165     { "head", "em" },
1166     { "head", "fieldset" },
1167     { "head", "font" },
1168     { "head", "form" },
1169     { "head", "frameset" },
1170     { "head", "h1" },
1171     { "head", "h2" },
1172     { "head", "h3" },
1173     { "head", "h4" },
1174     { "head", "h5" },
1175     { "head", "h6" },
1176     { "head", "hr" },
1177     { "head", "i" },
1178     { "head", "iframe" },
1179     { "head", "img" },
1180     { "head", "kbd" },
1181     { "head", "li" },
1182     { "head", "listing" },
1183     { "head", "map" },
1184     { "head", "menu" },
1185     { "head", "ol" },
1186     { "head", "p" },
1187     { "head", "pre" },
1188     { "head", "q" },
1189     { "head", "s" },
1190     { "head", "samp" },
1191     { "head", "small" },
1192     { "head", "span" },
1193     { "head", "strike" },
1194     { "head", "strong" },
1195     { "head", "sub" },
1196     { "head", "sup" },
1197     { "head", "table" },
1198     { "head", "tt" },
1199     { "head", "u" },
1200     { "head", "ul" },
1201     { "head", "var" },
1202     { "head", "xmp" },
1203     { "hr", "form" },
1204     { "i", "center" },
1205     { "i", "p" },
1206     { "i", "td" },
1207     { "i", "th" },
1208     { "legend", "fieldset" },
1209     { "li", "li" },
1210     { "link", "body" },
1211     { "link", "frameset" },
1212     { "listing", "dd" },
1213     { "listing", "dl" },
1214     { "listing", "dt" },
1215     { "listing", "fieldset" },
1216     { "listing", "form" },
1217     { "listing", "li" },
1218     { "listing", "table" },
1219     { "listing", "ul" },
1220     { "menu", "dd" },
1221     { "menu", "dl" },
1222     { "menu", "dt" },
1223     { "menu", "form" },
1224     { "menu", "ul" },
1225     { "ol", "form" },
1226     { "ol", "ul" },
1227     { "option", "optgroup" },
1228     { "option", "option" },
1229     { "p", "address" },
1230     { "p", "blockquote" },
1231     { "p", "body" },
1232     { "p", "caption" },
1233     { "p", "center" },
1234     { "p", "col" },
1235     { "p", "colgroup" },
1236     { "p", "dd" },
1237     { "p", "dir" },
1238     { "p", "div" },
1239     { "p", "dl" },
1240     { "p", "dt" },
1241     { "p", "fieldset" },
1242     { "p", "form" },
1243     { "p", "frameset" },
1244     { "p", "h1" },
1245     { "p", "h2" },
1246     { "p", "h3" },
1247     { "p", "h4" },
1248     { "p", "h5" },
1249     { "p", "h6" },
1250     { "p", "head" },
1251     { "p", "hr" },
1252     { "p", "li" },
1253     { "p", "listing" },
1254     { "p", "menu" },
1255     { "p", "ol" },
1256     { "p", "p" },
1257     { "p", "pre" },
1258     { "p", "table" },
1259     { "p", "tbody" },
1260     { "p", "td" },
1261     { "p", "tfoot" },
1262     { "p", "th" },
1263     { "p", "title" },
1264     { "p", "tr" },
1265     { "p", "ul" },
1266     { "p", "xmp" },
1267     { "pre", "dd" },
1268     { "pre", "dl" },
1269     { "pre", "dt" },
1270     { "pre", "fieldset" },
1271     { "pre", "form" },
1272     { "pre", "li" },
1273     { "pre", "table" },
1274     { "pre", "ul" },
1275     { "s", "p" },
1276     { "script", "noscript" },
1277     { "small", "p" },
1278     { "span", "td" },
1279     { "span", "th" },
1280     { "strike", "p" },
1281     { "style", "body" },
1282     { "style", "frameset" },
1283     { "tbody", "tbody" },
1284     { "tbody", "tfoot" },
1285     { "td", "tbody" },
1286     { "td", "td" },
1287     { "td", "tfoot" },
1288     { "td", "th" },
1289     { "td", "tr" },
1290     { "tfoot", "tbody" },
1291     { "th", "tbody" },
1292     { "th", "td" },
1293     { "th", "tfoot" },
1294     { "th", "th" },
1295     { "th", "tr" },
1296     { "thead", "tbody" },
1297     { "thead", "tfoot" },
1298     { "title", "body" },
1299     { "title", "frameset" },
1300     { "tr", "tbody" },
1301     { "tr", "tfoot" },
1302     { "tr", "tr" },
1303     { "tt", "p" },
1304     { "u", "p" },
1305     { "u", "td" },
1306     { "u", "th" },
1307     { "ul", "address" },
1308     { "ul", "form" },
1309     { "ul", "menu" },
1310     { "ul", "ol" },
1311     { "ul", "pre" },
1312     { "xmp", "dd" },
1313     { "xmp", "dl" },
1314     { "xmp", "dt" },
1315     { "xmp", "fieldset" },
1316     { "xmp", "form" },
1317     { "xmp", "li" },
1318     { "xmp", "table" },
1319     { "xmp", "ul" }
1320 };
1321
1322 /*
1323  * The list of HTML elements which are supposed not to have
1324  * CDATA content and where a p element will be implied
1325  *
1326  * TODO: extend that list by reading the HTML SGML DTD on
1327  *       implied paragraph
1328  */
1329 static const char *const htmlNoContentElements[] = {
1330     "html",
1331     "head",
1332     NULL
1333 };
1334
1335 /*
1336  * The list of HTML attributes which are of content %Script;
1337  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1338  *       it assumes the name starts with 'on'
1339  */
1340 static const char *const htmlScriptAttributes[] = {
1341     "onclick",
1342     "ondblclick",
1343     "onmousedown",
1344     "onmouseup",
1345     "onmouseover",
1346     "onmousemove",
1347     "onmouseout",
1348     "onkeypress",
1349     "onkeydown",
1350     "onkeyup",
1351     "onload",
1352     "onunload",
1353     "onfocus",
1354     "onblur",
1355     "onsubmit",
1356     "onreset",
1357     "onchange",
1358     "onselect"
1359 };
1360
1361 /*
1362  * This table is used by the htmlparser to know what to do with
1363  * broken html pages. By assigning different priorities to different
1364  * elements the parser can decide how to handle extra endtags.
1365  * Endtags are only allowed to close elements with lower or equal
1366  * priority.
1367  */
1368
1369 typedef struct {
1370     const char *name;
1371     int priority;
1372 } elementPriority;
1373
1374 static const elementPriority htmlEndPriority[] = {
1375     {"div",   150},
1376     {"td",    160},
1377     {"th",    160},
1378     {"tr",    170},
1379     {"thead", 180},
1380     {"tbody", 180},
1381     {"tfoot", 180},
1382     {"table", 190},
1383     {"head",  200},
1384     {"body",  200},
1385     {"html",  220},
1386     {NULL,    100} /* Default priority */
1387 };
1388
1389 /************************************************************************
1390  *                                                                      *
1391  *      functions to handle HTML specific data                  *
1392  *                                                                      *
1393  ************************************************************************/
1394
1395 /**
1396  * htmlInitAutoClose:
1397  *
1398  * DEPRECATED: This function will be made private. Call xmlInitParser to
1399  * initialize the library.
1400  *
1401  * This is a no-op now.
1402  */
1403 void
1404 htmlInitAutoClose(void) {
1405 }
1406
1407 static int
1408 htmlCompareTags(const void *key, const void *member) {
1409     const xmlChar *tag = (const xmlChar *) key;
1410     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1411
1412     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1413 }
1414
1415 /**
1416  * htmlTagLookup:
1417  * @tag:  The tag name in lowercase
1418  *
1419  * Lookup the HTML tag in the ElementTable
1420  *
1421  * Returns the related htmlElemDescPtr or NULL if not found.
1422  */
1423 const htmlElemDesc *
1424 htmlTagLookup(const xmlChar *tag) {
1425     if (tag == NULL)
1426         return(NULL);
1427
1428     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1429                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1430                 sizeof(htmlElemDesc), htmlCompareTags));
1431 }
1432
1433 /**
1434  * htmlGetEndPriority:
1435  * @name: The name of the element to look up the priority for.
1436  *
1437  * Return value: The "endtag" priority.
1438  **/
1439 static int
1440 htmlGetEndPriority (const xmlChar *name) {
1441     int i = 0;
1442
1443     while ((htmlEndPriority[i].name != NULL) &&
1444            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1445         i++;
1446
1447     return(htmlEndPriority[i].priority);
1448 }
1449
1450
1451 static int
1452 htmlCompareStartClose(const void *vkey, const void *member) {
1453     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1454     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1455     int ret;
1456
1457     ret = strcmp(key->oldTag, entry->oldTag);
1458     if (ret == 0)
1459         ret = strcmp(key->newTag, entry->newTag);
1460
1461     return(ret);
1462 }
1463
1464 /**
1465  * htmlCheckAutoClose:
1466  * @newtag:  The new tag name
1467  * @oldtag:  The old tag name
1468  *
1469  * Checks whether the new tag is one of the registered valid tags for
1470  * closing old.
1471  *
1472  * Returns 0 if no, 1 if yes.
1473  */
1474 static int
1475 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1476 {
1477     htmlStartCloseEntry key;
1478     void *res;
1479
1480     key.oldTag = (const char *) oldtag;
1481     key.newTag = (const char *) newtag;
1482     res = bsearch(&key, htmlStartClose,
1483             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1484             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1485     return(res != NULL);
1486 }
1487
1488 /**
1489  * htmlAutoCloseOnClose:
1490  * @ctxt:  an HTML parser context
1491  * @newtag:  The new tag name
1492  * @force:  force the tag closure
1493  *
1494  * The HTML DTD allows an ending tag to implicitly close other tags.
1495  */
1496 static void
1497 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1498 {
1499     const htmlElemDesc *info;
1500     int i, priority;
1501
1502     priority = htmlGetEndPriority(newtag);
1503
1504     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1505
1506         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1507             break;
1508         /*
1509          * A misplaced endtag can only close elements with lower
1510          * or equal priority, so if we find an element with higher
1511          * priority before we find an element with
1512          * matching name, we just ignore this endtag
1513          */
1514         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1515             return;
1516     }
1517     if (i < 0)
1518         return;
1519
1520     while (!xmlStrEqual(newtag, ctxt->name)) {
1521         info = htmlTagLookup(ctxt->name);
1522         if ((info != NULL) && (info->endTag == 3)) {
1523             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1524                          "Opening and ending tag mismatch: %s and %s\n",
1525                          newtag, ctxt->name);
1526         }
1527         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1528             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1529         htmlnamePop(ctxt);
1530     }
1531 }
1532
1533 /**
1534  * htmlAutoCloseOnEnd:
1535  * @ctxt:  an HTML parser context
1536  *
1537  * Close all remaining tags at the end of the stream
1538  */
1539 static void
1540 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1541 {
1542     int i;
1543
1544     if (ctxt->nameNr == 0)
1545         return;
1546     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1547         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1548             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1549         htmlnamePop(ctxt);
1550     }
1551 }
1552
1553 /**
1554  * htmlAutoClose:
1555  * @ctxt:  an HTML parser context
1556  * @newtag:  The new tag name or NULL
1557  *
1558  * The HTML DTD allows a tag to implicitly close other tags.
1559  * The list is kept in htmlStartClose array. This function is
1560  * called when a new tag has been detected and generates the
1561  * appropriates closes if possible/needed.
1562  * If newtag is NULL this mean we are at the end of the resource
1563  * and we should check
1564  */
1565 static void
1566 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1567 {
1568     while ((newtag != NULL) && (ctxt->name != NULL) &&
1569            (htmlCheckAutoClose(newtag, ctxt->name))) {
1570         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1571             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1572         htmlnamePop(ctxt);
1573     }
1574     if (newtag == NULL) {
1575         htmlAutoCloseOnEnd(ctxt);
1576         return;
1577     }
1578     while ((newtag == NULL) && (ctxt->name != NULL) &&
1579            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1580             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1581             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584         htmlnamePop(ctxt);
1585     }
1586 }
1587
1588 /**
1589  * htmlAutoCloseTag:
1590  * @doc:  the HTML document
1591  * @name:  The tag name
1592  * @elem:  the HTML element
1593  *
1594  * The HTML DTD allows a tag to implicitly close other tags.
1595  * The list is kept in htmlStartClose array. This function checks
1596  * if the element or one of it's children would autoclose the
1597  * given tag.
1598  *
1599  * Returns 1 if autoclose, 0 otherwise
1600  */
1601 int
1602 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1603     htmlNodePtr child;
1604
1605     if (elem == NULL) return(1);
1606     if (xmlStrEqual(name, elem->name)) return(0);
1607     if (htmlCheckAutoClose(elem->name, name)) return(1);
1608     child = elem->children;
1609     while (child != NULL) {
1610         if (htmlAutoCloseTag(doc, name, child)) return(1);
1611         child = child->next;
1612     }
1613     return(0);
1614 }
1615
1616 /**
1617  * htmlIsAutoClosed:
1618  * @doc:  the HTML document
1619  * @elem:  the HTML element
1620  *
1621  * The HTML DTD allows a tag to implicitly close other tags.
1622  * The list is kept in htmlStartClose array. This function checks
1623  * if a tag is autoclosed by one of it's child
1624  *
1625  * Returns 1 if autoclosed, 0 otherwise
1626  */
1627 int
1628 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1629     htmlNodePtr child;
1630
1631     if (elem == NULL) return(1);
1632     child = elem->children;
1633     while (child != NULL) {
1634         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1635         child = child->next;
1636     }
1637     return(0);
1638 }
1639
1640 /**
1641  * htmlCheckImplied:
1642  * @ctxt:  an HTML parser context
1643  * @newtag:  The new tag name
1644  *
1645  * The HTML DTD allows a tag to exists only implicitly
1646  * called when a new tag has been detected and generates the
1647  * appropriates implicit tags if missing
1648  */
1649 static void
1650 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1651     int i;
1652
1653     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1654         return;
1655     if (!htmlOmittedDefaultValue)
1656         return;
1657     if (xmlStrEqual(newtag, BAD_CAST"html"))
1658         return;
1659     if (ctxt->nameNr <= 0) {
1660         htmlnamePush(ctxt, BAD_CAST"html");
1661         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1662             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1663     }
1664     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1665         return;
1666     if ((ctxt->nameNr <= 1) &&
1667         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1668          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1669          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1670          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1671          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1672          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1673         if (ctxt->html >= 3) {
1674             /* we already saw or generated an <head> before */
1675             return;
1676         }
1677         /*
1678          * dropped OBJECT ... i you put it first BODY will be
1679          * assumed !
1680          */
1681         htmlnamePush(ctxt, BAD_CAST"head");
1682         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1683             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1684     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1685                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1686                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1687         if (ctxt->html >= 10) {
1688             /* we already saw or generated a <body> before */
1689             return;
1690         }
1691         for (i = 0;i < ctxt->nameNr;i++) {
1692             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1693                 return;
1694             }
1695             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1696                 return;
1697             }
1698         }
1699
1700         htmlnamePush(ctxt, BAD_CAST"body");
1701         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1702             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1703     }
1704 }
1705
1706 /**
1707  * htmlCheckParagraph
1708  * @ctxt:  an HTML parser context
1709  *
1710  * Check whether a p element need to be implied before inserting
1711  * characters in the current element.
1712  *
1713  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1714  *         in case of error.
1715  */
1716
1717 static int
1718 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1719     const xmlChar *tag;
1720     int i;
1721
1722     if (ctxt == NULL)
1723         return(-1);
1724     tag = ctxt->name;
1725     if (tag == NULL) {
1726         htmlAutoClose(ctxt, BAD_CAST"p");
1727         htmlCheckImplied(ctxt, BAD_CAST"p");
1728         htmlnamePush(ctxt, BAD_CAST"p");
1729         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1730             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1731         return(1);
1732     }
1733     if (!htmlOmittedDefaultValue)
1734         return(0);
1735     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1736         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1737             htmlAutoClose(ctxt, BAD_CAST"p");
1738             htmlCheckImplied(ctxt, BAD_CAST"p");
1739             htmlnamePush(ctxt, BAD_CAST"p");
1740             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1741                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1742             return(1);
1743         }
1744     }
1745     return(0);
1746 }
1747
1748 /**
1749  * htmlIsScriptAttribute:
1750  * @name:  an attribute name
1751  *
1752  * Check if an attribute is of content type Script
1753  *
1754  * Returns 1 is the attribute is a script 0 otherwise
1755  */
1756 int
1757 htmlIsScriptAttribute(const xmlChar *name) {
1758     unsigned int i;
1759
1760     if (name == NULL)
1761       return(0);
1762     /*
1763      * all script attributes start with 'on'
1764      */
1765     if ((name[0] != 'o') || (name[1] != 'n'))
1766       return(0);
1767     for (i = 0;
1768          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1769          i++) {
1770         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1771             return(1);
1772     }
1773     return(0);
1774 }
1775
1776 /************************************************************************
1777  *                                                                      *
1778  *      The list of HTML predefined entities                    *
1779  *                                                                      *
1780  ************************************************************************/
1781
1782
1783 static const htmlEntityDesc  html40EntitiesTable[] = {
1784 /*
1785  * the 4 absolute ones, plus apostrophe.
1786  */
1787 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1788 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1789 { 39,   "apos", "single quote" },
1790 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1791 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1792
1793 /*
1794  * A bunch still in the 128-255 range
1795  * Replacing them depend really on the charset used.
1796  */
1797 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1798 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1799 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1800 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1801 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1802 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1803 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1804 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1805 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1806 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1807 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1808 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1809 { 172,  "not",  "not sign, U+00AC ISOnum" },
1810 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1811 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1812 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1813 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1814 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1815 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1816 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1817 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1818 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1819 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1820 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1821 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1822 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1823 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1824 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1825 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1826 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1827 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1828 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1829 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1830 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1831 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1832 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1833 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1834 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1835 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1836 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1837 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1838 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1839 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1840 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1841 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1842 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1843 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1844 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1845 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1846 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1847 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1848 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1849 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1850 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1851 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1852 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1853 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1854 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1855 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1856 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1857 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1858 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1859 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1860 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1861 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1862 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1863 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1864 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1865 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1866 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1867 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1868 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1869 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1870 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1871 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1872 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1873 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1874 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1875 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1876 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1877 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1878 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1879 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1880 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1881 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1882 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1883 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1884 { 247,  "divide","division sign, U+00F7 ISOnum" },
1885 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1886 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1887 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1888 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1889 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1890 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1891 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1892 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1893
1894 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1895 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1896 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1897 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1898 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1899
1900 /*
1901  * Anything below should really be kept as entities references
1902  */
1903 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1904
1905 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1906 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1907
1908 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1909 { 914,  "Beta", "greek capital letter beta, U+0392" },
1910 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1911 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1912 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1913 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1914 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1915 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1916 { 921,  "Iota", "greek capital letter iota, U+0399" },
1917 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1918 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1919 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1920 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1921 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1922 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1923 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1924 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1925 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1926 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1927 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1928 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1929 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1930 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1931 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1932
1933 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1934 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1935 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1936 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1937 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1938 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1939 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1940 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1941 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1942 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1943 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1944 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1945 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1946 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1947 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1948 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1949 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1950 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1951 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1952 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1953 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1954 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1955 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1956 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1957 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1958 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1959 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1960 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1961
1962 { 8194, "ensp", "en space, U+2002 ISOpub" },
1963 { 8195, "emsp", "em space, U+2003 ISOpub" },
1964 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1965 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1966 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1967 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1968 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1969 { 8211, "ndash","en dash, U+2013 ISOpub" },
1970 { 8212, "mdash","em dash, U+2014 ISOpub" },
1971 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1972 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1973 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1974 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1975 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1976 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1977 { 8224, "dagger","dagger, U+2020 ISOpub" },
1978 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1979
1980 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1981 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1982
1983 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1984
1985 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1986 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1987
1988 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1989 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1990
1991 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1992 { 8260, "frasl","fraction slash, U+2044 NEW" },
1993
1994 { 8364, "euro", "euro sign, U+20AC NEW" },
1995
1996 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1997 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1998 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1999 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2000 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2001 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2002 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2003 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2004 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2005 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2006 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2007 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2008 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2009 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2010 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2011 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2012
2013 { 8704, "forall","for all, U+2200 ISOtech" },
2014 { 8706, "part", "partial differential, U+2202 ISOtech" },
2015 { 8707, "exist","there exists, U+2203 ISOtech" },
2016 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2017 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2018 { 8712, "isin", "element of, U+2208 ISOtech" },
2019 { 8713, "notin","not an element of, U+2209 ISOtech" },
2020 { 8715, "ni",   "contains as member, U+220B ISOtech" },
2021 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2022 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2023 { 8722, "minus","minus sign, U+2212 ISOtech" },
2024 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2025 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2026 { 8733, "prop", "proportional to, U+221D ISOtech" },
2027 { 8734, "infin","infinity, U+221E ISOtech" },
2028 { 8736, "ang",  "angle, U+2220 ISOamso" },
2029 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2030 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
2031 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2032 { 8746, "cup",  "union = cup, U+222A ISOtech" },
2033 { 8747, "int",  "integral, U+222B ISOtech" },
2034 { 8756, "there4","therefore, U+2234 ISOtech" },
2035 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2036 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2037 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2038 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
2039 { 8801, "equiv","identical to, U+2261 ISOtech" },
2040 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
2041 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
2042 { 8834, "sub",  "subset of, U+2282 ISOtech" },
2043 { 8835, "sup",  "superset of, U+2283 ISOtech" },
2044 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2045 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2046 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2047 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2048 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2049 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2050 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2051 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2052 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2053 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2054 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2055 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2056 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2057 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
2058
2059 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2060 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2061 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2062 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2063
2064 };
2065
2066 /************************************************************************
2067  *                                                                      *
2068  *              Commodity functions to handle entities                  *
2069  *                                                                      *
2070  ************************************************************************/
2071
2072 /*
2073  * Macro used to grow the current buffer.
2074  */
2075 #define growBuffer(buffer) {                                            \
2076     xmlChar *tmp;                                                       \
2077     buffer##_size *= 2;                                                 \
2078     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2079     if (tmp == NULL) {                                          \
2080         htmlErrMemory(ctxt, "growing buffer\n");                        \
2081         xmlFree(buffer);                                                \
2082         return(NULL);                                                   \
2083     }                                                                   \
2084     buffer = tmp;                                                       \
2085 }
2086
2087 /**
2088  * htmlEntityLookup:
2089  * @name: the entity name
2090  *
2091  * Lookup the given entity in EntitiesTable
2092  *
2093  * TODO: the linear scan is really ugly, an hash table is really needed.
2094  *
2095  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2096  */
2097 const htmlEntityDesc *
2098 htmlEntityLookup(const xmlChar *name) {
2099     unsigned int i;
2100
2101     for (i = 0;i < (sizeof(html40EntitiesTable)/
2102                     sizeof(html40EntitiesTable[0]));i++) {
2103         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2104             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2105         }
2106     }
2107     return(NULL);
2108 }
2109
2110 /**
2111  * htmlEntityValueLookup:
2112  * @value: the entity's unicode value
2113  *
2114  * Lookup the given entity in EntitiesTable
2115  *
2116  * TODO: the linear scan is really ugly, an hash table is really needed.
2117  *
2118  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2119  */
2120 const htmlEntityDesc *
2121 htmlEntityValueLookup(unsigned int value) {
2122     unsigned int i;
2123
2124     for (i = 0;i < (sizeof(html40EntitiesTable)/
2125                     sizeof(html40EntitiesTable[0]));i++) {
2126         if (html40EntitiesTable[i].value >= value) {
2127             if (html40EntitiesTable[i].value > value)
2128                 break;
2129             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2130         }
2131     }
2132     return(NULL);
2133 }
2134
2135 /**
2136  * UTF8ToHtml:
2137  * @out:  a pointer to an array of bytes to store the result
2138  * @outlen:  the length of @out
2139  * @in:  a pointer to an array of UTF-8 chars
2140  * @inlen:  the length of @in
2141  *
2142  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2143  * plus HTML entities block of chars out.
2144  *
2145  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2146  * The value of @inlen after return is the number of octets consumed
2147  *     as the return value is positive, else unpredictable.
2148  * The value of @outlen after return is the number of octets consumed.
2149  */
2150 int
2151 UTF8ToHtml(unsigned char* out, int *outlen,
2152               const unsigned char* in, int *inlen) {
2153     const unsigned char* processed = in;
2154     const unsigned char* outend;
2155     const unsigned char* outstart = out;
2156     const unsigned char* instart = in;
2157     const unsigned char* inend;
2158     unsigned int c, d;
2159     int trailing;
2160
2161     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2162     if (in == NULL) {
2163         /*
2164          * initialization nothing to do
2165          */
2166         *outlen = 0;
2167         *inlen = 0;
2168         return(0);
2169     }
2170     inend = in + (*inlen);
2171     outend = out + (*outlen);
2172     while (in < inend) {
2173         d = *in++;
2174         if      (d < 0x80)  { c= d; trailing= 0; }
2175         else if (d < 0xC0) {
2176             /* trailing byte in leading position */
2177             *outlen = out - outstart;
2178             *inlen = processed - instart;
2179             return(-2);
2180         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2181         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2182         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2183         else {
2184             /* no chance for this in Ascii */
2185             *outlen = out - outstart;
2186             *inlen = processed - instart;
2187             return(-2);
2188         }
2189
2190         if (inend - in < trailing) {
2191             break;
2192         }
2193
2194         for ( ; trailing; trailing--) {
2195             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2196                 break;
2197             c <<= 6;
2198             c |= d & 0x3F;
2199         }
2200
2201         /* assertion: c is a single UTF-4 value */
2202         if (c < 0x80) {
2203             if (out + 1 >= outend)
2204                 break;
2205             *out++ = c;
2206         } else {
2207             int len;
2208             const htmlEntityDesc * ent;
2209             const char *cp;
2210             char nbuf[16];
2211
2212             /*
2213              * Try to lookup a predefined HTML entity for it
2214              */
2215
2216             ent = htmlEntityValueLookup(c);
2217             if (ent == NULL) {
2218               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2219               cp = nbuf;
2220             }
2221             else
2222               cp = ent->name;
2223             len = strlen(cp);
2224             if (out + 2 + len >= outend)
2225                 break;
2226             *out++ = '&';
2227             memcpy(out, cp, len);
2228             out += len;
2229             *out++ = ';';
2230         }
2231         processed = in;
2232     }
2233     *outlen = out - outstart;
2234     *inlen = processed - instart;
2235     return(0);
2236 }
2237
2238 /**
2239  * htmlEncodeEntities:
2240  * @out:  a pointer to an array of bytes to store the result
2241  * @outlen:  the length of @out
2242  * @in:  a pointer to an array of UTF-8 chars
2243  * @inlen:  the length of @in
2244  * @quoteChar: the quote character to escape (' or ") or zero.
2245  *
2246  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2247  * plus HTML entities block of chars out.
2248  *
2249  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2250  * The value of @inlen after return is the number of octets consumed
2251  *     as the return value is positive, else unpredictable.
2252  * The value of @outlen after return is the number of octets consumed.
2253  */
2254 int
2255 htmlEncodeEntities(unsigned char* out, int *outlen,
2256                    const unsigned char* in, int *inlen, int quoteChar) {
2257     const unsigned char* processed = in;
2258     const unsigned char* outend;
2259     const unsigned char* outstart = out;
2260     const unsigned char* instart = in;
2261     const unsigned char* inend;
2262     unsigned int c, d;
2263     int trailing;
2264
2265     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2266         return(-1);
2267     outend = out + (*outlen);
2268     inend = in + (*inlen);
2269     while (in < inend) {
2270         d = *in++;
2271         if      (d < 0x80)  { c= d; trailing= 0; }
2272         else if (d < 0xC0) {
2273             /* trailing byte in leading position */
2274             *outlen = out - outstart;
2275             *inlen = processed - instart;
2276             return(-2);
2277         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2278         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2279         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2280         else {
2281             /* no chance for this in Ascii */
2282             *outlen = out - outstart;
2283             *inlen = processed - instart;
2284             return(-2);
2285         }
2286
2287         if (inend - in < trailing)
2288             break;
2289
2290         while (trailing--) {
2291             if (((d= *in++) & 0xC0) != 0x80) {
2292                 *outlen = out - outstart;
2293                 *inlen = processed - instart;
2294                 return(-2);
2295             }
2296             c <<= 6;
2297             c |= d & 0x3F;
2298         }
2299
2300         /* assertion: c is a single UTF-4 value */
2301         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2302             (c != '&') && (c != '<') && (c != '>')) {
2303             if (out >= outend)
2304                 break;
2305             *out++ = c;
2306         } else {
2307             const htmlEntityDesc * ent;
2308             const char *cp;
2309             char nbuf[16];
2310             int len;
2311
2312             /*
2313              * Try to lookup a predefined HTML entity for it
2314              */
2315             ent = htmlEntityValueLookup(c);
2316             if (ent == NULL) {
2317                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2318                 cp = nbuf;
2319             }
2320             else
2321                 cp = ent->name;
2322             len = strlen(cp);
2323             if (out + 2 + len > outend)
2324                 break;
2325             *out++ = '&';
2326             memcpy(out, cp, len);
2327             out += len;
2328             *out++ = ';';
2329         }
2330         processed = in;
2331     }
2332     *outlen = out - outstart;
2333     *inlen = processed - instart;
2334     return(0);
2335 }
2336
2337 /************************************************************************
2338  *                                                                      *
2339  *              Commodity functions to handle streams                   *
2340  *                                                                      *
2341  ************************************************************************/
2342
2343 #ifdef LIBXML_PUSH_ENABLED
2344 /**
2345  * htmlNewInputStream:
2346  * @ctxt:  an HTML parser context
2347  *
2348  * Create a new input stream structure
2349  * Returns the new input stream or NULL
2350  */
2351 static htmlParserInputPtr
2352 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2353     htmlParserInputPtr input;
2354
2355     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2356     if (input == NULL) {
2357         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2358         return(NULL);
2359     }
2360     memset(input, 0, sizeof(htmlParserInput));
2361     input->filename = NULL;
2362     input->directory = NULL;
2363     input->base = NULL;
2364     input->cur = NULL;
2365     input->buf = NULL;
2366     input->line = 1;
2367     input->col = 1;
2368     input->buf = NULL;
2369     input->free = NULL;
2370     input->version = NULL;
2371     input->consumed = 0;
2372     input->length = 0;
2373     return(input);
2374 }
2375 #endif
2376
2377
2378 /************************************************************************
2379  *                                                                      *
2380  *              Commodity functions, cleanup needed ?                   *
2381  *                                                                      *
2382  ************************************************************************/
2383 /*
2384  * all tags allowing pc data from the html 4.01 loose dtd
2385  * NOTE: it might be more appropriate to integrate this information
2386  * into the html40ElementTable array but I don't want to risk any
2387  * binary incompatibility
2388  */
2389 static const char *allowPCData[] = {
2390     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2391     "blockquote", "body", "button", "caption", "center", "cite", "code",
2392     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2393     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2394     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2395     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2396 };
2397
2398 /**
2399  * areBlanks:
2400  * @ctxt:  an HTML parser context
2401  * @str:  a xmlChar *
2402  * @len:  the size of @str
2403  *
2404  * Is this a sequence of blank chars that one can ignore ?
2405  *
2406  * Returns 1 if ignorable 0 otherwise.
2407  */
2408
2409 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2410     unsigned int i;
2411     int j;
2412     xmlNodePtr lastChild;
2413     xmlDtdPtr dtd;
2414
2415     for (j = 0;j < len;j++)
2416         if (!(IS_BLANK_CH(str[j]))) return(0);
2417
2418     if (CUR == 0) return(1);
2419     if (CUR != '<') return(0);
2420     if (ctxt->name == NULL)
2421         return(1);
2422     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2423         return(1);
2424     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2425         return(1);
2426
2427     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2428     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2429         dtd = xmlGetIntSubset(ctxt->myDoc);
2430         if (dtd != NULL && dtd->ExternalID != NULL) {
2431             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2432                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2433                 return(1);
2434         }
2435     }
2436
2437     if (ctxt->node == NULL) return(0);
2438     lastChild = xmlGetLastChild(ctxt->node);
2439     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2440         lastChild = lastChild->prev;
2441     if (lastChild == NULL) {
2442         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2443             (ctxt->node->content != NULL)) return(0);
2444         /* keep ws in constructs like ...<b> </b>...
2445            for all tags "b" allowing PCDATA */
2446         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2447             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2448                 return(0);
2449             }
2450         }
2451     } else if (xmlNodeIsText(lastChild)) {
2452         return(0);
2453     } else {
2454         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2455            for all tags "p" allowing PCDATA */
2456         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2457             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2458                 return(0);
2459             }
2460         }
2461     }
2462     return(1);
2463 }
2464
2465 /**
2466  * htmlNewDocNoDtD:
2467  * @URI:  URI for the dtd, or NULL
2468  * @ExternalID:  the external ID of the DTD, or NULL
2469  *
2470  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2471  * are NULL
2472  *
2473  * Returns a new document, do not initialize the DTD if not provided
2474  */
2475 htmlDocPtr
2476 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2477     xmlDocPtr cur;
2478
2479     /*
2480      * Allocate a new document and fill the fields.
2481      */
2482     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2483     if (cur == NULL) {
2484         htmlErrMemory(NULL, "HTML document creation failed\n");
2485         return(NULL);
2486     }
2487     memset(cur, 0, sizeof(xmlDoc));
2488
2489     cur->type = XML_HTML_DOCUMENT_NODE;
2490     cur->version = NULL;
2491     cur->intSubset = NULL;
2492     cur->doc = cur;
2493     cur->name = NULL;
2494     cur->children = NULL;
2495     cur->extSubset = NULL;
2496     cur->oldNs = NULL;
2497     cur->encoding = NULL;
2498     cur->standalone = 1;
2499     cur->compression = 0;
2500     cur->ids = NULL;
2501     cur->refs = NULL;
2502     cur->_private = NULL;
2503     cur->charset = XML_CHAR_ENCODING_UTF8;
2504     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2505     if ((ExternalID != NULL) ||
2506         (URI != NULL))
2507         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2508     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2509         xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2510     return(cur);
2511 }
2512
2513 /**
2514  * htmlNewDoc:
2515  * @URI:  URI for the dtd, or NULL
2516  * @ExternalID:  the external ID of the DTD, or NULL
2517  *
2518  * Creates a new HTML document
2519  *
2520  * Returns a new document
2521  */
2522 htmlDocPtr
2523 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2524     if ((URI == NULL) && (ExternalID == NULL))
2525         return(htmlNewDocNoDtD(
2526                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2527                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2528
2529     return(htmlNewDocNoDtD(URI, ExternalID));
2530 }
2531
2532
2533 /************************************************************************
2534  *                                                                      *
2535  *                      The parser itself                               *
2536  *      Relates to http://www.w3.org/TR/html40                          *
2537  *                                                                      *
2538  ************************************************************************/
2539
2540 /************************************************************************
2541  *                                                                      *
2542  *                      The parser itself                               *
2543  *                                                                      *
2544  ************************************************************************/
2545
2546 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2547
2548 static void
2549 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2550     int c;
2551
2552     htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2553                  "Incorrectly opened comment\n", NULL, NULL);
2554
2555     do {
2556         c = CUR;
2557         if (c == 0)
2558             break;
2559         NEXT;
2560     } while (c != '>');
2561 }
2562
2563 /**
2564  * htmlParseHTMLName:
2565  * @ctxt:  an HTML parser context
2566  *
2567  * parse an HTML tag or attribute name, note that we convert it to lowercase
2568  * since HTML names are not case-sensitive.
2569  *
2570  * Returns the Tag Name parsed or NULL
2571  */
2572
2573 static const xmlChar *
2574 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2575     int i = 0;
2576     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2577
2578     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2579         (CUR != ':') && (CUR != '.')) return(NULL);
2580
2581     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2582            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2583            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2584            (CUR == '.'))) {
2585         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2586         else loc[i] = CUR;
2587         i++;
2588
2589         NEXT;
2590     }
2591
2592     return(xmlDictLookup(ctxt->dict, loc, i));
2593 }
2594
2595
2596 /**
2597  * htmlParseHTMLName_nonInvasive:
2598  * @ctxt:  an HTML parser context
2599  *
2600  * parse an HTML tag or attribute name, note that we convert it to lowercase
2601  * since HTML names are not case-sensitive, this doesn't consume the data
2602  * from the stream, it's a look-ahead
2603  *
2604  * Returns the Tag Name parsed or NULL
2605  */
2606
2607 static const xmlChar *
2608 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2609     int i = 0;
2610     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2611
2612     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2613         (NXT(1) != ':')) return(NULL);
2614
2615     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2616            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2617            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2618         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2619         else loc[i] = NXT(1+i);
2620         i++;
2621     }
2622
2623     return(xmlDictLookup(ctxt->dict, loc, i));
2624 }
2625
2626
2627 /**
2628  * htmlParseName:
2629  * @ctxt:  an HTML parser context
2630  *
2631  * parse an HTML name, this routine is case sensitive.
2632  *
2633  * Returns the Name parsed or NULL
2634  */
2635
2636 static const xmlChar *
2637 htmlParseName(htmlParserCtxtPtr ctxt) {
2638     const xmlChar *in;
2639     const xmlChar *ret;
2640     int count = 0;
2641
2642     GROW;
2643
2644     /*
2645      * Accelerator for simple ASCII names
2646      */
2647     in = ctxt->input->cur;
2648     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2649         ((*in >= 0x41) && (*in <= 0x5A)) ||
2650         (*in == '_') || (*in == ':')) {
2651         in++;
2652         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2653                ((*in >= 0x41) && (*in <= 0x5A)) ||
2654                ((*in >= 0x30) && (*in <= 0x39)) ||
2655                (*in == '_') || (*in == '-') ||
2656                (*in == ':') || (*in == '.'))
2657             in++;
2658
2659         if (in == ctxt->input->end)
2660             return(NULL);
2661
2662         if ((*in > 0) && (*in < 0x80)) {
2663             count = in - ctxt->input->cur;
2664             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2665             ctxt->input->cur = in;
2666             ctxt->input->col += count;
2667             return(ret);
2668         }
2669     }
2670     return(htmlParseNameComplex(ctxt));
2671 }
2672
2673 static const xmlChar *
2674 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2675     int len = 0, l;
2676     int c;
2677     int count = 0;
2678     const xmlChar *base = ctxt->input->base;
2679
2680     /*
2681      * Handler for more complex cases
2682      */
2683     GROW;
2684     c = CUR_CHAR(l);
2685     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2686         (!IS_LETTER(c) && (c != '_') &&
2687          (c != ':'))) {
2688         return(NULL);
2689     }
2690
2691     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2692            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2693             (c == '.') || (c == '-') ||
2694             (c == '_') || (c == ':') ||
2695             (IS_COMBINING(c)) ||
2696             (IS_EXTENDER(c)))) {
2697         if (count++ > 100) {
2698             count = 0;
2699             GROW;
2700         }
2701         len += l;
2702         NEXTL(l);
2703         c = CUR_CHAR(l);
2704         if (ctxt->input->base != base) {
2705             /*
2706              * We changed encoding from an unknown encoding
2707              * Input buffer changed location, so we better start again
2708              */
2709             return(htmlParseNameComplex(ctxt));
2710         }
2711     }
2712
2713     if (ctxt->input->cur - ctxt->input->base < len) {
2714         /* Sanity check */
2715         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2716                      "unexpected change of input buffer", NULL, NULL);
2717         return (NULL);
2718     }
2719
2720     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2721 }
2722
2723
2724 /**
2725  * htmlParseHTMLAttribute:
2726  * @ctxt:  an HTML parser context
2727  * @stop:  a char stop value
2728  *
2729  * parse an HTML attribute value till the stop (quote), if
2730  * stop is 0 then it stops at the first space
2731  *
2732  * Returns the attribute parsed or NULL
2733  */
2734
2735 static xmlChar *
2736 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2737     xmlChar *buffer = NULL;
2738     int buffer_size = 0;
2739     xmlChar *out = NULL;
2740     const xmlChar *name = NULL;
2741     const xmlChar *cur = NULL;
2742     const htmlEntityDesc * ent;
2743
2744     /*
2745      * allocate a translation buffer.
2746      */
2747     buffer_size = HTML_PARSER_BUFFER_SIZE;
2748     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2749     if (buffer == NULL) {
2750         htmlErrMemory(ctxt, "buffer allocation failed\n");
2751         return(NULL);
2752     }
2753     out = buffer;
2754
2755     /*
2756      * Ok loop until we reach one of the ending chars
2757      */
2758     while ((CUR != 0) && (CUR != stop)) {
2759         if ((stop == 0) && (CUR == '>')) break;
2760         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2761         if (CUR == '&') {
2762             if (NXT(1) == '#') {
2763                 unsigned int c;
2764                 int bits;
2765
2766                 c = htmlParseCharRef(ctxt);
2767                 if      (c <    0x80)
2768                         { *out++  = c;                bits= -6; }
2769                 else if (c <   0x800)
2770                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2771                 else if (c < 0x10000)
2772                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2773                 else
2774                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2775
2776                 for ( ; bits >= 0; bits-= 6) {
2777                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2778                 }
2779
2780                 if (out - buffer > buffer_size - 100) {
2781                         int indx = out - buffer;
2782
2783                         growBuffer(buffer);
2784                         out = &buffer[indx];
2785                 }
2786             } else {
2787                 ent = htmlParseEntityRef(ctxt, &name);
2788                 if (name == NULL) {
2789                     *out++ = '&';
2790                     if (out - buffer > buffer_size - 100) {
2791                         int indx = out - buffer;
2792
2793                         growBuffer(buffer);
2794                         out = &buffer[indx];
2795                     }
2796                 } else if (ent == NULL) {
2797                     *out++ = '&';
2798                     cur = name;
2799                     while (*cur != 0) {
2800                         if (out - buffer > buffer_size - 100) {
2801                             int indx = out - buffer;
2802
2803                             growBuffer(buffer);
2804                             out = &buffer[indx];
2805                         }
2806                         *out++ = *cur++;
2807                     }
2808                 } else {
2809                     unsigned int c;
2810                     int bits;
2811
2812                     if (out - buffer > buffer_size - 100) {
2813                         int indx = out - buffer;
2814
2815                         growBuffer(buffer);
2816                         out = &buffer[indx];
2817                     }
2818                     c = ent->value;
2819                     if      (c <    0x80)
2820                         { *out++  = c;                bits= -6; }
2821                     else if (c <   0x800)
2822                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2823                     else if (c < 0x10000)
2824                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2825                     else
2826                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2827
2828                     for ( ; bits >= 0; bits-= 6) {
2829                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2830                     }
2831                 }
2832             }
2833         } else {
2834             unsigned int c;
2835             int bits, l;
2836
2837             if (out - buffer > buffer_size - 100) {
2838                 int indx = out - buffer;
2839
2840                 growBuffer(buffer);
2841                 out = &buffer[indx];
2842             }
2843             c = CUR_CHAR(l);
2844             if      (c <    0x80)
2845                     { *out++  = c;                bits= -6; }
2846             else if (c <   0x800)
2847                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2848             else if (c < 0x10000)
2849                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2850             else
2851                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2852
2853             for ( ; bits >= 0; bits-= 6) {
2854                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2855             }
2856             NEXT;
2857         }
2858     }
2859     *out = 0;
2860     return(buffer);
2861 }
2862
2863 /**
2864  * htmlParseEntityRef:
2865  * @ctxt:  an HTML parser context
2866  * @str:  location to store the entity name
2867  *
2868  * parse an HTML ENTITY references
2869  *
2870  * [68] EntityRef ::= '&' Name ';'
2871  *
2872  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2873  *         if non-NULL *str will have to be freed by the caller.
2874  */
2875 const htmlEntityDesc *
2876 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2877     const xmlChar *name;
2878     const htmlEntityDesc * ent = NULL;
2879
2880     if (str != NULL) *str = NULL;
2881     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2882
2883     if (CUR == '&') {
2884         NEXT;
2885         name = htmlParseName(ctxt);
2886         if (name == NULL) {
2887             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2888                          "htmlParseEntityRef: no name\n", NULL, NULL);
2889         } else {
2890             GROW;
2891             if (CUR == ';') {
2892                 if (str != NULL)
2893                     *str = name;
2894
2895                 /*
2896                  * Lookup the entity in the table.
2897                  */
2898                 ent = htmlEntityLookup(name);
2899                 if (ent != NULL) /* OK that's ugly !!! */
2900                     NEXT;
2901             } else {
2902                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2903                              "htmlParseEntityRef: expecting ';'\n",
2904                              NULL, NULL);
2905                 if (str != NULL)
2906                     *str = name;
2907             }
2908         }
2909     }
2910     return(ent);
2911 }
2912
2913 /**
2914  * htmlParseAttValue:
2915  * @ctxt:  an HTML parser context
2916  *
2917  * parse a value for an attribute
2918  * Note: the parser won't do substitution of entities here, this
2919  * will be handled later in xmlStringGetNodeList, unless it was
2920  * asked for ctxt->replaceEntities != 0
2921  *
2922  * Returns the AttValue parsed or NULL.
2923  */
2924
2925 static xmlChar *
2926 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2927     xmlChar *ret = NULL;
2928
2929     if (CUR == '"') {
2930         NEXT;
2931         ret = htmlParseHTMLAttribute(ctxt, '"');
2932         if (CUR != '"') {
2933             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2934                          "AttValue: \" expected\n", NULL, NULL);
2935         } else
2936             NEXT;
2937     } else if (CUR == '\'') {
2938         NEXT;
2939         ret = htmlParseHTMLAttribute(ctxt, '\'');
2940         if (CUR != '\'') {
2941             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2942                          "AttValue: ' expected\n", NULL, NULL);
2943         } else
2944             NEXT;
2945     } else {
2946         /*
2947          * That's an HTMLism, the attribute value may not be quoted
2948          */
2949         ret = htmlParseHTMLAttribute(ctxt, 0);
2950         if (ret == NULL) {
2951             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2952                          "AttValue: no value found\n", NULL, NULL);
2953         }
2954     }
2955     return(ret);
2956 }
2957
2958 /**
2959  * htmlParseSystemLiteral:
2960  * @ctxt:  an HTML parser context
2961  *
2962  * parse an HTML Literal
2963  *
2964  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2965  *
2966  * Returns the SystemLiteral parsed or NULL
2967  */
2968
2969 static xmlChar *
2970 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2971     size_t len = 0, startPosition = 0;
2972     int err = 0;
2973     int quote;
2974     xmlChar *ret = NULL;
2975
2976     if ((CUR != '"') && (CUR != '\'')) {
2977         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2978                      "SystemLiteral \" or ' expected\n", NULL, NULL);
2979         return(NULL);
2980     }
2981     quote = CUR;
2982     NEXT;
2983
2984     if (CUR_PTR < BASE_PTR)
2985         return(ret);
2986     startPosition = CUR_PTR - BASE_PTR;
2987
2988     while ((CUR != 0) && (CUR != quote)) {
2989         /* TODO: Handle UTF-8 */
2990         if (!IS_CHAR_CH(CUR)) {
2991             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2992                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2993             err = 1;
2994         }
2995         NEXT;
2996         len++;
2997     }
2998     if (CUR != quote) {
2999         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3000                      "Unfinished SystemLiteral\n", NULL, NULL);
3001     } else {
3002         NEXT;
3003         if (err == 0)
3004             ret = xmlStrndup((BASE_PTR+startPosition), len);
3005     }
3006
3007     return(ret);
3008 }
3009
3010 /**
3011  * htmlParsePubidLiteral:
3012  * @ctxt:  an HTML parser context
3013  *
3014  * parse an HTML public literal
3015  *
3016  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3017  *
3018  * Returns the PubidLiteral parsed or NULL.
3019  */
3020
3021 static xmlChar *
3022 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3023     size_t len = 0, startPosition = 0;
3024     int err = 0;
3025     int quote;
3026     xmlChar *ret = NULL;
3027
3028     if ((CUR != '"') && (CUR != '\'')) {
3029         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3030                      "PubidLiteral \" or ' expected\n", NULL, NULL);
3031         return(NULL);
3032     }
3033     quote = CUR;
3034     NEXT;
3035
3036     /*
3037      * Name ::= (Letter | '_') (NameChar)*
3038      */
3039     if (CUR_PTR < BASE_PTR)
3040         return(ret);
3041     startPosition = CUR_PTR - BASE_PTR;
3042
3043     while ((CUR != 0) && (CUR != quote)) {
3044         if (!IS_PUBIDCHAR_CH(CUR)) {
3045             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3046                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3047             err = 1;
3048         }
3049         len++;
3050         NEXT;
3051     }
3052
3053     if (CUR != quote) {
3054         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3055                      "Unfinished PubidLiteral\n", NULL, NULL);
3056     } else {
3057         NEXT;
3058         if (err == 0)
3059             ret = xmlStrndup((BASE_PTR + startPosition), len);
3060     }
3061
3062     return(ret);
3063 }
3064
3065 /**
3066  * htmlParseScript:
3067  * @ctxt:  an HTML parser context
3068  *
3069  * parse the content of an HTML SCRIPT or STYLE element
3070  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3071  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3072  * http://www.w3.org/TR/html4/types.html#type-script
3073  * http://www.w3.org/TR/html4/types.html#h-6.15
3074  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3075  *
3076  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3077  * element and the value of intrinsic event attributes. User agents must
3078  * not evaluate script data as HTML markup but instead must pass it on as
3079  * data to a script engine.
3080  * NOTES:
3081  * - The content is passed like CDATA
3082  * - the attributes for style and scripting "onXXX" are also described
3083  *   as CDATA but SGML allows entities references in attributes so their
3084  *   processing is identical as other attributes
3085  */
3086 static void
3087 htmlParseScript(htmlParserCtxtPtr ctxt) {
3088     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3089     int nbchar = 0;
3090     int cur,l;
3091
3092     SHRINK;
3093     cur = CUR_CHAR(l);
3094     while (cur != 0) {
3095         if ((cur == '<') && (NXT(1) == '/')) {
3096             /*
3097              * One should break here, the specification is clear:
3098              * Authors should therefore escape "</" within the content.
3099              * Escape mechanisms are specific to each scripting or
3100              * style sheet language.
3101              *
3102              * In recovery mode, only break if end tag match the
3103              * current tag, effectively ignoring all tags inside the
3104              * script/style block and treating the entire block as
3105              * CDATA.
3106              */
3107             if (ctxt->recovery) {
3108                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3109                                    xmlStrlen(ctxt->name)) == 0)
3110                 {
3111                     break; /* while */
3112                 } else {
3113                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3114                                  "Element %s embeds close tag\n",
3115                                  ctxt->name, NULL);
3116                 }
3117             } else {
3118                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3119                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3120                 {
3121                     break; /* while */
3122                 }
3123             }
3124         }
3125         if (IS_CHAR(cur)) {
3126             COPY_BUF(l,buf,nbchar,cur);
3127         } else {
3128             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3129                             "Invalid char in CDATA 0x%X\n", cur);
3130         }
3131         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3132             buf[nbchar] = 0;
3133             if (ctxt->sax->cdataBlock!= NULL) {
3134                 /*
3135                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136                  */
3137                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138             } else if (ctxt->sax->characters != NULL) {
3139                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140             }
3141             nbchar = 0;
3142         }
3143         GROW;
3144         NEXTL(l);
3145         cur = CUR_CHAR(l);
3146     }
3147
3148     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3149         buf[nbchar] = 0;
3150         if (ctxt->sax->cdataBlock!= NULL) {
3151             /*
3152              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3153              */
3154             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3155         } else if (ctxt->sax->characters != NULL) {
3156             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157         }
3158     }
3159 }
3160
3161
3162 /**
3163  * htmlParseCharDataInternal:
3164  * @ctxt:  an HTML parser context
3165  * @readahead: optional read ahead character in ascii range
3166  *
3167  * parse a CharData section.
3168  * if we are within a CDATA section ']]>' marks an end of section.
3169  *
3170  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3171  */
3172
3173 static void
3174 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3175     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3176     int nbchar = 0;
3177     int cur, l;
3178     int chunk = 0;
3179
3180     if (readahead)
3181         buf[nbchar++] = readahead;
3182
3183     SHRINK;
3184     cur = CUR_CHAR(l);
3185     while (((cur != '<') || (ctxt->token == '<')) &&
3186            ((cur != '&') || (ctxt->token == '&')) &&
3187            (cur != 0)) {
3188         if (!(IS_CHAR(cur))) {
3189             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3190                         "Invalid char in CDATA 0x%X\n", cur);
3191         } else {
3192             COPY_BUF(l,buf,nbchar,cur);
3193         }
3194         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3195             buf[nbchar] = 0;
3196
3197             /*
3198              * Ok the segment is to be consumed as chars.
3199              */
3200             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3201                 if (areBlanks(ctxt, buf, nbchar)) {
3202                     if (ctxt->keepBlanks) {
3203                         if (ctxt->sax->characters != NULL)
3204                             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3205                     } else {
3206                         if (ctxt->sax->ignorableWhitespace != NULL)
3207                             ctxt->sax->ignorableWhitespace(ctxt->userData,
3208                                                            buf, nbchar);
3209                     }
3210                 } else {
3211                     htmlCheckParagraph(ctxt);
3212                     if (ctxt->sax->characters != NULL)
3213                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3214                 }
3215             }
3216             nbchar = 0;
3217         }
3218         NEXTL(l);
3219         chunk++;
3220         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3221             chunk = 0;
3222             SHRINK;
3223             GROW;
3224         }
3225         cur = CUR_CHAR(l);
3226         if (cur == 0) {
3227             SHRINK;
3228             GROW;
3229             cur = CUR_CHAR(l);
3230         }
3231     }
3232     if (nbchar != 0) {
3233         buf[nbchar] = 0;
3234
3235         /*
3236          * Ok the segment is to be consumed as chars.
3237          */
3238         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3239             if (areBlanks(ctxt, buf, nbchar)) {
3240                 if (ctxt->keepBlanks) {
3241                     if (ctxt->sax->characters != NULL)
3242                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3243                 } else {
3244                     if (ctxt->sax->ignorableWhitespace != NULL)
3245                         ctxt->sax->ignorableWhitespace(ctxt->userData,
3246                                                        buf, nbchar);
3247                 }
3248             } else {
3249                 htmlCheckParagraph(ctxt);
3250                 if (ctxt->sax->characters != NULL)
3251                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
3252             }
3253         }
3254     } else {
3255         /*
3256          * Loop detection
3257          */
3258         if (cur == 0)
3259             ctxt->instate = XML_PARSER_EOF;
3260     }
3261 }
3262
3263 /**
3264  * htmlParseCharData:
3265  * @ctxt:  an HTML parser context
3266  *
3267  * parse a CharData section.
3268  * if we are within a CDATA section ']]>' marks an end of section.
3269  *
3270  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3271  */
3272
3273 static void
3274 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3275     htmlParseCharDataInternal(ctxt, 0);
3276 }
3277
3278 /**
3279  * htmlParseExternalID:
3280  * @ctxt:  an HTML parser context
3281  * @publicID:  a xmlChar** receiving PubidLiteral
3282  *
3283  * Parse an External ID or a Public ID
3284  *
3285  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3286  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3287  *
3288  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3289  *
3290  * Returns the function returns SystemLiteral and in the second
3291  *                case publicID receives PubidLiteral, is strict is off
3292  *                it is possible to return NULL and have publicID set.
3293  */
3294
3295 static xmlChar *
3296 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3297     xmlChar *URI = NULL;
3298
3299     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3300          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3301          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3302         SKIP(6);
3303         if (!IS_BLANK_CH(CUR)) {
3304             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3305                          "Space required after 'SYSTEM'\n", NULL, NULL);
3306         }
3307         SKIP_BLANKS;
3308         URI = htmlParseSystemLiteral(ctxt);
3309         if (URI == NULL) {
3310             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3311                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3312         }
3313     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3314                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3315                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3316         SKIP(6);
3317         if (!IS_BLANK_CH(CUR)) {
3318             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3319                          "Space required after 'PUBLIC'\n", NULL, NULL);
3320         }
3321         SKIP_BLANKS;
3322         *publicID = htmlParsePubidLiteral(ctxt);
3323         if (*publicID == NULL) {
3324             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3325                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3326                          NULL, NULL);
3327         }
3328         SKIP_BLANKS;
3329         if ((CUR == '"') || (CUR == '\'')) {
3330             URI = htmlParseSystemLiteral(ctxt);
3331         }
3332     }
3333     return(URI);
3334 }
3335
3336 /**
3337  * xmlParsePI:
3338  * @ctxt:  an XML parser context
3339  *
3340  * parse an XML Processing Instruction.
3341  *
3342  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3343  */
3344 static void
3345 htmlParsePI(htmlParserCtxtPtr ctxt) {
3346     xmlChar *buf = NULL;
3347     int len = 0;
3348     int size = HTML_PARSER_BUFFER_SIZE;
3349     int cur, l;
3350     const xmlChar *target;
3351     xmlParserInputState state;
3352     int count = 0;
3353
3354     if ((RAW == '<') && (NXT(1) == '?')) {
3355         state = ctxt->instate;
3356         ctxt->instate = XML_PARSER_PI;
3357         /*
3358          * this is a Processing Instruction.
3359          */
3360         SKIP(2);
3361         SHRINK;
3362
3363         /*
3364          * Parse the target name and check for special support like
3365          * namespace.
3366          */
3367         target = htmlParseName(ctxt);
3368         if (target != NULL) {
3369             if (RAW == '>') {
3370                 SKIP(1);
3371
3372                 /*
3373                  * SAX: PI detected.
3374                  */
3375                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3376                     (ctxt->sax->processingInstruction != NULL))
3377                     ctxt->sax->processingInstruction(ctxt->userData,
3378                                                      target, NULL);
3379                 ctxt->instate = state;
3380                 return;
3381             }
3382             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3383             if (buf == NULL) {
3384                 htmlErrMemory(ctxt, NULL);
3385                 ctxt->instate = state;
3386                 return;
3387             }
3388             cur = CUR;
3389             if (!IS_BLANK(cur)) {
3390                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3391                           "ParsePI: PI %s space expected\n", target, NULL);
3392             }
3393             SKIP_BLANKS;
3394             cur = CUR_CHAR(l);
3395             while ((cur != 0) && (cur != '>')) {
3396                 if (len + 5 >= size) {
3397                     xmlChar *tmp;
3398
3399                     size *= 2;
3400                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3401                     if (tmp == NULL) {
3402                         htmlErrMemory(ctxt, NULL);
3403                         xmlFree(buf);
3404                         ctxt->instate = state;
3405                         return;
3406                     }
3407                     buf = tmp;
3408                 }
3409                 count++;
3410                 if (count > 50) {
3411                     GROW;
3412                     count = 0;
3413                 }
3414                 if (IS_CHAR(cur)) {
3415                     COPY_BUF(l,buf,len,cur);
3416                 } else {
3417                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3418                                     "Invalid char in processing instruction "
3419                                     "0x%X\n", cur);
3420                 }
3421                 NEXTL(l);
3422                 cur = CUR_CHAR(l);
3423                 if (cur == 0) {
3424                     SHRINK;
3425                     GROW;
3426                     cur = CUR_CHAR(l);
3427                 }
3428             }
3429             buf[len] = 0;
3430             if (cur != '>') {
3431                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3432                       "ParsePI: PI %s never end ...\n", target, NULL);
3433             } else {
3434                 SKIP(1);
3435
3436                 /*
3437                  * SAX: PI detected.
3438                  */
3439                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3440                     (ctxt->sax->processingInstruction != NULL))
3441                     ctxt->sax->processingInstruction(ctxt->userData,
3442                                                      target, buf);
3443             }
3444             xmlFree(buf);
3445         } else {
3446             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3447                          "PI is not started correctly", NULL, NULL);
3448         }
3449         ctxt->instate = state;
3450     }
3451 }
3452
3453 /**
3454  * htmlParseComment:
3455  * @ctxt:  an HTML parser context
3456  *
3457  * Parse an XML (SGML) comment <!-- .... -->
3458  *
3459  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3460  */
3461 static void
3462 htmlParseComment(htmlParserCtxtPtr ctxt) {
3463     xmlChar *buf = NULL;
3464     int len;
3465     int size = HTML_PARSER_BUFFER_SIZE;
3466     int q, ql;
3467     int r, rl;
3468     int cur, l;
3469     int next, nl;
3470     xmlParserInputState state;
3471
3472     /*
3473      * Check that there is a comment right here.
3474      */
3475     if ((RAW != '<') || (NXT(1) != '!') ||
3476         (NXT(2) != '-') || (NXT(3) != '-')) return;
3477
3478     state = ctxt->instate;
3479     ctxt->instate = XML_PARSER_COMMENT;
3480     SHRINK;
3481     SKIP(4);
3482     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3483     if (buf == NULL) {
3484         htmlErrMemory(ctxt, "buffer allocation failed\n");
3485         ctxt->instate = state;
3486         return;
3487     }
3488     len = 0;
3489     buf[len] = 0;
3490     q = CUR_CHAR(ql);
3491     if (q == 0)
3492         goto unfinished;
3493     if (q == '>') {
3494         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3495         cur = '>';
3496         goto finished;
3497     }
3498     NEXTL(ql);
3499     r = CUR_CHAR(rl);
3500     if (r == 0)
3501         goto unfinished;
3502     if (q == '-' && r == '>') {
3503         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3504         cur = '>';
3505         goto finished;
3506     }
3507     NEXTL(rl);
3508     cur = CUR_CHAR(l);
3509     while ((cur != 0) &&
3510            ((cur != '>') ||
3511             (r != '-') || (q != '-'))) {
3512         NEXTL(l);
3513         next = CUR_CHAR(nl);
3514         if (next == 0) {
3515             SHRINK;
3516             GROW;
3517             next = CUR_CHAR(nl);
3518         }
3519
3520         if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3521           htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3522                        "Comment incorrectly closed by '--!>'", NULL, NULL);
3523           cur = '>';
3524           break;
3525         }
3526
3527         if (len + 5 >= size) {
3528             xmlChar *tmp;
3529
3530             size *= 2;
3531             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3532             if (tmp == NULL) {
3533                 xmlFree(buf);
3534                 htmlErrMemory(ctxt, "growing buffer failed\n");
3535                 ctxt->instate = state;
3536                 return;
3537             }
3538             buf = tmp;
3539         }
3540         if (IS_CHAR(q)) {
3541             COPY_BUF(ql,buf,len,q);
3542         } else {
3543             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3544                             "Invalid char in comment 0x%X\n", q);
3545         }
3546
3547         q = r;
3548         ql = rl;
3549         r = cur;
3550         rl = l;
3551         cur = next;
3552         l = nl;
3553     }
3554 finished:
3555     buf[len] = 0;
3556     if (cur == '>') {
3557         NEXT;
3558         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3559             (!ctxt->disableSAX))
3560             ctxt->sax->comment(ctxt->userData, buf);
3561         xmlFree(buf);
3562         ctxt->instate = state;
3563         return;
3564     }
3565
3566 unfinished:
3567     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3568                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3569     xmlFree(buf);
3570 }
3571
3572 /**
3573  * htmlParseCharRef:
3574  * @ctxt:  an HTML parser context
3575  *
3576  * parse Reference declarations
3577  *
3578  * [66] CharRef ::= '&#' [0-9]+ ';' |
3579  *                  '&#x' [0-9a-fA-F]+ ';'
3580  *
3581  * Returns the value parsed (as an int)
3582  */
3583 int
3584 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3585     int val = 0;
3586
3587     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3588         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3589                      "htmlParseCharRef: context error\n",
3590                      NULL, NULL);
3591         return(0);
3592     }
3593     if ((CUR == '&') && (NXT(1) == '#') &&
3594         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3595         SKIP(3);
3596         while (CUR != ';') {
3597             if ((CUR >= '0') && (CUR <= '9')) {
3598                 if (val < 0x110000)
3599                     val = val * 16 + (CUR - '0');
3600             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3601                 if (val < 0x110000)
3602                     val = val * 16 + (CUR - 'a') + 10;
3603             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3604                 if (val < 0x110000)
3605                     val = val * 16 + (CUR - 'A') + 10;
3606             } else {
3607                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3608                              "htmlParseCharRef: missing semicolon\n",
3609                              NULL, NULL);
3610                 break;
3611             }
3612             NEXT;
3613         }
3614         if (CUR == ';')
3615             NEXT;
3616     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3617         SKIP(2);
3618         while (CUR != ';') {
3619             if ((CUR >= '0') && (CUR <= '9')) {
3620                 if (val < 0x110000)
3621                     val = val * 10 + (CUR - '0');
3622             } else {
3623                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3624                              "htmlParseCharRef: missing semicolon\n",
3625                              NULL, NULL);
3626                 break;
3627             }
3628             NEXT;
3629         }
3630         if (CUR == ';')
3631             NEXT;
3632     } else {
3633         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3634                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3635     }
3636     /*
3637      * Check the value IS_CHAR ...
3638      */
3639     if (IS_CHAR(val)) {
3640         return(val);
3641     } else if (val >= 0x110000) {
3642         htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3643                      "htmlParseCharRef: value too large\n", NULL, NULL);
3644     } else {
3645         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3646                         "htmlParseCharRef: invalid xmlChar value %d\n",
3647                         val);
3648     }
3649     return(0);
3650 }
3651
3652
3653 /**
3654  * htmlParseDocTypeDecl:
3655  * @ctxt:  an HTML parser context
3656  *
3657  * parse a DOCTYPE declaration
3658  *
3659  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3660  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3661  */
3662
3663 static void
3664 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3665     const xmlChar *name;
3666     xmlChar *ExternalID = NULL;
3667     xmlChar *URI = NULL;
3668
3669     /*
3670      * We know that '<!DOCTYPE' has been detected.
3671      */
3672     SKIP(9);
3673
3674     SKIP_BLANKS;
3675
3676     /*
3677      * Parse the DOCTYPE name.
3678      */
3679     name = htmlParseName(ctxt);
3680     if (name == NULL) {
3681         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3682                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3683                      NULL, NULL);
3684     }
3685     /*
3686      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3687      */
3688
3689     SKIP_BLANKS;
3690
3691     /*
3692      * Check for SystemID and ExternalID
3693      */
3694     URI = htmlParseExternalID(ctxt, &ExternalID);
3695     SKIP_BLANKS;
3696
3697     /*
3698      * We should be at the end of the DOCTYPE declaration.
3699      */
3700     if (CUR != '>') {
3701         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3702                      "DOCTYPE improperly terminated\n", NULL, NULL);
3703         /* Ignore bogus content */
3704         while ((CUR != 0) && (CUR != '>'))
3705             NEXT;
3706     }
3707     if (CUR == '>')
3708         NEXT;
3709
3710     /*
3711      * Create or update the document accordingly to the DOCTYPE
3712      */
3713     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3714         (!ctxt->disableSAX))
3715         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3716
3717     /*
3718      * Cleanup, since we don't use all those identifiers
3719      */
3720     if (URI != NULL) xmlFree(URI);
3721     if (ExternalID != NULL) xmlFree(ExternalID);
3722 }
3723
3724 /**
3725  * htmlParseAttribute:
3726  * @ctxt:  an HTML parser context
3727  * @value:  a xmlChar ** used to store the value of the attribute
3728  *
3729  * parse an attribute
3730  *
3731  * [41] Attribute ::= Name Eq AttValue
3732  *
3733  * [25] Eq ::= S? '=' S?
3734  *
3735  * With namespace:
3736  *
3737  * [NS 11] Attribute ::= QName Eq AttValue
3738  *
3739  * Also the case QName == xmlns:??? is handled independently as a namespace
3740  * definition.
3741  *
3742  * Returns the attribute name, and the value in *value.
3743  */
3744
3745 static const xmlChar *
3746 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3747     const xmlChar *name;
3748     xmlChar *val = NULL;
3749
3750     *value = NULL;
3751     name = htmlParseHTMLName(ctxt);
3752     if (name == NULL) {
3753         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3754                      "error parsing attribute name\n", NULL, NULL);
3755         return(NULL);
3756     }
3757
3758     /*
3759      * read the value
3760      */
3761     SKIP_BLANKS;
3762     if (CUR == '=') {
3763         NEXT;
3764         SKIP_BLANKS;
3765         val = htmlParseAttValue(ctxt);
3766     }
3767
3768     *value = val;
3769     return(name);
3770 }
3771
3772 /**
3773  * htmlCheckEncodingDirect:
3774  * @ctxt:  an HTML parser context
3775  * @attvalue: the attribute value
3776  *
3777  * Checks an attribute value to detect
3778  * the encoding
3779  * If a new encoding is detected the parser is switched to decode
3780  * it and pass UTF8
3781  */
3782 static void
3783 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3784
3785     if ((ctxt == NULL) || (encoding == NULL) ||
3786         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3787         return;
3788
3789     /* do not change encoding */
3790     if (ctxt->input->encoding != NULL)
3791         return;
3792
3793     if (encoding != NULL) {
3794         xmlCharEncoding enc;
3795         xmlCharEncodingHandlerPtr handler;
3796
3797         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3798
3799         if (ctxt->input->encoding != NULL)
3800             xmlFree((xmlChar *) ctxt->input->encoding);
3801         ctxt->input->encoding = xmlStrdup(encoding);
3802
3803         enc = xmlParseCharEncoding((const char *) encoding);
3804         /*
3805          * registered set of known encodings
3806          */
3807         if (enc != XML_CHAR_ENCODING_ERROR) {
3808             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3809                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3810                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3811                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3812                 (ctxt->input->buf != NULL) &&
3813                 (ctxt->input->buf->encoder == NULL)) {
3814                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3815                              "htmlCheckEncoding: wrong encoding meta\n",
3816                              NULL, NULL);
3817             } else {
3818                 xmlSwitchEncoding(ctxt, enc);
3819             }
3820             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3821         } else {
3822             /*
3823              * fallback for unknown encodings
3824              */
3825             handler = xmlFindCharEncodingHandler((const char *) encoding);
3826             if (handler != NULL) {
3827                 xmlSwitchToEncoding(ctxt, handler);
3828                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3829             } else {
3830                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3831                              "htmlCheckEncoding: unknown encoding %s\n",
3832                              encoding, NULL);
3833             }
3834         }
3835
3836         if ((ctxt->input->buf != NULL) &&
3837             (ctxt->input->buf->encoder != NULL) &&
3838             (ctxt->input->buf->raw != NULL) &&
3839             (ctxt->input->buf->buffer != NULL)) {
3840             int nbchars;
3841             int processed;
3842
3843             /*
3844              * convert as much as possible to the parser reading buffer.
3845              */
3846             processed = ctxt->input->cur - ctxt->input->base;
3847             xmlBufShrink(ctxt->input->buf->buffer, processed);
3848             nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3849             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3850             if (nbchars < 0) {
3851                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3852                              "htmlCheckEncoding: encoder error\n",
3853                              NULL, NULL);
3854             }
3855         }
3856     }
3857 }
3858
3859 /**
3860  * htmlCheckEncoding:
3861  * @ctxt:  an HTML parser context
3862  * @attvalue: the attribute value
3863  *
3864  * Checks an http-equiv attribute from a Meta tag to detect
3865  * the encoding
3866  * If a new encoding is detected the parser is switched to decode
3867  * it and pass UTF8
3868  */
3869 static void
3870 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3871     const xmlChar *encoding;
3872
3873     if (!attvalue)
3874         return;
3875
3876     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3877     if (encoding != NULL) {
3878         encoding += 7;
3879     }
3880     /*
3881      * skip blank
3882      */
3883     if (encoding && IS_BLANK_CH(*encoding))
3884         encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3885     if (encoding && *encoding == '=') {
3886         encoding ++;
3887         htmlCheckEncodingDirect(ctxt, encoding);
3888     }
3889 }
3890
3891 /**
3892  * htmlCheckMeta:
3893  * @ctxt:  an HTML parser context
3894  * @atts:  the attributes values
3895  *
3896  * Checks an attributes from a Meta tag
3897  */
3898 static void
3899 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3900     int i;
3901     const xmlChar *att, *value;
3902     int http = 0;
3903     const xmlChar *content = NULL;
3904
3905     if ((ctxt == NULL) || (atts == NULL))
3906         return;
3907
3908     i = 0;
3909     att = atts[i++];
3910     while (att != NULL) {
3911         value = atts[i++];
3912         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3913          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3914             http = 1;
3915         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3916             htmlCheckEncodingDirect(ctxt, value);
3917         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3918             content = value;
3919         att = atts[i++];
3920     }
3921     if ((http) && (content != NULL))
3922         htmlCheckEncoding(ctxt, content);
3923
3924 }
3925
3926 /**
3927  * htmlParseStartTag:
3928  * @ctxt:  an HTML parser context
3929  *
3930  * parse a start of tag either for rule element or
3931  * EmptyElement. In both case we don't parse the tag closing chars.
3932  *
3933  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3934  *
3935  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3936  *
3937  * With namespace:
3938  *
3939  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3940  *
3941  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3942  *
3943  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3944  */
3945
3946 static int
3947 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3948     const xmlChar *name;
3949     const xmlChar *attname;
3950     xmlChar *attvalue;
3951     const xmlChar **atts;
3952     int nbatts = 0;
3953     int maxatts;
3954     int meta = 0;
3955     int i;
3956     int discardtag = 0;
3957
3958     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3959         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3960                      "htmlParseStartTag: context error\n", NULL, NULL);
3961         return -1;
3962     }
3963     if (ctxt->instate == XML_PARSER_EOF)
3964         return(-1);
3965     if (CUR != '<') return -1;
3966     NEXT;
3967
3968     atts = ctxt->atts;
3969     maxatts = ctxt->maxatts;
3970
3971     GROW;
3972     name = htmlParseHTMLName(ctxt);
3973     if (name == NULL) {
3974         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3975                      "htmlParseStartTag: invalid element name\n",
3976                      NULL, NULL);
3977         /* Dump the bogus tag like browsers do */
3978         while ((CUR != 0) && (CUR != '>') &&
3979                (ctxt->instate != XML_PARSER_EOF))
3980             NEXT;
3981         return -1;
3982     }
3983     if (xmlStrEqual(name, BAD_CAST"meta"))
3984         meta = 1;
3985
3986     /*
3987      * Check for auto-closure of HTML elements.
3988      */
3989     htmlAutoClose(ctxt, name);
3990
3991     /*
3992      * Check for implied HTML elements.
3993      */
3994     htmlCheckImplied(ctxt, name);
3995
3996     /*
3997      * Avoid html at any level > 0, head at any level != 1
3998      * or any attempt to recurse body
3999      */
4000     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4001         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002                      "htmlParseStartTag: misplaced <html> tag\n",
4003                      name, NULL);
4004         discardtag = 1;
4005         ctxt->depth++;
4006     }
4007     if ((ctxt->nameNr != 1) &&
4008         (xmlStrEqual(name, BAD_CAST"head"))) {
4009         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4010                      "htmlParseStartTag: misplaced <head> tag\n",
4011                      name, NULL);
4012         discardtag = 1;
4013         ctxt->depth++;
4014     }
4015     if (xmlStrEqual(name, BAD_CAST"body")) {
4016         int indx;
4017         for (indx = 0;indx < ctxt->nameNr;indx++) {
4018             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4019                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4020                              "htmlParseStartTag: misplaced <body> tag\n",
4021                              name, NULL);
4022                 discardtag = 1;
4023                 ctxt->depth++;
4024             }
4025         }
4026     }
4027
4028     /*
4029      * Now parse the attributes, it ends up with the ending
4030      *
4031      * (S Attribute)* S?
4032      */
4033     SKIP_BLANKS;
4034     while ((CUR != 0) &&
4035            (CUR != '>') &&
4036            ((CUR != '/') || (NXT(1) != '>'))) {
4037         GROW;
4038         attname = htmlParseAttribute(ctxt, &attvalue);
4039         if (attname != NULL) {
4040
4041             /*
4042              * Well formedness requires at most one declaration of an attribute
4043              */
4044             for (i = 0; i < nbatts;i += 2) {
4045                 if (xmlStrEqual(atts[i], attname)) {
4046                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4047                                  "Attribute %s redefined\n", attname, NULL);
4048                     if (attvalue != NULL)
4049                         xmlFree(attvalue);
4050                     goto failed;
4051                 }
4052             }
4053
4054             /*
4055              * Add the pair to atts
4056              */
4057             if (atts == NULL) {
4058                 maxatts = 22; /* allow for 10 attrs by default */
4059                 atts = (const xmlChar **)
4060                        xmlMalloc(maxatts * sizeof(xmlChar *));
4061                 if (atts == NULL) {
4062                     htmlErrMemory(ctxt, NULL);
4063                     if (attvalue != NULL)
4064                         xmlFree(attvalue);
4065                     goto failed;
4066                 }
4067                 ctxt->atts = atts;
4068                 ctxt->maxatts = maxatts;
4069             } else if (nbatts + 4 > maxatts) {
4070                 const xmlChar **n;
4071
4072                 maxatts *= 2;
4073                 n = (const xmlChar **) xmlRealloc((void *) atts,
4074                                              maxatts * sizeof(const xmlChar *));
4075                 if (n == NULL) {
4076                     htmlErrMemory(ctxt, NULL);
4077                     if (attvalue != NULL)
4078                         xmlFree(attvalue);
4079                     goto failed;
4080                 }
4081                 atts = n;
4082                 ctxt->atts = atts;
4083                 ctxt->maxatts = maxatts;
4084             }
4085             atts[nbatts++] = attname;
4086             atts[nbatts++] = attvalue;
4087             atts[nbatts] = NULL;
4088             atts[nbatts + 1] = NULL;
4089         }
4090         else {
4091             if (attvalue != NULL)
4092                 xmlFree(attvalue);
4093             /* Dump the bogus attribute string up to the next blank or
4094              * the end of the tag. */
4095             while ((CUR != 0) &&
4096                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4097                    ((CUR != '/') || (NXT(1) != '>')))
4098                 NEXT;
4099         }
4100
4101 failed:
4102         SKIP_BLANKS;
4103     }
4104
4105     /*
4106      * Handle specific association to the META tag
4107      */
4108     if (meta && (nbatts != 0))
4109         htmlCheckMeta(ctxt, atts);
4110
4111     /*
4112      * SAX: Start of Element !
4113      */
4114     if (!discardtag) {
4115         htmlnamePush(ctxt, name);
4116         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4117             if (nbatts != 0)
4118                 ctxt->sax->startElement(ctxt->userData, name, atts);
4119             else
4120                 ctxt->sax->startElement(ctxt->userData, name, NULL);
4121         }
4122     }
4123
4124     if (atts != NULL) {
4125         for (i = 1;i < nbatts;i += 2) {
4126             if (atts[i] != NULL)
4127                 xmlFree((xmlChar *) atts[i]);
4128         }
4129     }
4130
4131     return(discardtag);
4132 }
4133
4134 /**
4135  * htmlParseEndTag:
4136  * @ctxt:  an HTML parser context
4137  *
4138  * parse an end of tag
4139  *
4140  * [42] ETag ::= '</' Name S? '>'
4141  *
4142  * With namespace
4143  *
4144  * [NS 9] ETag ::= '</' QName S? '>'
4145  *
4146  * Returns 1 if the current level should be closed.
4147  */
4148
4149 static int
4150 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4151 {
4152     const xmlChar *name;
4153     const xmlChar *oldname;
4154     int i, ret;
4155
4156     if ((CUR != '<') || (NXT(1) != '/')) {
4157         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4158                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
4159         return (0);
4160     }
4161     SKIP(2);
4162
4163     name = htmlParseHTMLName(ctxt);
4164     if (name == NULL)
4165         return (0);
4166     /*
4167      * We should definitely be at the ending "S? '>'" part
4168      */
4169     SKIP_BLANKS;
4170     if (CUR != '>') {
4171         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4172                      "End tag : expected '>'\n", NULL, NULL);
4173         /* Skip to next '>' */
4174         while ((CUR != 0) && (CUR != '>'))
4175             NEXT;
4176     }
4177     if (CUR == '>')
4178         NEXT;
4179
4180     /*
4181      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4182      * out now.
4183      */
4184     if ((ctxt->depth > 0) &&
4185         (xmlStrEqual(name, BAD_CAST "html") ||
4186          xmlStrEqual(name, BAD_CAST "body") ||
4187          xmlStrEqual(name, BAD_CAST "head"))) {
4188         ctxt->depth--;
4189         return (0);
4190     }
4191
4192     /*
4193      * If the name read is not one of the element in the parsing stack
4194      * then return, it's just an error.
4195      */
4196     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4197         if (xmlStrEqual(name, ctxt->nameTab[i]))
4198             break;
4199     }
4200     if (i < 0) {
4201         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4202                      "Unexpected end tag : %s\n", name, NULL);
4203         return (0);
4204     }
4205
4206
4207     /*
4208      * Check for auto-closure of HTML elements.
4209      */
4210
4211     htmlAutoCloseOnClose(ctxt, name);
4212
4213     /*
4214      * Well formedness constraints, opening and closing must match.
4215      * With the exception that the autoclose may have popped stuff out
4216      * of the stack.
4217      */
4218     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4219         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4220                      "Opening and ending tag mismatch: %s and %s\n",
4221                      name, ctxt->name);
4222     }
4223
4224     /*
4225      * SAX: End of Tag
4226      */
4227     oldname = ctxt->name;
4228     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4229         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4230             ctxt->sax->endElement(ctxt->userData, name);
4231         htmlNodeInfoPop(ctxt);
4232         htmlnamePop(ctxt);
4233         ret = 1;
4234     } else {
4235         ret = 0;
4236     }
4237
4238     return (ret);
4239 }
4240
4241
4242 /**
4243  * htmlParseReference:
4244  * @ctxt:  an HTML parser context
4245  *
4246  * parse and handle entity references in content,
4247  * this will end-up in a call to character() since this is either a
4248  * CharRef, or a predefined entity.
4249  */
4250 static void
4251 htmlParseReference(htmlParserCtxtPtr ctxt) {
4252     const htmlEntityDesc * ent;
4253     xmlChar out[6];
4254     const xmlChar *name;
4255     if (CUR != '&') return;
4256
4257     if (NXT(1) == '#') {
4258         unsigned int c;
4259         int bits, i = 0;
4260
4261         c = htmlParseCharRef(ctxt);
4262         if (c == 0)
4263             return;
4264
4265         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4266         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4267         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4268         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4269
4270         for ( ; bits >= 0; bits-= 6) {
4271             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4272         }
4273         out[i] = 0;
4274
4275         htmlCheckParagraph(ctxt);
4276         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4277             ctxt->sax->characters(ctxt->userData, out, i);
4278     } else {
4279         ent = htmlParseEntityRef(ctxt, &name);
4280         if (name == NULL) {
4281             htmlCheckParagraph(ctxt);
4282             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4283                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4284             return;
4285         }
4286         if ((ent == NULL) || !(ent->value > 0)) {
4287             htmlCheckParagraph(ctxt);
4288             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4289                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4290                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4291                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4292             }
4293         } else {
4294             unsigned int c;
4295             int bits, i = 0;
4296
4297             c = ent->value;
4298             if      (c <    0x80)
4299                     { out[i++]= c;                bits= -6; }
4300             else if (c <   0x800)
4301                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4302             else if (c < 0x10000)
4303                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4304             else
4305                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4306
4307             for ( ; bits >= 0; bits-= 6) {
4308                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4309             }
4310             out[i] = 0;
4311
4312             htmlCheckParagraph(ctxt);
4313             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4314                 ctxt->sax->characters(ctxt->userData, out, i);
4315         }
4316     }
4317 }
4318
4319 /**
4320  * htmlParseContent:
4321  * @ctxt:  an HTML parser context
4322  *
4323  * Parse a content: comment, sub-element, reference or text.
4324  * Kept for compatibility with old code
4325  */
4326
4327 static void
4328 htmlParseContent(htmlParserCtxtPtr ctxt) {
4329     xmlChar *currentNode;
4330     int depth;
4331     const xmlChar *name;
4332
4333     currentNode = xmlStrdup(ctxt->name);
4334     depth = ctxt->nameNr;
4335     while (1) {
4336         GROW;
4337
4338         if (ctxt->instate == XML_PARSER_EOF)
4339             break;
4340
4341         /*
4342          * Our tag or one of it's parent or children is ending.
4343          */
4344         if ((CUR == '<') && (NXT(1) == '/')) {
4345             if (htmlParseEndTag(ctxt) &&
4346                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4347                 if (currentNode != NULL)
4348                     xmlFree(currentNode);
4349                 return;
4350             }
4351             continue; /* while */
4352         }
4353
4354         else if ((CUR == '<') &&
4355                  ((IS_ASCII_LETTER(NXT(1))) ||
4356                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4357             name = htmlParseHTMLName_nonInvasive(ctxt);
4358             if (name == NULL) {
4359                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4360                          "htmlParseStartTag: invalid element name\n",
4361                          NULL, NULL);
4362                 /* Dump the bogus tag like browsers do */
4363                 while ((CUR != 0) && (CUR != '>'))
4364                     NEXT;
4365
4366                 if (currentNode != NULL)
4367                     xmlFree(currentNode);
4368                 return;
4369             }
4370
4371             if (ctxt->name != NULL) {
4372                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4373                     htmlAutoClose(ctxt, name);
4374                     continue;
4375                 }
4376             }
4377         }
4378
4379         /*
4380          * Has this node been popped out during parsing of
4381          * the next element
4382          */
4383         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4384             (!xmlStrEqual(currentNode, ctxt->name)))
4385              {
4386             if (currentNode != NULL) xmlFree(currentNode);
4387             return;
4388         }
4389
4390         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4391             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4392             /*
4393              * Handle SCRIPT/STYLE separately
4394              */
4395             htmlParseScript(ctxt);
4396         }
4397
4398         else if ((CUR == '<') && (NXT(1) == '!')) {
4399             /*
4400              * Sometimes DOCTYPE arrives in the middle of the document
4401              */
4402             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4403                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4404                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4405                 (UPP(8) == 'E')) {
4406                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4407                              "Misplaced DOCTYPE declaration\n",
4408                              BAD_CAST "DOCTYPE" , NULL);
4409                 htmlParseDocTypeDecl(ctxt);
4410             }
4411             /*
4412              * First case :  a comment
4413              */
4414             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4415                 htmlParseComment(ctxt);
4416             }
4417             else {
4418                 htmlSkipBogusComment(ctxt);
4419             }
4420         }
4421
4422         /*
4423          * Second case : a Processing Instruction.
4424          */
4425         else if ((CUR == '<') && (NXT(1) == '?')) {
4426             htmlParsePI(ctxt);
4427         }
4428
4429         /*
4430          * Third case :  a sub-element.
4431          */
4432         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4433             htmlParseElement(ctxt);
4434         }
4435         else if (CUR == '<') {
4436             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4437                 (ctxt->sax->characters != NULL))
4438                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4439             NEXT;
4440         }
4441
4442         /*
4443          * Fourth case : a reference. If if has not been resolved,
4444          *    parsing returns it's Name, create the node
4445          */
4446         else if (CUR == '&') {
4447             htmlParseReference(ctxt);
4448         }
4449
4450         /*
4451          * Fifth case : end of the resource
4452          */
4453         else if (CUR == 0) {
4454             htmlAutoCloseOnEnd(ctxt);
4455             break;
4456         }
4457
4458         /*
4459          * Last case, text. Note that References are handled directly.
4460          */
4461         else {
4462             htmlParseCharData(ctxt);
4463         }
4464         GROW;
4465     }
4466     if (currentNode != NULL) xmlFree(currentNode);
4467 }
4468
4469 /**
4470  * htmlParseElement:
4471  * @ctxt:  an HTML parser context
4472  *
4473  * parse an HTML element, this is highly recursive
4474  * this is kept for compatibility with previous code versions
4475  *
4476  * [39] element ::= EmptyElemTag | STag content ETag
4477  *
4478  * [41] Attribute ::= Name Eq AttValue
4479  */
4480
4481 void
4482 htmlParseElement(htmlParserCtxtPtr ctxt) {
4483     const xmlChar *name;
4484     xmlChar *currentNode = NULL;
4485     const htmlElemDesc * info;
4486     htmlParserNodeInfo node_info;
4487     int failed;
4488     int depth;
4489     const xmlChar *oldptr;
4490
4491     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4492         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4493                      "htmlParseElement: context error\n", NULL, NULL);
4494         return;
4495     }
4496
4497     if (ctxt->instate == XML_PARSER_EOF)
4498         return;
4499
4500     /* Capture start position */
4501     if (ctxt->record_info) {
4502         node_info.begin_pos = ctxt->input->consumed +
4503                           (CUR_PTR - ctxt->input->base);
4504         node_info.begin_line = ctxt->input->line;
4505     }
4506
4507     failed = htmlParseStartTag(ctxt);
4508     name = ctxt->name;
4509     if ((failed == -1) || (name == NULL)) {
4510         if (CUR == '>')
4511             NEXT;
4512         return;
4513     }
4514
4515     /*
4516      * Lookup the info for that element.
4517      */
4518     info = htmlTagLookup(name);
4519     if (info == NULL) {
4520         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4521                      "Tag %s invalid\n", name, NULL);
4522     }
4523
4524     /*
4525      * Check for an Empty Element labeled the XML/SGML way
4526      */
4527     if ((CUR == '/') && (NXT(1) == '>')) {
4528         SKIP(2);
4529         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4530             ctxt->sax->endElement(ctxt->userData, name);
4531         htmlnamePop(ctxt);
4532         return;
4533     }
4534
4535     if (CUR == '>') {
4536         NEXT;
4537     } else {
4538         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4539                      "Couldn't find end of Start Tag %s\n", name, NULL);
4540
4541         /*
4542          * end of parsing of this node.
4543          */
4544         if (xmlStrEqual(name, ctxt->name)) {
4545             nodePop(ctxt);
4546             htmlnamePop(ctxt);
4547         }
4548
4549         /*
4550          * Capture end position and add node
4551          */
4552         if (ctxt->record_info) {
4553            node_info.end_pos = ctxt->input->consumed +
4554                               (CUR_PTR - ctxt->input->base);
4555            node_info.end_line = ctxt->input->line;
4556            node_info.node = ctxt->node;
4557            xmlParserAddNodeInfo(ctxt, &node_info);
4558         }
4559         return;
4560     }
4561
4562     /*
4563      * Check for an Empty Element from DTD definition
4564      */
4565     if ((info != NULL) && (info->empty)) {
4566         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4567             ctxt->sax->endElement(ctxt->userData, name);
4568         htmlnamePop(ctxt);
4569         return;
4570     }
4571
4572     /*
4573      * Parse the content of the element:
4574      */
4575     currentNode = xmlStrdup(ctxt->name);
4576     depth = ctxt->nameNr;
4577     while (CUR != 0) {
4578         oldptr = ctxt->input->cur;
4579         htmlParseContent(ctxt);
4580         if (oldptr==ctxt->input->cur) break;
4581         if (ctxt->nameNr < depth) break;
4582     }
4583
4584     /*
4585      * Capture end position and add node
4586      */
4587     if ( currentNode != NULL && ctxt->record_info ) {
4588        node_info.end_pos = ctxt->input->consumed +
4589                           (CUR_PTR - ctxt->input->base);
4590        node_info.end_line = ctxt->input->line;
4591        node_info.node = ctxt->node;
4592        xmlParserAddNodeInfo(ctxt, &node_info);
4593     }
4594     if (CUR == 0) {
4595         htmlAutoCloseOnEnd(ctxt);
4596     }
4597
4598     if (currentNode != NULL)
4599         xmlFree(currentNode);
4600 }
4601
4602 static void
4603 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4604     /*
4605      * Capture end position and add node
4606      */
4607     if ( ctxt->node != NULL && ctxt->record_info ) {
4608        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4609                                 (CUR_PTR - ctxt->input->base);
4610        ctxt->nodeInfo->end_line = ctxt->input->line;
4611        ctxt->nodeInfo->node = ctxt->node;
4612        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4613        htmlNodeInfoPop(ctxt);
4614     }
4615     if (CUR == 0) {
4616        htmlAutoCloseOnEnd(ctxt);
4617     }
4618 }
4619
4620 /**
4621  * htmlParseElementInternal:
4622  * @ctxt:  an HTML parser context
4623  *
4624  * parse an HTML element, new version, non recursive
4625  *
4626  * [39] element ::= EmptyElemTag | STag content ETag
4627  *
4628  * [41] Attribute ::= Name Eq AttValue
4629  */
4630
4631 static void
4632 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4633     const xmlChar *name;
4634     const htmlElemDesc * info;
4635     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4636     int failed;
4637
4638     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4639         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4640                      "htmlParseElementInternal: context error\n", NULL, NULL);
4641         return;
4642     }
4643
4644     if (ctxt->instate == XML_PARSER_EOF)
4645         return;
4646
4647     /* Capture start position */
4648     if (ctxt->record_info) {
4649         node_info.begin_pos = ctxt->input->consumed +
4650                           (CUR_PTR - ctxt->input->base);
4651         node_info.begin_line = ctxt->input->line;
4652     }
4653
4654     failed = htmlParseStartTag(ctxt);
4655     name = ctxt->name;
4656     if ((failed == -1) || (name == NULL)) {
4657         if (CUR == '>')
4658             NEXT;
4659         return;
4660     }
4661
4662     /*
4663      * Lookup the info for that element.
4664      */
4665     info = htmlTagLookup(name);
4666     if (info == NULL) {
4667         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4668                      "Tag %s invalid\n", name, NULL);
4669     }
4670
4671     /*
4672      * Check for an Empty Element labeled the XML/SGML way
4673      */
4674     if ((CUR == '/') && (NXT(1) == '>')) {
4675         SKIP(2);
4676         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4677             ctxt->sax->endElement(ctxt->userData, name);
4678         htmlnamePop(ctxt);
4679         return;
4680     }
4681
4682     if (CUR == '>') {
4683         NEXT;
4684     } else {
4685         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4686                      "Couldn't find end of Start Tag %s\n", name, NULL);
4687
4688         /*
4689          * end of parsing of this node.
4690          */
4691         if (xmlStrEqual(name, ctxt->name)) {
4692             nodePop(ctxt);
4693             htmlnamePop(ctxt);
4694         }
4695
4696         if (ctxt->record_info)
4697             htmlNodeInfoPush(ctxt, &node_info);
4698         htmlParserFinishElementParsing(ctxt);
4699         return;
4700     }
4701
4702     /*
4703      * Check for an Empty Element from DTD definition
4704      */
4705     if ((info != NULL) && (info->empty)) {
4706         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4707             ctxt->sax->endElement(ctxt->userData, name);
4708         htmlnamePop(ctxt);
4709         return;
4710     }
4711
4712     if (ctxt->record_info)
4713         htmlNodeInfoPush(ctxt, &node_info);
4714 }
4715
4716 /**
4717  * htmlParseContentInternal:
4718  * @ctxt:  an HTML parser context
4719  *
4720  * Parse a content: comment, sub-element, reference or text.
4721  * New version for non recursive htmlParseElementInternal
4722  */
4723
4724 static void
4725 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4726     xmlChar *currentNode;
4727     int depth;
4728     const xmlChar *name;
4729
4730     currentNode = xmlStrdup(ctxt->name);
4731     depth = ctxt->nameNr;
4732     while (1) {
4733         GROW;
4734
4735         if (ctxt->instate == XML_PARSER_EOF)
4736             break;
4737
4738         /*
4739          * Our tag or one of it's parent or children is ending.
4740          */
4741         if ((CUR == '<') && (NXT(1) == '/')) {
4742             if (htmlParseEndTag(ctxt) &&
4743                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4744                 if (currentNode != NULL)
4745                     xmlFree(currentNode);
4746
4747                 currentNode = xmlStrdup(ctxt->name);
4748                 depth = ctxt->nameNr;
4749             }
4750             continue; /* while */
4751         }
4752
4753         else if ((CUR == '<') &&
4754                  ((IS_ASCII_LETTER(NXT(1))) ||
4755                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4756             name = htmlParseHTMLName_nonInvasive(ctxt);
4757             if (name == NULL) {
4758                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4759                          "htmlParseStartTag: invalid element name\n",
4760                          NULL, NULL);
4761                 /* Dump the bogus tag like browsers do */
4762                 while ((CUR == 0) && (CUR != '>'))
4763                     NEXT;
4764
4765                 htmlParserFinishElementParsing(ctxt);
4766                 if (currentNode != NULL)
4767                     xmlFree(currentNode);
4768
4769                 currentNode = xmlStrdup(ctxt->name);
4770                 depth = ctxt->nameNr;
4771                 continue;
4772             }
4773
4774             if (ctxt->name != NULL) {
4775                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4776                     htmlAutoClose(ctxt, name);
4777                     continue;
4778                 }
4779             }
4780         }
4781
4782         /*
4783          * Has this node been popped out during parsing of
4784          * the next element
4785          */
4786         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4787             (!xmlStrEqual(currentNode, ctxt->name)))
4788              {
4789             htmlParserFinishElementParsing(ctxt);
4790             if (currentNode != NULL) xmlFree(currentNode);
4791
4792             currentNode = xmlStrdup(ctxt->name);
4793             depth = ctxt->nameNr;
4794             continue;
4795         }
4796
4797         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4798             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4799             /*
4800              * Handle SCRIPT/STYLE separately
4801              */
4802             htmlParseScript(ctxt);
4803         }
4804
4805         else if ((CUR == '<') && (NXT(1) == '!')) {
4806             /*
4807              * Sometimes DOCTYPE arrives in the middle of the document
4808              */
4809             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4810                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4811                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4812                 (UPP(8) == 'E')) {
4813                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4814                              "Misplaced DOCTYPE declaration\n",
4815                              BAD_CAST "DOCTYPE" , NULL);
4816                 htmlParseDocTypeDecl(ctxt);
4817             }
4818             /*
4819              * First case :  a comment
4820              */
4821             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4822                 htmlParseComment(ctxt);
4823             }
4824             else {
4825                 htmlSkipBogusComment(ctxt);
4826             }
4827         }
4828
4829         /*
4830          * Second case : a Processing Instruction.
4831          */
4832         else if ((CUR == '<') && (NXT(1) == '?')) {
4833             htmlParsePI(ctxt);
4834         }
4835
4836         /*
4837          * Third case :  a sub-element.
4838          */
4839         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4840             htmlParseElementInternal(ctxt);
4841             if (currentNode != NULL) xmlFree(currentNode);
4842
4843             currentNode = xmlStrdup(ctxt->name);
4844             depth = ctxt->nameNr;
4845         }
4846         else if (CUR == '<') {
4847             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4848                 (ctxt->sax->characters != NULL))
4849                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4850             NEXT;
4851         }
4852
4853         /*
4854          * Fourth case : a reference. If if has not been resolved,
4855          *    parsing returns it's Name, create the node
4856          */
4857         else if (CUR == '&') {
4858             htmlParseReference(ctxt);
4859         }
4860
4861         /*
4862          * Fifth case : end of the resource
4863          */
4864         else if (CUR == 0) {
4865             htmlAutoCloseOnEnd(ctxt);
4866             break;
4867         }
4868
4869         /*
4870          * Last case, text. Note that References are handled directly.
4871          */
4872         else {
4873             htmlParseCharData(ctxt);
4874         }
4875         GROW;
4876     }
4877     if (currentNode != NULL) xmlFree(currentNode);
4878 }
4879
4880 /**
4881  * htmlParseContent:
4882  * @ctxt:  an HTML parser context
4883  *
4884  * Parse a content: comment, sub-element, reference or text.
4885  * This is the entry point when called from parser.c
4886  */
4887
4888 void
4889 __htmlParseContent(void *ctxt) {
4890     if (ctxt != NULL)
4891         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4892 }
4893
4894 /**
4895  * htmlParseDocument:
4896  * @ctxt:  an HTML parser context
4897  *
4898  * parse an HTML document (and build a tree if using the standard SAX
4899  * interface).
4900  *
4901  * Returns 0, -1 in case of error. the parser context is augmented
4902  *                as a result of the parsing.
4903  */
4904
4905 int
4906 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4907     xmlChar start[4];
4908     xmlCharEncoding enc;
4909     xmlDtdPtr dtd;
4910
4911     xmlInitParser();
4912
4913     htmlDefaultSAXHandlerInit();
4914
4915     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4916         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4917                      "htmlParseDocument: context error\n", NULL, NULL);
4918         return(XML_ERR_INTERNAL_ERROR);
4919     }
4920     ctxt->html = 1;
4921     ctxt->linenumbers = 1;
4922     GROW;
4923     /*
4924      * SAX: beginning of the document processing.
4925      */
4926     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4927         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4928
4929     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4930         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4931         /*
4932          * Get the 4 first bytes and decode the charset
4933          * if enc != XML_CHAR_ENCODING_NONE
4934          * plug some encoding conversion routines.
4935          */
4936         start[0] = RAW;
4937         start[1] = NXT(1);
4938         start[2] = NXT(2);
4939         start[3] = NXT(3);
4940         enc = xmlDetectCharEncoding(&start[0], 4);
4941         if (enc != XML_CHAR_ENCODING_NONE) {
4942             xmlSwitchEncoding(ctxt, enc);
4943         }
4944     }
4945
4946     /*
4947      * Wipe out everything which is before the first '<'
4948      */
4949     SKIP_BLANKS;
4950     if (CUR == 0) {
4951         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4952                      "Document is empty\n", NULL, NULL);
4953     }
4954
4955     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4956         ctxt->sax->startDocument(ctxt->userData);
4957
4958
4959     /*
4960      * Parse possible comments and PIs before any content
4961      */
4962     while (((CUR == '<') && (NXT(1) == '!') &&
4963             (NXT(2) == '-') && (NXT(3) == '-')) ||
4964            ((CUR == '<') && (NXT(1) == '?'))) {
4965         htmlParseComment(ctxt);
4966         htmlParsePI(ctxt);
4967         SKIP_BLANKS;
4968     }
4969
4970
4971     /*
4972      * Then possibly doc type declaration(s) and more Misc
4973      * (doctypedecl Misc*)?
4974      */
4975     if ((CUR == '<') && (NXT(1) == '!') &&
4976         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4977         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4978         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4979         (UPP(8) == 'E')) {
4980         htmlParseDocTypeDecl(ctxt);
4981     }
4982     SKIP_BLANKS;
4983
4984     /*
4985      * Parse possible comments and PIs before any content
4986      */
4987     while (((CUR == '<') && (NXT(1) == '!') &&
4988             (NXT(2) == '-') && (NXT(3) == '-')) ||
4989            ((CUR == '<') && (NXT(1) == '?'))) {
4990         htmlParseComment(ctxt);
4991         htmlParsePI(ctxt);
4992         SKIP_BLANKS;
4993     }
4994
4995     /*
4996      * Time to start parsing the tree itself
4997      */
4998     htmlParseContentInternal(ctxt);
4999
5000     /*
5001      * autoclose
5002      */
5003     if (CUR == 0)
5004         htmlAutoCloseOnEnd(ctxt);
5005
5006
5007     /*
5008      * SAX: end of the document processing.
5009      */
5010     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5011         ctxt->sax->endDocument(ctxt->userData);
5012
5013     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5014         dtd = xmlGetIntSubset(ctxt->myDoc);
5015         if (dtd == NULL)
5016             ctxt->myDoc->intSubset =
5017                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5018                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5019                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5020     }
5021     if (! ctxt->wellFormed) return(-1);
5022     return(0);
5023 }
5024
5025
5026 /************************************************************************
5027  *                                                                      *
5028  *                      Parser contexts handling                        *
5029  *                                                                      *
5030  ************************************************************************/
5031
5032 /**
5033  * htmlInitParserCtxt:
5034  * @ctxt:  an HTML parser context
5035  *
5036  * Initialize a parser context
5037  *
5038  * Returns 0 in case of success and -1 in case of error
5039  */
5040
5041 static int
5042 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5043 {
5044     htmlSAXHandler *sax;
5045
5046     if (ctxt == NULL) return(-1);
5047     memset(ctxt, 0, sizeof(htmlParserCtxt));
5048
5049     ctxt->dict = xmlDictCreate();
5050     if (ctxt->dict == NULL) {
5051         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5052         return(-1);
5053     }
5054     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5055     if (sax == NULL) {
5056         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5057         return(-1);
5058     }
5059     else
5060         memset(sax, 0, sizeof(htmlSAXHandler));
5061
5062     /* Allocate the Input stack */
5063     ctxt->inputTab = (htmlParserInputPtr *)
5064                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5065     if (ctxt->inputTab == NULL) {
5066         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5067         ctxt->inputNr = 0;
5068         ctxt->inputMax = 0;
5069         ctxt->input = NULL;
5070         return(-1);
5071     }
5072     ctxt->inputNr = 0;
5073     ctxt->inputMax = 5;
5074     ctxt->input = NULL;
5075     ctxt->version = NULL;
5076     ctxt->encoding = NULL;
5077     ctxt->standalone = -1;
5078     ctxt->instate = XML_PARSER_START;
5079
5080     /* Allocate the Node stack */
5081     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5082     if (ctxt->nodeTab == NULL) {
5083         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5084         ctxt->nodeNr = 0;
5085         ctxt->nodeMax = 0;
5086         ctxt->node = NULL;
5087         ctxt->inputNr = 0;
5088         ctxt->inputMax = 0;
5089         ctxt->input = NULL;
5090         return(-1);
5091     }
5092     ctxt->nodeNr = 0;
5093     ctxt->nodeMax = 10;
5094     ctxt->node = NULL;
5095
5096     /* Allocate the Name stack */
5097     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5098     if (ctxt->nameTab == NULL) {
5099         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5100         ctxt->nameNr = 0;
5101         ctxt->nameMax = 0;
5102         ctxt->name = NULL;
5103         ctxt->nodeNr = 0;
5104         ctxt->nodeMax = 0;
5105         ctxt->node = NULL;
5106         ctxt->inputNr = 0;
5107         ctxt->inputMax = 0;
5108         ctxt->input = NULL;
5109         return(-1);
5110     }
5111     ctxt->nameNr = 0;
5112     ctxt->nameMax = 10;
5113     ctxt->name = NULL;
5114
5115     ctxt->nodeInfoTab = NULL;
5116     ctxt->nodeInfoNr  = 0;
5117     ctxt->nodeInfoMax = 0;
5118
5119     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5120     else {
5121         ctxt->sax = sax;
5122         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5123     }
5124     ctxt->userData = ctxt;
5125     ctxt->myDoc = NULL;
5126     ctxt->wellFormed = 1;
5127     ctxt->replaceEntities = 0;
5128     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5129     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5130     ctxt->html = 1;
5131     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5132     ctxt->vctxt.userData = ctxt;
5133     ctxt->vctxt.error = xmlParserValidityError;
5134     ctxt->vctxt.warning = xmlParserValidityWarning;
5135     ctxt->record_info = 0;
5136     ctxt->validate = 0;
5137     ctxt->checkIndex = 0;
5138     ctxt->catalogs = NULL;
5139     xmlInitNodeInfoSeq(&ctxt->node_seq);
5140     return(0);
5141 }
5142
5143 /**
5144  * htmlFreeParserCtxt:
5145  * @ctxt:  an HTML parser context
5146  *
5147  * Free all the memory used by a parser context. However the parsed
5148  * document in ctxt->myDoc is not freed.
5149  */
5150
5151 void
5152 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5153 {
5154     xmlFreeParserCtxt(ctxt);
5155 }
5156
5157 /**
5158  * htmlNewParserCtxt:
5159  *
5160  * Allocate and initialize a new parser context.
5161  *
5162  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5163  */
5164
5165 htmlParserCtxtPtr
5166 htmlNewParserCtxt(void)
5167 {
5168     xmlParserCtxtPtr ctxt;
5169
5170     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5171     if (ctxt == NULL) {
5172         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5173         return(NULL);
5174     }
5175     memset(ctxt, 0, sizeof(xmlParserCtxt));
5176     if (htmlInitParserCtxt(ctxt) < 0) {
5177         htmlFreeParserCtxt(ctxt);
5178         return(NULL);
5179     }
5180     return(ctxt);
5181 }
5182
5183 /**
5184  * htmlCreateMemoryParserCtxt:
5185  * @buffer:  a pointer to a char array
5186  * @size:  the size of the array
5187  *
5188  * Create a parser context for an HTML in-memory document.
5189  *
5190  * Returns the new parser context or NULL
5191  */
5192 htmlParserCtxtPtr
5193 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5194     xmlParserCtxtPtr ctxt;
5195     xmlParserInputPtr input;
5196     xmlParserInputBufferPtr buf;
5197
5198     if (buffer == NULL)
5199         return(NULL);
5200     if (size <= 0)
5201         return(NULL);
5202
5203     ctxt = htmlNewParserCtxt();
5204     if (ctxt == NULL)
5205         return(NULL);
5206
5207     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5208     if (buf == NULL) return(NULL);
5209
5210     input = xmlNewInputStream(ctxt);
5211     if (input == NULL) {
5212         xmlFreeParserInputBuffer(buf);
5213         xmlFreeParserCtxt(ctxt);
5214         return(NULL);
5215     }
5216
5217     input->filename = NULL;
5218     input->buf = buf;
5219     xmlBufResetInput(buf->buffer, input);
5220
5221     inputPush(ctxt, input);
5222     return(ctxt);
5223 }
5224
5225 /**
5226  * htmlCreateDocParserCtxt:
5227  * @cur:  a pointer to an array of xmlChar
5228  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5229  *
5230  * Create a parser context for an HTML document.
5231  *
5232  * TODO: check the need to add encoding handling there
5233  *
5234  * Returns the new parser context or NULL
5235  */
5236 static htmlParserCtxtPtr
5237 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5238     int len;
5239     htmlParserCtxtPtr ctxt;
5240
5241     if (cur == NULL)
5242         return(NULL);
5243     len = xmlStrlen(cur);
5244     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5245     if (ctxt == NULL)
5246         return(NULL);
5247
5248     if (encoding != NULL) {
5249         xmlCharEncoding enc;
5250         xmlCharEncodingHandlerPtr handler;
5251
5252         if (ctxt->input->encoding != NULL)
5253             xmlFree((xmlChar *) ctxt->input->encoding);
5254         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5255
5256         enc = xmlParseCharEncoding(encoding);
5257         /*
5258          * registered set of known encodings
5259          */
5260         if (enc != XML_CHAR_ENCODING_ERROR) {
5261             xmlSwitchEncoding(ctxt, enc);
5262             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5263                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5264                              "Unsupported encoding %s\n",
5265                              (const xmlChar *) encoding, NULL);
5266             }
5267         } else {
5268             /*
5269              * fallback for unknown encodings
5270              */
5271             handler = xmlFindCharEncodingHandler((const char *) encoding);
5272             if (handler != NULL) {
5273                 xmlSwitchToEncoding(ctxt, handler);
5274             } else {
5275                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5276                              "Unsupported encoding %s\n",
5277                              (const xmlChar *) encoding, NULL);
5278             }
5279         }
5280     }
5281     return(ctxt);
5282 }
5283
5284 #ifdef LIBXML_PUSH_ENABLED
5285 /************************************************************************
5286  *                                                                      *
5287  *      Progressive parsing interfaces                          *
5288  *                                                                      *
5289  ************************************************************************/
5290
5291 /**
5292  * htmlParseLookupSequence:
5293  * @ctxt:  an HTML parser context
5294  * @first:  the first char to lookup
5295  * @next:  the next char to lookup or zero
5296  * @third:  the next char to lookup or zero
5297  * @ignoreattrval: skip over attribute values
5298  *
5299  * Try to find if a sequence (first, next, third) or  just (first next) or
5300  * (first) is available in the input stream.
5301  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5302  * to avoid rescanning sequences of bytes, it DOES change the state of the
5303  * parser, do not use liberally.
5304  * This is basically similar to xmlParseLookupSequence()
5305  *
5306  * Returns the index to the current parsing point if the full sequence
5307  *      is available, -1 otherwise.
5308  */
5309 static int
5310 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5311                         xmlChar next, xmlChar third, int ignoreattrval)
5312 {
5313     int base, len;
5314     htmlParserInputPtr in;
5315     const xmlChar *buf;
5316     int invalue = 0;
5317     char valdellim = 0x0;
5318
5319     in = ctxt->input;
5320     if (in == NULL)
5321         return (-1);
5322
5323     base = in->cur - in->base;
5324     if (base < 0)
5325         return (-1);
5326
5327     if (ctxt->checkIndex > base) {
5328         base = ctxt->checkIndex;
5329         /* Abuse hasPErefs member to restore current state. */
5330         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5331     }
5332
5333     if (in->buf == NULL) {
5334         buf = in->base;
5335         len = in->length;
5336     } else {
5337         buf = xmlBufContent(in->buf->buffer);
5338         len = xmlBufUse(in->buf->buffer);
5339     }
5340
5341     /* take into account the sequence length */
5342     if (third)
5343         len -= 2;
5344     else if (next)
5345         len--;
5346     for (; base < len; base++) {
5347         if (ignoreattrval) {
5348             if (buf[base] == '"' || buf[base] == '\'') {
5349                 if (invalue) {
5350                     if (buf[base] == valdellim) {
5351                         invalue = 0;
5352                         continue;
5353                     }
5354                 } else {
5355                     valdellim = buf[base];
5356                     invalue = 1;
5357                     continue;
5358                 }
5359             } else if (invalue) {
5360                 continue;
5361             }
5362         }
5363         if (buf[base] == first) {
5364             if (third != 0) {
5365                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5366                     continue;
5367             } else if (next != 0) {
5368                 if (buf[base + 1] != next)
5369                     continue;
5370             }
5371             ctxt->checkIndex = 0;
5372 #ifdef DEBUG_PUSH
5373             if (next == 0)
5374                 xmlGenericError(xmlGenericErrorContext,
5375                                 "HPP: lookup '%c' found at %d\n",
5376                                 first, base);
5377             else if (third == 0)
5378                 xmlGenericError(xmlGenericErrorContext,
5379                                 "HPP: lookup '%c%c' found at %d\n",
5380                                 first, next, base);
5381             else
5382                 xmlGenericError(xmlGenericErrorContext,
5383                                 "HPP: lookup '%c%c%c' found at %d\n",
5384                                 first, next, third, base);
5385 #endif
5386             return (base - (in->cur - in->base));
5387         }
5388     }
5389     ctxt->checkIndex = base;
5390     /* Abuse hasPErefs member to track current state. */
5391     if (invalue)
5392         ctxt->hasPErefs |= 1;
5393     else
5394         ctxt->hasPErefs &= ~1;
5395 #ifdef DEBUG_PUSH
5396     if (next == 0)
5397         xmlGenericError(xmlGenericErrorContext,
5398                         "HPP: lookup '%c' failed\n", first);
5399     else if (third == 0)
5400         xmlGenericError(xmlGenericErrorContext,
5401                         "HPP: lookup '%c%c' failed\n", first, next);
5402     else
5403         xmlGenericError(xmlGenericErrorContext,
5404                         "HPP: lookup '%c%c%c' failed\n", first, next,
5405                         third);
5406 #endif
5407     return (-1);
5408 }
5409
5410 /**
5411  * htmlParseLookupCommentEnd:
5412  * @ctxt: an HTML parser context
5413  *
5414  * Try to find a comment end tag in the input stream
5415  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5416  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5417  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5418  * to avoid rescanning sequences of bytes, it DOES change the state of the
5419  * parser, do not use liberally.
5420  * This wraps to htmlParseLookupSequence()
5421  *
5422  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5423  */
5424 static int
5425 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5426 {
5427     int mark = 0;
5428     int cur = CUR_PTR - BASE_PTR;
5429
5430     while (mark >= 0) {
5431         mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5432         if ((mark < 0) ||
5433             (NXT(mark+2) == '>') ||
5434             ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5435             return mark;
5436         }
5437         ctxt->checkIndex = cur + mark + 1;
5438     }
5439     return mark;
5440 }
5441
5442
5443 /**
5444  * htmlParseTryOrFinish:
5445  * @ctxt:  an HTML parser context
5446  * @terminate:  last chunk indicator
5447  *
5448  * Try to progress on parsing
5449  *
5450  * Returns zero if no parsing was possible
5451  */
5452 static int
5453 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5454     int ret = 0;
5455     htmlParserInputPtr in;
5456     ptrdiff_t avail = 0;
5457     xmlChar cur, next;
5458
5459     htmlParserNodeInfo node_info;
5460
5461 #ifdef DEBUG_PUSH
5462     switch (ctxt->instate) {
5463         case XML_PARSER_EOF:
5464             xmlGenericError(xmlGenericErrorContext,
5465                     "HPP: try EOF\n"); break;
5466         case XML_PARSER_START:
5467             xmlGenericError(xmlGenericErrorContext,
5468                     "HPP: try START\n"); break;
5469         case XML_PARSER_MISC:
5470             xmlGenericError(xmlGenericErrorContext,
5471                     "HPP: try MISC\n");break;
5472         case XML_PARSER_COMMENT:
5473             xmlGenericError(xmlGenericErrorContext,
5474                     "HPP: try COMMENT\n");break;
5475         case XML_PARSER_PROLOG:
5476             xmlGenericError(xmlGenericErrorContext,
5477                     "HPP: try PROLOG\n");break;
5478         case XML_PARSER_START_TAG:
5479             xmlGenericError(xmlGenericErrorContext,
5480                     "HPP: try START_TAG\n");break;
5481         case XML_PARSER_CONTENT:
5482             xmlGenericError(xmlGenericErrorContext,
5483                     "HPP: try CONTENT\n");break;
5484         case XML_PARSER_CDATA_SECTION:
5485             xmlGenericError(xmlGenericErrorContext,
5486                     "HPP: try CDATA_SECTION\n");break;
5487         case XML_PARSER_END_TAG:
5488             xmlGenericError(xmlGenericErrorContext,
5489                     "HPP: try END_TAG\n");break;
5490         case XML_PARSER_ENTITY_DECL:
5491             xmlGenericError(xmlGenericErrorContext,
5492                     "HPP: try ENTITY_DECL\n");break;
5493         case XML_PARSER_ENTITY_VALUE:
5494             xmlGenericError(xmlGenericErrorContext,
5495                     "HPP: try ENTITY_VALUE\n");break;
5496         case XML_PARSER_ATTRIBUTE_VALUE:
5497             xmlGenericError(xmlGenericErrorContext,
5498                     "HPP: try ATTRIBUTE_VALUE\n");break;
5499         case XML_PARSER_DTD:
5500             xmlGenericError(xmlGenericErrorContext,
5501                     "HPP: try DTD\n");break;
5502         case XML_PARSER_EPILOG:
5503             xmlGenericError(xmlGenericErrorContext,
5504                     "HPP: try EPILOG\n");break;
5505         case XML_PARSER_PI:
5506             xmlGenericError(xmlGenericErrorContext,
5507                     "HPP: try PI\n");break;
5508         case XML_PARSER_SYSTEM_LITERAL:
5509             xmlGenericError(xmlGenericErrorContext,
5510                     "HPP: try SYSTEM_LITERAL\n");break;
5511     }
5512 #endif
5513
5514     while (1) {
5515
5516         in = ctxt->input;
5517         if (in == NULL) break;
5518         if (in->buf == NULL)
5519             avail = in->length - (in->cur - in->base);
5520         else
5521             avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5522                     (in->cur - in->base);
5523         if ((avail == 0) && (terminate)) {
5524             htmlAutoCloseOnEnd(ctxt);
5525             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5526                 /*
5527                  * SAX: end of the document processing.
5528                  */
5529                 ctxt->instate = XML_PARSER_EOF;
5530                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5531                     ctxt->sax->endDocument(ctxt->userData);
5532             }
5533         }
5534         if (avail < 1)
5535             goto done;
5536         /*
5537          * This is done to make progress and avoid an infinite loop
5538          * if a parsing attempt was aborted by hitting a NUL byte. After
5539          * changing htmlCurrentChar, this probably isn't necessary anymore.
5540          * We should consider removing this check.
5541          */
5542         cur = in->cur[0];
5543         if (cur == 0) {
5544             SKIP(1);
5545             continue;
5546         }
5547
5548         switch (ctxt->instate) {
5549             case XML_PARSER_EOF:
5550                 /*
5551                  * Document parsing is done !
5552                  */
5553                 goto done;
5554             case XML_PARSER_START:
5555                 /*
5556                  * Very first chars read from the document flow.
5557                  */
5558                 cur = in->cur[0];
5559                 if (IS_BLANK_CH(cur)) {
5560                     SKIP_BLANKS;
5561                     if (in->buf == NULL)
5562                         avail = in->length - (in->cur - in->base);
5563                     else
5564                         avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5565                                 (in->cur - in->base);
5566                 }
5567                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5568                     ctxt->sax->setDocumentLocator(ctxt->userData,
5569                                                   &xmlDefaultSAXLocator);
5570                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5571                     (!ctxt->disableSAX))
5572                     ctxt->sax->startDocument(ctxt->userData);
5573
5574                 cur = in->cur[0];
5575                 next = in->cur[1];
5576                 if ((cur == '<') && (next == '!') &&
5577                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5578                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5579                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5580                     (UPP(8) == 'E')) {
5581                     if ((!terminate) &&
5582                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5583                         goto done;
5584 #ifdef DEBUG_PUSH
5585                     xmlGenericError(xmlGenericErrorContext,
5586                             "HPP: Parsing internal subset\n");
5587 #endif
5588                     htmlParseDocTypeDecl(ctxt);
5589                     ctxt->instate = XML_PARSER_PROLOG;
5590 #ifdef DEBUG_PUSH
5591                     xmlGenericError(xmlGenericErrorContext,
5592                             "HPP: entering PROLOG\n");
5593 #endif
5594                 } else {
5595                     ctxt->instate = XML_PARSER_MISC;
5596 #ifdef DEBUG_PUSH
5597                     xmlGenericError(xmlGenericErrorContext,
5598                             "HPP: entering MISC\n");
5599 #endif
5600                 }
5601                 break;
5602             case XML_PARSER_MISC:
5603                 SKIP_BLANKS;
5604                 if (in->buf == NULL)
5605                     avail = in->length - (in->cur - in->base);
5606                 else
5607                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5608                             (in->cur - in->base);
5609                 /*
5610                  * no chars in buffer
5611                  */
5612                 if (avail < 1)
5613                     goto done;
5614                 /*
5615                  * not enough chars in buffer
5616                  */
5617                 if (avail < 2) {
5618                     if (!terminate)
5619                         goto done;
5620                     else
5621                         next = ' ';
5622                 } else {
5623                     next = in->cur[1];
5624                 }
5625                 cur = in->cur[0];
5626                 if ((cur == '<') && (next == '!') &&
5627                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5628                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5629                         goto done;
5630 #ifdef DEBUG_PUSH
5631                     xmlGenericError(xmlGenericErrorContext,
5632                             "HPP: Parsing Comment\n");
5633 #endif
5634                     htmlParseComment(ctxt);
5635                     ctxt->instate = XML_PARSER_MISC;
5636                 } else if ((cur == '<') && (next == '?')) {
5637                     if ((!terminate) &&
5638                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5639                         goto done;
5640 #ifdef DEBUG_PUSH
5641                     xmlGenericError(xmlGenericErrorContext,
5642                             "HPP: Parsing PI\n");
5643 #endif
5644                     htmlParsePI(ctxt);
5645                     ctxt->instate = XML_PARSER_MISC;
5646                 } else if ((cur == '<') && (next == '!') &&
5647                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5648                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5649                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5650                     (UPP(8) == 'E')) {
5651                     if ((!terminate) &&
5652                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5653                         goto done;
5654 #ifdef DEBUG_PUSH
5655                     xmlGenericError(xmlGenericErrorContext,
5656                             "HPP: Parsing internal subset\n");
5657 #endif
5658                     htmlParseDocTypeDecl(ctxt);
5659                     ctxt->instate = XML_PARSER_PROLOG;
5660 #ifdef DEBUG_PUSH
5661                     xmlGenericError(xmlGenericErrorContext,
5662                             "HPP: entering PROLOG\n");
5663 #endif
5664                 } else if ((cur == '<') && (next == '!') &&
5665                            (avail < 9)) {
5666                     goto done;
5667                 } else {
5668                     ctxt->instate = XML_PARSER_CONTENT;
5669 #ifdef DEBUG_PUSH
5670                     xmlGenericError(xmlGenericErrorContext,
5671                             "HPP: entering START_TAG\n");
5672 #endif
5673                 }
5674                 break;
5675             case XML_PARSER_PROLOG:
5676                 SKIP_BLANKS;
5677                 if (in->buf == NULL)
5678                     avail = in->length - (in->cur - in->base);
5679                 else
5680                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5681                             (in->cur - in->base);
5682                 if (avail < 2)
5683                     goto done;
5684                 cur = in->cur[0];
5685                 next = in->cur[1];
5686                 if ((cur == '<') && (next == '!') &&
5687                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5688                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5689                         goto done;
5690 #ifdef DEBUG_PUSH
5691                     xmlGenericError(xmlGenericErrorContext,
5692                             "HPP: Parsing Comment\n");
5693 #endif
5694                     htmlParseComment(ctxt);
5695                     ctxt->instate = XML_PARSER_PROLOG;
5696                 } else if ((cur == '<') && (next == '?')) {
5697                     if ((!terminate) &&
5698                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5699                         goto done;
5700 #ifdef DEBUG_PUSH
5701                     xmlGenericError(xmlGenericErrorContext,
5702                             "HPP: Parsing PI\n");
5703 #endif
5704                     htmlParsePI(ctxt);
5705                     ctxt->instate = XML_PARSER_PROLOG;
5706                 } else if ((cur == '<') && (next == '!') &&
5707                            (avail < 4)) {
5708                     goto done;
5709                 } else {
5710                     ctxt->instate = XML_PARSER_CONTENT;
5711 #ifdef DEBUG_PUSH
5712                     xmlGenericError(xmlGenericErrorContext,
5713                             "HPP: entering START_TAG\n");
5714 #endif
5715                 }
5716                 break;
5717             case XML_PARSER_EPILOG:
5718                 if (in->buf == NULL)
5719                     avail = in->length - (in->cur - in->base);
5720                 else
5721                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5722                             (in->cur - in->base);
5723                 if (avail < 1)
5724                     goto done;
5725                 cur = in->cur[0];
5726                 if (IS_BLANK_CH(cur)) {
5727                     htmlParseCharData(ctxt);
5728                     goto done;
5729                 }
5730                 if (avail < 2)
5731                     goto done;
5732                 next = in->cur[1];
5733                 if ((cur == '<') && (next == '!') &&
5734                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5735                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5736                         goto done;
5737 #ifdef DEBUG_PUSH
5738                     xmlGenericError(xmlGenericErrorContext,
5739                             "HPP: Parsing Comment\n");
5740 #endif
5741                     htmlParseComment(ctxt);
5742                     ctxt->instate = XML_PARSER_EPILOG;
5743                 } else if ((cur == '<') && (next == '?')) {
5744                     if ((!terminate) &&
5745                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5746                         goto done;
5747 #ifdef DEBUG_PUSH
5748                     xmlGenericError(xmlGenericErrorContext,
5749                             "HPP: Parsing PI\n");
5750 #endif
5751                     htmlParsePI(ctxt);
5752                     ctxt->instate = XML_PARSER_EPILOG;
5753                 } else if ((cur == '<') && (next == '!') &&
5754                            (avail < 4)) {
5755                     goto done;
5756                 } else {
5757                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5758                     ctxt->wellFormed = 0;
5759                     ctxt->instate = XML_PARSER_EOF;
5760 #ifdef DEBUG_PUSH
5761                     xmlGenericError(xmlGenericErrorContext,
5762                             "HPP: entering EOF\n");
5763 #endif
5764                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5765                         ctxt->sax->endDocument(ctxt->userData);
5766                     goto done;
5767                 }
5768                 break;
5769             case XML_PARSER_START_TAG: {
5770                 const xmlChar *name;
5771                 int failed;
5772                 const htmlElemDesc * info;
5773
5774                 /*
5775                  * no chars in buffer
5776                  */
5777                 if (avail < 1)
5778                     goto done;
5779                 /*
5780                  * not enough chars in buffer
5781                  */
5782                 if (avail < 2) {
5783                     if (!terminate)
5784                         goto done;
5785                     else
5786                         next = ' ';
5787                 } else {
5788                     next = in->cur[1];
5789                 }
5790                 cur = in->cur[0];
5791                 if (cur != '<') {
5792                     ctxt->instate = XML_PARSER_CONTENT;
5793 #ifdef DEBUG_PUSH
5794                     xmlGenericError(xmlGenericErrorContext,
5795                             "HPP: entering CONTENT\n");
5796 #endif
5797                     break;
5798                 }
5799                 if (next == '/') {
5800                     ctxt->instate = XML_PARSER_END_TAG;
5801                     ctxt->checkIndex = 0;
5802 #ifdef DEBUG_PUSH
5803                     xmlGenericError(xmlGenericErrorContext,
5804                             "HPP: entering END_TAG\n");
5805 #endif
5806                     break;
5807                 }
5808                 if ((!terminate) &&
5809                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5810                     goto done;
5811
5812                 /* Capture start position */
5813                 if (ctxt->record_info) {
5814                      node_info.begin_pos = ctxt->input->consumed +
5815                                         (CUR_PTR - ctxt->input->base);
5816                      node_info.begin_line = ctxt->input->line;
5817                 }
5818
5819
5820                 failed = htmlParseStartTag(ctxt);
5821                 name = ctxt->name;
5822                 if ((failed == -1) ||
5823                     (name == NULL)) {
5824                     if (CUR == '>')
5825                         NEXT;
5826                     break;
5827                 }
5828
5829                 /*
5830                  * Lookup the info for that element.
5831                  */
5832                 info = htmlTagLookup(name);
5833                 if (info == NULL) {
5834                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5835                                  "Tag %s invalid\n", name, NULL);
5836                 }
5837
5838                 /*
5839                  * Check for an Empty Element labeled the XML/SGML way
5840                  */
5841                 if ((CUR == '/') && (NXT(1) == '>')) {
5842                     SKIP(2);
5843                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5844                         ctxt->sax->endElement(ctxt->userData, name);
5845                     htmlnamePop(ctxt);
5846                     ctxt->instate = XML_PARSER_CONTENT;
5847 #ifdef DEBUG_PUSH
5848                     xmlGenericError(xmlGenericErrorContext,
5849                             "HPP: entering CONTENT\n");
5850 #endif
5851                     break;
5852                 }
5853
5854                 if (CUR == '>') {
5855                     NEXT;
5856                 } else {
5857                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5858                                  "Couldn't find end of Start Tag %s\n",
5859                                  name, NULL);
5860
5861                     /*
5862                      * end of parsing of this node.
5863                      */
5864                     if (xmlStrEqual(name, ctxt->name)) {
5865                         nodePop(ctxt);
5866                         htmlnamePop(ctxt);
5867                     }
5868
5869                     if (ctxt->record_info)
5870                         htmlNodeInfoPush(ctxt, &node_info);
5871
5872                     ctxt->instate = XML_PARSER_CONTENT;
5873 #ifdef DEBUG_PUSH
5874                     xmlGenericError(xmlGenericErrorContext,
5875                             "HPP: entering CONTENT\n");
5876 #endif
5877                     break;
5878                 }
5879
5880                 /*
5881                  * Check for an Empty Element from DTD definition
5882                  */
5883                 if ((info != NULL) && (info->empty)) {
5884                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5885                         ctxt->sax->endElement(ctxt->userData, name);
5886                     htmlnamePop(ctxt);
5887                 }
5888
5889                 if (ctxt->record_info)
5890                     htmlNodeInfoPush(ctxt, &node_info);
5891
5892                 ctxt->instate = XML_PARSER_CONTENT;
5893 #ifdef DEBUG_PUSH
5894                 xmlGenericError(xmlGenericErrorContext,
5895                         "HPP: entering CONTENT\n");
5896 #endif
5897                 break;
5898             }
5899             case XML_PARSER_CONTENT: {
5900                 xmlChar chr[2] = { 0, 0 };
5901
5902                 /*
5903                  * Handle preparsed entities and charRef
5904                  */
5905                 if (ctxt->token != 0) {
5906                     chr[0] = (xmlChar) ctxt->token;
5907                     htmlCheckParagraph(ctxt);
5908                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5909                         ctxt->sax->characters(ctxt->userData, chr, 1);
5910                     ctxt->token = 0;
5911                     ctxt->checkIndex = 0;
5912                 }
5913                 if ((avail == 1) && (terminate)) {
5914                     cur = in->cur[0];
5915                     if ((cur != '<') && (cur != '&')) {
5916                         if (ctxt->sax != NULL) {
5917                             chr[0] = cur;
5918                             if (IS_BLANK_CH(cur)) {
5919                                 if (ctxt->keepBlanks) {
5920                                     if (ctxt->sax->characters != NULL)
5921                                         ctxt->sax->characters(
5922                                                 ctxt->userData, chr, 1);
5923                                 } else {
5924                                     if (ctxt->sax->ignorableWhitespace != NULL)
5925                                         ctxt->sax->ignorableWhitespace(
5926                                                 ctxt->userData, chr, 1);
5927                                 }
5928                             } else {
5929                                 htmlCheckParagraph(ctxt);
5930                                 if (ctxt->sax->characters != NULL)
5931                                     ctxt->sax->characters(
5932                                             ctxt->userData, chr, 1);
5933                             }
5934                         }
5935                         ctxt->token = 0;
5936                         ctxt->checkIndex = 0;
5937                         in->cur++;
5938                         break;
5939                     }
5940                 }
5941                 if (avail < 2)
5942                     goto done;
5943                 cur = in->cur[0];
5944                 next = in->cur[1];
5945                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5946                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5947                     /*
5948                      * Handle SCRIPT/STYLE separately
5949                      */
5950                     if (!terminate) {
5951                         int idx;
5952                         xmlChar val;
5953
5954                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5955                         if (idx < 0)
5956                             goto done;
5957                         val = in->cur[idx + 2];
5958                         if (val == 0) /* bad cut of input */
5959                             goto done;
5960                     }
5961                     htmlParseScript(ctxt);
5962                     if ((cur == '<') && (next == '/')) {
5963                         ctxt->instate = XML_PARSER_END_TAG;
5964                         ctxt->checkIndex = 0;
5965 #ifdef DEBUG_PUSH
5966                         xmlGenericError(xmlGenericErrorContext,
5967                                 "HPP: entering END_TAG\n");
5968 #endif
5969                         break;
5970                     }
5971                 } else if ((cur == '<') && (next == '!')) {
5972                     /*
5973                      * Sometimes DOCTYPE arrives in the middle of the document
5974                      */
5975                     if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5976                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5977                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5978                         (UPP(8) == 'E')) {
5979                         if ((!terminate) &&
5980                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5981                             goto done;
5982                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5983                                      "Misplaced DOCTYPE declaration\n",
5984                                      BAD_CAST "DOCTYPE" , NULL);
5985                         htmlParseDocTypeDecl(ctxt);
5986                     } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5987                         if ((!terminate) &&
5988                             (htmlParseLookupCommentEnd(ctxt) < 0))
5989                             goto done;
5990 #ifdef DEBUG_PUSH
5991                         xmlGenericError(xmlGenericErrorContext,
5992                                 "HPP: Parsing Comment\n");
5993 #endif
5994                         htmlParseComment(ctxt);
5995                         ctxt->instate = XML_PARSER_CONTENT;
5996                     } else {
5997                         if ((!terminate) &&
5998                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5999                             goto done;
6000                         htmlSkipBogusComment(ctxt);
6001                     }
6002                 } else if ((cur == '<') && (next == '?')) {
6003                     if ((!terminate) &&
6004                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6005                         goto done;
6006 #ifdef DEBUG_PUSH
6007                     xmlGenericError(xmlGenericErrorContext,
6008                             "HPP: Parsing PI\n");
6009 #endif
6010                     htmlParsePI(ctxt);
6011                     ctxt->instate = XML_PARSER_CONTENT;
6012                 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
6013                     goto done;
6014                 } else if ((cur == '<') && (next == '/')) {
6015                     ctxt->instate = XML_PARSER_END_TAG;
6016                     ctxt->checkIndex = 0;
6017 #ifdef DEBUG_PUSH
6018                     xmlGenericError(xmlGenericErrorContext,
6019                             "HPP: entering END_TAG\n");
6020 #endif
6021                     break;
6022                 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6023                     if ((!terminate) && (next == 0))
6024                         goto done;
6025                     ctxt->instate = XML_PARSER_START_TAG;
6026                     ctxt->checkIndex = 0;
6027 #ifdef DEBUG_PUSH
6028                     xmlGenericError(xmlGenericErrorContext,
6029                             "HPP: entering START_TAG\n");
6030 #endif
6031                     break;
6032                 } else if (cur == '<') {
6033                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6034                         (ctxt->sax->characters != NULL))
6035                         ctxt->sax->characters(ctxt->userData,
6036                                               BAD_CAST "<", 1);
6037                     NEXT;
6038                 } else {
6039                     /*
6040                      * check that the text sequence is complete
6041                      * before handing out the data to the parser
6042                      * to avoid problems with erroneous end of
6043                      * data detection.
6044                      */
6045                     if ((!terminate) &&
6046                         (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6047                         goto done;
6048                     ctxt->checkIndex = 0;
6049 #ifdef DEBUG_PUSH
6050                     xmlGenericError(xmlGenericErrorContext,
6051                             "HPP: Parsing char data\n");
6052 #endif
6053                     while ((ctxt->instate != XML_PARSER_EOF) &&
6054                            (cur != '<') && (in->cur < in->end)) {
6055                         if (cur == '&') {
6056                             htmlParseReference(ctxt);
6057                         } else {
6058                             htmlParseCharData(ctxt);
6059                         }
6060                         cur = in->cur[0];
6061                     }
6062                 }
6063
6064                 break;
6065             }
6066             case XML_PARSER_END_TAG:
6067                 if (avail < 2)
6068                     goto done;
6069                 if ((!terminate) &&
6070                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6071                     goto done;
6072                 htmlParseEndTag(ctxt);
6073                 if (ctxt->nameNr == 0) {
6074                     ctxt->instate = XML_PARSER_EPILOG;
6075                 } else {
6076                     ctxt->instate = XML_PARSER_CONTENT;
6077                 }
6078                 ctxt->checkIndex = 0;
6079 #ifdef DEBUG_PUSH
6080                 xmlGenericError(xmlGenericErrorContext,
6081                         "HPP: entering CONTENT\n");
6082 #endif
6083                 break;
6084             case XML_PARSER_CDATA_SECTION:
6085                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6086                         "HPP: internal error, state == CDATA\n",
6087                              NULL, NULL);
6088                 ctxt->instate = XML_PARSER_CONTENT;
6089                 ctxt->checkIndex = 0;
6090 #ifdef DEBUG_PUSH
6091                 xmlGenericError(xmlGenericErrorContext,
6092                         "HPP: entering CONTENT\n");
6093 #endif
6094                 break;
6095             case XML_PARSER_DTD:
6096                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6097                         "HPP: internal error, state == DTD\n",
6098                              NULL, NULL);
6099                 ctxt->instate = XML_PARSER_CONTENT;
6100                 ctxt->checkIndex = 0;
6101 #ifdef DEBUG_PUSH
6102                 xmlGenericError(xmlGenericErrorContext,
6103                         "HPP: entering CONTENT\n");
6104 #endif
6105                 break;
6106             case XML_PARSER_COMMENT:
6107                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6108                         "HPP: internal error, state == COMMENT\n",
6109                              NULL, NULL);
6110                 ctxt->instate = XML_PARSER_CONTENT;
6111                 ctxt->checkIndex = 0;
6112 #ifdef DEBUG_PUSH
6113                 xmlGenericError(xmlGenericErrorContext,
6114                         "HPP: entering CONTENT\n");
6115 #endif
6116                 break;
6117             case XML_PARSER_PI:
6118                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6119                         "HPP: internal error, state == PI\n",
6120                              NULL, NULL);
6121                 ctxt->instate = XML_PARSER_CONTENT;
6122                 ctxt->checkIndex = 0;
6123 #ifdef DEBUG_PUSH
6124                 xmlGenericError(xmlGenericErrorContext,
6125                         "HPP: entering CONTENT\n");
6126 #endif
6127                 break;
6128             case XML_PARSER_ENTITY_DECL:
6129                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6130                         "HPP: internal error, state == ENTITY_DECL\n",
6131                              NULL, NULL);
6132                 ctxt->instate = XML_PARSER_CONTENT;
6133                 ctxt->checkIndex = 0;
6134 #ifdef DEBUG_PUSH
6135                 xmlGenericError(xmlGenericErrorContext,
6136                         "HPP: entering CONTENT\n");
6137 #endif
6138                 break;
6139             case XML_PARSER_ENTITY_VALUE:
6140                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6141                         "HPP: internal error, state == ENTITY_VALUE\n",
6142                              NULL, NULL);
6143                 ctxt->instate = XML_PARSER_CONTENT;
6144                 ctxt->checkIndex = 0;
6145 #ifdef DEBUG_PUSH
6146                 xmlGenericError(xmlGenericErrorContext,
6147                         "HPP: entering DTD\n");
6148 #endif
6149                 break;
6150             case XML_PARSER_ATTRIBUTE_VALUE:
6151                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6152                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6153                              NULL, NULL);
6154                 ctxt->instate = XML_PARSER_START_TAG;
6155                 ctxt->checkIndex = 0;
6156 #ifdef DEBUG_PUSH
6157                 xmlGenericError(xmlGenericErrorContext,
6158                         "HPP: entering START_TAG\n");
6159 #endif
6160                 break;
6161             case XML_PARSER_SYSTEM_LITERAL:
6162                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6163                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6164                              NULL, NULL);
6165                 ctxt->instate = XML_PARSER_CONTENT;
6166                 ctxt->checkIndex = 0;
6167 #ifdef DEBUG_PUSH
6168                 xmlGenericError(xmlGenericErrorContext,
6169                         "HPP: entering CONTENT\n");
6170 #endif
6171                 break;
6172             case XML_PARSER_IGNORE:
6173                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6174                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
6175                              NULL, NULL);
6176                 ctxt->instate = XML_PARSER_CONTENT;
6177                 ctxt->checkIndex = 0;
6178 #ifdef DEBUG_PUSH
6179                 xmlGenericError(xmlGenericErrorContext,
6180                         "HPP: entering CONTENT\n");
6181 #endif
6182                 break;
6183             case XML_PARSER_PUBLIC_LITERAL:
6184                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6185                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
6186                              NULL, NULL);
6187                 ctxt->instate = XML_PARSER_CONTENT;
6188                 ctxt->checkIndex = 0;
6189 #ifdef DEBUG_PUSH
6190                 xmlGenericError(xmlGenericErrorContext,
6191                         "HPP: entering CONTENT\n");
6192 #endif
6193                 break;
6194
6195         }
6196     }
6197 done:
6198     if ((avail == 0) && (terminate)) {
6199         htmlAutoCloseOnEnd(ctxt);
6200         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6201             /*
6202              * SAX: end of the document processing.
6203              */
6204             ctxt->instate = XML_PARSER_EOF;
6205             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6206                 ctxt->sax->endDocument(ctxt->userData);
6207         }
6208     }
6209     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6210         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6211          (ctxt->instate == XML_PARSER_EPILOG))) {
6212         xmlDtdPtr dtd;
6213         dtd = xmlGetIntSubset(ctxt->myDoc);
6214         if (dtd == NULL)
6215             ctxt->myDoc->intSubset =
6216                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6217                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6218                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6219     }
6220 #ifdef DEBUG_PUSH
6221     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6222 #endif
6223     return(ret);
6224 }
6225
6226 /**
6227  * htmlParseChunk:
6228  * @ctxt:  an HTML parser context
6229  * @chunk:  an char array
6230  * @size:  the size in byte of the chunk
6231  * @terminate:  last chunk indicator
6232  *
6233  * Parse a Chunk of memory
6234  *
6235  * Returns zero if no error, the xmlParserErrors otherwise.
6236  */
6237 int
6238 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6239               int terminate) {
6240     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6241         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6242                      "htmlParseChunk: context error\n", NULL, NULL);
6243         return(XML_ERR_INTERNAL_ERROR);
6244     }
6245     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6246         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6247         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6248         size_t cur = ctxt->input->cur - ctxt->input->base;
6249         int res;
6250
6251         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6252         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6253         if (res < 0) {
6254             ctxt->errNo = XML_PARSER_EOF;
6255             ctxt->disableSAX = 1;
6256             return (XML_PARSER_EOF);
6257         }
6258 #ifdef DEBUG_PUSH
6259         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6260 #endif
6261
6262 #if 0
6263         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6264             htmlParseTryOrFinish(ctxt, terminate);
6265 #endif
6266     } else if (ctxt->instate != XML_PARSER_EOF) {
6267         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6268             xmlParserInputBufferPtr in = ctxt->input->buf;
6269             if ((in->encoder != NULL) && (in->buffer != NULL) &&
6270                     (in->raw != NULL)) {
6271                 int nbchars;
6272                 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6273                 size_t current = ctxt->input->cur - ctxt->input->base;
6274
6275                 nbchars = xmlCharEncInput(in, terminate);
6276                 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6277                 if (nbchars < 0) {
6278                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6279                                  "encoder error\n", NULL, NULL);
6280                     return(XML_ERR_INVALID_ENCODING);
6281                 }
6282             }
6283         }
6284     }
6285     htmlParseTryOrFinish(ctxt, terminate);
6286     if (terminate) {
6287         if ((ctxt->instate != XML_PARSER_EOF) &&
6288             (ctxt->instate != XML_PARSER_EPILOG) &&
6289             (ctxt->instate != XML_PARSER_MISC)) {
6290             ctxt->errNo = XML_ERR_DOCUMENT_END;
6291             ctxt->wellFormed = 0;
6292         }
6293         if (ctxt->instate != XML_PARSER_EOF) {
6294             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6295                 ctxt->sax->endDocument(ctxt->userData);
6296         }
6297         ctxt->instate = XML_PARSER_EOF;
6298     }
6299     return((xmlParserErrors) ctxt->errNo);
6300 }
6301
6302 /************************************************************************
6303  *                                                                      *
6304  *                      User entry points                               *
6305  *                                                                      *
6306  ************************************************************************/
6307
6308 /**
6309  * htmlCreatePushParserCtxt:
6310  * @sax:  a SAX handler
6311  * @user_data:  The user data returned on SAX callbacks
6312  * @chunk:  a pointer to an array of chars
6313  * @size:  number of chars in the array
6314  * @filename:  an optional file name or URI
6315  * @enc:  an optional encoding
6316  *
6317  * Create a parser context for using the HTML parser in push mode
6318  * The value of @filename is used for fetching external entities
6319  * and error/warning reports.
6320  *
6321  * Returns the new parser context or NULL
6322  */
6323 htmlParserCtxtPtr
6324 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6325                          const char *chunk, int size, const char *filename,
6326                          xmlCharEncoding enc) {
6327     htmlParserCtxtPtr ctxt;
6328     htmlParserInputPtr inputStream;
6329     xmlParserInputBufferPtr buf;
6330
6331     xmlInitParser();
6332
6333     buf = xmlAllocParserInputBuffer(enc);
6334     if (buf == NULL) return(NULL);
6335
6336     ctxt = htmlNewParserCtxt();
6337     if (ctxt == NULL) {
6338         xmlFreeParserInputBuffer(buf);
6339         return(NULL);
6340     }
6341     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6342         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6343     if (sax != NULL) {
6344         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6345             xmlFree(ctxt->sax);
6346         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6347         if (ctxt->sax == NULL) {
6348             xmlFree(buf);
6349             xmlFree(ctxt);
6350             return(NULL);
6351         }
6352         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6353         if (user_data != NULL)
6354             ctxt->userData = user_data;
6355     }
6356     if (filename == NULL) {
6357         ctxt->directory = NULL;
6358     } else {
6359         ctxt->directory = xmlParserGetDirectory(filename);
6360     }
6361
6362     inputStream = htmlNewInputStream(ctxt);
6363     if (inputStream == NULL) {
6364         xmlFreeParserCtxt(ctxt);
6365         xmlFree(buf);
6366         return(NULL);
6367     }
6368
6369     if (filename == NULL)
6370         inputStream->filename = NULL;
6371     else
6372         inputStream->filename = (char *)
6373             xmlCanonicPath((const xmlChar *) filename);
6374     inputStream->buf = buf;
6375     xmlBufResetInput(buf->buffer, inputStream);
6376
6377     inputPush(ctxt, inputStream);
6378
6379     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6380         (ctxt->input->buf != NULL))  {
6381         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6382         size_t cur = ctxt->input->cur - ctxt->input->base;
6383
6384         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6385
6386         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6387 #ifdef DEBUG_PUSH
6388         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6389 #endif
6390     }
6391     ctxt->progressive = 1;
6392
6393     return(ctxt);
6394 }
6395 #endif /* LIBXML_PUSH_ENABLED */
6396
6397 /**
6398  * htmlSAXParseDoc:
6399  * @cur:  a pointer to an array of xmlChar
6400  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6401  * @sax:  the SAX handler block
6402  * @userData: if using SAX, this pointer will be provided on callbacks.
6403  *
6404  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6405  * to handle parse events. If sax is NULL, fallback to the default DOM
6406  * behavior and return a tree.
6407  *
6408  * Returns the resulting document tree unless SAX is NULL or the document is
6409  *     not well formed.
6410  */
6411
6412 htmlDocPtr
6413 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6414                 htmlSAXHandlerPtr sax, void *userData) {
6415     htmlDocPtr ret;
6416     htmlParserCtxtPtr ctxt;
6417
6418     xmlInitParser();
6419
6420     if (cur == NULL) return(NULL);
6421
6422
6423     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6424     if (ctxt == NULL) return(NULL);
6425     if (sax != NULL) {
6426         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6427         ctxt->sax = sax;
6428         ctxt->userData = userData;
6429     }
6430
6431     htmlParseDocument(ctxt);
6432     ret = ctxt->myDoc;
6433     if (sax != NULL) {
6434         ctxt->sax = NULL;
6435         ctxt->userData = NULL;
6436     }
6437     htmlFreeParserCtxt(ctxt);
6438
6439     return(ret);
6440 }
6441
6442 /**
6443  * htmlParseDoc:
6444  * @cur:  a pointer to an array of xmlChar
6445  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6446  *
6447  * parse an HTML in-memory document and build a tree.
6448  *
6449  * Returns the resulting document tree
6450  */
6451
6452 htmlDocPtr
6453 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6454     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6455 }
6456
6457
6458 /**
6459  * htmlCreateFileParserCtxt:
6460  * @filename:  the filename
6461  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6462  *
6463  * Create a parser context for a file content.
6464  * Automatic support for ZLIB/Compress compressed document is provided
6465  * by default if found at compile-time.
6466  *
6467  * Returns the new parser context or NULL
6468  */
6469 htmlParserCtxtPtr
6470 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6471 {
6472     htmlParserCtxtPtr ctxt;
6473     htmlParserInputPtr inputStream;
6474     char *canonicFilename;
6475     /* htmlCharEncoding enc; */
6476     xmlChar *content, *content_line = (xmlChar *) "charset=";
6477
6478     if (filename == NULL)
6479         return(NULL);
6480
6481     ctxt = htmlNewParserCtxt();
6482     if (ctxt == NULL) {
6483         return(NULL);
6484     }
6485     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6486     if (canonicFilename == NULL) {
6487 #ifdef LIBXML_SAX1_ENABLED
6488         if (xmlDefaultSAXHandler.error != NULL) {
6489             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6490         }
6491 #endif
6492         xmlFreeParserCtxt(ctxt);
6493         return(NULL);
6494     }
6495
6496     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6497     xmlFree(canonicFilename);
6498     if (inputStream == NULL) {
6499         xmlFreeParserCtxt(ctxt);
6500         return(NULL);
6501     }
6502
6503     inputPush(ctxt, inputStream);
6504
6505     /* set encoding */
6506     if (encoding) {
6507         size_t l = strlen(encoding);
6508
6509         if (l < 1000) {
6510             content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6511             if (content) {
6512                 strcpy ((char *)content, (char *)content_line);
6513                 strcat ((char *)content, (char *)encoding);
6514                 htmlCheckEncoding (ctxt, content);
6515                 xmlFree (content);
6516             }
6517         }
6518     }
6519
6520     return(ctxt);
6521 }
6522
6523 /**
6524  * htmlSAXParseFile:
6525  * @filename:  the filename
6526  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6527  * @sax:  the SAX handler block
6528  * @userData: if using SAX, this pointer will be provided on callbacks.
6529  *
6530  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6531  * compressed document is provided by default if found at compile-time.
6532  * It use the given SAX function block to handle the parsing callback.
6533  * If sax is NULL, fallback to the default DOM tree building routines.
6534  *
6535  * Returns the resulting document tree unless SAX is NULL or the document is
6536  *     not well formed.
6537  */
6538
6539 htmlDocPtr
6540 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6541                  void *userData) {
6542     htmlDocPtr ret;
6543     htmlParserCtxtPtr ctxt;
6544     htmlSAXHandlerPtr oldsax = NULL;
6545
6546     xmlInitParser();
6547
6548     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6549     if (ctxt == NULL) return(NULL);
6550     if (sax != NULL) {
6551         oldsax = ctxt->sax;
6552         ctxt->sax = sax;
6553         ctxt->userData = userData;
6554     }
6555
6556     htmlParseDocument(ctxt);
6557
6558     ret = ctxt->myDoc;
6559     if (sax != NULL) {
6560         ctxt->sax = oldsax;
6561         ctxt->userData = NULL;
6562     }
6563     htmlFreeParserCtxt(ctxt);
6564
6565     return(ret);
6566 }
6567
6568 /**
6569  * htmlParseFile:
6570  * @filename:  the filename
6571  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6572  *
6573  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6574  * compressed document is provided by default if found at compile-time.
6575  *
6576  * Returns the resulting document tree
6577  */
6578
6579 htmlDocPtr
6580 htmlParseFile(const char *filename, const char *encoding) {
6581     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6582 }
6583
6584 /**
6585  * htmlHandleOmittedElem:
6586  * @val:  int 0 or 1
6587  *
6588  * Set and return the previous value for handling HTML omitted tags.
6589  *
6590  * Returns the last value for 0 for no handling, 1 for auto insertion.
6591  */
6592
6593 int
6594 htmlHandleOmittedElem(int val) {
6595     int old = htmlOmittedDefaultValue;
6596
6597     htmlOmittedDefaultValue = val;
6598     return(old);
6599 }
6600
6601 /**
6602  * htmlElementAllowedHere:
6603  * @parent: HTML parent element
6604  * @elt: HTML element
6605  *
6606  * Checks whether an HTML element may be a direct child of a parent element.
6607  * Note - doesn't check for deprecated elements
6608  *
6609  * Returns 1 if allowed; 0 otherwise.
6610  */
6611 int
6612 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6613   const char** p ;
6614
6615   if ( ! elt || ! parent || ! parent->subelts )
6616         return 0 ;
6617
6618   for ( p = parent->subelts; *p; ++p )
6619     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6620       return 1 ;
6621
6622   return 0 ;
6623 }
6624 /**
6625  * htmlElementStatusHere:
6626  * @parent: HTML parent element
6627  * @elt: HTML element
6628  *
6629  * Checks whether an HTML element may be a direct child of a parent element.
6630  * and if so whether it is valid or deprecated.
6631  *
6632  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6633  */
6634 htmlStatus
6635 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6636   if ( ! parent || ! elt )
6637     return HTML_INVALID ;
6638   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6639     return HTML_INVALID ;
6640
6641   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6642 }
6643 /**
6644  * htmlAttrAllowed:
6645  * @elt: HTML element
6646  * @attr: HTML attribute
6647  * @legacy: whether to allow deprecated attributes
6648  *
6649  * Checks whether an attribute is valid for an element
6650  * Has full knowledge of Required and Deprecated attributes
6651  *
6652  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6653  */
6654 htmlStatus
6655 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6656   const char** p ;
6657
6658   if ( !elt || ! attr )
6659         return HTML_INVALID ;
6660
6661   if ( elt->attrs_req )
6662     for ( p = elt->attrs_req; *p; ++p)
6663       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6664         return HTML_REQUIRED ;
6665
6666   if ( elt->attrs_opt )
6667     for ( p = elt->attrs_opt; *p; ++p)
6668       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6669         return HTML_VALID ;
6670
6671   if ( legacy && elt->attrs_depr )
6672     for ( p = elt->attrs_depr; *p; ++p)
6673       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6674         return HTML_DEPRECATED ;
6675
6676   return HTML_INVALID ;
6677 }
6678 /**
6679  * htmlNodeStatus:
6680  * @node: an htmlNodePtr in a tree
6681  * @legacy: whether to allow deprecated elements (YES is faster here
6682  *      for Element nodes)
6683  *
6684  * Checks whether the tree node is valid.  Experimental (the author
6685  *     only uses the HTML enhancements in a SAX parser)
6686  *
6687  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6688  *      legacy allowed) or htmlElementStatusHere (otherwise).
6689  *      for Attribute nodes, a return from htmlAttrAllowed
6690  *      for other nodes, HTML_NA (no checks performed)
6691  */
6692 htmlStatus
6693 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6694   if ( ! node )
6695     return HTML_INVALID ;
6696
6697   switch ( node->type ) {
6698     case XML_ELEMENT_NODE:
6699       return legacy
6700         ? ( htmlElementAllowedHere (
6701                 htmlTagLookup(node->parent->name) , node->name
6702                 ) ? HTML_VALID : HTML_INVALID )
6703         : htmlElementStatusHere(
6704                 htmlTagLookup(node->parent->name) ,
6705                 htmlTagLookup(node->name) )
6706         ;
6707     case XML_ATTRIBUTE_NODE:
6708       return htmlAttrAllowed(
6709         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6710     default: return HTML_NA ;
6711   }
6712 }
6713 /************************************************************************
6714  *                                                                      *
6715  *      New set (2.6.0) of simpler and more flexible APIs               *
6716  *                                                                      *
6717  ************************************************************************/
6718 /**
6719  * DICT_FREE:
6720  * @str:  a string
6721  *
6722  * Free a string if it is not owned by the "dict" dictionary in the
6723  * current scope
6724  */
6725 #define DICT_FREE(str)                                          \
6726         if ((str) && ((!dict) ||                                \
6727             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6728             xmlFree((char *)(str));
6729
6730 /**
6731  * htmlCtxtReset:
6732  * @ctxt: an HTML parser context
6733  *
6734  * Reset a parser context
6735  */
6736 void
6737 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6738 {
6739     xmlParserInputPtr input;
6740     xmlDictPtr dict;
6741
6742     if (ctxt == NULL)
6743         return;
6744
6745     xmlInitParser();
6746     dict = ctxt->dict;
6747
6748     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6749         xmlFreeInputStream(input);
6750     }
6751     ctxt->inputNr = 0;
6752     ctxt->input = NULL;
6753
6754     ctxt->spaceNr = 0;
6755     if (ctxt->spaceTab != NULL) {
6756         ctxt->spaceTab[0] = -1;
6757         ctxt->space = &ctxt->spaceTab[0];
6758     } else {
6759         ctxt->space = NULL;
6760     }
6761
6762
6763     ctxt->nodeNr = 0;
6764     ctxt->node = NULL;
6765
6766     ctxt->nameNr = 0;
6767     ctxt->name = NULL;
6768
6769     ctxt->nsNr = 0;
6770
6771     DICT_FREE(ctxt->version);
6772     ctxt->version = NULL;
6773     DICT_FREE(ctxt->encoding);
6774     ctxt->encoding = NULL;
6775     DICT_FREE(ctxt->directory);
6776     ctxt->directory = NULL;
6777     DICT_FREE(ctxt->extSubURI);
6778     ctxt->extSubURI = NULL;
6779     DICT_FREE(ctxt->extSubSystem);
6780     ctxt->extSubSystem = NULL;
6781     if (ctxt->myDoc != NULL)
6782         xmlFreeDoc(ctxt->myDoc);
6783     ctxt->myDoc = NULL;
6784
6785     ctxt->standalone = -1;
6786     ctxt->hasExternalSubset = 0;
6787     ctxt->hasPErefs = 0;
6788     ctxt->html = 1;
6789     ctxt->external = 0;
6790     ctxt->instate = XML_PARSER_START;
6791     ctxt->token = 0;
6792
6793     ctxt->wellFormed = 1;
6794     ctxt->nsWellFormed = 1;
6795     ctxt->disableSAX = 0;
6796     ctxt->valid = 1;
6797     ctxt->vctxt.userData = ctxt;
6798     ctxt->vctxt.error = xmlParserValidityError;
6799     ctxt->vctxt.warning = xmlParserValidityWarning;
6800     ctxt->record_info = 0;
6801     ctxt->checkIndex = 0;
6802     ctxt->inSubset = 0;
6803     ctxt->errNo = XML_ERR_OK;
6804     ctxt->depth = 0;
6805     ctxt->charset = XML_CHAR_ENCODING_NONE;
6806     ctxt->catalogs = NULL;
6807     xmlInitNodeInfoSeq(&ctxt->node_seq);
6808
6809     if (ctxt->attsDefault != NULL) {
6810         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6811         ctxt->attsDefault = NULL;
6812     }
6813     if (ctxt->attsSpecial != NULL) {
6814         xmlHashFree(ctxt->attsSpecial, NULL);
6815         ctxt->attsSpecial = NULL;
6816     }
6817 }
6818
6819 /**
6820  * htmlCtxtUseOptions:
6821  * @ctxt: an HTML parser context
6822  * @options:  a combination of htmlParserOption(s)
6823  *
6824  * Applies the options to the parser context
6825  *
6826  * Returns 0 in case of success, the set of unknown or unimplemented options
6827  *         in case of error.
6828  */
6829 int
6830 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6831 {
6832     if (ctxt == NULL)
6833         return(-1);
6834
6835     if (options & HTML_PARSE_NOWARNING) {
6836         ctxt->sax->warning = NULL;
6837         ctxt->vctxt.warning = NULL;
6838         options -= XML_PARSE_NOWARNING;
6839         ctxt->options |= XML_PARSE_NOWARNING;
6840     }
6841     if (options & HTML_PARSE_NOERROR) {
6842         ctxt->sax->error = NULL;
6843         ctxt->vctxt.error = NULL;
6844         ctxt->sax->fatalError = NULL;
6845         options -= XML_PARSE_NOERROR;
6846         ctxt->options |= XML_PARSE_NOERROR;
6847     }
6848     if (options & HTML_PARSE_PEDANTIC) {
6849         ctxt->pedantic = 1;
6850         options -= XML_PARSE_PEDANTIC;
6851         ctxt->options |= XML_PARSE_PEDANTIC;
6852     } else
6853         ctxt->pedantic = 0;
6854     if (options & XML_PARSE_NOBLANKS) {
6855         ctxt->keepBlanks = 0;
6856         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6857         options -= XML_PARSE_NOBLANKS;
6858         ctxt->options |= XML_PARSE_NOBLANKS;
6859     } else
6860         ctxt->keepBlanks = 1;
6861     if (options & HTML_PARSE_RECOVER) {
6862         ctxt->recovery = 1;
6863         options -= HTML_PARSE_RECOVER;
6864     } else
6865         ctxt->recovery = 0;
6866     if (options & HTML_PARSE_COMPACT) {
6867         ctxt->options |= HTML_PARSE_COMPACT;
6868         options -= HTML_PARSE_COMPACT;
6869     }
6870     if (options & XML_PARSE_HUGE) {
6871         ctxt->options |= XML_PARSE_HUGE;
6872         options -= XML_PARSE_HUGE;
6873     }
6874     if (options & HTML_PARSE_NODEFDTD) {
6875         ctxt->options |= HTML_PARSE_NODEFDTD;
6876         options -= HTML_PARSE_NODEFDTD;
6877     }
6878     if (options & HTML_PARSE_IGNORE_ENC) {
6879         ctxt->options |= HTML_PARSE_IGNORE_ENC;
6880         options -= HTML_PARSE_IGNORE_ENC;
6881     }
6882     if (options & HTML_PARSE_NOIMPLIED) {
6883         ctxt->options |= HTML_PARSE_NOIMPLIED;
6884         options -= HTML_PARSE_NOIMPLIED;
6885     }
6886     ctxt->dictNames = 0;
6887     return (options);
6888 }
6889
6890 /**
6891  * htmlDoRead:
6892  * @ctxt:  an HTML parser context
6893  * @URL:  the base URL to use for the document
6894  * @encoding:  the document encoding, or NULL
6895  * @options:  a combination of htmlParserOption(s)
6896  * @reuse:  keep the context for reuse
6897  *
6898  * Common front-end for the htmlRead functions
6899  *
6900  * Returns the resulting document tree or NULL
6901  */
6902 static htmlDocPtr
6903 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6904           int options, int reuse)
6905 {
6906     htmlDocPtr ret;
6907
6908     htmlCtxtUseOptions(ctxt, options);
6909     ctxt->html = 1;
6910     if (encoding != NULL) {
6911         xmlCharEncodingHandlerPtr hdlr;
6912
6913         hdlr = xmlFindCharEncodingHandler(encoding);
6914         if (hdlr != NULL) {
6915             xmlSwitchToEncoding(ctxt, hdlr);
6916             if (ctxt->input->encoding != NULL)
6917               xmlFree((xmlChar *) ctxt->input->encoding);
6918             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6919         }
6920     }
6921     if ((URL != NULL) && (ctxt->input != NULL) &&
6922         (ctxt->input->filename == NULL))
6923         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6924     htmlParseDocument(ctxt);
6925     ret = ctxt->myDoc;
6926     ctxt->myDoc = NULL;
6927     if (!reuse) {
6928         if ((ctxt->dictNames) &&
6929             (ret != NULL) &&
6930             (ret->dict == ctxt->dict))
6931             ctxt->dict = NULL;
6932         xmlFreeParserCtxt(ctxt);
6933     }
6934     return (ret);
6935 }
6936
6937 /**
6938  * htmlReadDoc:
6939  * @cur:  a pointer to a zero terminated string
6940  * @URL:  the base URL to use for the document
6941  * @encoding:  the document encoding, or NULL
6942  * @options:  a combination of htmlParserOption(s)
6943  *
6944  * parse an XML in-memory document and build a tree.
6945  *
6946  * Returns the resulting document tree
6947  */
6948 htmlDocPtr
6949 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6950 {
6951     htmlParserCtxtPtr ctxt;
6952
6953     if (cur == NULL)
6954         return (NULL);
6955
6956     xmlInitParser();
6957     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6958     if (ctxt == NULL)
6959         return (NULL);
6960     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6961 }
6962
6963 /**
6964  * htmlReadFile:
6965  * @filename:  a file or URL
6966  * @encoding:  the document encoding, or NULL
6967  * @options:  a combination of htmlParserOption(s)
6968  *
6969  * parse an XML file from the filesystem or the network.
6970  *
6971  * Returns the resulting document tree
6972  */
6973 htmlDocPtr
6974 htmlReadFile(const char *filename, const char *encoding, int options)
6975 {
6976     htmlParserCtxtPtr ctxt;
6977
6978     xmlInitParser();
6979     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6980     if (ctxt == NULL)
6981         return (NULL);
6982     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6983 }
6984
6985 /**
6986  * htmlReadMemory:
6987  * @buffer:  a pointer to a char array
6988  * @size:  the size of the array
6989  * @URL:  the base URL to use for the document
6990  * @encoding:  the document encoding, or NULL
6991  * @options:  a combination of htmlParserOption(s)
6992  *
6993  * parse an XML in-memory document and build a tree.
6994  *
6995  * Returns the resulting document tree
6996  */
6997 htmlDocPtr
6998 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6999 {
7000     htmlParserCtxtPtr ctxt;
7001
7002     xmlInitParser();
7003     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
7004     if (ctxt == NULL)
7005         return (NULL);
7006     htmlDefaultSAXHandlerInit();
7007     if (ctxt->sax != NULL)
7008         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
7009     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7010 }
7011
7012 /**
7013  * htmlReadFd:
7014  * @fd:  an open file descriptor
7015  * @URL:  the base URL to use for the document
7016  * @encoding:  the document encoding, or NULL
7017  * @options:  a combination of htmlParserOption(s)
7018  *
7019  * parse an HTML from a file descriptor and build a tree.
7020  * NOTE that the file descriptor will not be closed when the
7021  *      reader is closed or reset.
7022  *
7023  * Returns the resulting document tree
7024  */
7025 htmlDocPtr
7026 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7027 {
7028     htmlParserCtxtPtr ctxt;
7029     xmlParserInputBufferPtr input;
7030     htmlParserInputPtr stream;
7031
7032     if (fd < 0)
7033         return (NULL);
7034
7035     xmlInitParser();
7036     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7037     if (input == NULL)
7038         return (NULL);
7039     input->closecallback = NULL;
7040     ctxt = htmlNewParserCtxt();
7041     if (ctxt == NULL) {
7042         xmlFreeParserInputBuffer(input);
7043         return (NULL);
7044     }
7045     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7046     if (stream == NULL) {
7047         xmlFreeParserInputBuffer(input);
7048         htmlFreeParserCtxt(ctxt);
7049         return (NULL);
7050     }
7051     inputPush(ctxt, stream);
7052     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7053 }
7054
7055 /**
7056  * htmlReadIO:
7057  * @ioread:  an I/O read function
7058  * @ioclose:  an I/O close function
7059  * @ioctx:  an I/O handler
7060  * @URL:  the base URL to use for the document
7061  * @encoding:  the document encoding, or NULL
7062  * @options:  a combination of htmlParserOption(s)
7063  *
7064  * parse an HTML document from I/O functions and source and build a tree.
7065  *
7066  * Returns the resulting document tree
7067  */
7068 htmlDocPtr
7069 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7070           void *ioctx, const char *URL, const char *encoding, int options)
7071 {
7072     htmlParserCtxtPtr ctxt;
7073     xmlParserInputBufferPtr input;
7074     xmlParserInputPtr stream;
7075
7076     if (ioread == NULL)
7077         return (NULL);
7078     xmlInitParser();
7079
7080     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7081                                          XML_CHAR_ENCODING_NONE);
7082     if (input == NULL) {
7083         if (ioclose != NULL)
7084             ioclose(ioctx);
7085         return (NULL);
7086     }
7087     ctxt = htmlNewParserCtxt();
7088     if (ctxt == NULL) {
7089         xmlFreeParserInputBuffer(input);
7090         return (NULL);
7091     }
7092     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7093     if (stream == NULL) {
7094         xmlFreeParserInputBuffer(input);
7095         xmlFreeParserCtxt(ctxt);
7096         return (NULL);
7097     }
7098     inputPush(ctxt, stream);
7099     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7100 }
7101
7102 /**
7103  * htmlCtxtReadDoc:
7104  * @ctxt:  an HTML parser context
7105  * @cur:  a pointer to a zero terminated string
7106  * @URL:  the base URL to use for the document
7107  * @encoding:  the document encoding, or NULL
7108  * @options:  a combination of htmlParserOption(s)
7109  *
7110  * parse an XML in-memory document and build a tree.
7111  * This reuses the existing @ctxt parser context
7112  *
7113  * Returns the resulting document tree
7114  */
7115 htmlDocPtr
7116 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7117                const char *URL, const char *encoding, int options)
7118 {
7119     xmlParserInputPtr stream;
7120
7121     if (cur == NULL)
7122         return (NULL);
7123     if (ctxt == NULL)
7124         return (NULL);
7125     xmlInitParser();
7126
7127     htmlCtxtReset(ctxt);
7128
7129     stream = xmlNewStringInputStream(ctxt, cur);
7130     if (stream == NULL) {
7131         return (NULL);
7132     }
7133     inputPush(ctxt, stream);
7134     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7135 }
7136
7137 /**
7138  * htmlCtxtReadFile:
7139  * @ctxt:  an HTML parser context
7140  * @filename:  a file or URL
7141  * @encoding:  the document encoding, or NULL
7142  * @options:  a combination of htmlParserOption(s)
7143  *
7144  * parse an XML file from the filesystem or the network.
7145  * This reuses the existing @ctxt parser context
7146  *
7147  * Returns the resulting document tree
7148  */
7149 htmlDocPtr
7150 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7151                 const char *encoding, int options)
7152 {
7153     xmlParserInputPtr stream;
7154
7155     if (filename == NULL)
7156         return (NULL);
7157     if (ctxt == NULL)
7158         return (NULL);
7159     xmlInitParser();
7160
7161     htmlCtxtReset(ctxt);
7162
7163     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7164     if (stream == NULL) {
7165         return (NULL);
7166     }
7167     inputPush(ctxt, stream);
7168     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7169 }
7170
7171 /**
7172  * htmlCtxtReadMemory:
7173  * @ctxt:  an HTML parser context
7174  * @buffer:  a pointer to a char array
7175  * @size:  the size of the array
7176  * @URL:  the base URL to use for the document
7177  * @encoding:  the document encoding, or NULL
7178  * @options:  a combination of htmlParserOption(s)
7179  *
7180  * parse an XML in-memory document and build a tree.
7181  * This reuses the existing @ctxt parser context
7182  *
7183  * Returns the resulting document tree
7184  */
7185 htmlDocPtr
7186 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7187                   const char *URL, const char *encoding, int options)
7188 {
7189     xmlParserInputBufferPtr input;
7190     xmlParserInputPtr stream;
7191
7192     if (ctxt == NULL)
7193         return (NULL);
7194     if (buffer == NULL)
7195         return (NULL);
7196     xmlInitParser();
7197
7198     htmlCtxtReset(ctxt);
7199
7200     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7201     if (input == NULL) {
7202         return(NULL);
7203     }
7204
7205     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7206     if (stream == NULL) {
7207         xmlFreeParserInputBuffer(input);
7208         return(NULL);
7209     }
7210
7211     inputPush(ctxt, stream);
7212     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7213 }
7214
7215 /**
7216  * htmlCtxtReadFd:
7217  * @ctxt:  an HTML parser context
7218  * @fd:  an open file descriptor
7219  * @URL:  the base URL to use for the document
7220  * @encoding:  the document encoding, or NULL
7221  * @options:  a combination of htmlParserOption(s)
7222  *
7223  * parse an XML from a file descriptor and build a tree.
7224  * This reuses the existing @ctxt parser context
7225  *
7226  * Returns the resulting document tree
7227  */
7228 htmlDocPtr
7229 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7230               const char *URL, const char *encoding, int options)
7231 {
7232     xmlParserInputBufferPtr input;
7233     xmlParserInputPtr stream;
7234
7235     if (fd < 0)
7236         return (NULL);
7237     if (ctxt == NULL)
7238         return (NULL);
7239     xmlInitParser();
7240
7241     htmlCtxtReset(ctxt);
7242
7243
7244     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7245     if (input == NULL)
7246         return (NULL);
7247     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7248     if (stream == NULL) {
7249         xmlFreeParserInputBuffer(input);
7250         return (NULL);
7251     }
7252     inputPush(ctxt, stream);
7253     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7254 }
7255
7256 /**
7257  * htmlCtxtReadIO:
7258  * @ctxt:  an HTML parser context
7259  * @ioread:  an I/O read function
7260  * @ioclose:  an I/O close function
7261  * @ioctx:  an I/O handler
7262  * @URL:  the base URL to use for the document
7263  * @encoding:  the document encoding, or NULL
7264  * @options:  a combination of htmlParserOption(s)
7265  *
7266  * parse an HTML document from I/O functions and source and build a tree.
7267  * This reuses the existing @ctxt parser context
7268  *
7269  * Returns the resulting document tree
7270  */
7271 htmlDocPtr
7272 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7273               xmlInputCloseCallback ioclose, void *ioctx,
7274               const char *URL,
7275               const char *encoding, int options)
7276 {
7277     xmlParserInputBufferPtr input;
7278     xmlParserInputPtr stream;
7279
7280     if (ioread == NULL)
7281         return (NULL);
7282     if (ctxt == NULL)
7283         return (NULL);
7284     xmlInitParser();
7285
7286     htmlCtxtReset(ctxt);
7287
7288     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7289                                          XML_CHAR_ENCODING_NONE);
7290     if (input == NULL) {
7291         if (ioclose != NULL)
7292             ioclose(ioctx);
7293         return (NULL);
7294     }
7295     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7296     if (stream == NULL) {
7297         xmlFreeParserInputBuffer(input);
7298         return (NULL);
7299     }
7300     inputPush(ctxt, stream);
7301     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7302 }
7303
7304 #endif /* LIBXML_HTML_ENABLED */