libs/xml2/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #include <ctype.h>
  15 #include <stdlib.h>
  16
  17 #include <libxml/xmlmemory.h>
  18 #include <libxml/tree.h>
  19 #include <libxml/parser.h>
  20 #include <libxml/parserInternals.h>
  21 #include <libxml/xmlerror.h>
  22 #include <libxml/HTMLparser.h>
  23 #include <libxml/HTMLtree.h>
  24 #include <libxml/entities.h>
  25 #include <libxml/encoding.h>
  26 #include <libxml/valid.h>
  27 #include <libxml/xmlIO.h>
  28 #include <libxml/globals.h>
  29 #include <libxml/uri.h>
  30
  31 #include "buf.h"
  32 #include "enc.h"
  33
  34 #define HTML_MAX_NAMELEN 1000
  35 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  36 #define HTML_PARSER_BUFFER_SIZE 100
  37
  38 /* #define DEBUG */
  39 /* #define DEBUG_PUSH */
  40
  41 static int htmlOmittedDefaultValue = 1;
  42
  43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  44                              xmlChar end, xmlChar  end2, xmlChar end3);
  45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  46
  47 /************************************************************************
  48  *                                                                      *
  49  *              Some factorized error routines                          *
  50  *                                                                      *
  51  ************************************************************************/
  52
  53 /**
  54  * htmlErrMemory:
  55  * @ctxt:  an HTML parser context
  56  * @extra:  extra information
  57  *
  58  * Handle a redefinition of attribute error
  59  */
  60 static void
  61 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  62 {
  63     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  64         (ctxt->instate == XML_PARSER_EOF))
  65         return;
  66     if (ctxt != NULL) {
  67         ctxt->errNo = XML_ERR_NO_MEMORY;
  68         ctxt->instate = XML_PARSER_EOF;
  69         ctxt->disableSAX = 1;
  70     }
  71     if (extra)
  72         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  73                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  74                         NULL, NULL, 0, 0,
  75                         "Memory allocation failed : %s\n", extra);
  76     else
  77         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  78                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  79                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  80 }
  81
  82 /**
  83  * htmlParseErr:
  84  * @ctxt:  an HTML parser context
  85  * @error:  the error number
  86  * @msg:  the error message
  87  * @str1:  string infor
  88  * @str2:  string infor
  89  *
  90  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  91  */
  92 static void LIBXML_ATTR_FORMAT(3,0)
  93 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  94              const char *msg, const xmlChar *str1, const xmlChar *str2)
  95 {
  96     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  97         (ctxt->instate == XML_PARSER_EOF))
  98         return;
  99     if (ctxt != NULL)
 100         ctxt->errNo = error;
 101     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 102                     XML_ERR_ERROR, NULL, 0,
 103                     (const char *) str1, (const char *) str2,
 104                     NULL, 0, 0,
 105                     msg, str1, str2);
 106     if (ctxt != NULL)
 107         ctxt->wellFormed = 0;
 108 }
 109
 110 /**
 111  * htmlParseErrInt:
 112  * @ctxt:  an HTML parser context
 113  * @error:  the error number
 114  * @msg:  the error message
 115  * @val:  integer info
 116  *
 117  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 118  */
 119 static void LIBXML_ATTR_FORMAT(3,0)
 120 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 121              const char *msg, int val)
 122 {
 123     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 124         (ctxt->instate == XML_PARSER_EOF))
 125         return;
 126     if (ctxt != NULL)
 127         ctxt->errNo = error;
 128     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 129                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 130                     NULL, val, 0, msg, val);
 131     if (ctxt != NULL)
 132         ctxt->wellFormed = 0;
 133 }
 134
 135 /************************************************************************
 136  *                                                                      *
 137  *      Parser stacks related functions and macros              *
 138  *                                                                      *
 139  ************************************************************************/
 140
 141 /**
 142  * htmlnamePush:
 143  * @ctxt:  an HTML parser context
 144  * @value:  the element name
 145  *
 146  * Pushes a new element name on top of the name stack
 147  *
 148  * Returns 0 in case of error, the index in the stack otherwise
 149  */
 150 static int
 151 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 152 {
 153     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 154         ctxt->html = 3;
 155     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 156         ctxt->html = 10;
 157     if (ctxt->nameNr >= ctxt->nameMax) {
 158         ctxt->nameMax *= 2;
 159         ctxt->nameTab = (const xmlChar * *)
 160                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 161                                     ctxt->nameMax *
 162                                     sizeof(ctxt->nameTab[0]));
 163         if (ctxt->nameTab == NULL) {
 164             htmlErrMemory(ctxt, NULL);
 165             return (0);
 166         }
 167     }
 168     ctxt->nameTab[ctxt->nameNr] = value;
 169     ctxt->name = value;
 170     return (ctxt->nameNr++);
 171 }
 172 /**
 173  * htmlnamePop:
 174  * @ctxt: an HTML parser context
 175  *
 176  * Pops the top element name from the name stack
 177  *
 178  * Returns the name just removed
 179  */
 180 static const xmlChar *
 181 htmlnamePop(htmlParserCtxtPtr ctxt)
 182 {
 183     const xmlChar *ret;
 184
 185     if (ctxt->nameNr <= 0)
 186         return (NULL);
 187     ctxt->nameNr--;
 188     if (ctxt->nameNr < 0)
 189         return (NULL);
 190     if (ctxt->nameNr > 0)
 191         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 192     else
 193         ctxt->name = NULL;
 194     ret = ctxt->nameTab[ctxt->nameNr];
 195     ctxt->nameTab[ctxt->nameNr] = NULL;
 196     return (ret);
 197 }
 198
 199 /**
 200  * htmlNodeInfoPush:
 201  * @ctxt:  an HTML parser context
 202  * @value:  the node info
 203  *
 204  * Pushes a new element name on top of the node info stack
 205  *
 206  * Returns 0 in case of error, the index in the stack otherwise
 207  */
 208 static int
 209 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 210 {
 211     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 212         if (ctxt->nodeInfoMax == 0)
 213                 ctxt->nodeInfoMax = 5;
 214         ctxt->nodeInfoMax *= 2;
 215         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 216                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 217                                     ctxt->nodeInfoMax *
 218                                     sizeof(ctxt->nodeInfoTab[0]));
 219         if (ctxt->nodeInfoTab == NULL) {
 220             htmlErrMemory(ctxt, NULL);
 221             return (0);
 222         }
 223     }
 224     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 225     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 226     return (ctxt->nodeInfoNr++);
 227 }
 228
 229 /**
 230  * htmlNodeInfoPop:
 231  * @ctxt:  an HTML parser context
 232  *
 233  * Pops the top element name from the node info stack
 234  *
 235  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 236  */
 237 static htmlParserNodeInfo *
 238 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 239 {
 240     if (ctxt->nodeInfoNr <= 0)
 241         return (NULL);
 242     ctxt->nodeInfoNr--;
 243     if (ctxt->nodeInfoNr < 0)
 244         return (NULL);
 245     if (ctxt->nodeInfoNr > 0)
 246         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 247     else
 248         ctxt->nodeInfo = NULL;
 249     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 250 }
 251
 252 /*
 253  * Macros for accessing the content. Those should be used only by the parser,
 254  * and not exported.
 255  *
 256  * Dirty macros, i.e. one need to make assumption on the context to use them
 257  *
 258  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 259  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 260  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 261  *           in UNICODE mode. This should be used internally by the parser
 262  *           only to compare to ASCII values otherwise it would break when
 263  *           running with UTF-8 encoding.
 264  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 265  *           to compare on ASCII based substring.
 266  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 267  *           it should be used only to compare on ASCII based substring.
 268  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 269  *           strings without newlines within the parser.
 270  *
 271  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 272  *
 273  *   CURRENT Returns the current char value, with the full decoding of
 274  *           UTF-8 if we are using this mode. It returns an int.
 275  *   NEXT    Skip to the next character, this does the proper decoding
 276  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 277  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 278  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 279  */
 280
 281 #define UPPER (toupper(*ctxt->input->cur))
 282
 283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
 284
 285 #define NXT(val) ctxt->input->cur[(val)]
 286
 287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 288
 289 #define CUR_PTR ctxt->input->cur
 290 #define BASE_PTR ctxt->input->base
 291
 292 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 293                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 294         xmlParserInputShrink(ctxt->input)
 295
 296 #define GROW if ((ctxt->progressive == 0) &&                            \
 297                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 298         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 299
 300 #define CURRENT ((int) (*ctxt->input->cur))
 301
 302 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 303
 304 /* Imported from XML */
 305
 306 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 307 #define CUR ((int) (*ctxt->input->cur))
 308 #define NEXT xmlNextChar(ctxt)
 309
 310 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 311
 312
 313 #define NEXTL(l) do {                                                   \
 314     if (*(ctxt->input->cur) == '\n') {                                  \
 315         ctxt->input->line++; ctxt->input->col = 1;                      \
 316     } else ctxt->input->col++;                                          \
 317     ctxt->token = 0; ctxt->input->cur += l;                             \
 318   } while (0)
 319
 320 /************
 321     \
 322     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 323     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 324  ************/
 325
 326 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 327 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 328
 329 #define COPY_BUF(l,b,i,v)                                               \
 330     if (l == 1) b[i++] = (xmlChar) v;                                   \
 331     else i += xmlCopyChar(l,&b[i],v)
 332
 333 /**
 334  * htmlFindEncoding:
 335  * @the HTML parser context
 336  *
 337  * Ty to find and encoding in the current data available in the input
 338  * buffer this is needed to try to switch to the proper encoding when
 339  * one face a character error.
 340  * That's an heuristic, since it's operating outside of parsing it could
 341  * try to use a meta which had been commented out, that's the reason it
 342  * should only be used in case of error, not as a default.
 343  *
 344  * Returns an encoding string or NULL if not found, the string need to
 345  *   be freed
 346  */
 347 static xmlChar *
 348 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 349     const xmlChar *start, *cur, *end;
 350
 351     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 352         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 353         (ctxt->input->buf->encoder != NULL))
 354         return(NULL);
 355     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 356         return(NULL);
 357
 358     start = ctxt->input->cur;
 359     end = ctxt->input->end;
 360     /* we also expect the input buffer to be zero terminated */
 361     if (*end != 0)
 362         return(NULL);
 363
 364     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 365     if (cur == NULL)
 366         return(NULL);
 367     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 368     if (cur == NULL)
 369         return(NULL);
 370     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 371     if (cur == NULL)
 372         return(NULL);
 373     cur += 8;
 374     start = cur;
 375     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 376            ((*cur >= 'a') && (*cur <= 'z')) ||
 377            ((*cur >= '0') && (*cur <= '9')) ||
 378            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 379            cur++;
 380     if (cur == start)
 381         return(NULL);
 382     return(xmlStrndup(start, cur - start));
 383 }
 384
 385 /**
 386  * htmlCurrentChar:
 387  * @ctxt:  the HTML parser context
 388  * @len:  pointer to the length of the char read
 389  *
 390  * The current char value, if using UTF-8 this may actually span multiple
 391  * bytes in the input buffer. Implement the end of line normalization:
 392  * 2.11 End-of-Line Handling
 393  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 394  * char, then the encoding converter is plugged in automatically.
 395  *
 396  * Returns the current char value and its length
 397  */
 398
 399 static int
 400 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 401     const unsigned char *cur;
 402     unsigned char c;
 403     unsigned int val;
 404
 405     if (ctxt->instate == XML_PARSER_EOF)
 406         return(0);
 407
 408     if (ctxt->token != 0) {
 409         *len = 0;
 410         return(ctxt->token);
 411     }
 412     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
 413         xmlChar * guess;
 414         xmlCharEncodingHandlerPtr handler;
 415
 416         /*
 417          * Assume it's a fixed length encoding (1) with
 418          * a compatible encoding for the ASCII set, since
 419          * HTML constructs only use < 128 chars
 420          */
 421         if ((int) *ctxt->input->cur < 0x80) {
 422             *len = 1;
 423             if ((*ctxt->input->cur == 0) &&
 424                 (ctxt->input->cur < ctxt->input->end)) {
 425                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 426                                 "Char 0x%X out of allowed range\n", 0);
 427                 return(' ');
 428             }
 429             return((int) *ctxt->input->cur);
 430         }
 431
 432         /*
 433          * Humm this is bad, do an automatic flow conversion
 434          */
 435         guess = htmlFindEncoding(ctxt);
 436         if (guess == NULL) {
 437             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 438         } else {
 439             if (ctxt->input->encoding != NULL)
 440                 xmlFree((xmlChar *) ctxt->input->encoding);
 441             ctxt->input->encoding = guess;
 442             handler = xmlFindCharEncodingHandler((const char *) guess);
 443             if (handler != NULL) {
 444                 /*
 445                  * Don't use UTF-8 encoder which isn't required and
 446                  * can produce invalid UTF-8.
 447                  */
 448                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
 449                     xmlSwitchToEncoding(ctxt, handler);
 450             } else {
 451                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 452                              "Unsupported encoding %s", guess, NULL);
 453             }
 454         }
 455         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 456     }
 457
 458     /*
 459      * We are supposed to handle UTF8, check it's valid
 460      * From rfc2044: encoding of the Unicode values on UTF-8:
 461      *
 462      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 463      * 0000 0000-0000 007F   0xxxxxxx
 464      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 465      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 466      *
 467      * Check for the 0x110000 limit too
 468      */
 469     cur = ctxt->input->cur;
 470     c = *cur;
 471     if (c & 0x80) {
 472         if ((c & 0x40) == 0)
 473             goto encoding_error;
 474         if (cur[1] == 0) {
 475             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 476             cur = ctxt->input->cur;
 477         }
 478         if ((cur[1] & 0xc0) != 0x80)
 479             goto encoding_error;
 480         if ((c & 0xe0) == 0xe0) {
 481
 482             if (cur[2] == 0) {
 483                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 484                 cur = ctxt->input->cur;
 485             }
 486             if ((cur[2] & 0xc0) != 0x80)
 487                 goto encoding_error;
 488             if ((c & 0xf0) == 0xf0) {
 489                 if (cur[3] == 0) {
 490                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 491                     cur = ctxt->input->cur;
 492                 }
 493                 if (((c & 0xf8) != 0xf0) ||
 494                     ((cur[3] & 0xc0) != 0x80))
 495                     goto encoding_error;
 496                 /* 4-byte code */
 497                 *len = 4;
 498                 val = (cur[0] & 0x7) << 18;
 499                 val |= (cur[1] & 0x3f) << 12;
 500                 val |= (cur[2] & 0x3f) << 6;
 501                 val |= cur[3] & 0x3f;
 502                 if (val < 0x10000)
 503                     goto encoding_error;
 504             } else {
 505               /* 3-byte code */
 506                 *len = 3;
 507                 val = (cur[0] & 0xf) << 12;
 508                 val |= (cur[1] & 0x3f) << 6;
 509                 val |= cur[2] & 0x3f;
 510                 if (val < 0x800)
 511                     goto encoding_error;
 512             }
 513         } else {
 514           /* 2-byte code */
 515             *len = 2;
 516             val = (cur[0] & 0x1f) << 6;
 517             val |= cur[1] & 0x3f;
 518             if (val < 0x80)
 519                 goto encoding_error;
 520         }
 521         if (!IS_CHAR(val)) {
 522             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 523                             "Char 0x%X out of allowed range\n", val);
 524         }
 525         return(val);
 526     } else {
 527         if ((*ctxt->input->cur == 0) &&
 528             (ctxt->input->cur < ctxt->input->end)) {
 529             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 530                             "Char 0x%X out of allowed range\n", 0);
 531             *len = 1;
 532             return(' ');
 533         }
 534         /* 1-byte code */
 535         *len = 1;
 536         return((int) *ctxt->input->cur);
 537     }
 538
 539 encoding_error:
 540     /*
 541      * If we detect an UTF8 error that probably mean that the
 542      * input encoding didn't get properly advertised in the
 543      * declaration header. Report the error and switch the encoding
 544      * to ISO-Latin-1 (if you don't like this policy, just declare the
 545      * encoding !)
 546      */
 547     {
 548         char buffer[150];
 549
 550         if (ctxt->input->end - ctxt->input->cur >= 4) {
 551             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 552                             ctxt->input->cur[0], ctxt->input->cur[1],
 553                             ctxt->input->cur[2], ctxt->input->cur[3]);
 554         } else {
 555             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 556         }
 557         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 558                      "Input is not proper UTF-8, indicate encoding !\n",
 559                      BAD_CAST buffer, NULL);
 560     }
 561
 562     /*
 563      * Don't switch encodings twice. Note that if there's an encoder, we
 564      * shouldn't receive invalid UTF-8 anyway.
 565      *
 566      * Note that if ctxt->input->buf == NULL, switching encodings is
 567      * impossible, see Gitlab issue #34.
 568      */
 569     if ((ctxt->input->buf != NULL) &&
 570         (ctxt->input->buf->encoder == NULL))
 571         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 572     *len = 1;
 573     return((int) *ctxt->input->cur);
 574 }
 575
 576 /**
 577  * htmlSkipBlankChars:
 578  * @ctxt:  the HTML parser context
 579  *
 580  * skip all blanks character found at that point in the input streams.
 581  *
 582  * Returns the number of space chars skipped
 583  */
 584
 585 static int
 586 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 587     int res = 0;
 588
 589     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 590         if ((*ctxt->input->cur == 0) &&
 591             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 592                 xmlPopInput(ctxt);
 593         } else {
 594             if (*(ctxt->input->cur) == '\n') {
 595                 ctxt->input->line++; ctxt->input->col = 1;
 596             } else ctxt->input->col++;
 597             ctxt->input->cur++;
 598             if (*ctxt->input->cur == 0)
 599                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 600         }
 601         if (res < INT_MAX)
 602             res++;
 603     }
 604     return(res);
 605 }
 606
 607
 608
 609 /************************************************************************
 610  *                                                                      *
 611  *      The list of HTML elements and their properties          *
 612  *                                                                      *
 613  ************************************************************************/
 614
 615 /*
 616  *  Start Tag: 1 means the start tag can be omitted
 617  *  End Tag:   1 means the end tag can be omitted
 618  *             2 means it's forbidden (empty elements)
 619  *             3 means the tag is stylistic and should be closed easily
 620  *  Depr:      this element is deprecated
 621  *  DTD:       1 means that this element is valid only in the Loose DTD
 622  *             2 means that this element is valid only in the Frameset DTD
 623  *
 624  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 625         , subElements , impliedsubelt , Attributes, userdata
 626  */
 627
 628 /* Definitions and a couple of vars for HTML Elements */
 629
 630 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 631 #define NB_FONTSTYLE 8
 632 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 633 #define NB_PHRASE 10
 634 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 635 #define NB_SPECIAL 16
 636 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 637 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 638 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 639 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 640 #define FORMCTRL "input", "select", "textarea", "label", "button"
 641 #define NB_FORMCTRL 5
 642 #define PCDATA
 643 #define NB_PCDATA 0
 644 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 645 #define NB_HEADING 6
 646 #define LIST "ul", "ol", "dir", "menu"
 647 #define NB_LIST 4
 648 #define MODIFIER
 649 #define NB_MODIFIER 0
 650 #define FLOW BLOCK,INLINE
 651 #define NB_FLOW NB_BLOCK + NB_INLINE
 652 #define EMPTY NULL
 653
 654
 655 static const char* const html_flow[] = { FLOW, NULL } ;
 656 static const char* const html_inline[] = { INLINE, NULL } ;
 657
 658 /* placeholders: elts with content but no subelements */
 659 static const char* const html_pcdata[] = { NULL } ;
 660 #define html_cdata html_pcdata
 661
 662
 663 /* ... and for HTML Attributes */
 664
 665 #define COREATTRS "id", "class", "style", "title"
 666 #define NB_COREATTRS 4
 667 #define I18N "lang", "dir"
 668 #define NB_I18N 2
 669 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 670 #define NB_EVENTS 9
 671 #define ATTRS COREATTRS,I18N,EVENTS
 672 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 673 #define CELLHALIGN "align", "char", "charoff"
 674 #define NB_CELLHALIGN 3
 675 #define CELLVALIGN "valign"
 676 #define NB_CELLVALIGN 1
 677
 678 static const char* const html_attrs[] = { ATTRS, NULL } ;
 679 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 680 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 681 static const char* const i18n_attrs[] = { I18N, NULL } ;
 682
 683
 684 /* Other declarations that should go inline ... */
 685 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 686         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 687         "tabindex", "onfocus", "onblur", NULL } ;
 688 static const char* const target_attr[] = { "target", NULL } ;
 689 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 690 static const char* const alt_attr[] = { "alt", NULL } ;
 691 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 692 static const char* const href_attrs[] = { "href", NULL } ;
 693 static const char* const clear_attrs[] = { "clear", NULL } ;
 694 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 695
 696 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 697 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 698                 "archive", "alt", "name", "height", "width", "align",
 699                 "hspace", "vspace", NULL } ;
 700 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 701         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 702 static const char* const basefont_attrs[] =
 703         { "id", "size", "color", "face", NULL } ;
 704 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 705 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 706 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 707 static const char* const body_depr[] = { "background", "bgcolor", "text",
 708         "link", "vlink", "alink", NULL } ;
 709 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 710         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 711
 712
 713 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 714 static const char* const col_elt[] = { "col", NULL } ;
 715 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 716 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 717 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 718 static const char* const compact_attr[] = { "compact", NULL } ;
 719 static const char* const label_attr[] = { "label", NULL } ;
 720 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 721 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 722 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 723 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 724 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 725 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 726 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 727 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 728 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 729 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 730 static const char* const version_attr[] = { "version", NULL } ;
 731 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 732 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 733 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 734 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 735 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 736 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 737 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 738 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 739 static const char* const align_attr[] = { "align", NULL } ;
 740 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 741 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 742 static const char* const name_attr[] = { "name", NULL } ;
 743 static const char* const action_attr[] = { "action", NULL } ;
 744 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 745 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 746 static const char* const content_attr[] = { "content", NULL } ;
 747 static const char* const type_attr[] = { "type", NULL } ;
 748 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 749 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 750 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 751 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 752 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 753 static const char* const option_elt[] = { "option", NULL } ;
 754 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 755 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 756 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 757 static const char* const width_attr[] = { "width", NULL } ;
 758 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 759 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 760 static const char* const language_attr[] = { "language", NULL } ;
 761 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 762 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 763 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 764 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 765 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 766 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 767 static const char* const tr_elt[] = { "tr", NULL } ;
 768 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 769 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 770 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 771 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 772 static const char* const tr_contents[] = { "th", "td", NULL } ;
 773 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 774 static const char* const li_elt[] = { "li", NULL } ;
 775 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 776 static const char* const dir_attr[] = { "dir", NULL} ;
 777
 778 #define DECL (const char**)
 779
 780 static const htmlElemDesc
 781 html40ElementTable[] = {
 782 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 783         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 784 },
 785 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 786         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 787 },
 788 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 789         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 790 },
 791 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 792         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 793 },
 794 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 795         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 796 },
 797 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 798         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 799 },
 800 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 801         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 802 },
 803 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 804         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 805 },
 806 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 807         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 808 },
 809 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 810         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 811 },
 812 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 813         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 814 },
 815 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 816         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 817 },
 818 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 819         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 820 },
 821 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 822         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 823 },
 824 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 825         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 826 },
 827 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 828         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 829 },
 830 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 831         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 832 },
 833 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 834         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 835 },
 836 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 837         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 838 },
 839 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 840         EMPTY , NULL , DECL col_attrs , NULL, NULL
 841 },
 842 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 843         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 844 },
 845 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 846         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 847 },
 848 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 849         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 850 },
 851 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 852         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 853 },
 854 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 855         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 856 },
 857 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 858         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 859 },
 860 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 861         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 862 },
 863 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 864         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 865 },
 866 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 867         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 868 },
 869 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 870         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 871 },
 872 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 873         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 874 },
 875 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 876         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 877 },
 878 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 879         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 880 },
 881 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 882         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 883 },
 884 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 885         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 886 },
 887 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 888         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 889 },
 890 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 891         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 892 },
 893 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 894         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 895 },
 896 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 897         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 898 },
 899 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 900         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 901 },
 902 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 903         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 904 },
 905 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 906         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 907 },
 908 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 909         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 910 },
 911 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 912         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 913 },
 914 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 915         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 916 },
 917 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 918         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 919 },
 920 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 921         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 922 },
 923 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 924         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 925 },
 926 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 927         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 928 },
 929 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 930         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 931 },
 932 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 933         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 934 },
 935 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 936         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 937 },
 938 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 939         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 940 },
 941 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 942         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 943 },
 944 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 945         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 946 },
 947 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 948         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 949 },
 950 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 951         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 952 },
 953 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 954         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 955 },
 956 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 957         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 958 },
 959 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 960         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 961 },
 962 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 963         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 964 },
 965 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 966         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 967 },
 968 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 969         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 970 },
 971 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 972         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 973 },
 974 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 975         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 976 },
 977 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 978         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 979 },
 980 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 981         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 982 },
 983 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 984         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 985 },
 986 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 987         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 988 },
 989 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 990         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 991 },
 992 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 993         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 994 },
 995 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 996         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 997 },
 998 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 999         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1002         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
1005         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1006 },
1007 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1008         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1009 },
1010 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
1011         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1012 },
1013 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
1014         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1017         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1020         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1021 },
1022 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1023         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1026         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1027 },
1028 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1029         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1030 },
1031 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1032         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1033 },
1034 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1035         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1036 },
1037 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1038         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1041         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1042 },
1043 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1044         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1045 },
1046 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1047         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1048 },
1049 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1050         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1051 },
1052 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1053         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1054 },
1055 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1056         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1057 }
1058 };
1059
1060 typedef struct {
1061     const char *oldTag;
1062     const char *newTag;
1063 } htmlStartCloseEntry;
1064
1065 /*
1066  * start tags that imply the end of current element
1067  */
1068 static const htmlStartCloseEntry htmlStartClose[] = {
1069     { "a", "a" },
1070     { "a", "fieldset" },
1071     { "a", "table" },
1072     { "a", "td" },
1073     { "a", "th" },
1074     { "address", "dd" },
1075     { "address", "dl" },
1076     { "address", "dt" },
1077     { "address", "form" },
1078     { "address", "li" },
1079     { "address", "ul" },
1080     { "b", "center" },
1081     { "b", "p" },
1082     { "b", "td" },
1083     { "b", "th" },
1084     { "big", "p" },
1085     { "caption", "col" },
1086     { "caption", "colgroup" },
1087     { "caption", "tbody" },
1088     { "caption", "tfoot" },
1089     { "caption", "thead" },
1090     { "caption", "tr" },
1091     { "col", "col" },
1092     { "col", "colgroup" },
1093     { "col", "tbody" },
1094     { "col", "tfoot" },
1095     { "col", "thead" },
1096     { "col", "tr" },
1097     { "colgroup", "colgroup" },
1098     { "colgroup", "tbody" },
1099     { "colgroup", "tfoot" },
1100     { "colgroup", "thead" },
1101     { "colgroup", "tr" },
1102     { "dd", "dt" },
1103     { "dir", "dd" },
1104     { "dir", "dl" },
1105     { "dir", "dt" },
1106     { "dir", "form" },
1107     { "dir", "ul" },
1108     { "dl", "form" },
1109     { "dl", "li" },
1110     { "dt", "dd" },
1111     { "dt", "dl" },
1112     { "font", "center" },
1113     { "font", "td" },
1114     { "font", "th" },
1115     { "form", "form" },
1116     { "h1", "fieldset" },
1117     { "h1", "form" },
1118     { "h1", "li" },
1119     { "h1", "p" },
1120     { "h1", "table" },
1121     { "h2", "fieldset" },
1122     { "h2", "form" },
1123     { "h2", "li" },
1124     { "h2", "p" },
1125     { "h2", "table" },
1126     { "h3", "fieldset" },
1127     { "h3", "form" },
1128     { "h3", "li" },
1129     { "h3", "p" },
1130     { "h3", "table" },
1131     { "h4", "fieldset" },
1132     { "h4", "form" },
1133     { "h4", "li" },
1134     { "h4", "p" },
1135     { "h4", "table" },
1136     { "h5", "fieldset" },
1137     { "h5", "form" },
1138     { "h5", "li" },
1139     { "h5", "p" },
1140     { "h5", "table" },
1141     { "h6", "fieldset" },
1142     { "h6", "form" },
1143     { "h6", "li" },
1144     { "h6", "p" },
1145     { "h6", "table" },
1146     { "head", "a" },
1147     { "head", "abbr" },
1148     { "head", "acronym" },
1149     { "head", "address" },
1150     { "head", "b" },
1151     { "head", "bdo" },
1152     { "head", "big" },
1153     { "head", "blockquote" },
1154     { "head", "body" },
1155     { "head", "br" },
1156     { "head", "center" },
1157     { "head", "cite" },
1158     { "head", "code" },
1159     { "head", "dd" },
1160     { "head", "dfn" },
1161     { "head", "dir" },
1162     { "head", "div" },
1163     { "head", "dl" },
1164     { "head", "dt" },
1165     { "head", "em" },
1166     { "head", "fieldset" },
1167     { "head", "font" },
1168     { "head", "form" },
1169     { "head", "frameset" },
1170     { "head", "h1" },
1171     { "head", "h2" },
1172     { "head", "h3" },
1173     { "head", "h4" },
1174     { "head", "h5" },
1175     { "head", "h6" },
1176     { "head", "hr" },
1177     { "head", "i" },
1178     { "head", "iframe" },
1179     { "head", "img" },
1180     { "head", "kbd" },
1181     { "head", "li" },
1182     { "head", "listing" },
1183     { "head", "map" },
1184     { "head", "menu" },
1185     { "head", "ol" },
1186     { "head", "p" },
1187     { "head", "pre" },
1188     { "head", "q" },
1189     { "head", "s" },
1190     { "head", "samp" },
1191     { "head", "small" },
1192     { "head", "span" },
1193     { "head", "strike" },
1194     { "head", "strong" },
1195     { "head", "sub" },
1196     { "head", "sup" },
1197     { "head", "table" },
1198     { "head", "tt" },
1199     { "head", "u" },
1200     { "head", "ul" },
1201     { "head", "var" },
1202     { "head", "xmp" },
1203     { "hr", "form" },
1204     { "i", "center" },
1205     { "i", "p" },
1206     { "i", "td" },
1207     { "i", "th" },
1208     { "legend", "fieldset" },
1209     { "li", "li" },
1210     { "link", "body" },
1211     { "link", "frameset" },
1212     { "listing", "dd" },
1213     { "listing", "dl" },
1214     { "listing", "dt" },
1215     { "listing", "fieldset" },
1216     { "listing", "form" },
1217     { "listing", "li" },
1218     { "listing", "table" },
1219     { "listing", "ul" },
1220     { "menu", "dd" },
1221     { "menu", "dl" },
1222     { "menu", "dt" },
1223     { "menu", "form" },
1224     { "menu", "ul" },
1225     { "ol", "form" },
1226     { "ol", "ul" },
1227     { "option", "optgroup" },
1228     { "option", "option" },
1229     { "p", "address" },
1230     { "p", "blockquote" },
1231     { "p", "body" },
1232     { "p", "caption" },
1233     { "p", "center" },
1234     { "p", "col" },
1235     { "p", "colgroup" },
1236     { "p", "dd" },
1237     { "p", "dir" },
1238     { "p", "div" },
1239     { "p", "dl" },
1240     { "p", "dt" },
1241     { "p", "fieldset" },
1242     { "p", "form" },
1243     { "p", "frameset" },
1244     { "p", "h1" },
1245     { "p", "h2" },
1246     { "p", "h3" },
1247     { "p", "h4" },
1248     { "p", "h5" },
1249     { "p", "h6" },
1250     { "p", "head" },
1251     { "p", "hr" },
1252     { "p", "li" },
1253     { "p", "listing" },
1254     { "p", "menu" },
1255     { "p", "ol" },
1256     { "p", "p" },
1257     { "p", "pre" },
1258     { "p", "table" },
1259     { "p", "tbody" },
1260     { "p", "td" },
1261     { "p", "tfoot" },
1262     { "p", "th" },
1263     { "p", "title" },
1264     { "p", "tr" },
1265     { "p", "ul" },
1266     { "p", "xmp" },
1267     { "pre", "dd" },
1268     { "pre", "dl" },
1269     { "pre", "dt" },
1270     { "pre", "fieldset" },
1271     { "pre", "form" },
1272     { "pre", "li" },
1273     { "pre", "table" },
1274     { "pre", "ul" },
1275     { "s", "p" },
1276     { "script", "noscript" },
1277     { "small", "p" },
1278     { "span", "td" },
1279     { "span", "th" },
1280     { "strike", "p" },
1281     { "style", "body" },
1282     { "style", "frameset" },
1283     { "tbody", "tbody" },
1284     { "tbody", "tfoot" },
1285     { "td", "tbody" },
1286     { "td", "td" },
1287     { "td", "tfoot" },
1288     { "td", "th" },
1289     { "td", "tr" },
1290     { "tfoot", "tbody" },
1291     { "th", "tbody" },
1292     { "th", "td" },
1293     { "th", "tfoot" },
1294     { "th", "th" },
1295     { "th", "tr" },
1296     { "thead", "tbody" },
1297     { "thead", "tfoot" },
1298     { "title", "body" },
1299     { "title", "frameset" },
1300     { "tr", "tbody" },
1301     { "tr", "tfoot" },
1302     { "tr", "tr" },
1303     { "tt", "p" },
1304     { "u", "p" },
1305     { "u", "td" },
1306     { "u", "th" },
1307     { "ul", "address" },
1308     { "ul", "form" },
1309     { "ul", "menu" },
1310     { "ul", "ol" },
1311     { "ul", "pre" },
1312     { "xmp", "dd" },
1313     { "xmp", "dl" },
1314     { "xmp", "dt" },
1315     { "xmp", "fieldset" },
1316     { "xmp", "form" },
1317     { "xmp", "li" },
1318     { "xmp", "table" },
1319     { "xmp", "ul" }
1320 };
1321
1322 /*
1323  * The list of HTML elements which are supposed not to have
1324  * CDATA content and where a p element will be implied
1325  *
1326  * TODO: extend that list by reading the HTML SGML DTD on
1327  *       implied paragraph
1328  */
1329 static const char *const htmlNoContentElements[] = {
1330     "html",
1331     "head",
1332     NULL
1333 };
1334
1335 /*
1336  * The list of HTML attributes which are of content %Script;
1337  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1338  *       it assumes the name starts with 'on'
1339  */
1340 static const char *const htmlScriptAttributes[] = {
1341     "onclick",
1342     "ondblclick",
1343     "onmousedown",
1344     "onmouseup",
1345     "onmouseover",
1346     "onmousemove",
1347     "onmouseout",
1348     "onkeypress",
1349     "onkeydown",
1350     "onkeyup",
1351     "onload",
1352     "onunload",
1353     "onfocus",
1354     "onblur",
1355     "onsubmit",
1356     "onreset",
1357     "onchange",
1358     "onselect"
1359 };
1360
1361 /*
1362  * This table is used by the htmlparser to know what to do with
1363  * broken html pages. By assigning different priorities to different
1364  * elements the parser can decide how to handle extra endtags.
1365  * Endtags are only allowed to close elements with lower or equal
1366  * priority.
1367  */
1368
1369 typedef struct {
1370     const char *name;
1371     int priority;
1372 } elementPriority;
1373
1374 static const elementPriority htmlEndPriority[] = {
1375     {"div",   150},
1376     {"td",    160},
1377     {"th",    160},
1378     {"tr",    170},
1379     {"thead", 180},
1380     {"tbody", 180},
1381     {"tfoot", 180},
1382     {"table", 190},
1383     {"head",  200},
1384     {"body",  200},
1385     {"html",  220},
1386     {NULL,    100} /* Default priority */
1387 };
1388
1389 /************************************************************************
1390  *                                                                      *
1391  *      functions to handle HTML specific data                  *
1392  *                                                                      *
1393  ************************************************************************/
1394
1395 /**
1396  * htmlInitAutoClose:
1397  *
1398  * DEPRECATED: This function will be made private. Call xmlInitParser to
1399  * initialize the library.
1400  *
1401  * This is a no-op now.
1402  */
1403 void
1404 htmlInitAutoClose(void) {
1405 }
1406
1407 static int
1408 htmlCompareTags(const void *key, const void *member) {
1409     const xmlChar *tag = (const xmlChar *) key;
1410     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1411
1412     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1413 }
1414
1415 /**
1416  * htmlTagLookup:
1417  * @tag:  The tag name in lowercase
1418  *
1419  * Lookup the HTML tag in the ElementTable
1420  *
1421  * Returns the related htmlElemDescPtr or NULL if not found.
1422  */
1423 const htmlElemDesc *
1424 htmlTagLookup(const xmlChar *tag) {
1425     if (tag == NULL)
1426         return(NULL);
1427
1428     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1429                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1430                 sizeof(htmlElemDesc), htmlCompareTags));
1431 }
1432
1433 /**
1434  * htmlGetEndPriority:
1435  * @name: The name of the element to look up the priority for.
1436  *
1437  * Return value: The "endtag" priority.
1438  **/
1439 static int
1440 htmlGetEndPriority (const xmlChar *name) {
1441     int i = 0;
1442
1443     while ((htmlEndPriority[i].name != NULL) &&
1444            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1445         i++;
1446
1447     return(htmlEndPriority[i].priority);
1448 }
1449
1450
1451 static int
1452 htmlCompareStartClose(const void *vkey, const void *member) {
1453     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1454     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1455     int ret;
1456
1457     ret = strcmp(key->oldTag, entry->oldTag);
1458     if (ret == 0)
1459         ret = strcmp(key->newTag, entry->newTag);
1460
1461     return(ret);
1462 }
1463
1464 /**
1465  * htmlCheckAutoClose:
1466  * @newtag:  The new tag name
1467  * @oldtag:  The old tag name
1468  *
1469  * Checks whether the new tag is one of the registered valid tags for
1470  * closing old.
1471  *
1472  * Returns 0 if no, 1 if yes.
1473  */
1474 static int
1475 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1476 {
1477     htmlStartCloseEntry key;
1478     void *res;
1479
1480     key.oldTag = (const char *) oldtag;
1481     key.newTag = (const char *) newtag;
1482     res = bsearch(&key, htmlStartClose,
1483             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1484             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1485     return(res != NULL);
1486 }
1487
1488 /**
1489  * htmlAutoCloseOnClose:
1490  * @ctxt:  an HTML parser context
1491  * @newtag:  The new tag name
1492  * @force:  force the tag closure
1493  *
1494  * The HTML DTD allows an ending tag to implicitly close other tags.
1495  */
1496 static void
1497 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1498 {
1499     const htmlElemDesc *info;
1500     int i, priority;
1501
1502     priority = htmlGetEndPriority(newtag);
1503
1504     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1505
1506         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1507             break;
1508         /*
1509          * A misplaced endtag can only close elements with lower
1510          * or equal priority, so if we find an element with higher
1511          * priority before we find an element with
1512          * matching name, we just ignore this endtag
1513          */
1514         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1515             return;
1516     }
1517     if (i < 0)
1518         return;
1519
1520     while (!xmlStrEqual(newtag, ctxt->name)) {
1521         info = htmlTagLookup(ctxt->name);
1522         if ((info != NULL) && (info->endTag == 3)) {
1523             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1524                          "Opening and ending tag mismatch: %s and %s\n",
1525                          newtag, ctxt->name);
1526         }
1527         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1528             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1529         htmlnamePop(ctxt);
1530     }
1531 }
1532
1533 /**
1534  * htmlAutoCloseOnEnd:
1535  * @ctxt:  an HTML parser context
1536  *
1537  * Close all remaining tags at the end of the stream
1538  */
1539 static void
1540 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1541 {
1542     int i;
1543
1544     if (ctxt->nameNr == 0)
1545         return;
1546     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1547         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1548             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1549         htmlnamePop(ctxt);
1550     }
1551 }
1552
1553 /**
1554  * htmlAutoClose:
1555  * @ctxt:  an HTML parser context
1556  * @newtag:  The new tag name or NULL
1557  *
1558  * The HTML DTD allows a tag to implicitly close other tags.
1559  * The list is kept in htmlStartClose array. This function is
1560  * called when a new tag has been detected and generates the
1561  * appropriates closes if possible/needed.
1562  * If newtag is NULL this mean we are at the end of the resource
1563  * and we should check
1564  */
1565 static void
1566 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1567 {
1568     while ((newtag != NULL) && (ctxt->name != NULL) &&
1569            (htmlCheckAutoClose(newtag, ctxt->name))) {
1570         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1571             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1572         htmlnamePop(ctxt);
1573     }
1574     if (newtag == NULL) {
1575         htmlAutoCloseOnEnd(ctxt);
1576         return;
1577     }
1578     while ((newtag == NULL) && (ctxt->name != NULL) &&
1579            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1580             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1581             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584         htmlnamePop(ctxt);
1585     }
1586 }
1587
1588 /**
1589  * htmlAutoCloseTag:
1590  * @doc:  the HTML document
1591  * @name:  The tag name
1592  * @elem:  the HTML element
1593  *
1594  * The HTML DTD allows a tag to implicitly close other tags.
1595  * The list is kept in htmlStartClose array. This function checks
1596  * if the element or one of it's children would autoclose the
1597  * given tag.
1598  *
1599  * Returns 1 if autoclose, 0 otherwise
1600  */
1601 int
1602 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1603     htmlNodePtr child;
1604
1605     if (elem == NULL) return(1);
1606     if (xmlStrEqual(name, elem->name)) return(0);
1607     if (htmlCheckAutoClose(elem->name, name)) return(1);
1608     child = elem->children;
1609     while (child != NULL) {
1610         if (htmlAutoCloseTag(doc, name, child)) return(1);
1611         child = child->next;
1612     }
1613     return(0);
1614 }
1615
1616 /**
1617  * htmlIsAutoClosed:
1618  * @doc:  the HTML document
1619  * @elem:  the HTML element
1620  *
1621  * The HTML DTD allows a tag to implicitly close other tags.
1622  * The list is kept in htmlStartClose array. This function checks
1623  * if a tag is autoclosed by one of it's child
1624  *
1625  * Returns 1 if autoclosed, 0 otherwise
1626  */
1627 int
1628 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1629     htmlNodePtr child;
1630
1631     if (elem == NULL) return(1);
1632     child = elem->children;
1633     while (child != NULL) {
1634         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1635         child = child->next;
1636     }
1637     return(0);
1638 }
1639
1640 /**
1641  * htmlCheckImplied:
1642  * @ctxt:  an HTML parser context
1643  * @newtag:  The new tag name
1644  *
1645  * The HTML DTD allows a tag to exists only implicitly
1646  * called when a new tag has been detected and generates the
1647  * appropriates implicit tags if missing
1648  */
1649 static void
1650 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1651     int i;
1652
1653     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1654         return;
1655     if (!htmlOmittedDefaultValue)
1656         return;
1657     if (xmlStrEqual(newtag, BAD_CAST"html"))
1658         return;
1659     if (ctxt->nameNr <= 0) {
1660         htmlnamePush(ctxt, BAD_CAST"html");
1661         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1662             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1663     }
1664     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1665         return;
1666     if ((ctxt->nameNr <= 1) &&
1667         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1668          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1669          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1670          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1671          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1672          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1673         if (ctxt->html >= 3) {
1674             /* we already saw or generated an <head> before */
1675             return;
1676         }
1677         /*
1678          * dropped OBJECT ... i you put it first BODY will be
1679          * assumed !
1680          */
1681         htmlnamePush(ctxt, BAD_CAST"head");
1682         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1683             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1684     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1685                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1686                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1687         if (ctxt->html >= 10) {
1688             /* we already saw or generated a <body> before */
1689             return;
1690         }
1691         for (i = 0;i < ctxt->nameNr;i++) {
1692             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1693                 return;
1694             }
1695             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1696                 return;
1697             }
1698         }
1699
1700         htmlnamePush(ctxt, BAD_CAST"body");
1701         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1702             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1703     }
1704 }
1705
1706 /**
1707  * htmlCheckParagraph
1708  * @ctxt:  an HTML parser context
1709  *
1710  * Check whether a p element need to be implied before inserting
1711  * characters in the current element.
1712  *
1713  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1714  *         in case of error.
1715  */
1716
1717 static int
1718 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1719     const xmlChar *tag;
1720     int i;
1721
1722     if (ctxt == NULL)
1723         return(-1);
1724     tag = ctxt->name;
1725     if (tag == NULL) {
1726         htmlAutoClose(ctxt, BAD_CAST"p");
1727         htmlCheckImplied(ctxt, BAD_CAST"p");
1728         htmlnamePush(ctxt, BAD_CAST"p");
1729         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1730             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1731         return(1);
1732     }
1733     if (!htmlOmittedDefaultValue)
1734         return(0);
1735     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1736         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1737             htmlAutoClose(ctxt, BAD_CAST"p");
1738             htmlCheckImplied(ctxt, BAD_CAST"p");
1739             htmlnamePush(ctxt, BAD_CAST"p");
1740             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1741                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1742             return(1);
1743         }
1744     }
1745     return(0);
1746 }
1747
1748 /**
1749  * htmlIsScriptAttribute:
1750  * @name:  an attribute name
1751  *
1752  * Check if an attribute is of content type Script
1753  *
1754  * Returns 1 is the attribute is a script 0 otherwise
1755  */
1756 int
1757 htmlIsScriptAttribute(const xmlChar *name) {
1758     unsigned int i;
1759
1760     if (name == NULL)
1761       return(0);
1762     /*
1763      * all script attributes start with 'on'
1764      */
1765     if ((name[0] != 'o') || (name[1] != 'n'))
1766       return(0);
1767     for (i = 0;
1768          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1769          i++) {
1770         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1771             return(1);
1772     }
1773     return(0);
1774 }
1775
1776 /************************************************************************
1777  *                                                                      *
1778  *      The list of HTML predefined entities                    *
1779  *                                                                      *
1780  ************************************************************************/
1781
1782
1783 static const htmlEntityDesc  html40EntitiesTable[] = {
1784 /*
1785  * the 4 absolute ones, plus apostrophe.
1786  */
1787 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1788 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1789 { 39,   "apos", "single quote" },
1790 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1791 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1792
1793 /*
1794  * A bunch still in the 128-255 range
1795  * Replacing them depend really on the charset used.
1796  */
1797 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1798 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1799 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1800 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1801 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1802 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1803 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1804 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1805 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1806 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1807 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1808 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1809 { 172,  "not",  "not sign, U+00AC ISOnum" },
1810 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1811 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1812 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1813 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1814 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1815 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1816 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1817 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1818 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1819 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1820 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1821 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1822 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1823 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1824 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1825 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1826 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1827 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1828 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1829 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1830 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1831 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1832 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1833 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1834 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1835 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1836 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1837 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1838 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1839 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1840 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1841 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1842 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1843 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1844 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1845 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1846 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1847 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1848 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1849 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1850 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1851 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1852 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1853 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1854 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1855 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1856 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1857 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1858 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1859 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1860 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1861 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1862 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1863 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1864 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1865 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1866 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1867 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1868 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1869 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1870 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1871 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1872 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1873 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1874 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1875 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1876 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1877 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1878 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1879 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1880 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1881 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1882 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1883 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1884 { 247,  "divide","division sign, U+00F7 ISOnum" },
1885 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1886 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1887 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1888 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1889 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1890 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1891 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1892 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1893
1894 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1895 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1896 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1897 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1898 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1899
1900 /*
1901  * Anything below should really be kept as entities references
1902  */
1903 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1904
1905 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1906 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1907
1908 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1909 { 914,  "Beta", "greek capital letter beta, U+0392" },
1910 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1911 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1912 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1913 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1914 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1915 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1916 { 921,  "Iota", "greek capital letter iota, U+0399" },
1917 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1918 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1919 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1920 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1921 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1922 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1923 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1924 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1925 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1926 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1927 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1928 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1929 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1930 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1931 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1932
1933 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1934 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1935 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1936 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1937 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1938 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1939 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1940 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1941 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1942 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1943 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1944 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1945 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1946 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1947 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1948 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1949 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1950 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1951 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1952 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1953 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1954 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1955 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1956 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1957 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1958 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1959 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1960 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1961
1962 { 8194, "ensp", "en space, U+2002 ISOpub" },
1963 { 8195, "emsp", "em space, U+2003 ISOpub" },
1964 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1965 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1966 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1967 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1968 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1969 { 8211, "ndash","en dash, U+2013 ISOpub" },
1970 { 8212, "mdash","em dash, U+2014 ISOpub" },
1971 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1972 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1973 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1974 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1975 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1976 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1977 { 8224, "dagger","dagger, U+2020 ISOpub" },
1978 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1979
1980 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1981 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1982
1983 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1984
1985 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1986 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1987
1988 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1989 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1990
1991 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1992 { 8260, "frasl","fraction slash, U+2044 NEW" },
1993
1994 { 8364, "euro", "euro sign, U+20AC NEW" },
1995
1996 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1997 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1998 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1999 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2000 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2001 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2002 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2003 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2004 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2005 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2006 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2007 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2008 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2009 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2010 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2011 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2012
2013 { 8704, "forall","for all, U+2200 ISOtech" },
2014 { 8706, "part", "partial differential, U+2202 ISOtech" },
2015 { 8707, "exist","there exists, U+2203 ISOtech" },
2016 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2017 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2018 { 8712, "isin", "element of, U+2208 ISOtech" },
2019 { 8713, "notin","not an element of, U+2209 ISOtech" },
2020 { 8715, "ni",   "contains as member, U+220B ISOtech" },
2021 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2022 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2023 { 8722, "minus","minus sign, U+2212 ISOtech" },
2024 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2025 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2026 { 8733, "prop", "proportional to, U+221D ISOtech" },
2027 { 8734, "infin","infinity, U+221E ISOtech" },
2028 { 8736, "ang",  "angle, U+2220 ISOamso" },
2029 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2030 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
2031 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2032 { 8746, "cup",  "union = cup, U+222A ISOtech" },
2033 { 8747, "int",  "integral, U+222B ISOtech" },
2034 { 8756, "there4","therefore, U+2234 ISOtech" },
2035 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2036 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2037 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2038 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
2039 { 8801, "equiv","identical to, U+2261 ISOtech" },
2040 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
2041 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
2042 { 8834, "sub",  "subset of, U+2282 ISOtech" },
2043 { 8835, "sup",  "superset of, U+2283 ISOtech" },
2044 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2045 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2046 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2047 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2048 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2049 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2050 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2051 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2052 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2053 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2054 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2055 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2056 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2057 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
2058
2059 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2060 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2061 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2062 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2063
2064 };
2065
2066 /************************************************************************
2067  *                                                                      *
2068  *              Commodity functions to handle entities                  *
2069  *                                                                      *
2070  ************************************************************************/
2071
2072 /*
2073  * Macro used to grow the current buffer.
2074  */
2075 #define growBuffer(buffer) {                                            \
2076     xmlChar *tmp;                                                       \
2077     buffer##_size *= 2;                                                 \
2078     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2079     if (tmp == NULL) {                                          \
2080         htmlErrMemory(ctxt, "growing buffer\n");                        \
2081         xmlFree(buffer);                                                \
2082         return(NULL);                                                   \
2083     }                                                                   \
2084     buffer = tmp;                                                       \
2085 }
2086
2087 /**
2088  * htmlEntityLookup:
2089  * @name: the entity name
2090  *
2091  * Lookup the given entity in EntitiesTable
2092  *
2093  * TODO: the linear scan is really ugly, an hash table is really needed.
2094  *
2095  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2096  */
2097 const htmlEntityDesc *
2098 htmlEntityLookup(const xmlChar *name) {
2099     unsigned int i;
2100
2101     for (i = 0;i < (sizeof(html40EntitiesTable)/
2102                     sizeof(html40EntitiesTable[0]));i++) {
2103         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2104             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2105         }
2106     }
2107     return(NULL);
2108 }
2109
2110 /**
2111  * htmlEntityValueLookup:
2112  * @value: the entity's unicode value
2113  *
2114  * Lookup the given entity in EntitiesTable
2115  *
2116  * TODO: the linear scan is really ugly, an hash table is really needed.
2117  *
2118  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2119  */
2120 const htmlEntityDesc *
2121 htmlEntityValueLookup(unsigned int value) {
2122     unsigned int i;
2123
2124     for (i = 0;i < (sizeof(html40EntitiesTable)/
2125                     sizeof(html40EntitiesTable[0]));i++) {
2126         if (html40EntitiesTable[i].value >= value) {
2127             if (html40EntitiesTable[i].value > value)
2128                 break;
2129             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2130         }
2131     }
2132     return(NULL);
2133 }
2134
2135 /**
2136  * UTF8ToHtml:
2137  * @out:  a pointer to an array of bytes to store the result
2138  * @outlen:  the length of @out
2139  * @in:  a pointer to an array of UTF-8 chars
2140  * @inlen:  the length of @in
2141  *
2142  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2143  * plus HTML entities block of chars out.
2144  *
2145  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2146  * The value of @inlen after return is the number of octets consumed
2147  *     as the return value is positive, else unpredictable.
2148  * The value of @outlen after return is the number of octets consumed.
2149  */
2150 int
2151 UTF8ToHtml(unsigned char* out, int *outlen,
2152               const unsigned char* in, int *inlen) {
2153     const unsigned char* processed = in;
2154     const unsigned char* outend;
2155     const unsigned char* outstart = out;
2156     const unsigned char* instart = in;
2157     const unsigned char* inend;
2158     unsigned int c, d;
2159     int trailing;
2160
2161     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2162     if (in == NULL) {
2163         /*
2164          * initialization nothing to do
2165          */
2166         *outlen = 0;
2167         *inlen = 0;
2168         return(0);
2169     }
2170     inend = in + (*inlen);
2171     outend = out + (*outlen);
2172     while (in < inend) {
2173         d = *in++;
2174         if      (d < 0x80)  { c= d; trailing= 0; }
2175         else if (d < 0xC0) {
2176             /* trailing byte in leading position */
2177             *outlen = out - outstart;
2178             *inlen = processed - instart;
2179             return(-2);
2180         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2181         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2182         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2183         else {
2184             /* no chance for this in Ascii */
2185             *outlen = out - outstart;
2186             *inlen = processed - instart;
2187             return(-2);
2188         }
2189
2190         if (inend - in < trailing) {
2191             break;
2192         }
2193
2194         for ( ; trailing; trailing--) {
2195             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2196                 break;
2197             c <<= 6;
2198             c |= d & 0x3F;
2199         }
2200
2201         /* assertion: c is a single UTF-4 value */
2202         if (c < 0x80) {
2203             if (out + 1 >= outend)
2204                 break;
2205             *out++ = c;
2206         } else {
2207             int len;
2208             const htmlEntityDesc * ent;
2209             const char *cp;
2210             char nbuf[16];
2211
2212             /*
2213              * Try to lookup a predefined HTML entity for it
2214              */
2215
2216             ent = htmlEntityValueLookup(c);
2217             if (ent == NULL) {
2218               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2219               cp = nbuf;
2220             }
2221             else
2222               cp = ent->name;
2223             len = strlen(cp);
2224             if (out + 2 + len >= outend)
2225                 break;
2226             *out++ = '&';
2227             memcpy(out, cp, len);
2228             out += len;
2229             *out++ = ';';
2230         }
2231         processed = in;
2232     }
2233     *outlen = out - outstart;
2234     *inlen = processed - instart;
2235     return(0);
2236 }
2237
2238 /**
2239  * htmlEncodeEntities:
2240  * @out:  a pointer to an array of bytes to store the result
2241  * @outlen:  the length of @out
2242  * @in:  a pointer to an array of UTF-8 chars
2243  * @inlen:  the length of @in
2244  * @quoteChar: the quote character to escape (' or ") or zero.
2245  *
2246  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2247  * plus HTML entities block of chars out.
2248  *
2249  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2250  * The value of @inlen after return is the number of octets consumed
2251  *     as the return value is positive, else unpredictable.
2252  * The value of @outlen after return is the number of octets consumed.
2253  */
2254 int
2255 htmlEncodeEntities(unsigned char* out, int *outlen,
2256                    const unsigned char* in, int *inlen, int quoteChar) {
2257     const unsigned char* processed = in;
2258     const unsigned char* outend;
2259     const unsigned char* outstart = out;
2260     const unsigned char* instart = in;
2261     const unsigned char* inend;
2262     unsigned int c, d;
2263     int trailing;
2264
2265     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2266         return(-1);
2267     outend = out + (*outlen);
2268     inend = in + (*inlen);
2269     while (in < inend) {
2270         d = *in++;
2271         if      (d < 0x80)  { c= d; trailing= 0; }
2272         else if (d < 0xC0) {
2273             /* trailing byte in leading position */
2274             *outlen = out - outstart;
2275             *inlen = processed - instart;
2276             return(-2);
2277         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2278         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2279         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2280         else {
2281             /* no chance for this in Ascii */
2282             *outlen = out - outstart;
2283             *inlen = processed - instart;
2284             return(-2);
2285         }
2286
2287         if (inend - in < trailing)
2288             break;
2289
2290         while (trailing--) {
2291             if (((d= *in++) & 0xC0) != 0x80) {
2292                 *outlen = out - outstart;
2293                 *inlen = processed - instart;
2294                 return(-2);
2295             }
2296             c <<= 6;
2297             c |= d & 0x3F;
2298         }
2299
2300         /* assertion: c is a single UTF-4 value */
2301         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2302             (c != '&') && (c != '<') && (c != '>')) {
2303             if (out >= outend)
2304                 break;
2305             *out++ = c;
2306         } else {
2307             const htmlEntityDesc * ent;
2308             const char *cp;
2309             char nbuf[16];
2310             int len;
2311
2312             /*
2313              * Try to lookup a predefined HTML entity for it
2314              */
2315             ent = htmlEntityValueLookup(c);
2316             if (ent == NULL) {
2317                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2318                 cp = nbuf;
2319             }
2320             else
2321                 cp = ent->name;
2322             len = strlen(cp);
2323             if (out + 2 + len > outend)
2324                 break;
2325             *out++ = '&';
2326             memcpy(out, cp, len);
2327             out += len;
2328             *out++ = ';';
2329         }
2330         processed = in;
2331     }
2332     *outlen = out - outstart;
2333     *inlen = processed - instart;
2334     return(0);
2335 }
2336
2337 /************************************************************************
2338  *                                                                      *
2339  *              Commodity functions to handle streams                   *
2340  *                                                                      *
2341  ************************************************************************/
2342
2343 #ifdef LIBXML_PUSH_ENABLED
2344 /**
2345  * htmlNewInputStream:
2346  * @ctxt:  an HTML parser context
2347  *
2348  * Create a new input stream structure
2349  * Returns the new input stream or NULL
2350  */
2351 static htmlParserInputPtr
2352 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2353     htmlParserInputPtr input;
2354
2355     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2356     if (input == NULL) {
2357         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2358         return(NULL);
2359     }
2360     memset(input, 0, sizeof(htmlParserInput));
2361     input->filename = NULL;
2362     input->directory = NULL;
2363     input->base = NULL;
2364     input->cur = NULL;
2365     input->buf = NULL;
2366     input->line = 1;
2367     input->col = 1;
2368     input->buf = NULL;
2369     input->free = NULL;
2370     input->version = NULL;
2371     input->consumed = 0;
2372     input->length = 0;
2373     return(input);
2374 }
2375 #endif
2376
2377
2378 /************************************************************************
2379  *                                                                      *
2380  *              Commodity functions, cleanup needed ?                   *
2381  *                                                                      *
2382  ************************************************************************/
2383 /*
2384  * all tags allowing pc data from the html 4.01 loose dtd
2385  * NOTE: it might be more appropriate to integrate this information
2386  * into the html40ElementTable array but I don't want to risk any
2387  * binary incompatibility
2388  */
2389 static const char *allowPCData[] = {
2390     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2391     "blockquote", "body", "button", "caption", "center", "cite", "code",
2392     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2393     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2394     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2395     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2396 };
2397
2398 /**
2399  * areBlanks:
2400  * @ctxt:  an HTML parser context
2401  * @str:  a xmlChar *
2402  * @len:  the size of @str
2403  *
2404  * Is this a sequence of blank chars that one can ignore ?
2405  *
2406  * Returns 1 if ignorable 0 otherwise.
2407  */
2408
2409 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2410     unsigned int i;
2411     int j;
2412     xmlNodePtr lastChild;
2413     xmlDtdPtr dtd;
2414
2415     for (j = 0;j < len;j++)
2416         if (!(IS_BLANK_CH(str[j]))) return(0);
2417
2418     if (CUR == 0) return(1);
2419     if (CUR != '<') return(0);
2420     if (ctxt->name == NULL)
2421         return(1);
2422     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2423         return(1);
2424     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2425         return(1);
2426
2427     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2428     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2429         dtd = xmlGetIntSubset(ctxt->myDoc);
2430         if (dtd != NULL && dtd->ExternalID != NULL) {
2431             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2432                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2433                 return(1);
2434         }
2435     }
2436
2437     if (ctxt->node == NULL) return(0);
2438     lastChild = xmlGetLastChild(ctxt->node);
2439     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2440         lastChild = lastChild->prev;
2441     if (lastChild == NULL) {
2442         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2443             (ctxt->node->content != NULL)) return(0);
2444         /* keep ws in constructs like ...<b> </b>...
2445            for all tags "b" allowing PCDATA */
2446         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2447             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2448                 return(0);
2449             }
2450         }
2451     } else if (xmlNodeIsText(lastChild)) {
2452         return(0);
2453     } else {
2454         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2455            for all tags "p" allowing PCDATA */
2456         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2457             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2458                 return(0);
2459             }
2460         }
2461     }
2462     return(1);
2463 }
2464
2465 /**
2466  * htmlNewDocNoDtD:
2467  * @URI:  URI for the dtd, or NULL
2468  * @ExternalID:  the external ID of the DTD, or NULL
2469  *
2470  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2471  * are NULL
2472  *
2473  * Returns a new document, do not initialize the DTD if not provided
2474  */
2475 htmlDocPtr
2476 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2477     xmlDocPtr cur;
2478
2479     /*
2480      * Allocate a new document and fill the fields.
2481      */
2482     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2483     if (cur == NULL) {
2484         htmlErrMemory(NULL, "HTML document creation failed\n");
2485         return(NULL);
2486     }
2487     memset(cur, 0, sizeof(xmlDoc));
2488
2489     cur->type = XML_HTML_DOCUMENT_NODE;
2490     cur->version = NULL;
2491     cur->intSubset = NULL;
2492     cur->doc = cur;
2493     cur->name = NULL;
2494     cur->children = NULL;
2495     cur->extSubset = NULL;
2496     cur->oldNs = NULL;
2497     cur->encoding = NULL;
2498     cur->standalone = 1;
2499     cur->compression = 0;
2500     cur->ids = NULL;
2501     cur->refs = NULL;
2502     cur->_private = NULL;
2503     cur->charset = XML_CHAR_ENCODING_UTF8;
2504     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2505     if ((ExternalID != NULL) ||
2506         (URI != NULL))
2507         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2508     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2509         xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2510     return(cur);
2511 }
2512
2513 /**
2514  * htmlNewDoc:
2515  * @URI:  URI for the dtd, or NULL
2516  * @ExternalID:  the external ID of the DTD, or NULL
2517  *
2518  * Creates a new HTML document
2519  *
2520  * Returns a new document
2521  */
2522 htmlDocPtr
2523 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2524     if ((URI == NULL) && (ExternalID == NULL))
2525         return(htmlNewDocNoDtD(
2526                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2527                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2528
2529     return(htmlNewDocNoDtD(URI, ExternalID));
2530 }
2531
2532
2533 /************************************************************************
2534  *                                                                      *
2535  *                      The parser itself                               *
2536  *      Relates to http://www.w3.org/TR/html40                          *
2537  *                                                                      *
2538  ************************************************************************/
2539
2540 /************************************************************************
2541  *                                                                      *
2542  *                      The parser itself                               *
2543  *                                                                      *
2544  ************************************************************************/
2545
2546 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2547
2548 static void
2549 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2550     int c;
2551
2552     htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2553                  "Incorrectly opened comment\n", NULL, NULL);
2554
2555     do {
2556         c = CUR;
2557         if (c == 0)
2558             break;
2559         NEXT;
2560     } while (c != '>');
2561 }
2562
2563 /**
2564  * htmlParseHTMLName:
2565  * @ctxt:  an HTML parser context
2566  *
2567  * parse an HTML tag or attribute name, note that we convert it to lowercase
2568  * since HTML names are not case-sensitive.
2569  *
2570  * Returns the Tag Name parsed or NULL
2571  */
2572
2573 static const xmlChar *
2574 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2575     int i = 0;
2576     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2577
2578     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2579         (CUR != ':') && (CUR != '.')) return(NULL);
2580
2581     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2582            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2583            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2584            (CUR == '.'))) {
2585         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2586         else loc[i] = CUR;
2587         i++;
2588
2589         NEXT;
2590     }
2591
2592     return(xmlDictLookup(ctxt->dict, loc, i));
2593 }
2594
2595
2596 /**
2597  * htmlParseHTMLName_nonInvasive:
2598  * @ctxt:  an HTML parser context
2599  *
2600  * parse an HTML tag or attribute name, note that we convert it to lowercase
2601  * since HTML names are not case-sensitive, this doesn't consume the data
2602  * from the stream, it's a look-ahead
2603  *
2604  * Returns the Tag Name parsed or NULL
2605  */
2606
2607 static const xmlChar *
2608 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2609     int i = 0;
2610     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2611
2612     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2613         (NXT(1) != ':')) return(NULL);
2614
2615     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2616            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2617            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2618         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2619         else loc[i] = NXT(1+i);
2620         i++;
2621     }
2622
2623     return(xmlDictLookup(ctxt->dict, loc, i));
2624 }
2625
2626
2627 /**
2628  * htmlParseName:
2629  * @ctxt:  an HTML parser context
2630  *
2631  * parse an HTML name, this routine is case sensitive.
2632  *
2633  * Returns the Name parsed or NULL
2634  */
2635
2636 static const xmlChar *
2637 htmlParseName(htmlParserCtxtPtr ctxt) {
2638     const xmlChar *in;
2639     const xmlChar *ret;
2640     int count = 0;
2641
2642     GROW;
2643
2644     /*
2645      * Accelerator for simple ASCII names
2646      */
2647     in = ctxt->input->cur;
2648     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2649         ((*in >= 0x41) && (*in <= 0x5A)) ||
2650         (*in == '_') || (*in == ':')) {
2651         in++;
2652         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2653                ((*in >= 0x41) && (*in <= 0x5A)) ||
2654                ((*in >= 0x30) && (*in <= 0x39)) ||
2655                (*in == '_') || (*in == '-') ||
2656                (*in == ':') || (*in == '.'))
2657             in++;
2658
2659         if (in == ctxt->input->end)
2660             return(NULL);
2661
2662         if ((*in > 0) && (*in < 0x80)) {
2663             count = in - ctxt->input->cur;
2664             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2665             ctxt->input->cur = in;
2666             ctxt->input->col += count;
2667             return(ret);
2668         }
2669     }
2670     return(htmlParseNameComplex(ctxt));
2671 }
2672
2673 static const xmlChar *
2674 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2675     int len = 0, l;
2676     int c;
2677     int count = 0;
2678     const xmlChar *base = ctxt->input->base;
2679
2680     /*
2681      * Handler for more complex cases
2682      */
2683     GROW;
2684     c = CUR_CHAR(l);
2685     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2686         (!IS_LETTER(c) && (c != '_') &&
2687          (c != ':'))) {
2688         return(NULL);
2689     }
2690
2691     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2692            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2693             (c == '.') || (c == '-') ||
2694             (c == '_') || (c == ':') ||
2695             (IS_COMBINING(c)) ||
2696             (IS_EXTENDER(c)))) {
2697         if (count++ > 100) {
2698             count = 0;
2699             GROW;
2700         }
2701         len += l;
2702         NEXTL(l);
2703         c = CUR_CHAR(l);
2704         if (ctxt->input->base != base) {
2705             /*
2706              * We changed encoding from an unknown encoding
2707              * Input buffer changed location, so we better start again
2708              */
2709             return(htmlParseNameComplex(ctxt));
2710         }
2711     }
2712
2713     if (ctxt->input->cur - ctxt->input->base < len) {
2714         /* Sanity check */
2715         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2716                      "unexpected change of input buffer", NULL, NULL);
2717         return (NULL);
2718     }
2719
2720     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2721 }
2722
2723
2724 /**
2725  * htmlParseHTMLAttribute:
2726  * @ctxt:  an HTML parser context
2727  * @stop:  a char stop value
2728  *
2729  * parse an HTML attribute value till the stop (quote), if
2730  * stop is 0 then it stops at the first space
2731  *
2732  * Returns the attribute parsed or NULL
2733  */
2734
2735 static xmlChar *
2736 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2737     xmlChar *buffer = NULL;
2738     int buffer_size = 0;
2739     xmlChar *out = NULL;
2740     const xmlChar *name = NULL;
2741     const xmlChar *cur = NULL;
2742     const htmlEntityDesc * ent;
2743
2744     /*
2745      * allocate a translation buffer.
2746      */
2747     buffer_size = HTML_PARSER_BUFFER_SIZE;
2748     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2749     if (buffer == NULL) {
2750         htmlErrMemory(ctxt, "buffer allocation failed\n");
2751         return(NULL);
2752     }
2753     out = buffer;
2754
2755     /*
2756      * Ok loop until we reach one of the ending chars
2757      */
2758     while ((CUR != 0) && (CUR != stop)) {
2759         if ((stop == 0) && (CUR == '>')) break;
2760         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2761         if (CUR == '&') {
2762             if (NXT(1) == '#') {
2763                 unsigned int c;
2764                 int bits;
2765
2766                 c = htmlParseCharRef(ctxt);
2767                 if      (c <    0x80)
2768                         { *out++  = c;                bits= -6; }
2769                 else if (c <   0x800)
2770                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2771                 else if (c < 0x10000)
2772                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2773                 else
2774                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2775
2776                 for ( ; bits >= 0; bits-= 6) {
2777                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2778                 }
2779
2780                 if (out - buffer > buffer_size - 100) {
2781                         int indx = out - buffer;
2782
2783                         growBuffer(buffer);
2784                         out = &buffer[indx];
2785                 }
2786             } else {
2787                 ent = htmlParseEntityRef(ctxt, &name);
2788                 if (name == NULL) {
2789                     *out++ = '&';
2790                     if (out - buffer > buffer_size - 100) {
2791                         int indx = out - buffer;
2792
2793                         growBuffer(buffer);
2794                         out = &buffer[indx];
2795                     }
2796                 } else if (ent == NULL) {
2797                     *out++ = '&';
2798                     cur = name;
2799                     while (*cur != 0) {
2800                         if (out - buffer > buffer_size - 100) {
2801                             int indx = out - buffer;
2802
2803                             growBuffer(buffer);
2804                             out = &buffer[indx];
2805                         }
2806                         *out++ = *cur++;
2807                     }
2808                 } else {
2809                     unsigned int c;
2810                     int bits;
2811
2812                     if (out - buffer > buffer_size - 100) {
2813                         int indx = out - buffer;
2814
2815                         growBuffer(buffer);
2816                         out = &buffer[indx];
2817                     }
2818                     c = ent->value;
2819                     if      (c <    0x80)
2820                         { *out++  = c;                bits= -6; }
2821                     else if (c <   0x800)
2822                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2823                     else if (c < 0x10000)
2824                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2825                     else
2826                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2827
2828                     for ( ; bits >= 0; bits-= 6) {
2829                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2830                     }
2831                 }
2832             }
2833         } else {
2834             unsigned int c;
2835             int bits, l;
2836
2837             if (out - buffer > buffer_size - 100) {
2838                 int indx = out - buffer;
2839
2840                 growBuffer(buffer);
2841                 out = &buffer[indx];
2842             }
2843             c = CUR_CHAR(l);
2844             if      (c <    0x80)
2845                     { *out++  = c;                bits= -6; }
2846             else if (c <   0x800)
2847                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2848             else if (c < 0x10000)
2849                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2850             else
2851                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2852
2853             for ( ; bits >= 0; bits-= 6) {
2854                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2855             }
2856             NEXT;
2857         }
2858     }
2859     *out = 0;
2860     return(buffer);
2861 }
2862
2863 /**
2864  * htmlParseEntityRef:
2865  * @ctxt:  an HTML parser context
2866  * @str:  location to store the entity name
2867  *
2868  * parse an HTML ENTITY references
2869  *
2870  * [68] EntityRef ::= '&' Name ';'
2871  *
2872  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2873  *         if non-NULL *str will have to be freed by the caller.
2874  */
2875 const htmlEntityDesc *
2876 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2877     const xmlChar *name;
2878     const htmlEntityDesc * ent = NULL;
2879
2880     if (str != NULL) *str = NULL;
2881     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2882
2883     if (CUR == '&') {
2884         NEXT;
2885         name = htmlParseName(ctxt);
2886         if (name == NULL) {
2887             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2888                          "htmlParseEntityRef: no name\n", NULL, NULL);
2889         } else {
2890             GROW;
2891             if (CUR == ';') {
2892                 if (str != NULL)
2893                     *str = name;
2894
2895                 /*
2896                  * Lookup the entity in the table.
2897                  */
2898                 ent = htmlEntityLookup(name);
2899                 if (ent != NULL) /* OK that's ugly !!! */
2900                     NEXT;
2901             } else {
2902                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2903                              "htmlParseEntityRef: expecting ';'\n",
2904                              NULL, NULL);
2905                 if (str != NULL)
2906                     *str = name;
2907             }
2908         }
2909     }
2910     return(ent);
2911 }
2912
2913 /**
2914  * htmlParseAttValue:
2915  * @ctxt:  an HTML parser context
2916  *
2917  * parse a value for an attribute
2918  * Note: the parser won't do substitution of entities here, this
2919  * will be handled later in xmlStringGetNodeList, unless it was
2920  * asked for ctxt->replaceEntities != 0
2921  *
2922  * Returns the AttValue parsed or NULL.
2923  */
2924
2925 static xmlChar *
2926 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2927     xmlChar *ret = NULL;
2928
2929     if (CUR == '"') {
2930         NEXT;
2931         ret = htmlParseHTMLAttribute(ctxt, '"');
2932         if (CUR != '"') {
2933             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2934                          "AttValue: \" expected\n", NULL, NULL);
2935         } else
2936             NEXT;
2937     } else if (CUR == '\'') {
2938         NEXT;
2939         ret = htmlParseHTMLAttribute(ctxt, '\'');
2940         if (CUR != '\'') {
2941             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2942                          "AttValue: ' expected\n", NULL, NULL);
2943         } else
2944             NEXT;
2945     } else {
2946         /*
2947          * That's an HTMLism, the attribute value may not be quoted
2948          */
2949         ret = htmlParseHTMLAttribute(ctxt, 0);
2950         if (ret == NULL) {
2951             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2952                          "AttValue: no value found\n", NULL, NULL);
2953         }
2954     }
2955     return(ret);
2956 }
2957
2958 /**
2959  * htmlParseSystemLiteral:
2960  * @ctxt:  an HTML parser context
2961  *
2962  * parse an HTML Literal
2963  *
2964  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2965  *
2966  * Returns the SystemLiteral parsed or NULL
2967  */
2968
2969 static xmlChar *
2970 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2971     size_t len = 0, startPosition = 0;
2972     int err = 0;
2973     int quote;
2974     xmlChar *ret = NULL;
2975
2976     if ((CUR != '"') && (CUR != '\'')) {
2977         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2978                      "SystemLiteral \" or ' expected\n", NULL, NULL);
2979         return(NULL);
2980     }
2981     quote = CUR;
2982     NEXT;
2983
2984     if (CUR_PTR < BASE_PTR)
2985         return(ret);
2986     startPosition = CUR_PTR - BASE_PTR;
2987
2988     while ((CUR != 0) && (CUR != quote)) {
2989         /* TODO: Handle UTF-8 */
2990         if (!IS_CHAR_CH(CUR)) {
2991             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2992                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2993             err = 1;
2994         }
2995         NEXT;
2996         len++;
2997     }
2998     if (CUR != quote) {
2999         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3000                      "Unfinished SystemLiteral\n", NULL, NULL);
3001     } else {
3002         NEXT;
3003         if (err == 0)
3004             ret = xmlStrndup((BASE_PTR+startPosition), len);
3005     }
3006
3007     return(ret);
3008 }
3009
3010 /**
3011  * htmlParsePubidLiteral:
3012  * @ctxt:  an HTML parser context
3013  *
3014  * parse an HTML public literal
3015  *
3016  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3017  *
3018  * Returns the PubidLiteral parsed or NULL.
3019  */
3020
3021 static xmlChar *
3022 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3023     size_t len = 0, startPosition = 0;
3024     int err = 0;
3025     int quote;
3026     xmlChar *ret = NULL;
3027
3028     if ((CUR != '"') && (CUR != '\'')) {
3029         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3030                      "PubidLiteral \" or ' expected\n", NULL, NULL);
3031         return(NULL);
3032     }
3033     quote = CUR;
3034     NEXT;
3035
3036     /*
3037      * Name ::= (Letter | '_') (NameChar)*
3038      */
3039     if (CUR_PTR < BASE_PTR)
3040         return(ret);
3041     startPosition = CUR_PTR - BASE_PTR;
3042
3043     while ((CUR != 0) && (CUR != quote)) {
3044         if (!IS_PUBIDCHAR_CH(CUR)) {
3045             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3046                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3047             err = 1;
3048         }
3049         len++;
3050         NEXT;
3051     }
3052
3053     if (CUR != quote) {
3054         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3055                      "Unfinished PubidLiteral\n", NULL, NULL);
3056     } else {
3057         NEXT;
3058         if (err == 0)
3059             ret = xmlStrndup((BASE_PTR + startPosition), len);
3060     }
3061
3062     return(ret);
3063 }
3064
3065 /**
3066  * htmlParseScript:
3067  * @ctxt:  an HTML parser context
3068  *
3069  * parse the content of an HTML SCRIPT or STYLE element
3070  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3071  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3072  * http://www.w3.org/TR/html4/types.html#type-script
3073  * http://www.w3.org/TR/html4/types.html#h-6.15
3074  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3075  *
3076  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3077  * element and the value of intrinsic event attributes. User agents must
3078  * not evaluate script data as HTML markup but instead must pass it on as
3079  * data to a script engine.
3080  * NOTES:
3081  * - The content is passed like CDATA
3082  * - the attributes for style and scripting "onXXX" are also described
3083  *   as CDATA but SGML allows entities references in attributes so their
3084  *   processing is identical as other attributes
3085  */
3086 static void
3087 htmlParseScript(htmlParserCtxtPtr ctxt) {
3088     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3089     int nbchar = 0;
3090     int cur,l;
3091
3092     SHRINK;
3093     cur = CUR_CHAR(l);
3094     while (cur != 0) {
3095         if ((cur == '<') && (NXT(1) == '/')) {
3096             /*
3097              * One should break here, the specification is clear:
3098              * Authors should therefore escape "</" within the content.
3099              * Escape mechanisms are specific to each scripting or
3100              * style sheet language.
3101              *
3102              * In recovery mode, only break if end tag match the
3103              * current tag, effectively ignoring all tags inside the
3104              * script/style block and treating the entire block as
3105              * CDATA.
3106              */
3107             if (ctxt->recovery) {
3108                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3109                                    xmlStrlen(ctxt->name)) == 0)
3110                 {
3111                     break; /* while */
3112                 } else {
3113                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3114                                  "Element %s embeds close tag\n",
3115                                  ctxt->name, NULL);
3116                 }
3117             } else {
3118                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3119                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3120                 {
3121                     break; /* while */
3122                 }
3123             }
3124         }
3125         if (IS_CHAR(cur)) {
3126             COPY_BUF(l,buf,nbchar,cur);
3127         } else {
3128             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3129                             "Invalid char in CDATA 0x%X\n", cur);
3130         }
3131         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3132             buf[nbchar] = 0;
3133             if (ctxt->sax->cdataBlock!= NULL) {
3134                 /*
3135                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136                  */
3137                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138             } else if (ctxt->sax->characters != NULL) {
3139                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140             }
3141             nbchar = 0;
3142         }
3143         GROW;
3144         NEXTL(l);
3145         cur = CUR_CHAR(l);
3146     }
3147
3148     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3149         buf[nbchar] = 0;
3150         if (ctxt->sax->cdataBlock!= NULL) {
3151             /*
3152              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3153              */
3154             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3155         } else if (ctxt->sax->characters != NULL) {
3156             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157         }
3158     }
3159 }
3160
3161
3162 /**
3163  * htmlParseCharDataInternal:
3164  * @ctxt:  an HTML parser context
3165  * @readahead: optional read ahead character in ascii range
3166  *
3167  * parse a CharData section.
3168  * if we are within a CDATA section ']]>' marks an end of section.
3169  *
3170  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3171  */
3172
3173 static void
3174 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3175     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3176     int nbchar = 0;
3177     int cur, l;
3178     int chunk = 0;
3179
3180     if (readahead)
3181         buf[nbchar++] = readahead;
3182
3183     SHRINK;
3184     cur = CUR_CHAR(l);
3185     while (((cur != '<') || (ctxt->token == '<')) &&
3186            ((cur != '&') || (ctxt->token == '&')) &&
3187            (cur != 0)) {
3188         if (!(IS_CHAR(cur))) {
3189             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3190                         "Invalid char in CDATA 0x%X\n", cur);
3191         } else {
3192             COPY_BUF(l,buf,nbchar,cur);
3193         }
3194         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3195             buf[nbchar] = 0;
3196
3197             /*
3198              * Ok the segment is to be consumed as chars.
3199              */
3200             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3201                 if (areBlanks(ctxt, buf, nbchar)) {
3202                     if (ctxt->keepBlanks) {
3203                         if (ctxt->sax->characters != NULL)
3204                             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3205                     } else {
3206                         if (ctxt->sax->ignorableWhitespace != NULL)
3207                             ctxt->sax->ignorableWhitespace(ctxt->userData,
3208                                                            buf, nbchar);
3209                     }
3210                 } else {
3211                     htmlCheckParagraph(ctxt);
3212                     if (ctxt->sax->characters != NULL)
3213                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3214                 }
3215             }
3216             nbchar = 0;
3217         }
3218         NEXTL(l);
3219         chunk++;
3220         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3221             chunk = 0;
3222             SHRINK;
3223             GROW;
3224         }
3225         cur = CUR_CHAR(l);
3226         if (cur == 0) {
3227             SHRINK;
3228             GROW;
3229             cur = CUR_CHAR(l);
3230         }
3231     }
3232     if (nbchar != 0) {
3233         buf[nbchar] = 0;
3234
3235         /*
3236          * Ok the segment is to be consumed as chars.
3237          */
3238         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3239             if (areBlanks(ctxt, buf, nbchar)) {
3240                 if (ctxt->keepBlanks) {
3241                     if (ctxt->sax->characters != NULL)
3242                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3243                 } else {
3244                     if (ctxt->sax->ignorableWhitespace != NULL)
3245                         ctxt->sax->ignorableWhitespace(ctxt->userData,
3246                                                        buf, nbchar);
3247                 }
3248             } else {
3249                 htmlCheckParagraph(ctxt);
3250                 if (ctxt->sax->characters != NULL)
3251                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
3252             }
3253         }
3254     } else {
3255         /*
3256          * Loop detection
3257          */
3258         if (cur == 0)
3259             ctxt->instate = XML_PARSER_EOF;
3260     }
3261 }
3262
3263 /**
3264  * htmlParseCharData:
3265  * @ctxt:  an HTML parser context
3266  *
3267  * parse a CharData section.
3268  * if we are within a CDATA section ']]>' marks an end of section.
3269  *
3270  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3271  */
3272
3273 static void
3274 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3275     htmlParseCharDataInternal(ctxt, 0);
3276 }
3277
3278 /**
3279  * htmlParseExternalID:
3280  * @ctxt:  an HTML parser context
3281  * @publicID:  a xmlChar** receiving PubidLiteral
3282  *
3283  * Parse an External ID or a Public ID
3284  *
3285  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3286  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3287  *
3288  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3289  *
3290  * Returns the function returns SystemLiteral and in the second
3291  *                case publicID receives PubidLiteral, is strict is off
3292  *                it is possible to return NULL and have publicID set.
3293  */
3294
3295 static xmlChar *
3296 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3297     xmlChar *URI = NULL;
3298
3299     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3300          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3301          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3302         SKIP(6);
3303         if (!IS_BLANK_CH(CUR)) {
3304             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3305                          "Space required after 'SYSTEM'\n", NULL, NULL);
3306         }
3307         SKIP_BLANKS;
3308         URI = htmlParseSystemLiteral(ctxt);
3309         if (URI == NULL) {
3310             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3311                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3312         }
3313     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3314                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3315                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3316         SKIP(6);
3317         if (!IS_BLANK_CH(CUR)) {
3318             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3319                          "Space required after 'PUBLIC'\n", NULL, NULL);
3320         }
3321         SKIP_BLANKS;
3322         *publicID = htmlParsePubidLiteral(ctxt);
3323         if (*publicID == NULL) {
3324             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3325                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3326                          NULL, NULL);
3327         }
3328         SKIP_BLANKS;
3329         if ((CUR == '"') || (CUR == '\'')) {
3330             URI = htmlParseSystemLiteral(ctxt);
3331         }
3332     }
3333     return(URI);
3334 }
3335
3336 /**
3337  * xmlParsePI:
3338  * @ctxt:  an XML parser context
3339  *
3340  * parse an XML Processing Instruction.
3341  *
3342  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3343  */
3344 static void
3345 htmlParsePI(htmlParserCtxtPtr ctxt) {
3346     xmlChar *buf = NULL;
3347     int len = 0;
3348     int size = HTML_PARSER_BUFFER_SIZE;
3349     int cur, l;
3350     const xmlChar *target;
3351     xmlParserInputState state;
3352     int count = 0;
3353
3354     if ((RAW == '<') && (NXT(1) == '?')) {
3355         state = ctxt->instate;
3356         ctxt->instate = XML_PARSER_PI;
3357         /*
3358          * this is a Processing Instruction.
3359          */
3360         SKIP(2);
3361         SHRINK;
3362
3363         /*
3364          * Parse the target name and check for special support like
3365          * namespace.
3366          */
3367         target = htmlParseName(ctxt);
3368         if (target != NULL) {
3369             if (RAW == '>') {
3370                 SKIP(1);
3371
3372                 /*
3373                  * SAX: PI detected.
3374                  */
3375                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3376                     (ctxt->sax->processingInstruction != NULL))
3377                     ctxt->sax->processingInstruction(ctxt->userData,
3378                                                      target, NULL);
3379                 ctxt->instate = state;
3380                 return;
3381             }
3382             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3383             if (buf == NULL) {
3384                 htmlErrMemory(ctxt, NULL);
3385                 ctxt->instate = state;
3386                 return;
3387             }
3388             cur = CUR;
3389             if (!IS_BLANK(cur)) {
3390                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3391                           "ParsePI: PI %s space expected\n", target, NULL);
3392             }
3393             SKIP_BLANKS;
3394             cur = CUR_CHAR(l);
3395             while ((cur != 0) && (cur != '>')) {
3396                 if (len + 5 >= size) {
3397                     xmlChar *tmp;
3398
3399                     size *= 2;
3400                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3401                     if (tmp == NULL) {
3402                         htmlErrMemory(ctxt, NULL);
3403                         xmlFree(buf);
3404                         ctxt->instate = state;
3405                         return;
3406                     }
3407                     buf = tmp;
3408                 }
3409                 count++;
3410                 if (count > 50) {
3411                     GROW;
3412                     count = 0;
3413                 }
3414                 if (IS_CHAR(cur)) {
3415                     COPY_BUF(l,buf,len,cur);
3416                 } else {
3417                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3418                                     "Invalid char in processing instruction "
3419                                     "0x%X\n", cur);
3420                 }
3421                 NEXTL(l);
3422                 cur = CUR_CHAR(l);
3423                 if (cur == 0) {
3424                     SHRINK;
3425                     GROW;
3426                     cur = CUR_CHAR(l);
3427                 }
3428             }
3429             buf[len] = 0;
3430             if (cur != '>') {
3431                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3432                       "ParsePI: PI %s never end ...\n", target, NULL);
3433             } else {
3434                 SKIP(1);
3435
3436                 /*
3437                  * SAX: PI detected.
3438                  */
3439                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3440                     (ctxt->sax->processingInstruction != NULL))
3441                     ctxt->sax->processingInstruction(ctxt->userData,
3442                                                      target, buf);
3443             }
3444             xmlFree(buf);
3445         } else {
3446             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3447                          "PI is not started correctly", NULL, NULL);
3448         }
3449         ctxt->instate = state;
3450     }
3451 }
3452
3453 /**
3454  * htmlParseComment:
3455  * @ctxt:  an HTML parser context
3456  *
3457  * Parse an XML (SGML) comment <!-- .... -->
3458  *
3459  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3460  */
3461 static void
3462 htmlParseComment(htmlParserCtxtPtr ctxt) {
3463     xmlChar *buf = NULL;
3464     int len;
3465     int size = HTML_PARSER_BUFFER_SIZE;
3466     int q, ql;
3467     int r, rl;
3468     int cur, l;
3469     int next, nl;
3470     xmlParserInputState state;
3471
3472     /*
3473      * Check that there is a comment right here.
3474      */
3475     if ((RAW != '<') || (NXT(1) != '!') ||
3476         (NXT(2) != '-') || (NXT(3) != '-')) return;
3477
3478     state = ctxt->instate;
3479     ctxt->instate = XML_PARSER_COMMENT;
3480     SHRINK;
3481     SKIP(4);
3482     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3483     if (buf == NULL) {
3484         htmlErrMemory(ctxt, "buffer allocation failed\n");
3485         ctxt->instate = state;
3486         return;
3487     }
3488     len = 0;
3489     buf[len] = 0;
3490     q = CUR_CHAR(ql);
3491     if (q == 0)
3492         goto unfinished;
3493     if (q == '>') {
3494         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3495         cur = '>';
3496         goto finished;
3497     }
3498     NEXTL(ql);
3499     r = CUR_CHAR(rl);
3500     if (r == 0)
3501         goto unfinished;
3502     if (q == '-' && r == '>') {
3503         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3504         cur = '>';
3505         goto finished;
3506     }
3507     NEXTL(rl);
3508     cur = CUR_CHAR(l);
3509     while ((cur != 0) &&
3510            ((cur != '>') ||
3511             (r != '-') || (q != '-'))) {
3512         NEXTL(l);
3513         next = CUR_CHAR(nl);
3514         if (next == 0) {
3515             SHRINK;
3516             GROW;
3517             next = CUR_CHAR(nl);
3518         }
3519
3520         if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3521           htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3522                        "Comment incorrectly closed by '--!>'", NULL, NULL);
3523           cur = '>';
3524           break;
3525         }
3526
3527         if (len + 5 >= size) {
3528             xmlChar *tmp;
3529
3530             size *= 2;
3531             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3532             if (tmp == NULL) {
3533                 xmlFree(buf);
3534                 htmlErrMemory(ctxt, "growing buffer failed\n");
3535                 ctxt->instate = state;
3536                 return;
3537             }
3538             buf = tmp;
3539         }
3540         if (IS_CHAR(q)) {
3541             COPY_BUF(ql,buf,len,q);
3542         } else {
3543             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3544                             "Invalid char in comment 0x%X\n", q);
3545         }
3546
3547         q = r;
3548         ql = rl;
3549         r = cur;
3550         rl = l;
3551         cur = next;
3552         l = nl;
3553     }
3554 finished:
3555     buf[len] = 0;
3556     if (cur == '>') {
3557         NEXT;
3558         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3559             (!ctxt->disableSAX))
3560             ctxt->sax->comment(ctxt->userData, buf);
3561         xmlFree(buf);
3562         ctxt->instate = state;
3563         return;
3564     }
3565
3566 unfinished:
3567     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3568                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3569     xmlFree(buf);
3570 }
3571
3572 /**
3573  * htmlParseCharRef:
3574  * @ctxt:  an HTML parser context
3575  *
3576  * parse Reference declarations
3577  *
3578  * [66] CharRef ::= '&#' [0-9]+ ';' |
3579  *                  '&#x' [0-9a-fA-F]+ ';'
3580  *
3581  * Returns the value parsed (as an int)
3582  */
3583 int
3584 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3585     int val = 0;
3586
3587     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3588         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3589                      "htmlParseCharRef: context error\n",
3590                      NULL, NULL);
3591         return(0);
3592     }
3593     if ((CUR == '&') && (NXT(1) == '#') &&
3594         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3595         SKIP(3);
3596         while (CUR != ';') {
3597             if ((CUR >= '0') && (CUR <= '9')) {
3598                 if (val < 0x110000)
3599                     val = val * 16 + (CUR - '0');
3600             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3601                 if (val < 0x110000)
3602                     val = val * 16 + (CUR - 'a') + 10;
3603             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3604                 if (val < 0x110000)
3605                     val = val * 16 + (CUR - 'A') + 10;
3606             } else {
3607                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3608                              "htmlParseCharRef: missing semicolon\n",
3609                              NULL, NULL);
3610                 break;
3611             }
3612             NEXT;
3613         }
3614         if (CUR == ';')
3615             NEXT;
3616     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3617         SKIP(2);
3618         while (CUR != ';') {
3619             if ((CUR >= '0') && (CUR <= '9')) {
3620                 if (val < 0x110000)
3621                     val = val * 10 + (CUR - '0');
3622             } else {
3623                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3624                              "htmlParseCharRef: missing semicolon\n",
3625                              NULL, NULL);
3626                 break;
3627             }
3628             NEXT;
3629         }
3630         if (CUR == ';')
3631             NEXT;
3632     } else {
3633         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3634                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3635     }
3636     /*
3637      * Check the value IS_CHAR ...
3638      */
3639     if (IS_CHAR(val)) {
3640         return(val);
3641     } else if (val >= 0x110000) {
3642         htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3643                      "htmlParseCharRef: value too large\n", NULL, NULL);
3644     } else {
3645         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3646                         "htmlParseCharRef: invalid xmlChar value %d\n",
3647                         val);
3648     }
3649     return(0);
3650 }
3651
3652
3653 /**
3654  * htmlParseDocTypeDecl:
3655  * @ctxt:  an HTML parser context
3656  *
3657  * parse a DOCTYPE declaration
3658  *
3659  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3660  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3661  */
3662
3663 static void
3664 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3665     const xmlChar *name;
3666     xmlChar *ExternalID = NULL;
3667     xmlChar *URI = NULL;
3668
3669     /*
3670      * We know that '<!DOCTYPE' has been detected.
3671      */
3672     SKIP(9);
3673
3674     SKIP_BLANKS;
3675
3676     /*
3677      * Parse the DOCTYPE name.
3678      */
3679     name = htmlParseName(ctxt);
3680     if (name == NULL) {
3681         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3682                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3683                      NULL, NULL);
3684     }
3685     /*
3686      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3687      */
3688
3689     SKIP_BLANKS;
3690
3691     /*
3692      * Check for SystemID and ExternalID
3693      */
3694     URI = htmlParseExternalID(ctxt, &ExternalID);
3695     SKIP_BLANKS;
3696
3697     /*
3698      * We should be at the end of the DOCTYPE declaration.
3699      */
3700     if (CUR != '>') {
3701         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3702                      "DOCTYPE improperly terminated\n", NULL, NULL);
3703         /* Ignore bogus content */
3704         while ((CUR != 0) && (CUR != '>'))
3705             NEXT;
3706     }
3707     if (CUR == '>')
3708         NEXT;
3709
3710     /*
3711      * Create or update the document accordingly to the DOCTYPE
3712      */
3713     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3714         (!ctxt->disableSAX))
3715         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3716
3717     /*
3718      * Cleanup, since we don't use all those identifiers
3719      */
3720     if (URI != NULL) xmlFree(URI);
3721     if (ExternalID != NULL) xmlFree(ExternalID);
3722 }
3723
3724 /**
3725  * htmlParseAttribute:
3726  * @ctxt:  an HTML parser context
3727  * @value:  a xmlChar ** used to store the value of the attribute
3728  *
3729  * parse an attribute
3730  *
3731  * [41] Attribute ::= Name Eq AttValue
3732  *
3733  * [25] Eq ::= S? '=' S?
3734  *
3735  * With namespace:
3736  *
3737  * [NS 11] Attribute ::= QName Eq AttValue
3738  *
3739  * Also the case QName == xmlns:??? is handled independently as a namespace
3740  * definition.
3741  *
3742  * Returns the attribute name, and the value in *value.
3743  */
3744
3745 static const xmlChar *
3746 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3747     const xmlChar *name;
3748     xmlChar *val = NULL;
3749
3750     *value = NULL;
3751     name = htmlParseHTMLName(ctxt);
3752     if (name == NULL) {
3753         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3754                      "error parsing attribute name\n", NULL, NULL);
3755         return(NULL);
3756     }
3757
3758     /*
3759      * read the value
3760      */
3761     SKIP_BLANKS;
3762     if (CUR == '=') {
3763         NEXT;
3764         SKIP_BLANKS;
3765         val = htmlParseAttValue(ctxt);
3766     }
3767
3768     *value = val;
3769     return(name);
3770 }
3771
3772 /**
3773  * htmlCheckEncodingDirect:
3774  * @ctxt:  an HTML parser context
3775  * @attvalue: the attribute value
3776  *
3777  * Checks an attribute value to detect
3778  * the encoding
3779  * If a new encoding is detected the parser is switched to decode
3780  * it and pass UTF8
3781  */
3782 static void
3783 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3784
3785     if ((ctxt == NULL) || (encoding == NULL) ||
3786         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3787         return;
3788
3789     /* do not change encoding */
3790     if (ctxt->input->encoding != NULL)
3791         return;
3792
3793     if (encoding != NULL) {
3794         xmlCharEncoding enc;
3795         xmlCharEncodingHandlerPtr handler;
3796
3797         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3798
3799         if (ctxt->input->encoding != NULL)
3800             xmlFree((xmlChar *) ctxt->input->encoding);
3801         ctxt->input->encoding = xmlStrdup(encoding);
3802
3803         enc = xmlParseCharEncoding((const char *) encoding);
3804         /*
3805          * registered set of known encodings
3806          */
3807         if (enc != XML_CHAR_ENCODING_ERROR) {
3808             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3809                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3810                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3811                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3812                 (ctxt->input->buf != NULL) &&
3813                 (ctxt->input->buf->encoder == NULL)) {
3814                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3815                              "htmlCheckEncoding: wrong encoding meta\n",
3816                              NULL, NULL);
3817             } else {
3818                 xmlSwitchEncoding(ctxt, enc);
3819             }
3820             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3821         } else {
3822             /*
3823              * fallback for unknown encodings
3824              */
3825             handler = xmlFindCharEncodingHandler((const char *) encoding);
3826             if (handler != NULL) {
3827                 xmlSwitchToEncoding(ctxt, handler);
3828                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3829             } else {
3830                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3831                              "htmlCheckEncoding: unknown encoding %s\n",
3832                              encoding, NULL);
3833             }
3834         }
3835
3836         if ((ctxt->input->buf != NULL) &&
3837             (ctxt->input->buf->encoder != NULL) &&
3838             (ctxt->input->buf->raw != NULL) &&
3839             (ctxt->input->buf->buffer != NULL)) {
3840             int nbchars;
3841             int processed;
3842
3843             /*
3844              * convert as much as possible to the parser reading buffer.
3845              */
3846             processed = ctxt->input->cur - ctxt->input->base;
3847             xmlBufShrink(ctxt->input->buf->buffer, processed);
3848             nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3849             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3850             if (nbchars < 0) {
3851                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3852                              "htmlCheckEncoding: encoder error\n",
3853                              NULL, NULL);
3854             }
3855         }
3856     }
3857 }
3858
3859 /**
3860  * htmlCheckEncoding:
3861  * @ctxt:  an HTML parser context
3862  * @attvalue: the attribute value
3863  *
3864  * Checks an http-equiv attribute from a Meta tag to detect
3865  * the encoding
3866  * If a new encoding is detected the parser is switched to decode
3867  * it and pass UTF8
3868  */
3869 static void
3870 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3871     const xmlChar *encoding;
3872
3873     if (!attvalue)
3874         return;
3875
3876     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3877     if (encoding != NULL) {
3878         encoding += 7;
3879     }
3880     /*
3881      * skip blank
3882      */
3883     if (encoding && IS_BLANK_CH(*encoding))
3884         encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3885     if (encoding && *encoding == '=') {
3886         encoding ++;
3887         htmlCheckEncodingDirect(ctxt, encoding);
3888     }
3889 }
3890
3891 /**
3892  * htmlCheckMeta:
3893  * @ctxt:  an HTML parser context
3894  * @atts:  the attributes values
3895  *
3896  * Checks an attributes from a Meta tag
3897  */
3898 static void
3899 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3900     int i;
3901     const xmlChar *att, *value;
3902     int http = 0;
3903     const xmlChar *content = NULL;
3904
3905     if ((ctxt == NULL) || (atts == NULL))
3906         return;
3907
3908     i = 0;
3909     att = atts[i++];
3910     while (att != NULL) {
3911         value = atts[i++];
3912         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3913          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3914             http = 1;
3915         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3916             htmlCheckEncodingDirect(ctxt, value);
3917         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3918             content = value;
3919         att = atts[i++];
3920     }
3921     if ((http) && (content != NULL))
3922         htmlCheckEncoding(ctxt, content);
3923
3924 }
3925
3926 /**
3927  * htmlParseStartTag:
3928  * @ctxt:  an HTML parser context
3929  *
3930  * parse a start of tag either for rule element or
3931  * EmptyElement. In both case we don't parse the tag closing chars.
3932  *
3933  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3934  *
3935  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3936  *
3937  * With namespace:
3938  *
3939  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3940  *
3941  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3942  *
3943  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3944  */
3945
3946 static int
3947 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3948     const xmlChar *name;
3949     const xmlChar *attname;
3950     xmlChar *attvalue;
3951     const xmlChar **atts;
3952     int nbatts = 0;
3953     int maxatts;
3954     int meta = 0;
3955     int i;
3956     int discardtag = 0;
3957
3958     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3959         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3960                      "htmlParseStartTag: context error\n", NULL, NULL);
3961         return -1;
3962     }
3963     if (ctxt->instate == XML_PARSER_EOF)
3964         return(-1);
3965     if (CUR != '<') return -1;
3966     NEXT;
3967
3968     atts = ctxt->atts;
3969     maxatts = ctxt->maxatts;
3970
3971     GROW;
3972     name = htmlParseHTMLName(ctxt);
3973     if (name == NULL) {
3974         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3975                      "htmlParseStartTag: invalid element name\n",
3976                      NULL, NULL);
3977         /* Dump the bogus tag like browsers do */
3978         while ((CUR != 0) && (CUR != '>') &&
3979                (ctxt->instate != XML_PARSER_EOF))
3980             NEXT;
3981         return -1;
3982     }
3983     if (xmlStrEqual(name, BAD_CAST"meta"))
3984         meta = 1;
3985
3986     /*
3987      * Check for auto-closure of HTML elements.
3988      */
3989     htmlAutoClose(ctxt, name);
3990
3991     /*
3992      * Check for implied HTML elements.
3993      */
3994     htmlCheckImplied(ctxt, name);
3995
3996     /*
3997      * Avoid html at any level > 0, head at any level != 1
3998      * or any attempt to recurse body
3999      */
4000     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4001         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002                      "htmlParseStartTag: misplaced <html> tag\n",
4003                      name, NULL);
4004         discardtag = 1;
4005         ctxt->depth++;
4006     }
4007     if ((ctxt->nameNr != 1) &&
4008         (xmlStrEqual(name, BAD_CAST"head"))) {
4009         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4010                      "htmlParseStartTag: misplaced <head> tag\n",
4011                      name, NULL);
4012         discardtag = 1;
4013         ctxt->depth++;
4014     }
4015     if (xmlStrEqual(name, BAD_CAST"body")) {
4016         int indx;
4017         for (indx = 0;indx < ctxt->nameNr;indx++) {
4018             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4019                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4020                              "htmlParseStartTag: misplaced <body> tag\n",
4021                              name, NULL);
4022                 discardtag = 1;
4023                 ctxt->depth++;
4024             }
4025         }
4026     }
4027
4028     /*
4029      * Now parse the attributes, it ends up with the ending
4030      *
4031      * (S Attribute)* S?
4032      */
4033     SKIP_BLANKS;
4034     while ((CUR != 0) &&
4035            (CUR != '>') &&
4036            ((CUR != '/') || (NXT(1) != '>'))) {
4037         GROW;
4038         attname = htmlParseAttribute(ctxt, &attvalue);
4039         if (attname != NULL) {
4040
4041             /*
4042              * Well formedness requires at most one declaration of an attribute
4043              */
4044             for (i = 0; i < nbatts;i += 2) {
4045                 if (xmlStrEqual(atts[i], attname)) {
4046                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4047                                  "Attribute %s redefined\n", attname, NULL);
4048                     if (attvalue != NULL)
4049                         xmlFree(attvalue);
4050                     goto failed;
4051                 }
4052             }
4053
4054             /*
4055              * Add the pair to atts
4056              */
4057             if (atts == NULL) {
4058                 maxatts = 22; /* allow for 10 attrs by default */
4059                 atts = (const xmlChar **)
4060                        xmlMalloc(maxatts * sizeof(xmlChar *));
4061                 if (atts == NULL) {
4062                     htmlErrMemory(ctxt, NULL);
4063                     if (attvalue != NULL)
4064                         xmlFree(attvalue);
4065                     goto failed;
4066                 }
4067                 ctxt->atts = atts;
4068                 ctxt->maxatts = maxatts;
4069             } else if (nbatts + 4 > maxatts) {
4070                 const xmlChar **n;
4071
4072                 maxatts *= 2;
4073                 n = (const xmlChar **) xmlRealloc((void *) atts,
4074                                              maxatts * sizeof(const xmlChar *));
4075                 if (n == NULL) {
4076                     htmlErrMemory(ctxt, NULL);
4077                     if (attvalue != NULL)
4078                         xmlFree(attvalue);
4079                     goto failed;
4080                 }
4081                 atts = n;
4082                 ctxt->atts = atts;
4083                 ctxt->maxatts = maxatts;
4084             }
4085             atts[nbatts++] = attname;
4086             atts[nbatts++] = attvalue;
4087             atts[nbatts] = NULL;
4088             atts[nbatts + 1] = NULL;
4089         }
4090         else {
4091             if (attvalue != NULL)
4092                 xmlFree(attvalue);
4093             /* Dump the bogus attribute string up to the next blank or
4094              * the end of the tag. */
4095             while ((CUR != 0) &&
4096                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4097                    ((CUR != '/') || (NXT(1) != '>')))
4098                 NEXT;
4099         }
4100
4101 failed:
4102         SKIP_BLANKS;
4103     }
4104
4105     /*
4106      * Handle specific association to the META tag
4107      */
4108     if (meta && (nbatts != 0))
4109         htmlCheckMeta(ctxt, atts);
4110
4111     /*
4112      * SAX: Start of Element !
4113      */
4114     if (!discardtag) {
4115         htmlnamePush(ctxt, name);
4116         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4117             if (nbatts != 0)
4118                 ctxt->sax->startElement(ctxt->userData, name, atts);
4119             else
4120                 ctxt->sax->startElement(ctxt->userData, name, NULL);
4121         }
4122     }
4123
4124     if (atts != NULL) {
4125         for (i = 1;i < nbatts;i += 2) {
4126             if (atts[i] != NULL)
4127                 xmlFree((xmlChar *) atts[i]);
4128         }
4129     }
4130
4131     return(discardtag);
4132 }
4133
4134 /**
4135  * htmlParseEndTag:
4136  * @ctxt:  an HTML parser context
4137  *
4138  * parse an end of tag
4139  *
4140  * [42] ETag ::= '</' Name S? '>'
4141  *
4142  * With namespace
4143  *
4144  * [NS 9] ETag ::= '</' QName S? '>'
4145  *
4146  * Returns 1 if the current level should be closed.
4147  */
4148
4149 static int
4150 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4151 {
4152     const xmlChar *name;
4153     const xmlChar *oldname;
4154     int i, ret;
4155
4156     if ((CUR != '<') || (NXT(1) != '/')) {
4157         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4158                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
4159         return (0);
4160     }
4161     SKIP(2);
4162
4163     name = htmlParseHTMLName(ctxt);
4164     if (name == NULL)
4165         return (0);
4166     /*
4167      * We should definitely be at the ending "S? '>'" part
4168      */
4169     SKIP_BLANKS;
4170     if (CUR != '>') {
4171         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4172                      "End tag : expected '>'\n", NULL, NULL);
4173         /* Skip to next '>' */
4174         while ((CUR != 0) && (CUR != '>'))
4175             NEXT;
4176     }
4177     if (CUR == '>')
4178         NEXT;
4179
4180     /*
4181      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4182      * out now.
4183      */
4184     if ((ctxt->depth > 0) &&
4185         (xmlStrEqual(name, BAD_CAST "html") ||
4186          xmlStrEqual(name, BAD_CAST "body") ||
4187          xmlStrEqual(name, BAD_CAST "head"))) {
4188         ctxt->depth--;
4189         return (0);
4190     }
4191
4192     /*
4193      * If the name read is not one of the element in the parsing stack
4194      * then return, it's just an error.
4195      */
4196     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4197         if (xmlStrEqual(name, ctxt->nameTab[i]))
4198             break;
4199     }
4200     if (i < 0) {
4201         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4202                      "Unexpected end tag : %s\n", name, NULL);
4203         return (0);
4204     }
4205
4206
4207     /*
4208      * Check for auto-closure of HTML elements.
4209      */
4210
4211     htmlAutoCloseOnClose(ctxt, name);
4212
4213     /*
4214      * Well formedness constraints, opening and closing must match.
4215      * With the exception that the autoclose may have popped stuff out
4216      * of the stack.
4217      */
4218     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4219         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4220                      "Opening and ending tag mismatch: %s and %s\n",
4221                      name, ctxt->name);
4222     }
4223
4224     /*
4225      * SAX: End of Tag
4226      */
4227     oldname = ctxt->name;
4228     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4229         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4230             ctxt->sax->endElement(ctxt->userData, name);
4231         htmlNodeInfoPop(ctxt);
4232         htmlnamePop(ctxt);
4233         ret = 1;
4234     } else {
4235         ret = 0;
4236     }
4237
4238     return (ret);
4239 }
4240
4241
4242 /**
4243  * htmlParseReference:
4244  * @ctxt:  an HTML parser context
4245  *
4246  * parse and handle entity references in content,
4247  * this will end-up in a call to character() since this is either a
4248  * CharRef, or a predefined entity.
4249  */
4250 static void
4251 htmlParseReference(htmlParserCtxtPtr ctxt) {
4252     const htmlEntityDesc * ent;
4253     xmlChar out[6];
4254     const xmlChar *name;
4255     if (CUR != '&') return;
4256
4257     if (NXT(1) == '#') {
4258         unsigned int c;
4259         int bits, i = 0;
4260
4261         c = htmlParseCharRef(ctxt);
4262         if (c == 0)
4263             return;
4264
4265         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4266         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4267         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4268         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4269
4270         for ( ; bits >= 0; bits-= 6) {
4271             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4272         }
4273         out[i] = 0;
4274
4275         htmlCheckParagraph(ctxt);
4276         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4277             ctxt->sax->characters(ctxt->userData, out, i);
4278     } else {
4279         ent = htmlParseEntityRef(ctxt, &name);
4280         if (name == NULL) {
4281             htmlCheckParagraph(ctxt);
4282             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4283                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4284             return;
4285         }
4286         if ((ent == NULL) || !(ent->value > 0)) {
4287             htmlCheckParagraph(ctxt);
4288             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4289                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4290                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4291                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4292             }
4293         } else {
4294             unsigned int c;
4295             int bits, i = 0;
4296
4297             c = ent->value;
4298             if      (c <    0x80)
4299                     { out[i++]= c;                bits= -6; }
4300             else if (c <   0x800)
4301                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4302             else if (c < 0x10000)
4303                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4304             else
4305                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4306
4307             for ( ; bits >= 0; bits-= 6) {
4308                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4309             }
4310             out[i] = 0;
4311
4312             htmlCheckParagraph(ctxt);
4313             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4314                 ctxt->sax->characters(ctxt->userData, out, i);
4315         }
4316     }
4317 }
4318
4319 /**
4320  * htmlParseContent:
4321  * @ctxt:  an HTML parser context
4322  *
4323  * Parse a content: comment, sub-element, reference or text.
4324  * Kept for compatibility with old code
4325  */
4326
4327 static void
4328 htmlParseContent(htmlParserCtxtPtr ctxt) {
4329     xmlChar *currentNode;
4330     int depth;
4331     const xmlChar *name;
4332
4333     currentNode = xmlStrdup(ctxt->name);
4334     depth = ctxt->nameNr;
4335     while (1) {
4336         GROW;
4337
4338         if (ctxt->instate == XML_PARSER_EOF)
4339             break;
4340
4341         /*
4342          * Our tag or one of it's parent or children is ending.
4343          */
4344         if ((CUR == '<') && (NXT(1) == '/')) {
4345             if (htmlParseEndTag(ctxt) &&
4346                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4347                 if (currentNode != NULL)
4348                     xmlFree(currentNode);
4349                 return;
4350             }
4351             continue; /* while */
4352         }
4353
4354         else if ((CUR == '<') &&
4355                  ((IS_ASCII_LETTER(NXT(1))) ||
4356                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4357             name = htmlParseHTMLName_nonInvasive(ctxt);
4358             if (name == NULL) {
4359                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4360                          "htmlParseStartTag: invalid element name\n",
4361                          NULL, NULL);
4362                 /* Dump the bogus tag like browsers do */
4363                 while ((CUR != 0) && (CUR != '>'))
4364                     NEXT;
4365
4366                 if (currentNode != NULL)
4367                     xmlFree(currentNode);
4368                 return;
4369             }
4370
4371             if (ctxt->name != NULL) {
4372                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4373                     htmlAutoClose(ctxt, name);
4374                     continue;
4375                 }
4376             }
4377         }
4378
4379         /*
4380          * Has this node been popped out during parsing of
4381          * the next element
4382          */
4383         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4384             (!xmlStrEqual(currentNode, ctxt->name)))
4385              {
4386             if (currentNode != NULL) xmlFree(currentNode);
4387             return;
4388         }
4389
4390         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4391             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4392             /*
4393              * Handle SCRIPT/STYLE separately
4394              */
4395             htmlParseScript(ctxt);
4396         }
4397
4398         else if ((CUR == '<') && (NXT(1) == '!')) {
4399             /*
4400              * Sometimes DOCTYPE arrives in the middle of the document
4401              */
4402             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4403                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4404                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4405                 (UPP(8) == 'E')) {
4406                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4407                              "Misplaced DOCTYPE declaration\n",
4408                              BAD_CAST "DOCTYPE" , NULL);
4409                 htmlParseDocTypeDecl(ctxt);
4410             }
4411             /*
4412              * First case :  a comment
4413              */
4414             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4415                 htmlParseComment(ctxt);
4416             }
4417             else {
4418                 htmlSkipBogusComment(ctxt);
4419             }
4420         }
4421
4422         /*
4423          * Second case : a Processing Instruction.
4424          */
4425         else if ((CUR == '<') && (NXT(1) == '?')) {
4426             htmlParsePI(ctxt);
4427         }
4428
4429         /*
4430          * Third case :  a sub-element.
4431          */
4432         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4433             htmlParseElement(ctxt);
4434         }
4435         else if (CUR == '<') {
4436             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4437                 (ctxt->sax->characters != NULL))
4438                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4439             NEXT;
4440         }
4441
4442         /*
4443          * Fourth case : a reference. If if has not been resolved,
4444          *    parsing returns it's Name, create the node
4445          */
4446         else if (CUR == '&') {
4447             htmlParseReference(ctxt);
4448         }
4449
4450         /*
4451          * Fifth case : end of the resource
4452          */
4453         else if (CUR == 0) {
4454             htmlAutoCloseOnEnd(ctxt);
4455             break;
4456         }
4457
4458         /*
4459          * Last case, text. Note that References are handled directly.
4460          */
4461         else {
4462             htmlParseCharData(ctxt);
4463         }
4464         GROW;
4465     }
4466     if (currentNode != NULL) xmlFree(currentNode);
4467 }
4468
4469 /**
4470  * htmlParseElement:
4471  * @ctxt:  an HTML parser context
4472  *
4473  * parse an HTML element, this is highly recursive
4474  * this is kept for compatibility with previous code versions
4475  *
4476  * [39] element ::= EmptyElemTag | STag content ETag
4477  *
4478  * [41] Attribute ::= Name Eq AttValue
4479  */
4480
4481 void
4482 htmlParseElement(htmlParserCtxtPtr ctxt) {
4483     const xmlChar *name;
4484     xmlChar *currentNode = NULL;
4485     const htmlElemDesc * info;
4486     htmlParserNodeInfo node_info;
4487     int failed;
4488     int depth;
4489     const xmlChar *oldptr;
4490
4491     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4492         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4493                      "htmlParseElement: context error\n", NULL, NULL);
4494         return;
4495     }
4496
4497     if (ctxt->instate == XML_PARSER_EOF)
4498         return;
4499
4500     /* Capture start position */
4501     if (ctxt->record_info) {
4502         node_info.begin_pos = ctxt->input->consumed +
4503                           (CUR_PTR - ctxt->input->base);
4504         node_info.begin_line = ctxt->input->line;
4505     }
4506
4507     failed = htmlParseStartTag(ctxt);
4508     name = ctxt->name;
4509     if ((failed == -1) || (name == NULL)) {
4510         if (CUR == '>')
4511             NEXT;
4512         return;
4513     }
4514
4515     /*
4516      * Lookup the info for that element.
4517      */
4518     info = htmlTagLookup(name);
4519     if (info == NULL) {
4520         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4521                      "Tag %s invalid\n", name, NULL);
4522     }
4523
4524     /*
4525      * Check for an Empty Element labeled the XML/SGML way
4526      */
4527     if ((CUR == '/') && (NXT(1) == '>')) {
4528         SKIP(2);
4529         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4530             ctxt->sax->endElement(ctxt->userData, name);
4531         htmlnamePop(ctxt);
4532         return;
4533     }
4534
4535     if (CUR == '>') {
4536         NEXT;
4537     } else {
4538         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4539                      "Couldn't find end of Start Tag %s\n", name, NULL);
4540
4541         /*
4542          * end of parsing of this node.
4543          */
4544         if (xmlStrEqual(name, ctxt->name)) {
4545             nodePop(ctxt);
4546             htmlnamePop(ctxt);
4547         }
4548
4549         /*
4550          * Capture end position and add node
4551          */
4552         if (ctxt->record_info) {
4553            node_info.end_pos = ctxt->input->consumed +
4554                               (CUR_PTR - ctxt->input->base);
4555            node_info.end_line = ctxt->input->line;
4556            node_info.node = ctxt->node;
4557            xmlParserAddNodeInfo(ctxt, &node_info);
4558         }
4559         return;
4560     }
4561
4562     /*
4563      * Check for an Empty Element from DTD definition
4564      */
4565     if ((info != NULL) && (info->empty)) {
4566         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4567             ctxt->sax->endElement(ctxt->userData, name);
4568         htmlnamePop(ctxt);
4569         return;
4570     }
4571
4572     /*
4573      * Parse the content of the element:
4574      */
4575     currentNode = xmlStrdup(ctxt->name);
4576     depth = ctxt->nameNr;
4577     while (CUR != 0) {
4578         oldptr = ctxt->input->cur;
4579         htmlParseContent(ctxt);
4580         if (oldptr==ctxt->input->cur) break;
4581         if (ctxt->nameNr < depth) break;
4582     }
4583
4584     /*
4585      * Capture end position and add node
4586      */
4587     if ( currentNode != NULL && ctxt->record_info ) {
4588        node_info.end_pos = ctxt->input->consumed +
4589                           (CUR_PTR - ctxt->input->base);
4590        node_info.end_line = ctxt->input->line;
4591        node_info.node = ctxt->node;
4592        xmlParserAddNodeInfo(ctxt, &node_info);
4593     }
4594     if (CUR == 0) {
4595         htmlAutoCloseOnEnd(ctxt);
4596     }
4597
4598     if (currentNode != NULL)
4599         xmlFree(currentNode);
4600 }
4601
4602 static void
4603 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4604     /*
4605      * Capture end position and add node
4606      */
4607     if ( ctxt->node != NULL && ctxt->record_info ) {
4608        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4609                                 (CUR_PTR - ctxt->input->base);
4610        ctxt->nodeInfo->end_line = ctxt->input->line;
4611        ctxt->nodeInfo->node = ctxt->node;
4612        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4613        htmlNodeInfoPop(ctxt);
4614     }
4615     if (CUR == 0) {
4616        htmlAutoCloseOnEnd(ctxt);
4617     }
4618 }
4619
4620 /**
4621  * htmlParseElementInternal:
4622  * @ctxt:  an HTML parser context
4623  *
4624  * parse an HTML element, new version, non recursive
4625  *
4626  * [39] element ::= EmptyElemTag | STag content ETag
4627  *
4628  * [41] Attribute ::= Name Eq AttValue
4629  */
4630
4631 static void
4632 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4633     const xmlChar *name;
4634     const htmlElemDesc * info;
4635     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4636     int failed;
4637
4638     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4639         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4640                      "htmlParseElementInternal: context error\n", NULL, NULL);
4641         return;
4642     }
4643
4644     if (ctxt->instate == XML_PARSER_EOF)
4645         return;
4646
4647     /* Capture start position */
4648     if (ctxt->record_info) {
4649         node_info.begin_pos = ctxt->input->consumed +
4650                           (CUR_PTR - ctxt->input->base);
4651         node_info.begin_line = ctxt->input->line;
4652     }
4653
4654     failed = htmlParseStartTag(ctxt);
4655     name = ctxt->name;
4656     if ((failed == -1) || (name == NULL)) {
4657         if (CUR == '>')
4658             NEXT;
4659         return;
4660     }
4661
4662     /*
4663      * Lookup the info for that element.
4664      */
4665     info = htmlTagLookup(name);
4666     if (info == NULL) {
4667         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4668                      "Tag %s invalid\n", name, NULL);
4669     }
4670
4671     /*
4672      * Check for an Empty Element labeled the XML/SGML way
4673      */
4674     if ((CUR == '/') && (NXT(1) == '>')) {
4675         SKIP(2);
4676         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4677             ctxt->sax->endElement(ctxt->userData, name);
4678         htmlnamePop(ctxt);
4679         return;
4680     }
4681
4682     if (CUR == '>') {
4683         NEXT;
4684     } else {
4685         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4686                      "Couldn't find end of Start Tag %s\n", name, NULL);
4687
4688         /*
4689          * end of parsing of this node.
4690          */
4691         if (xmlStrEqual(name, ctxt->name)) {
4692             nodePop(ctxt);
4693             htmlnamePop(ctxt);
4694         }
4695
4696         if (ctxt->record_info)
4697             htmlNodeInfoPush(ctxt, &node_info);
4698         htmlParserFinishElementParsing(ctxt);
4699         return;
4700     }
4701
4702     /*
4703      * Check for an Empty Element from DTD definition
4704      */
4705     if ((info != NULL) && (info->empty)) {
4706         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4707             ctxt->sax->endElement(ctxt->userData, name);
4708         htmlnamePop(ctxt);
4709         return;
4710     }
4711
4712     if (ctxt->record_info)
4713         htmlNodeInfoPush(ctxt, &node_info);
4714 }
4715
4716 /**
4717  * htmlParseContentInternal:
4718  * @ctxt:  an HTML parser context
4719  *
4720  * Parse a content: comment, sub-element, reference or text.
4721  * New version for non recursive htmlParseElementInternal
4722  */
4723
4724 static void
4725 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4726     xmlChar *currentNode;
4727     int depth;
4728     const xmlChar *name;
4729
4730     currentNode = xmlStrdup(ctxt->name);
4731     depth = ctxt->nameNr;
4732     while (1) {
4733         GROW;
4734
4735         if (ctxt->instate == XML_PARSER_EOF)
4736             break;
4737
4738         /*
4739          * Our tag or one of it's parent or children is ending.
4740          */
4741         if ((CUR == '<') && (NXT(1) == '/')) {
4742             if (htmlParseEndTag(ctxt) &&
4743                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4744                 if (currentNode != NULL)
4745                     xmlFree(currentNode);
4746
4747                 currentNode = xmlStrdup(ctxt->name);
4748                 depth = ctxt->nameNr;
4749             }
4750             continue; /* while */
4751         }
4752
4753         else if ((CUR == '<') &&
4754                  ((IS_ASCII_LETTER(NXT(1))) ||
4755                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4756             name = htmlParseHTMLName_nonInvasive(ctxt);
4757             if (name == NULL) {
4758                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4759                          "htmlParseStartTag: invalid element name\n",
4760                          NULL, NULL);
4761                 /* Dump the bogus tag like browsers do */
4762                 while ((CUR == 0) && (CUR != '>'))
4763                     NEXT;
4764
4765                 htmlParserFinishElementParsing(ctxt);
4766                 if (currentNode != NULL)
4767                     xmlFree(currentNode);
4768
4769                 currentNode = xmlStrdup(ctxt->name);
4770                 depth = ctxt->nameNr;
4771                 continue;
4772             }
4773
4774             if (ctxt->name != NULL) {
4775                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4776                     htmlAutoClose(ctxt, name);
4777                     continue;
4778                 }
4779             }
4780         }
4781
4782         /*
4783          * Has this node been popped out during parsing of
4784          * the next element
4785          */
4786         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4787             (!xmlStrEqual(currentNode, ctxt->name)))
4788              {
4789             htmlParserFinishElementParsing(ctxt);
4790             if (currentNode != NULL) xmlFree(currentNode);
4791
4792             currentNode = xmlStrdup(ctxt->name);
4793             depth = ctxt->nameNr;
4794             continue;
4795         }
4796
4797         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4798             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4799             /*
4800              * Handle SCRIPT/STYLE separately
4801              */
4802             htmlParseScript(ctxt);
4803         }
4804
4805         else if ((CUR == '<') && (NXT(1) == '!')) {
4806             /*
4807              * Sometimes DOCTYPE arrives in the middle of the document
4808              */
4809             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4810                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4811                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4812                 (UPP(8) == 'E')) {
4813                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4814                              "Misplaced DOCTYPE declaration\n",
4815                              BAD_CAST "DOCTYPE" , NULL);
4816                 htmlParseDocTypeDecl(ctxt);
4817             }
4818             /*
4819              * First case :  a comment
4820              */
4821             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4822                 htmlParseComment(ctxt);
4823             }
4824             else {
4825                 htmlSkipBogusComment(ctxt);
4826             }
4827         }
4828
4829         /*
4830          * Second case : a Processing Instruction.
4831          */
4832         else if ((CUR == '<') && (NXT(1) == '?')) {
4833             htmlParsePI(ctxt);
4834         }
4835
4836         /*
4837          * Third case :  a sub-element.
4838          */
4839         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4840             htmlParseElementInternal(ctxt);
4841             if (currentNode != NULL) xmlFree(currentNode);
4842
4843             currentNode = xmlStrdup(ctxt->name);
4844             depth = ctxt->nameNr;
4845         }
4846         else if (CUR == '<') {
4847             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4848                 (ctxt->sax->characters != NULL))
4849                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4850             NEXT;
4851         }
4852
4853         /*
4854          * Fourth case : a reference. If if has not been resolved,
4855          *    parsing returns it's Name, create the node
4856          */
4857         else if (CUR == '&') {
4858             htmlParseReference(ctxt);
4859         }
4860
4861         /*
4862          * Fifth case : end of the resource
4863          */
4864         else if (CUR == 0) {
4865             htmlAutoCloseOnEnd(ctxt);
4866             break;
4867         }
4868
4869         /*
4870          * Last case, text. Note that References are handled directly.
4871          */
4872         else {
4873             htmlParseCharData(ctxt);
4874         }
4875         GROW;
4876     }
4877     if (currentNode != NULL) xmlFree(currentNode);
4878 }
4879
4880 /**
4881  * htmlParseContent:
4882  * @ctxt:  an HTML parser context
4883  *
4884  * Parse a content: comment, sub-element, reference or text.
4885  * This is the entry point when called from parser.c
4886  */
4887
4888 void
4889 __htmlParseContent(void *ctxt) {
4890     if (ctxt != NULL)
4891         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4892 }
4893
4894 /**
4895  * htmlParseDocument:
4896  * @ctxt:  an HTML parser context
4897  *
4898  * parse an HTML document (and build a tree if using the standard SAX
4899  * interface).
4900  *
4901  * Returns 0, -1 in case of error. the parser context is augmented
4902  *                as a result of the parsing.
4903  */
4904
4905 int
4906 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4907     xmlChar start[4];
4908     xmlCharEncoding enc;
4909     xmlDtdPtr dtd;
4910
4911     xmlInitParser();
4912
4913     htmlDefaultSAXHandlerInit();
4914
4915     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4916         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4917                      "htmlParseDocument: context error\n", NULL, NULL);
4918         return(XML_ERR_INTERNAL_ERROR);
4919     }
4920     ctxt->html = 1;
4921     ctxt->linenumbers = 1;
4922     GROW;
4923     /*
4924      * SAX: beginning of the document processing.
4925      */
4926     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4927         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4928
4929     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4930         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4931         /*
4932          * Get the 4 first bytes and decode the charset
4933          * if enc != XML_CHAR_ENCODING_NONE
4934          * plug some encoding conversion routines.
4935          */
4936         start[0] = RAW;
4937         start[1] = NXT(1);
4938         start[2] = NXT(2);
4939         start[3] = NXT(3);
4940         enc = xmlDetectCharEncoding(&start[0], 4);
4941         if (enc != XML_CHAR_ENCODING_NONE) {
4942             xmlSwitchEncoding(ctxt, enc);
4943         }
4944     }
4945
4946     /*
4947      * Wipe out everything which is before the first '<'
4948      */
4949     SKIP_BLANKS;
4950     if (CUR == 0) {
4951         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4952                      "Document is empty\n", NULL, NULL);
4953     }
4954
4955     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4956         ctxt->sax->startDocument(ctxt->userData);
4957
4958
4959     /*
4960      * Parse possible comments and PIs before any content
4961      */
4962     while (((CUR == '<') && (NXT(1) == '!') &&
4963             (NXT(2) == '-') && (NXT(3) == '-')) ||
4964            ((CUR == '<') && (NXT(1) == '?'))) {
4965         htmlParseComment(ctxt);
4966         htmlParsePI(ctxt);
4967         SKIP_BLANKS;
4968     }
4969
4970
4971     /*
4972      * Then possibly doc type declaration(s) and more Misc
4973      * (doctypedecl Misc*)?
4974      */
4975     if ((CUR == '<') && (NXT(1) == '!') &&
4976         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4977         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4978         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4979         (UPP(8) == 'E')) {
4980         htmlParseDocTypeDecl(ctxt);
4981     }
4982     SKIP_BLANKS;
4983
4984     /*
4985      * Parse possible comments and PIs before any content
4986      */
4987     while (((CUR == '<') && (NXT(1) == '!') &&
4988             (NXT(2) == '-') && (NXT(3) == '-')) ||
4989            ((CUR == '<') && (NXT(1) == '?'))) {
4990         htmlParseComment(ctxt);
4991         htmlParsePI(ctxt);
4992         SKIP_BLANKS;
4993     }
4994
4995     /*
4996      * Time to start parsing the tree itself
4997      */
4998     htmlParseContentInternal(ctxt);
4999
5000     /*
5001      * autoclose
5002      */
5003     if (CUR == 0)
5004         htmlAutoCloseOnEnd(ctxt);
5005
5006
5007     /*
5008      * SAX: end of the document processing.
5009      */
5010     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5011         ctxt->sax->endDocument(ctxt->userData);
5012
5013     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5014         dtd = xmlGetIntSubset(ctxt->myDoc);
5015         if (dtd == NULL)
5016             ctxt->myDoc->intSubset =
5017                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5018                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5019                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5020     }
5021     if (! ctxt->wellFormed) return(-1);
5022     return(0);
5023 }
5024
5025
5026 /************************************************************************
5027  *                                                                      *
5028  *                      Parser contexts handling                        *
5029  *                                                                      *
5030  ************************************************************************/
5031
5032 /**
5033  * htmlInitParserCtxt:
5034  * @ctxt:  an HTML parser context
5035  *
5036  * Initialize a parser context
5037  *
5038  * Returns 0 in case of success and -1 in case of error
5039  */
5040
5041 static int
5042 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5043 {
5044     htmlSAXHandler *sax;
5045
5046     if (ctxt == NULL) return(-1);
5047     memset(ctxt, 0, sizeof(htmlParserCtxt));
5048
5049     ctxt->dict = xmlDictCreate();
5050     if (ctxt->dict == NULL) {
5051         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5052         return(-1);
5053     }
5054     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5055     if (sax == NULL) {
5056         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5057         return(-1);
5058     }
5059     memset(sax, 0, sizeof(htmlSAXHandler));
5060
5061     /* Allocate the Input stack */
5062     ctxt->inputTab = (htmlParserInputPtr *)
5063                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5064     if (ctxt->inputTab == NULL) {
5065         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5066         ctxt->inputNr = 0;
5067         ctxt->inputMax = 0;
5068         ctxt->input = NULL;
5069         return(-1);
5070     }
5071     ctxt->inputNr = 0;
5072     ctxt->inputMax = 5;
5073     ctxt->input = NULL;
5074     ctxt->version = NULL;
5075     ctxt->encoding = NULL;
5076     ctxt->standalone = -1;
5077     ctxt->instate = XML_PARSER_START;
5078
5079     /* Allocate the Node stack */
5080     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5081     if (ctxt->nodeTab == NULL) {
5082         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5083         ctxt->nodeNr = 0;
5084         ctxt->nodeMax = 0;
5085         ctxt->node = NULL;
5086         ctxt->inputNr = 0;
5087         ctxt->inputMax = 0;
5088         ctxt->input = NULL;
5089         return(-1);
5090     }
5091     ctxt->nodeNr = 0;
5092     ctxt->nodeMax = 10;
5093     ctxt->node = NULL;
5094
5095     /* Allocate the Name stack */
5096     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5097     if (ctxt->nameTab == NULL) {
5098         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5099         ctxt->nameNr = 0;
5100         ctxt->nameMax = 0;
5101         ctxt->name = NULL;
5102         ctxt->nodeNr = 0;
5103         ctxt->nodeMax = 0;
5104         ctxt->node = NULL;
5105         ctxt->inputNr = 0;
5106         ctxt->inputMax = 0;
5107         ctxt->input = NULL;
5108         return(-1);
5109     }
5110     ctxt->nameNr = 0;
5111     ctxt->nameMax = 10;
5112     ctxt->name = NULL;
5113
5114     ctxt->nodeInfoTab = NULL;
5115     ctxt->nodeInfoNr  = 0;
5116     ctxt->nodeInfoMax = 0;
5117
5118     ctxt->sax = sax;
5119     xmlSAX2InitHtmlDefaultSAXHandler(sax);
5120
5121     ctxt->userData = ctxt;
5122     ctxt->myDoc = NULL;
5123     ctxt->wellFormed = 1;
5124     ctxt->replaceEntities = 0;
5125     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5126     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5127     ctxt->html = 1;
5128     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5129     ctxt->vctxt.userData = ctxt;
5130     ctxt->vctxt.error = xmlParserValidityError;
5131     ctxt->vctxt.warning = xmlParserValidityWarning;
5132     ctxt->record_info = 0;
5133     ctxt->validate = 0;
5134     ctxt->checkIndex = 0;
5135     ctxt->catalogs = NULL;
5136     xmlInitNodeInfoSeq(&ctxt->node_seq);
5137     return(0);
5138 }
5139
5140 /**
5141  * htmlFreeParserCtxt:
5142  * @ctxt:  an HTML parser context
5143  *
5144  * Free all the memory used by a parser context. However the parsed
5145  * document in ctxt->myDoc is not freed.
5146  */
5147
5148 void
5149 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5150 {
5151     xmlFreeParserCtxt(ctxt);
5152 }
5153
5154 /**
5155  * htmlNewParserCtxt:
5156  *
5157  * Allocate and initialize a new parser context.
5158  *
5159  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5160  */
5161
5162 htmlParserCtxtPtr
5163 htmlNewParserCtxt(void)
5164 {
5165     xmlParserCtxtPtr ctxt;
5166
5167     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5168     if (ctxt == NULL) {
5169         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5170         return(NULL);
5171     }
5172     memset(ctxt, 0, sizeof(xmlParserCtxt));
5173     if (htmlInitParserCtxt(ctxt) < 0) {
5174         htmlFreeParserCtxt(ctxt);
5175         return(NULL);
5176     }
5177     return(ctxt);
5178 }
5179
5180 /**
5181  * htmlCreateMemoryParserCtxt:
5182  * @buffer:  a pointer to a char array
5183  * @size:  the size of the array
5184  *
5185  * Create a parser context for an HTML in-memory document.
5186  *
5187  * Returns the new parser context or NULL
5188  */
5189 htmlParserCtxtPtr
5190 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5191     xmlParserCtxtPtr ctxt;
5192     xmlParserInputPtr input;
5193     xmlParserInputBufferPtr buf;
5194
5195     if (buffer == NULL)
5196         return(NULL);
5197     if (size <= 0)
5198         return(NULL);
5199
5200     ctxt = htmlNewParserCtxt();
5201     if (ctxt == NULL)
5202         return(NULL);
5203
5204     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5205     if (buf == NULL) return(NULL);
5206
5207     input = xmlNewInputStream(ctxt);
5208     if (input == NULL) {
5209         xmlFreeParserInputBuffer(buf);
5210         xmlFreeParserCtxt(ctxt);
5211         return(NULL);
5212     }
5213
5214     input->filename = NULL;
5215     input->buf = buf;
5216     xmlBufResetInput(buf->buffer, input);
5217
5218     inputPush(ctxt, input);
5219     return(ctxt);
5220 }
5221
5222 /**
5223  * htmlCreateDocParserCtxt:
5224  * @cur:  a pointer to an array of xmlChar
5225  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5226  *
5227  * Create a parser context for an HTML document.
5228  *
5229  * TODO: check the need to add encoding handling there
5230  *
5231  * Returns the new parser context or NULL
5232  */
5233 static htmlParserCtxtPtr
5234 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5235     int len;
5236     htmlParserCtxtPtr ctxt;
5237
5238     if (cur == NULL)
5239         return(NULL);
5240     len = xmlStrlen(cur);
5241     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5242     if (ctxt == NULL)
5243         return(NULL);
5244
5245     if (encoding != NULL) {
5246         xmlCharEncoding enc;
5247         xmlCharEncodingHandlerPtr handler;
5248
5249         if (ctxt->input->encoding != NULL)
5250             xmlFree((xmlChar *) ctxt->input->encoding);
5251         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5252
5253         enc = xmlParseCharEncoding(encoding);
5254         /*
5255          * registered set of known encodings
5256          */
5257         if (enc != XML_CHAR_ENCODING_ERROR) {
5258             xmlSwitchEncoding(ctxt, enc);
5259             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5260                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5261                              "Unsupported encoding %s\n",
5262                              (const xmlChar *) encoding, NULL);
5263             }
5264         } else {
5265             /*
5266              * fallback for unknown encodings
5267              */
5268             handler = xmlFindCharEncodingHandler((const char *) encoding);
5269             if (handler != NULL) {
5270                 xmlSwitchToEncoding(ctxt, handler);
5271             } else {
5272                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5273                              "Unsupported encoding %s\n",
5274                              (const xmlChar *) encoding, NULL);
5275             }
5276         }
5277     }
5278     return(ctxt);
5279 }
5280
5281 #ifdef LIBXML_PUSH_ENABLED
5282 /************************************************************************
5283  *                                                                      *
5284  *      Progressive parsing interfaces                          *
5285  *                                                                      *
5286  ************************************************************************/
5287
5288 /**
5289  * htmlParseLookupSequence:
5290  * @ctxt:  an HTML parser context
5291  * @first:  the first char to lookup
5292  * @next:  the next char to lookup or zero
5293  * @third:  the next char to lookup or zero
5294  * @ignoreattrval: skip over attribute values
5295  *
5296  * Try to find if a sequence (first, next, third) or  just (first next) or
5297  * (first) is available in the input stream.
5298  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5299  * to avoid rescanning sequences of bytes, it DOES change the state of the
5300  * parser, do not use liberally.
5301  * This is basically similar to xmlParseLookupSequence()
5302  *
5303  * Returns the index to the current parsing point if the full sequence
5304  *      is available, -1 otherwise.
5305  */
5306 static int
5307 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5308                         xmlChar next, xmlChar third, int ignoreattrval)
5309 {
5310     int base, len;
5311     htmlParserInputPtr in;
5312     const xmlChar *buf;
5313     int invalue = 0;
5314     char valdellim = 0x0;
5315
5316     in = ctxt->input;
5317     if (in == NULL)
5318         return (-1);
5319
5320     base = in->cur - in->base;
5321     if (base < 0)
5322         return (-1);
5323
5324     if (ctxt->checkIndex > base) {
5325         base = ctxt->checkIndex;
5326         /* Abuse hasPErefs member to restore current state. */
5327         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5328     }
5329
5330     if (in->buf == NULL) {
5331         buf = in->base;
5332         len = in->length;
5333     } else {
5334         buf = xmlBufContent(in->buf->buffer);
5335         len = xmlBufUse(in->buf->buffer);
5336     }
5337
5338     /* take into account the sequence length */
5339     if (third)
5340         len -= 2;
5341     else if (next)
5342         len--;
5343     for (; base < len; base++) {
5344         if (ignoreattrval) {
5345             if (buf[base] == '"' || buf[base] == '\'') {
5346                 if (invalue) {
5347                     if (buf[base] == valdellim) {
5348                         invalue = 0;
5349                         continue;
5350                     }
5351                 } else {
5352                     valdellim = buf[base];
5353                     invalue = 1;
5354                     continue;
5355                 }
5356             } else if (invalue) {
5357                 continue;
5358             }
5359         }
5360         if (buf[base] == first) {
5361             if (third != 0) {
5362                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5363                     continue;
5364             } else if (next != 0) {
5365                 if (buf[base + 1] != next)
5366                     continue;
5367             }
5368             ctxt->checkIndex = 0;
5369 #ifdef DEBUG_PUSH
5370             if (next == 0)
5371                 xmlGenericError(xmlGenericErrorContext,
5372                                 "HPP: lookup '%c' found at %d\n",
5373                                 first, base);
5374             else if (third == 0)
5375                 xmlGenericError(xmlGenericErrorContext,
5376                                 "HPP: lookup '%c%c' found at %d\n",
5377                                 first, next, base);
5378             else
5379                 xmlGenericError(xmlGenericErrorContext,
5380                                 "HPP: lookup '%c%c%c' found at %d\n",
5381                                 first, next, third, base);
5382 #endif
5383             return (base - (in->cur - in->base));
5384         }
5385     }
5386     ctxt->checkIndex = base;
5387     /* Abuse hasPErefs member to track current state. */
5388     if (invalue)
5389         ctxt->hasPErefs |= 1;
5390     else
5391         ctxt->hasPErefs &= ~1;
5392 #ifdef DEBUG_PUSH
5393     if (next == 0)
5394         xmlGenericError(xmlGenericErrorContext,
5395                         "HPP: lookup '%c' failed\n", first);
5396     else if (third == 0)
5397         xmlGenericError(xmlGenericErrorContext,
5398                         "HPP: lookup '%c%c' failed\n", first, next);
5399     else
5400         xmlGenericError(xmlGenericErrorContext,
5401                         "HPP: lookup '%c%c%c' failed\n", first, next,
5402                         third);
5403 #endif
5404     return (-1);
5405 }
5406
5407 /**
5408  * htmlParseLookupCommentEnd:
5409  * @ctxt: an HTML parser context
5410  *
5411  * Try to find a comment end tag in the input stream
5412  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5413  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5414  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5415  * to avoid rescanning sequences of bytes, it DOES change the state of the
5416  * parser, do not use liberally.
5417  * This wraps to htmlParseLookupSequence()
5418  *
5419  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5420  */
5421 static int
5422 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5423 {
5424     int mark = 0;
5425     int cur = CUR_PTR - BASE_PTR;
5426
5427     while (mark >= 0) {
5428         mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5429         if ((mark < 0) ||
5430             (NXT(mark+2) == '>') ||
5431             ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5432             return mark;
5433         }
5434         ctxt->checkIndex = cur + mark + 1;
5435     }
5436     return mark;
5437 }
5438
5439
5440 /**
5441  * htmlParseTryOrFinish:
5442  * @ctxt:  an HTML parser context
5443  * @terminate:  last chunk indicator
5444  *
5445  * Try to progress on parsing
5446  *
5447  * Returns zero if no parsing was possible
5448  */
5449 static int
5450 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5451     int ret = 0;
5452     htmlParserInputPtr in;
5453     ptrdiff_t avail = 0;
5454     xmlChar cur, next;
5455
5456     htmlParserNodeInfo node_info;
5457
5458 #ifdef DEBUG_PUSH
5459     switch (ctxt->instate) {
5460         case XML_PARSER_EOF:
5461             xmlGenericError(xmlGenericErrorContext,
5462                     "HPP: try EOF\n"); break;
5463         case XML_PARSER_START:
5464             xmlGenericError(xmlGenericErrorContext,
5465                     "HPP: try START\n"); break;
5466         case XML_PARSER_MISC:
5467             xmlGenericError(xmlGenericErrorContext,
5468                     "HPP: try MISC\n");break;
5469         case XML_PARSER_COMMENT:
5470             xmlGenericError(xmlGenericErrorContext,
5471                     "HPP: try COMMENT\n");break;
5472         case XML_PARSER_PROLOG:
5473             xmlGenericError(xmlGenericErrorContext,
5474                     "HPP: try PROLOG\n");break;
5475         case XML_PARSER_START_TAG:
5476             xmlGenericError(xmlGenericErrorContext,
5477                     "HPP: try START_TAG\n");break;
5478         case XML_PARSER_CONTENT:
5479             xmlGenericError(xmlGenericErrorContext,
5480                     "HPP: try CONTENT\n");break;
5481         case XML_PARSER_CDATA_SECTION:
5482             xmlGenericError(xmlGenericErrorContext,
5483                     "HPP: try CDATA_SECTION\n");break;
5484         case XML_PARSER_END_TAG:
5485             xmlGenericError(xmlGenericErrorContext,
5486                     "HPP: try END_TAG\n");break;
5487         case XML_PARSER_ENTITY_DECL:
5488             xmlGenericError(xmlGenericErrorContext,
5489                     "HPP: try ENTITY_DECL\n");break;
5490         case XML_PARSER_ENTITY_VALUE:
5491             xmlGenericError(xmlGenericErrorContext,
5492                     "HPP: try ENTITY_VALUE\n");break;
5493         case XML_PARSER_ATTRIBUTE_VALUE:
5494             xmlGenericError(xmlGenericErrorContext,
5495                     "HPP: try ATTRIBUTE_VALUE\n");break;
5496         case XML_PARSER_DTD:
5497             xmlGenericError(xmlGenericErrorContext,
5498                     "HPP: try DTD\n");break;
5499         case XML_PARSER_EPILOG:
5500             xmlGenericError(xmlGenericErrorContext,
5501                     "HPP: try EPILOG\n");break;
5502         case XML_PARSER_PI:
5503             xmlGenericError(xmlGenericErrorContext,
5504                     "HPP: try PI\n");break;
5505         case XML_PARSER_SYSTEM_LITERAL:
5506             xmlGenericError(xmlGenericErrorContext,
5507                     "HPP: try SYSTEM_LITERAL\n");break;
5508     }
5509 #endif
5510
5511     while (1) {
5512
5513         in = ctxt->input;
5514         if (in == NULL) break;
5515         if (in->buf == NULL)
5516             avail = in->length - (in->cur - in->base);
5517         else
5518             avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5519                     (in->cur - in->base);
5520         if ((avail == 0) && (terminate)) {
5521             htmlAutoCloseOnEnd(ctxt);
5522             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5523                 /*
5524                  * SAX: end of the document processing.
5525                  */
5526                 ctxt->instate = XML_PARSER_EOF;
5527                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5528                     ctxt->sax->endDocument(ctxt->userData);
5529             }
5530         }
5531         if (avail < 1)
5532             goto done;
5533         /*
5534          * This is done to make progress and avoid an infinite loop
5535          * if a parsing attempt was aborted by hitting a NUL byte. After
5536          * changing htmlCurrentChar, this probably isn't necessary anymore.
5537          * We should consider removing this check.
5538          */
5539         cur = in->cur[0];
5540         if (cur == 0) {
5541             SKIP(1);
5542             continue;
5543         }
5544
5545         switch (ctxt->instate) {
5546             case XML_PARSER_EOF:
5547                 /*
5548                  * Document parsing is done !
5549                  */
5550                 goto done;
5551             case XML_PARSER_START:
5552                 /*
5553                  * Very first chars read from the document flow.
5554                  */
5555                 cur = in->cur[0];
5556                 if (IS_BLANK_CH(cur)) {
5557                     SKIP_BLANKS;
5558                     if (in->buf == NULL)
5559                         avail = in->length - (in->cur - in->base);
5560                     else
5561                         avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5562                                 (in->cur - in->base);
5563                 }
5564                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5565                     ctxt->sax->setDocumentLocator(ctxt->userData,
5566                                                   &xmlDefaultSAXLocator);
5567                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5568                     (!ctxt->disableSAX))
5569                     ctxt->sax->startDocument(ctxt->userData);
5570
5571                 cur = in->cur[0];
5572                 next = in->cur[1];
5573                 if ((cur == '<') && (next == '!') &&
5574                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5575                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5576                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5577                     (UPP(8) == 'E')) {
5578                     if ((!terminate) &&
5579                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5580                         goto done;
5581 #ifdef DEBUG_PUSH
5582                     xmlGenericError(xmlGenericErrorContext,
5583                             "HPP: Parsing internal subset\n");
5584 #endif
5585                     htmlParseDocTypeDecl(ctxt);
5586                     ctxt->instate = XML_PARSER_PROLOG;
5587 #ifdef DEBUG_PUSH
5588                     xmlGenericError(xmlGenericErrorContext,
5589                             "HPP: entering PROLOG\n");
5590 #endif
5591                 } else {
5592                     ctxt->instate = XML_PARSER_MISC;
5593 #ifdef DEBUG_PUSH
5594                     xmlGenericError(xmlGenericErrorContext,
5595                             "HPP: entering MISC\n");
5596 #endif
5597                 }
5598                 break;
5599             case XML_PARSER_MISC:
5600                 SKIP_BLANKS;
5601                 if (in->buf == NULL)
5602                     avail = in->length - (in->cur - in->base);
5603                 else
5604                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5605                             (in->cur - in->base);
5606                 /*
5607                  * no chars in buffer
5608                  */
5609                 if (avail < 1)
5610                     goto done;
5611                 /*
5612                  * not enough chars in buffer
5613                  */
5614                 if (avail < 2) {
5615                     if (!terminate)
5616                         goto done;
5617                     else
5618                         next = ' ';
5619                 } else {
5620                     next = in->cur[1];
5621                 }
5622                 cur = in->cur[0];
5623                 if ((cur == '<') && (next == '!') &&
5624                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5625                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5626                         goto done;
5627 #ifdef DEBUG_PUSH
5628                     xmlGenericError(xmlGenericErrorContext,
5629                             "HPP: Parsing Comment\n");
5630 #endif
5631                     htmlParseComment(ctxt);
5632                     ctxt->instate = XML_PARSER_MISC;
5633                 } else if ((cur == '<') && (next == '?')) {
5634                     if ((!terminate) &&
5635                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5636                         goto done;
5637 #ifdef DEBUG_PUSH
5638                     xmlGenericError(xmlGenericErrorContext,
5639                             "HPP: Parsing PI\n");
5640 #endif
5641                     htmlParsePI(ctxt);
5642                     ctxt->instate = XML_PARSER_MISC;
5643                 } else if ((cur == '<') && (next == '!') &&
5644                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5645                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5646                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5647                     (UPP(8) == 'E')) {
5648                     if ((!terminate) &&
5649                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5650                         goto done;
5651 #ifdef DEBUG_PUSH
5652                     xmlGenericError(xmlGenericErrorContext,
5653                             "HPP: Parsing internal subset\n");
5654 #endif
5655                     htmlParseDocTypeDecl(ctxt);
5656                     ctxt->instate = XML_PARSER_PROLOG;
5657 #ifdef DEBUG_PUSH
5658                     xmlGenericError(xmlGenericErrorContext,
5659                             "HPP: entering PROLOG\n");
5660 #endif
5661                 } else if ((cur == '<') && (next == '!') &&
5662                            (avail < 9)) {
5663                     goto done;
5664                 } else {
5665                     ctxt->instate = XML_PARSER_CONTENT;
5666 #ifdef DEBUG_PUSH
5667                     xmlGenericError(xmlGenericErrorContext,
5668                             "HPP: entering START_TAG\n");
5669 #endif
5670                 }
5671                 break;
5672             case XML_PARSER_PROLOG:
5673                 SKIP_BLANKS;
5674                 if (in->buf == NULL)
5675                     avail = in->length - (in->cur - in->base);
5676                 else
5677                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5678                             (in->cur - in->base);
5679                 if (avail < 2)
5680                     goto done;
5681                 cur = in->cur[0];
5682                 next = in->cur[1];
5683                 if ((cur == '<') && (next == '!') &&
5684                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5685                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5686                         goto done;
5687 #ifdef DEBUG_PUSH
5688                     xmlGenericError(xmlGenericErrorContext,
5689                             "HPP: Parsing Comment\n");
5690 #endif
5691                     htmlParseComment(ctxt);
5692                     ctxt->instate = XML_PARSER_PROLOG;
5693                 } else if ((cur == '<') && (next == '?')) {
5694                     if ((!terminate) &&
5695                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5696                         goto done;
5697 #ifdef DEBUG_PUSH
5698                     xmlGenericError(xmlGenericErrorContext,
5699                             "HPP: Parsing PI\n");
5700 #endif
5701                     htmlParsePI(ctxt);
5702                     ctxt->instate = XML_PARSER_PROLOG;
5703                 } else if ((cur == '<') && (next == '!') &&
5704                            (avail < 4)) {
5705                     goto done;
5706                 } else {
5707                     ctxt->instate = XML_PARSER_CONTENT;
5708 #ifdef DEBUG_PUSH
5709                     xmlGenericError(xmlGenericErrorContext,
5710                             "HPP: entering START_TAG\n");
5711 #endif
5712                 }
5713                 break;
5714             case XML_PARSER_EPILOG:
5715                 if (in->buf == NULL)
5716                     avail = in->length - (in->cur - in->base);
5717                 else
5718                     avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5719                             (in->cur - in->base);
5720                 if (avail < 1)
5721                     goto done;
5722                 cur = in->cur[0];
5723                 if (IS_BLANK_CH(cur)) {
5724                     htmlParseCharData(ctxt);
5725                     goto done;
5726                 }
5727                 if (avail < 2)
5728                     goto done;
5729                 next = in->cur[1];
5730                 if ((cur == '<') && (next == '!') &&
5731                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5732                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5733                         goto done;
5734 #ifdef DEBUG_PUSH
5735                     xmlGenericError(xmlGenericErrorContext,
5736                             "HPP: Parsing Comment\n");
5737 #endif
5738                     htmlParseComment(ctxt);
5739                     ctxt->instate = XML_PARSER_EPILOG;
5740                 } else if ((cur == '<') && (next == '?')) {
5741                     if ((!terminate) &&
5742                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5743                         goto done;
5744 #ifdef DEBUG_PUSH
5745                     xmlGenericError(xmlGenericErrorContext,
5746                             "HPP: Parsing PI\n");
5747 #endif
5748                     htmlParsePI(ctxt);
5749                     ctxt->instate = XML_PARSER_EPILOG;
5750                 } else if ((cur == '<') && (next == '!') &&
5751                            (avail < 4)) {
5752                     goto done;
5753                 } else {
5754                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5755                     ctxt->wellFormed = 0;
5756                     ctxt->instate = XML_PARSER_EOF;
5757 #ifdef DEBUG_PUSH
5758                     xmlGenericError(xmlGenericErrorContext,
5759                             "HPP: entering EOF\n");
5760 #endif
5761                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5762                         ctxt->sax->endDocument(ctxt->userData);
5763                     goto done;
5764                 }
5765                 break;
5766             case XML_PARSER_START_TAG: {
5767                 const xmlChar *name;
5768                 int failed;
5769                 const htmlElemDesc * info;
5770
5771                 /*
5772                  * no chars in buffer
5773                  */
5774                 if (avail < 1)
5775                     goto done;
5776                 /*
5777                  * not enough chars in buffer
5778                  */
5779                 if (avail < 2) {
5780                     if (!terminate)
5781                         goto done;
5782                     else
5783                         next = ' ';
5784                 } else {
5785                     next = in->cur[1];
5786                 }
5787                 cur = in->cur[0];
5788                 if (cur != '<') {
5789                     ctxt->instate = XML_PARSER_CONTENT;
5790 #ifdef DEBUG_PUSH
5791                     xmlGenericError(xmlGenericErrorContext,
5792                             "HPP: entering CONTENT\n");
5793 #endif
5794                     break;
5795                 }
5796                 if (next == '/') {
5797                     ctxt->instate = XML_PARSER_END_TAG;
5798                     ctxt->checkIndex = 0;
5799 #ifdef DEBUG_PUSH
5800                     xmlGenericError(xmlGenericErrorContext,
5801                             "HPP: entering END_TAG\n");
5802 #endif
5803                     break;
5804                 }
5805                 if ((!terminate) &&
5806                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5807                     goto done;
5808
5809                 /* Capture start position */
5810                 if (ctxt->record_info) {
5811                      node_info.begin_pos = ctxt->input->consumed +
5812                                         (CUR_PTR - ctxt->input->base);
5813                      node_info.begin_line = ctxt->input->line;
5814                 }
5815
5816
5817                 failed = htmlParseStartTag(ctxt);
5818                 name = ctxt->name;
5819                 if ((failed == -1) ||
5820                     (name == NULL)) {
5821                     if (CUR == '>')
5822                         NEXT;
5823                     break;
5824                 }
5825
5826                 /*
5827                  * Lookup the info for that element.
5828                  */
5829                 info = htmlTagLookup(name);
5830                 if (info == NULL) {
5831                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5832                                  "Tag %s invalid\n", name, NULL);
5833                 }
5834
5835                 /*
5836                  * Check for an Empty Element labeled the XML/SGML way
5837                  */
5838                 if ((CUR == '/') && (NXT(1) == '>')) {
5839                     SKIP(2);
5840                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5841                         ctxt->sax->endElement(ctxt->userData, name);
5842                     htmlnamePop(ctxt);
5843                     ctxt->instate = XML_PARSER_CONTENT;
5844 #ifdef DEBUG_PUSH
5845                     xmlGenericError(xmlGenericErrorContext,
5846                             "HPP: entering CONTENT\n");
5847 #endif
5848                     break;
5849                 }
5850
5851                 if (CUR == '>') {
5852                     NEXT;
5853                 } else {
5854                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5855                                  "Couldn't find end of Start Tag %s\n",
5856                                  name, NULL);
5857
5858                     /*
5859                      * end of parsing of this node.
5860                      */
5861                     if (xmlStrEqual(name, ctxt->name)) {
5862                         nodePop(ctxt);
5863                         htmlnamePop(ctxt);
5864                     }
5865
5866                     if (ctxt->record_info)
5867                         htmlNodeInfoPush(ctxt, &node_info);
5868
5869                     ctxt->instate = XML_PARSER_CONTENT;
5870 #ifdef DEBUG_PUSH
5871                     xmlGenericError(xmlGenericErrorContext,
5872                             "HPP: entering CONTENT\n");
5873 #endif
5874                     break;
5875                 }
5876
5877                 /*
5878                  * Check for an Empty Element from DTD definition
5879                  */
5880                 if ((info != NULL) && (info->empty)) {
5881                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5882                         ctxt->sax->endElement(ctxt->userData, name);
5883                     htmlnamePop(ctxt);
5884                 }
5885
5886                 if (ctxt->record_info)
5887                     htmlNodeInfoPush(ctxt, &node_info);
5888
5889                 ctxt->instate = XML_PARSER_CONTENT;
5890 #ifdef DEBUG_PUSH
5891                 xmlGenericError(xmlGenericErrorContext,
5892                         "HPP: entering CONTENT\n");
5893 #endif
5894                 break;
5895             }
5896             case XML_PARSER_CONTENT: {
5897                 xmlChar chr[2] = { 0, 0 };
5898
5899                 /*
5900                  * Handle preparsed entities and charRef
5901                  */
5902                 if (ctxt->token != 0) {
5903                     chr[0] = (xmlChar) ctxt->token;
5904                     htmlCheckParagraph(ctxt);
5905                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5906                         ctxt->sax->characters(ctxt->userData, chr, 1);
5907                     ctxt->token = 0;
5908                     ctxt->checkIndex = 0;
5909                 }
5910                 if ((avail == 1) && (terminate)) {
5911                     cur = in->cur[0];
5912                     if ((cur != '<') && (cur != '&')) {
5913                         if (ctxt->sax != NULL) {
5914                             chr[0] = cur;
5915                             if (IS_BLANK_CH(cur)) {
5916                                 if (ctxt->keepBlanks) {
5917                                     if (ctxt->sax->characters != NULL)
5918                                         ctxt->sax->characters(
5919                                                 ctxt->userData, chr, 1);
5920                                 } else {
5921                                     if (ctxt->sax->ignorableWhitespace != NULL)
5922                                         ctxt->sax->ignorableWhitespace(
5923                                                 ctxt->userData, chr, 1);
5924                                 }
5925                             } else {
5926                                 htmlCheckParagraph(ctxt);
5927                                 if (ctxt->sax->characters != NULL)
5928                                     ctxt->sax->characters(
5929                                             ctxt->userData, chr, 1);
5930                             }
5931                         }
5932                         ctxt->token = 0;
5933                         ctxt->checkIndex = 0;
5934                         in->cur++;
5935                         break;
5936                     }
5937                 }
5938                 if (avail < 2)
5939                     goto done;
5940                 cur = in->cur[0];
5941                 next = in->cur[1];
5942                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5943                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5944                     /*
5945                      * Handle SCRIPT/STYLE separately
5946                      */
5947                     if (!terminate) {
5948                         int idx;
5949                         xmlChar val;
5950
5951                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5952                         if (idx < 0)
5953                             goto done;
5954                         val = in->cur[idx + 2];
5955                         if (val == 0) /* bad cut of input */
5956                             goto done;
5957                     }
5958                     htmlParseScript(ctxt);
5959                     if ((cur == '<') && (next == '/')) {
5960                         ctxt->instate = XML_PARSER_END_TAG;
5961                         ctxt->checkIndex = 0;
5962 #ifdef DEBUG_PUSH
5963                         xmlGenericError(xmlGenericErrorContext,
5964                                 "HPP: entering END_TAG\n");
5965 #endif
5966                         break;
5967                     }
5968                 } else if ((cur == '<') && (next == '!')) {
5969                     /*
5970                      * Sometimes DOCTYPE arrives in the middle of the document
5971                      */
5972                     if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5973                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5974                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5975                         (UPP(8) == 'E')) {
5976                         if ((!terminate) &&
5977                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5978                             goto done;
5979                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5980                                      "Misplaced DOCTYPE declaration\n",
5981                                      BAD_CAST "DOCTYPE" , NULL);
5982                         htmlParseDocTypeDecl(ctxt);
5983                     } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5984                         if ((!terminate) &&
5985                             (htmlParseLookupCommentEnd(ctxt) < 0))
5986                             goto done;
5987 #ifdef DEBUG_PUSH
5988                         xmlGenericError(xmlGenericErrorContext,
5989                                 "HPP: Parsing Comment\n");
5990 #endif
5991                         htmlParseComment(ctxt);
5992                         ctxt->instate = XML_PARSER_CONTENT;
5993                     } else {
5994                         if ((!terminate) &&
5995                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5996                             goto done;
5997                         htmlSkipBogusComment(ctxt);
5998                     }
5999                 } else if ((cur == '<') && (next == '?')) {
6000                     if ((!terminate) &&
6001                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6002                         goto done;
6003 #ifdef DEBUG_PUSH
6004                     xmlGenericError(xmlGenericErrorContext,
6005                             "HPP: Parsing PI\n");
6006 #endif
6007                     htmlParsePI(ctxt);
6008                     ctxt->instate = XML_PARSER_CONTENT;
6009                 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
6010                     goto done;
6011                 } else if ((cur == '<') && (next == '/')) {
6012                     ctxt->instate = XML_PARSER_END_TAG;
6013                     ctxt->checkIndex = 0;
6014 #ifdef DEBUG_PUSH
6015                     xmlGenericError(xmlGenericErrorContext,
6016                             "HPP: entering END_TAG\n");
6017 #endif
6018                     break;
6019                 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6020                     if ((!terminate) && (next == 0))
6021                         goto done;
6022                     ctxt->instate = XML_PARSER_START_TAG;
6023                     ctxt->checkIndex = 0;
6024 #ifdef DEBUG_PUSH
6025                     xmlGenericError(xmlGenericErrorContext,
6026                             "HPP: entering START_TAG\n");
6027 #endif
6028                     break;
6029                 } else if (cur == '<') {
6030                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6031                         (ctxt->sax->characters != NULL))
6032                         ctxt->sax->characters(ctxt->userData,
6033                                               BAD_CAST "<", 1);
6034                     NEXT;
6035                 } else {
6036                     /*
6037                      * check that the text sequence is complete
6038                      * before handing out the data to the parser
6039                      * to avoid problems with erroneous end of
6040                      * data detection.
6041                      */
6042                     if ((!terminate) &&
6043                         (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6044                         goto done;
6045                     ctxt->checkIndex = 0;
6046 #ifdef DEBUG_PUSH
6047                     xmlGenericError(xmlGenericErrorContext,
6048                             "HPP: Parsing char data\n");
6049 #endif
6050                     while ((ctxt->instate != XML_PARSER_EOF) &&
6051                            (cur != '<') && (in->cur < in->end)) {
6052                         if (cur == '&') {
6053                             htmlParseReference(ctxt);
6054                         } else {
6055                             htmlParseCharData(ctxt);
6056                         }
6057                         cur = in->cur[0];
6058                     }
6059                 }
6060
6061                 break;
6062             }
6063             case XML_PARSER_END_TAG:
6064                 if (avail < 2)
6065                     goto done;
6066                 if ((!terminate) &&
6067                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6068                     goto done;
6069                 htmlParseEndTag(ctxt);
6070                 if (ctxt->nameNr == 0) {
6071                     ctxt->instate = XML_PARSER_EPILOG;
6072                 } else {
6073                     ctxt->instate = XML_PARSER_CONTENT;
6074                 }
6075                 ctxt->checkIndex = 0;
6076 #ifdef DEBUG_PUSH
6077                 xmlGenericError(xmlGenericErrorContext,
6078                         "HPP: entering CONTENT\n");
6079 #endif
6080                 break;
6081             case XML_PARSER_CDATA_SECTION:
6082                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6083                         "HPP: internal error, state == CDATA\n",
6084                              NULL, NULL);
6085                 ctxt->instate = XML_PARSER_CONTENT;
6086                 ctxt->checkIndex = 0;
6087 #ifdef DEBUG_PUSH
6088                 xmlGenericError(xmlGenericErrorContext,
6089                         "HPP: entering CONTENT\n");
6090 #endif
6091                 break;
6092             case XML_PARSER_DTD:
6093                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6094                         "HPP: internal error, state == DTD\n",
6095                              NULL, NULL);
6096                 ctxt->instate = XML_PARSER_CONTENT;
6097                 ctxt->checkIndex = 0;
6098 #ifdef DEBUG_PUSH
6099                 xmlGenericError(xmlGenericErrorContext,
6100                         "HPP: entering CONTENT\n");
6101 #endif
6102                 break;
6103             case XML_PARSER_COMMENT:
6104                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6105                         "HPP: internal error, state == COMMENT\n",
6106                              NULL, NULL);
6107                 ctxt->instate = XML_PARSER_CONTENT;
6108                 ctxt->checkIndex = 0;
6109 #ifdef DEBUG_PUSH
6110                 xmlGenericError(xmlGenericErrorContext,
6111                         "HPP: entering CONTENT\n");
6112 #endif
6113                 break;
6114             case XML_PARSER_PI:
6115                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6116                         "HPP: internal error, state == PI\n",
6117                              NULL, NULL);
6118                 ctxt->instate = XML_PARSER_CONTENT;
6119                 ctxt->checkIndex = 0;
6120 #ifdef DEBUG_PUSH
6121                 xmlGenericError(xmlGenericErrorContext,
6122                         "HPP: entering CONTENT\n");
6123 #endif
6124                 break;
6125             case XML_PARSER_ENTITY_DECL:
6126                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6127                         "HPP: internal error, state == ENTITY_DECL\n",
6128                              NULL, NULL);
6129                 ctxt->instate = XML_PARSER_CONTENT;
6130                 ctxt->checkIndex = 0;
6131 #ifdef DEBUG_PUSH
6132                 xmlGenericError(xmlGenericErrorContext,
6133                         "HPP: entering CONTENT\n");
6134 #endif
6135                 break;
6136             case XML_PARSER_ENTITY_VALUE:
6137                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6138                         "HPP: internal error, state == ENTITY_VALUE\n",
6139                              NULL, NULL);
6140                 ctxt->instate = XML_PARSER_CONTENT;
6141                 ctxt->checkIndex = 0;
6142 #ifdef DEBUG_PUSH
6143                 xmlGenericError(xmlGenericErrorContext,
6144                         "HPP: entering DTD\n");
6145 #endif
6146                 break;
6147             case XML_PARSER_ATTRIBUTE_VALUE:
6148                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6149                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6150                              NULL, NULL);
6151                 ctxt->instate = XML_PARSER_START_TAG;
6152                 ctxt->checkIndex = 0;
6153 #ifdef DEBUG_PUSH
6154                 xmlGenericError(xmlGenericErrorContext,
6155                         "HPP: entering START_TAG\n");
6156 #endif
6157                 break;
6158             case XML_PARSER_SYSTEM_LITERAL:
6159                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6160                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6161                              NULL, NULL);
6162                 ctxt->instate = XML_PARSER_CONTENT;
6163                 ctxt->checkIndex = 0;
6164 #ifdef DEBUG_PUSH
6165                 xmlGenericError(xmlGenericErrorContext,
6166                         "HPP: entering CONTENT\n");
6167 #endif
6168                 break;
6169             case XML_PARSER_IGNORE:
6170                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6171                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
6172                              NULL, NULL);
6173                 ctxt->instate = XML_PARSER_CONTENT;
6174                 ctxt->checkIndex = 0;
6175 #ifdef DEBUG_PUSH
6176                 xmlGenericError(xmlGenericErrorContext,
6177                         "HPP: entering CONTENT\n");
6178 #endif
6179                 break;
6180             case XML_PARSER_PUBLIC_LITERAL:
6181                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6182                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
6183                              NULL, NULL);
6184                 ctxt->instate = XML_PARSER_CONTENT;
6185                 ctxt->checkIndex = 0;
6186 #ifdef DEBUG_PUSH
6187                 xmlGenericError(xmlGenericErrorContext,
6188                         "HPP: entering CONTENT\n");
6189 #endif
6190                 break;
6191
6192         }
6193     }
6194 done:
6195     if ((avail == 0) && (terminate)) {
6196         htmlAutoCloseOnEnd(ctxt);
6197         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6198             /*
6199              * SAX: end of the document processing.
6200              */
6201             ctxt->instate = XML_PARSER_EOF;
6202             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6203                 ctxt->sax->endDocument(ctxt->userData);
6204         }
6205     }
6206     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6207         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6208          (ctxt->instate == XML_PARSER_EPILOG))) {
6209         xmlDtdPtr dtd;
6210         dtd = xmlGetIntSubset(ctxt->myDoc);
6211         if (dtd == NULL)
6212             ctxt->myDoc->intSubset =
6213                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6214                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6215                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6216     }
6217 #ifdef DEBUG_PUSH
6218     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6219 #endif
6220     return(ret);
6221 }
6222
6223 /**
6224  * htmlParseChunk:
6225  * @ctxt:  an HTML parser context
6226  * @chunk:  an char array
6227  * @size:  the size in byte of the chunk
6228  * @terminate:  last chunk indicator
6229  *
6230  * Parse a Chunk of memory
6231  *
6232  * Returns zero if no error, the xmlParserErrors otherwise.
6233  */
6234 int
6235 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6236               int terminate) {
6237     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6238         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6239                      "htmlParseChunk: context error\n", NULL, NULL);
6240         return(XML_ERR_INTERNAL_ERROR);
6241     }
6242     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6243         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6244         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6245         size_t cur = ctxt->input->cur - ctxt->input->base;
6246         int res;
6247
6248         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6249         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6250         if (res < 0) {
6251             ctxt->errNo = XML_PARSER_EOF;
6252             ctxt->disableSAX = 1;
6253             return (XML_PARSER_EOF);
6254         }
6255 #ifdef DEBUG_PUSH
6256         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6257 #endif
6258
6259 #if 0
6260         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6261             htmlParseTryOrFinish(ctxt, terminate);
6262 #endif
6263     } else if (ctxt->instate != XML_PARSER_EOF) {
6264         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6265             xmlParserInputBufferPtr in = ctxt->input->buf;
6266             if ((in->encoder != NULL) && (in->buffer != NULL) &&
6267                     (in->raw != NULL)) {
6268                 int nbchars;
6269                 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6270                 size_t current = ctxt->input->cur - ctxt->input->base;
6271
6272                 nbchars = xmlCharEncInput(in, terminate);
6273                 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6274                 if (nbchars < 0) {
6275                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6276                                  "encoder error\n", NULL, NULL);
6277                     return(XML_ERR_INVALID_ENCODING);
6278                 }
6279             }
6280         }
6281     }
6282     htmlParseTryOrFinish(ctxt, terminate);
6283     if (terminate) {
6284         if ((ctxt->instate != XML_PARSER_EOF) &&
6285             (ctxt->instate != XML_PARSER_EPILOG) &&
6286             (ctxt->instate != XML_PARSER_MISC)) {
6287             ctxt->errNo = XML_ERR_DOCUMENT_END;
6288             ctxt->wellFormed = 0;
6289         }
6290         if (ctxt->instate != XML_PARSER_EOF) {
6291             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6292                 ctxt->sax->endDocument(ctxt->userData);
6293         }
6294         ctxt->instate = XML_PARSER_EOF;
6295     }
6296     return((xmlParserErrors) ctxt->errNo);
6297 }
6298
6299 /************************************************************************
6300  *                                                                      *
6301  *                      User entry points                               *
6302  *                                                                      *
6303  ************************************************************************/
6304
6305 /**
6306  * htmlCreatePushParserCtxt:
6307  * @sax:  a SAX handler
6308  * @user_data:  The user data returned on SAX callbacks
6309  * @chunk:  a pointer to an array of chars
6310  * @size:  number of chars in the array
6311  * @filename:  an optional file name or URI
6312  * @enc:  an optional encoding
6313  *
6314  * Create a parser context for using the HTML parser in push mode
6315  * The value of @filename is used for fetching external entities
6316  * and error/warning reports.
6317  *
6318  * Returns the new parser context or NULL
6319  */
6320 htmlParserCtxtPtr
6321 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6322                          const char *chunk, int size, const char *filename,
6323                          xmlCharEncoding enc) {
6324     htmlParserCtxtPtr ctxt;
6325     htmlParserInputPtr inputStream;
6326     xmlParserInputBufferPtr buf;
6327
6328     xmlInitParser();
6329
6330     buf = xmlAllocParserInputBuffer(enc);
6331     if (buf == NULL) return(NULL);
6332
6333     ctxt = htmlNewParserCtxt();
6334     if (ctxt == NULL) {
6335         xmlFreeParserInputBuffer(buf);
6336         return(NULL);
6337     }
6338     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6339         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6340     if (sax != NULL) {
6341         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6342             xmlFree(ctxt->sax);
6343         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6344         if (ctxt->sax == NULL) {
6345             xmlFree(buf);
6346             xmlFree(ctxt);
6347             return(NULL);
6348         }
6349         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6350         if (user_data != NULL)
6351             ctxt->userData = user_data;
6352     }
6353     if (filename == NULL) {
6354         ctxt->directory = NULL;
6355     } else {
6356         ctxt->directory = xmlParserGetDirectory(filename);
6357     }
6358
6359     inputStream = htmlNewInputStream(ctxt);
6360     if (inputStream == NULL) {
6361         xmlFreeParserCtxt(ctxt);
6362         xmlFree(buf);
6363         return(NULL);
6364     }
6365
6366     if (filename == NULL)
6367         inputStream->filename = NULL;
6368     else
6369         inputStream->filename = (char *)
6370             xmlCanonicPath((const xmlChar *) filename);
6371     inputStream->buf = buf;
6372     xmlBufResetInput(buf->buffer, inputStream);
6373
6374     inputPush(ctxt, inputStream);
6375
6376     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6377         (ctxt->input->buf != NULL))  {
6378         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6379         size_t cur = ctxt->input->cur - ctxt->input->base;
6380
6381         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6382
6383         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6384 #ifdef DEBUG_PUSH
6385         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6386 #endif
6387     }
6388     ctxt->progressive = 1;
6389
6390     return(ctxt);
6391 }
6392 #endif /* LIBXML_PUSH_ENABLED */
6393
6394 /**
6395  * htmlSAXParseDoc:
6396  * @cur:  a pointer to an array of xmlChar
6397  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6398  * @sax:  the SAX handler block
6399  * @userData: if using SAX, this pointer will be provided on callbacks.
6400  *
6401  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6402  * to handle parse events. If sax is NULL, fallback to the default DOM
6403  * behavior and return a tree.
6404  *
6405  * Returns the resulting document tree unless SAX is NULL or the document is
6406  *     not well formed.
6407  */
6408
6409 htmlDocPtr
6410 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6411                 htmlSAXHandlerPtr sax, void *userData) {
6412     htmlDocPtr ret;
6413     htmlParserCtxtPtr ctxt;
6414
6415     xmlInitParser();
6416
6417     if (cur == NULL) return(NULL);
6418
6419
6420     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6421     if (ctxt == NULL) return(NULL);
6422     if (sax != NULL) {
6423         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6424         ctxt->sax = sax;
6425         ctxt->userData = userData;
6426     }
6427
6428     htmlParseDocument(ctxt);
6429     ret = ctxt->myDoc;
6430     if (sax != NULL) {
6431         ctxt->sax = NULL;
6432         ctxt->userData = NULL;
6433     }
6434     htmlFreeParserCtxt(ctxt);
6435
6436     return(ret);
6437 }
6438
6439 /**
6440  * htmlParseDoc:
6441  * @cur:  a pointer to an array of xmlChar
6442  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6443  *
6444  * parse an HTML in-memory document and build a tree.
6445  *
6446  * Returns the resulting document tree
6447  */
6448
6449 htmlDocPtr
6450 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6451     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6452 }
6453
6454
6455 /**
6456  * htmlCreateFileParserCtxt:
6457  * @filename:  the filename
6458  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6459  *
6460  * Create a parser context for a file content.
6461  * Automatic support for ZLIB/Compress compressed document is provided
6462  * by default if found at compile-time.
6463  *
6464  * Returns the new parser context or NULL
6465  */
6466 htmlParserCtxtPtr
6467 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6468 {
6469     htmlParserCtxtPtr ctxt;
6470     htmlParserInputPtr inputStream;
6471     char *canonicFilename;
6472     /* htmlCharEncoding enc; */
6473     xmlChar *content, *content_line = (xmlChar *) "charset=";
6474
6475     if (filename == NULL)
6476         return(NULL);
6477
6478     ctxt = htmlNewParserCtxt();
6479     if (ctxt == NULL) {
6480         return(NULL);
6481     }
6482     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6483     if (canonicFilename == NULL) {
6484 #ifdef LIBXML_SAX1_ENABLED
6485         if (xmlDefaultSAXHandler.error != NULL) {
6486             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6487         }
6488 #endif
6489         xmlFreeParserCtxt(ctxt);
6490         return(NULL);
6491     }
6492
6493     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6494     xmlFree(canonicFilename);
6495     if (inputStream == NULL) {
6496         xmlFreeParserCtxt(ctxt);
6497         return(NULL);
6498     }
6499
6500     inputPush(ctxt, inputStream);
6501
6502     /* set encoding */
6503     if (encoding) {
6504         size_t l = strlen(encoding);
6505
6506         if (l < 1000) {
6507             content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6508             if (content) {
6509                 strcpy ((char *)content, (char *)content_line);
6510                 strcat ((char *)content, (char *)encoding);
6511                 htmlCheckEncoding (ctxt, content);
6512                 xmlFree (content);
6513             }
6514         }
6515     }
6516
6517     return(ctxt);
6518 }
6519
6520 /**
6521  * htmlSAXParseFile:
6522  * @filename:  the filename
6523  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6524  * @sax:  the SAX handler block
6525  * @userData: if using SAX, this pointer will be provided on callbacks.
6526  *
6527  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6528  * compressed document is provided by default if found at compile-time.
6529  * It use the given SAX function block to handle the parsing callback.
6530  * If sax is NULL, fallback to the default DOM tree building routines.
6531  *
6532  * Returns the resulting document tree unless SAX is NULL or the document is
6533  *     not well formed.
6534  */
6535
6536 htmlDocPtr
6537 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6538                  void *userData) {
6539     htmlDocPtr ret;
6540     htmlParserCtxtPtr ctxt;
6541     htmlSAXHandlerPtr oldsax = NULL;
6542
6543     xmlInitParser();
6544
6545     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6546     if (ctxt == NULL) return(NULL);
6547     if (sax != NULL) {
6548         oldsax = ctxt->sax;
6549         ctxt->sax = sax;
6550         ctxt->userData = userData;
6551     }
6552
6553     htmlParseDocument(ctxt);
6554
6555     ret = ctxt->myDoc;
6556     if (sax != NULL) {
6557         ctxt->sax = oldsax;
6558         ctxt->userData = NULL;
6559     }
6560     htmlFreeParserCtxt(ctxt);
6561
6562     return(ret);
6563 }
6564
6565 /**
6566  * htmlParseFile:
6567  * @filename:  the filename
6568  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6569  *
6570  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6571  * compressed document is provided by default if found at compile-time.
6572  *
6573  * Returns the resulting document tree
6574  */
6575
6576 htmlDocPtr
6577 htmlParseFile(const char *filename, const char *encoding) {
6578     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6579 }
6580
6581 /**
6582  * htmlHandleOmittedElem:
6583  * @val:  int 0 or 1
6584  *
6585  * Set and return the previous value for handling HTML omitted tags.
6586  *
6587  * Returns the last value for 0 for no handling, 1 for auto insertion.
6588  */
6589
6590 int
6591 htmlHandleOmittedElem(int val) {
6592     int old = htmlOmittedDefaultValue;
6593
6594     htmlOmittedDefaultValue = val;
6595     return(old);
6596 }
6597
6598 /**
6599  * htmlElementAllowedHere:
6600  * @parent: HTML parent element
6601  * @elt: HTML element
6602  *
6603  * Checks whether an HTML element may be a direct child of a parent element.
6604  * Note - doesn't check for deprecated elements
6605  *
6606  * Returns 1 if allowed; 0 otherwise.
6607  */
6608 int
6609 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6610   const char** p ;
6611
6612   if ( ! elt || ! parent || ! parent->subelts )
6613         return 0 ;
6614
6615   for ( p = parent->subelts; *p; ++p )
6616     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6617       return 1 ;
6618
6619   return 0 ;
6620 }
6621 /**
6622  * htmlElementStatusHere:
6623  * @parent: HTML parent element
6624  * @elt: HTML element
6625  *
6626  * Checks whether an HTML element may be a direct child of a parent element.
6627  * and if so whether it is valid or deprecated.
6628  *
6629  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6630  */
6631 htmlStatus
6632 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6633   if ( ! parent || ! elt )
6634     return HTML_INVALID ;
6635   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6636     return HTML_INVALID ;
6637
6638   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6639 }
6640 /**
6641  * htmlAttrAllowed:
6642  * @elt: HTML element
6643  * @attr: HTML attribute
6644  * @legacy: whether to allow deprecated attributes
6645  *
6646  * Checks whether an attribute is valid for an element
6647  * Has full knowledge of Required and Deprecated attributes
6648  *
6649  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6650  */
6651 htmlStatus
6652 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6653   const char** p ;
6654
6655   if ( !elt || ! attr )
6656         return HTML_INVALID ;
6657
6658   if ( elt->attrs_req )
6659     for ( p = elt->attrs_req; *p; ++p)
6660       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6661         return HTML_REQUIRED ;
6662
6663   if ( elt->attrs_opt )
6664     for ( p = elt->attrs_opt; *p; ++p)
6665       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6666         return HTML_VALID ;
6667
6668   if ( legacy && elt->attrs_depr )
6669     for ( p = elt->attrs_depr; *p; ++p)
6670       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6671         return HTML_DEPRECATED ;
6672
6673   return HTML_INVALID ;
6674 }
6675 /**
6676  * htmlNodeStatus:
6677  * @node: an htmlNodePtr in a tree
6678  * @legacy: whether to allow deprecated elements (YES is faster here
6679  *      for Element nodes)
6680  *
6681  * Checks whether the tree node is valid.  Experimental (the author
6682  *     only uses the HTML enhancements in a SAX parser)
6683  *
6684  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6685  *      legacy allowed) or htmlElementStatusHere (otherwise).
6686  *      for Attribute nodes, a return from htmlAttrAllowed
6687  *      for other nodes, HTML_NA (no checks performed)
6688  */
6689 htmlStatus
6690 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6691   if ( ! node )
6692     return HTML_INVALID ;
6693
6694   switch ( node->type ) {
6695     case XML_ELEMENT_NODE:
6696       return legacy
6697         ? ( htmlElementAllowedHere (
6698                 htmlTagLookup(node->parent->name) , node->name
6699                 ) ? HTML_VALID : HTML_INVALID )
6700         : htmlElementStatusHere(
6701                 htmlTagLookup(node->parent->name) ,
6702                 htmlTagLookup(node->name) )
6703         ;
6704     case XML_ATTRIBUTE_NODE:
6705       return htmlAttrAllowed(
6706         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6707     default: return HTML_NA ;
6708   }
6709 }
6710 /************************************************************************
6711  *                                                                      *
6712  *      New set (2.6.0) of simpler and more flexible APIs               *
6713  *                                                                      *
6714  ************************************************************************/
6715 /**
6716  * DICT_FREE:
6717  * @str:  a string
6718  *
6719  * Free a string if it is not owned by the "dict" dictionary in the
6720  * current scope
6721  */
6722 #define DICT_FREE(str)                                          \
6723         if ((str) && ((!dict) ||                                \
6724             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6725             xmlFree((char *)(str));
6726
6727 /**
6728  * htmlCtxtReset:
6729  * @ctxt: an HTML parser context
6730  *
6731  * Reset a parser context
6732  */
6733 void
6734 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6735 {
6736     xmlParserInputPtr input;
6737     xmlDictPtr dict;
6738
6739     if (ctxt == NULL)
6740         return;
6741
6742     xmlInitParser();
6743     dict = ctxt->dict;
6744
6745     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6746         xmlFreeInputStream(input);
6747     }
6748     ctxt->inputNr = 0;
6749     ctxt->input = NULL;
6750
6751     ctxt->spaceNr = 0;
6752     if (ctxt->spaceTab != NULL) {
6753         ctxt->spaceTab[0] = -1;
6754         ctxt->space = &ctxt->spaceTab[0];
6755     } else {
6756         ctxt->space = NULL;
6757     }
6758
6759
6760     ctxt->nodeNr = 0;
6761     ctxt->node = NULL;
6762
6763     ctxt->nameNr = 0;
6764     ctxt->name = NULL;
6765
6766     ctxt->nsNr = 0;
6767
6768     DICT_FREE(ctxt->version);
6769     ctxt->version = NULL;
6770     DICT_FREE(ctxt->encoding);
6771     ctxt->encoding = NULL;
6772     DICT_FREE(ctxt->directory);
6773     ctxt->directory = NULL;
6774     DICT_FREE(ctxt->extSubURI);
6775     ctxt->extSubURI = NULL;
6776     DICT_FREE(ctxt->extSubSystem);
6777     ctxt->extSubSystem = NULL;
6778     if (ctxt->myDoc != NULL)
6779         xmlFreeDoc(ctxt->myDoc);
6780     ctxt->myDoc = NULL;
6781
6782     ctxt->standalone = -1;
6783     ctxt->hasExternalSubset = 0;
6784     ctxt->hasPErefs = 0;
6785     ctxt->html = 1;
6786     ctxt->external = 0;
6787     ctxt->instate = XML_PARSER_START;
6788     ctxt->token = 0;
6789
6790     ctxt->wellFormed = 1;
6791     ctxt->nsWellFormed = 1;
6792     ctxt->disableSAX = 0;
6793     ctxt->valid = 1;
6794     ctxt->vctxt.userData = ctxt;
6795     ctxt->vctxt.error = xmlParserValidityError;
6796     ctxt->vctxt.warning = xmlParserValidityWarning;
6797     ctxt->record_info = 0;
6798     ctxt->checkIndex = 0;
6799     ctxt->inSubset = 0;
6800     ctxt->errNo = XML_ERR_OK;
6801     ctxt->depth = 0;
6802     ctxt->charset = XML_CHAR_ENCODING_NONE;
6803     ctxt->catalogs = NULL;
6804     xmlInitNodeInfoSeq(&ctxt->node_seq);
6805
6806     if (ctxt->attsDefault != NULL) {
6807         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6808         ctxt->attsDefault = NULL;
6809     }
6810     if (ctxt->attsSpecial != NULL) {
6811         xmlHashFree(ctxt->attsSpecial, NULL);
6812         ctxt->attsSpecial = NULL;
6813     }
6814 }
6815
6816 /**
6817  * htmlCtxtUseOptions:
6818  * @ctxt: an HTML parser context
6819  * @options:  a combination of htmlParserOption(s)
6820  *
6821  * Applies the options to the parser context
6822  *
6823  * Returns 0 in case of success, the set of unknown or unimplemented options
6824  *         in case of error.
6825  */
6826 int
6827 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6828 {
6829     if (ctxt == NULL)
6830         return(-1);
6831
6832     if (options & HTML_PARSE_NOWARNING) {
6833         ctxt->sax->warning = NULL;
6834         ctxt->vctxt.warning = NULL;
6835         options -= XML_PARSE_NOWARNING;
6836         ctxt->options |= XML_PARSE_NOWARNING;
6837     }
6838     if (options & HTML_PARSE_NOERROR) {
6839         ctxt->sax->error = NULL;
6840         ctxt->vctxt.error = NULL;
6841         ctxt->sax->fatalError = NULL;
6842         options -= XML_PARSE_NOERROR;
6843         ctxt->options |= XML_PARSE_NOERROR;
6844     }
6845     if (options & HTML_PARSE_PEDANTIC) {
6846         ctxt->pedantic = 1;
6847         options -= XML_PARSE_PEDANTIC;
6848         ctxt->options |= XML_PARSE_PEDANTIC;
6849     } else
6850         ctxt->pedantic = 0;
6851     if (options & XML_PARSE_NOBLANKS) {
6852         ctxt->keepBlanks = 0;
6853         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6854         options -= XML_PARSE_NOBLANKS;
6855         ctxt->options |= XML_PARSE_NOBLANKS;
6856     } else
6857         ctxt->keepBlanks = 1;
6858     if (options & HTML_PARSE_RECOVER) {
6859         ctxt->recovery = 1;
6860         options -= HTML_PARSE_RECOVER;
6861     } else
6862         ctxt->recovery = 0;
6863     if (options & HTML_PARSE_COMPACT) {
6864         ctxt->options |= HTML_PARSE_COMPACT;
6865         options -= HTML_PARSE_COMPACT;
6866     }
6867     if (options & XML_PARSE_HUGE) {
6868         ctxt->options |= XML_PARSE_HUGE;
6869         options -= XML_PARSE_HUGE;
6870     }
6871     if (options & HTML_PARSE_NODEFDTD) {
6872         ctxt->options |= HTML_PARSE_NODEFDTD;
6873         options -= HTML_PARSE_NODEFDTD;
6874     }
6875     if (options & HTML_PARSE_IGNORE_ENC) {
6876         ctxt->options |= HTML_PARSE_IGNORE_ENC;
6877         options -= HTML_PARSE_IGNORE_ENC;
6878     }
6879     if (options & HTML_PARSE_NOIMPLIED) {
6880         ctxt->options |= HTML_PARSE_NOIMPLIED;
6881         options -= HTML_PARSE_NOIMPLIED;
6882     }
6883     ctxt->dictNames = 0;
6884     return (options);
6885 }
6886
6887 /**
6888  * htmlDoRead:
6889  * @ctxt:  an HTML parser context
6890  * @URL:  the base URL to use for the document
6891  * @encoding:  the document encoding, or NULL
6892  * @options:  a combination of htmlParserOption(s)
6893  * @reuse:  keep the context for reuse
6894  *
6895  * Common front-end for the htmlRead functions
6896  *
6897  * Returns the resulting document tree or NULL
6898  */
6899 static htmlDocPtr
6900 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6901           int options, int reuse)
6902 {
6903     htmlDocPtr ret;
6904
6905     htmlCtxtUseOptions(ctxt, options);
6906     ctxt->html = 1;
6907     if (encoding != NULL) {
6908         xmlCharEncodingHandlerPtr hdlr;
6909
6910         hdlr = xmlFindCharEncodingHandler(encoding);
6911         if (hdlr != NULL) {
6912             xmlSwitchToEncoding(ctxt, hdlr);
6913             if (ctxt->input->encoding != NULL)
6914               xmlFree((xmlChar *) ctxt->input->encoding);
6915             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6916         }
6917     }
6918     if ((URL != NULL) && (ctxt->input != NULL) &&
6919         (ctxt->input->filename == NULL))
6920         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6921     htmlParseDocument(ctxt);
6922     ret = ctxt->myDoc;
6923     ctxt->myDoc = NULL;
6924     if (!reuse) {
6925         if ((ctxt->dictNames) &&
6926             (ret != NULL) &&
6927             (ret->dict == ctxt->dict))
6928             ctxt->dict = NULL;
6929         xmlFreeParserCtxt(ctxt);
6930     }
6931     return (ret);
6932 }
6933
6934 /**
6935  * htmlReadDoc:
6936  * @cur:  a pointer to a zero terminated string
6937  * @URL:  the base URL to use for the document
6938  * @encoding:  the document encoding, or NULL
6939  * @options:  a combination of htmlParserOption(s)
6940  *
6941  * parse an XML in-memory document and build a tree.
6942  *
6943  * Returns the resulting document tree
6944  */
6945 htmlDocPtr
6946 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6947 {
6948     htmlParserCtxtPtr ctxt;
6949
6950     if (cur == NULL)
6951         return (NULL);
6952
6953     xmlInitParser();
6954     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6955     if (ctxt == NULL)
6956         return (NULL);
6957     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6958 }
6959
6960 /**
6961  * htmlReadFile:
6962  * @filename:  a file or URL
6963  * @encoding:  the document encoding, or NULL
6964  * @options:  a combination of htmlParserOption(s)
6965  *
6966  * parse an XML file from the filesystem or the network.
6967  *
6968  * Returns the resulting document tree
6969  */
6970 htmlDocPtr
6971 htmlReadFile(const char *filename, const char *encoding, int options)
6972 {
6973     htmlParserCtxtPtr ctxt;
6974
6975     xmlInitParser();
6976     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6977     if (ctxt == NULL)
6978         return (NULL);
6979     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6980 }
6981
6982 /**
6983  * htmlReadMemory:
6984  * @buffer:  a pointer to a char array
6985  * @size:  the size of the array
6986  * @URL:  the base URL to use for the document
6987  * @encoding:  the document encoding, or NULL
6988  * @options:  a combination of htmlParserOption(s)
6989  *
6990  * parse an XML in-memory document and build a tree.
6991  *
6992  * Returns the resulting document tree
6993  */
6994 htmlDocPtr
6995 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6996 {
6997     htmlParserCtxtPtr ctxt;
6998
6999     xmlInitParser();
7000     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
7001     if (ctxt == NULL)
7002         return (NULL);
7003     htmlDefaultSAXHandlerInit();
7004     if (ctxt->sax != NULL)
7005         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
7006     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7007 }
7008
7009 /**
7010  * htmlReadFd:
7011  * @fd:  an open file descriptor
7012  * @URL:  the base URL to use for the document
7013  * @encoding:  the document encoding, or NULL
7014  * @options:  a combination of htmlParserOption(s)
7015  *
7016  * parse an HTML from a file descriptor and build a tree.
7017  * NOTE that the file descriptor will not be closed when the
7018  *      reader is closed or reset.
7019  *
7020  * Returns the resulting document tree
7021  */
7022 htmlDocPtr
7023 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7024 {
7025     htmlParserCtxtPtr ctxt;
7026     xmlParserInputBufferPtr input;
7027     htmlParserInputPtr stream;
7028
7029     if (fd < 0)
7030         return (NULL);
7031
7032     xmlInitParser();
7033     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7034     if (input == NULL)
7035         return (NULL);
7036     input->closecallback = NULL;
7037     ctxt = htmlNewParserCtxt();
7038     if (ctxt == NULL) {
7039         xmlFreeParserInputBuffer(input);
7040         return (NULL);
7041     }
7042     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7043     if (stream == NULL) {
7044         xmlFreeParserInputBuffer(input);
7045         htmlFreeParserCtxt(ctxt);
7046         return (NULL);
7047     }
7048     inputPush(ctxt, stream);
7049     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7050 }
7051
7052 /**
7053  * htmlReadIO:
7054  * @ioread:  an I/O read function
7055  * @ioclose:  an I/O close function
7056  * @ioctx:  an I/O handler
7057  * @URL:  the base URL to use for the document
7058  * @encoding:  the document encoding, or NULL
7059  * @options:  a combination of htmlParserOption(s)
7060  *
7061  * parse an HTML document from I/O functions and source and build a tree.
7062  *
7063  * Returns the resulting document tree
7064  */
7065 htmlDocPtr
7066 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7067           void *ioctx, const char *URL, const char *encoding, int options)
7068 {
7069     htmlParserCtxtPtr ctxt;
7070     xmlParserInputBufferPtr input;
7071     xmlParserInputPtr stream;
7072
7073     if (ioread == NULL)
7074         return (NULL);
7075     xmlInitParser();
7076
7077     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7078                                          XML_CHAR_ENCODING_NONE);
7079     if (input == NULL) {
7080         if (ioclose != NULL)
7081             ioclose(ioctx);
7082         return (NULL);
7083     }
7084     ctxt = htmlNewParserCtxt();
7085     if (ctxt == NULL) {
7086         xmlFreeParserInputBuffer(input);
7087         return (NULL);
7088     }
7089     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7090     if (stream == NULL) {
7091         xmlFreeParserInputBuffer(input);
7092         xmlFreeParserCtxt(ctxt);
7093         return (NULL);
7094     }
7095     inputPush(ctxt, stream);
7096     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7097 }
7098
7099 /**
7100  * htmlCtxtReadDoc:
7101  * @ctxt:  an HTML parser context
7102  * @cur:  a pointer to a zero terminated string
7103  * @URL:  the base URL to use for the document
7104  * @encoding:  the document encoding, or NULL
7105  * @options:  a combination of htmlParserOption(s)
7106  *
7107  * parse an XML in-memory document and build a tree.
7108  * This reuses the existing @ctxt parser context
7109  *
7110  * Returns the resulting document tree
7111  */
7112 htmlDocPtr
7113 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7114                const char *URL, const char *encoding, int options)
7115 {
7116     if (cur == NULL)
7117         return (NULL);
7118     return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
7119                                encoding, options));
7120 }
7121
7122 /**
7123  * htmlCtxtReadFile:
7124  * @ctxt:  an HTML parser context
7125  * @filename:  a file or URL
7126  * @encoding:  the document encoding, or NULL
7127  * @options:  a combination of htmlParserOption(s)
7128  *
7129  * parse an XML file from the filesystem or the network.
7130  * This reuses the existing @ctxt parser context
7131  *
7132  * Returns the resulting document tree
7133  */
7134 htmlDocPtr
7135 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7136                 const char *encoding, int options)
7137 {
7138     xmlParserInputPtr stream;
7139
7140     if (filename == NULL)
7141         return (NULL);
7142     if (ctxt == NULL)
7143         return (NULL);
7144     xmlInitParser();
7145
7146     htmlCtxtReset(ctxt);
7147
7148     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7149     if (stream == NULL) {
7150         return (NULL);
7151     }
7152     inputPush(ctxt, stream);
7153     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7154 }
7155
7156 /**
7157  * htmlCtxtReadMemory:
7158  * @ctxt:  an HTML parser context
7159  * @buffer:  a pointer to a char array
7160  * @size:  the size of the array
7161  * @URL:  the base URL to use for the document
7162  * @encoding:  the document encoding, or NULL
7163  * @options:  a combination of htmlParserOption(s)
7164  *
7165  * parse an XML in-memory document and build a tree.
7166  * This reuses the existing @ctxt parser context
7167  *
7168  * Returns the resulting document tree
7169  */
7170 htmlDocPtr
7171 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7172                   const char *URL, const char *encoding, int options)
7173 {
7174     xmlParserInputBufferPtr input;
7175     xmlParserInputPtr stream;
7176
7177     if (ctxt == NULL)
7178         return (NULL);
7179     if (buffer == NULL)
7180         return (NULL);
7181     xmlInitParser();
7182
7183     htmlCtxtReset(ctxt);
7184
7185     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7186     if (input == NULL) {
7187         return(NULL);
7188     }
7189
7190     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7191     if (stream == NULL) {
7192         xmlFreeParserInputBuffer(input);
7193         return(NULL);
7194     }
7195
7196     inputPush(ctxt, stream);
7197     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7198 }
7199
7200 /**
7201  * htmlCtxtReadFd:
7202  * @ctxt:  an HTML parser context
7203  * @fd:  an open file descriptor
7204  * @URL:  the base URL to use for the document
7205  * @encoding:  the document encoding, or NULL
7206  * @options:  a combination of htmlParserOption(s)
7207  *
7208  * parse an XML from a file descriptor and build a tree.
7209  * This reuses the existing @ctxt parser context
7210  *
7211  * Returns the resulting document tree
7212  */
7213 htmlDocPtr
7214 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7215               const char *URL, const char *encoding, int options)
7216 {
7217     xmlParserInputBufferPtr input;
7218     xmlParserInputPtr stream;
7219
7220     if (fd < 0)
7221         return (NULL);
7222     if (ctxt == NULL)
7223         return (NULL);
7224     xmlInitParser();
7225
7226     htmlCtxtReset(ctxt);
7227
7228
7229     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7230     if (input == NULL)
7231         return (NULL);
7232     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7233     if (stream == NULL) {
7234         xmlFreeParserInputBuffer(input);
7235         return (NULL);
7236     }
7237     inputPush(ctxt, stream);
7238     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7239 }
7240
7241 /**
7242  * htmlCtxtReadIO:
7243  * @ctxt:  an HTML parser context
7244  * @ioread:  an I/O read function
7245  * @ioclose:  an I/O close function
7246  * @ioctx:  an I/O handler
7247  * @URL:  the base URL to use for the document
7248  * @encoding:  the document encoding, or NULL
7249  * @options:  a combination of htmlParserOption(s)
7250  *
7251  * parse an HTML document from I/O functions and source and build a tree.
7252  * This reuses the existing @ctxt parser context
7253  *
7254  * Returns the resulting document tree
7255  */
7256 htmlDocPtr
7257 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7258               xmlInputCloseCallback ioclose, void *ioctx,
7259               const char *URL,
7260               const char *encoding, int options)
7261 {
7262     xmlParserInputBufferPtr input;
7263     xmlParserInputPtr stream;
7264
7265     if (ioread == NULL)
7266         return (NULL);
7267     if (ctxt == NULL)
7268         return (NULL);
7269     xmlInitParser();
7270
7271     htmlCtxtReset(ctxt);
7272
7273     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7274                                          XML_CHAR_ENCODING_NONE);
7275     if (input == NULL) {
7276         if (ioclose != NULL)
7277             ioclose(ioctx);
7278         return (NULL);
7279     }
7280     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7281     if (stream == NULL) {
7282         xmlFreeParserInputBuffer(input);
7283         return (NULL);
7284     }
7285     inputPush(ctxt, stream);
7286     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7287 }
7288
7289 #endif /* LIBXML_HTML_ENABLED */