libs/xml2/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #include <ctype.h>
  15 #include <stdlib.h>
  16
  17 #include <libxml/xmlmemory.h>
  18 #include <libxml/tree.h>
  19 #include <libxml/parser.h>
  20 #include <libxml/parserInternals.h>
  21 #include <libxml/xmlerror.h>
  22 #include <libxml/HTMLparser.h>
  23 #include <libxml/HTMLtree.h>
  24 #include <libxml/entities.h>
  25 #include <libxml/encoding.h>
  26 #include <libxml/valid.h>
  27 #include <libxml/xmlIO.h>
  28 #include <libxml/globals.h>
  29 #include <libxml/uri.h>
  30
  31 #include "private/buf.h"
  32 #include "private/enc.h"
  33 #include "private/error.h"
  34 #include "private/html.h"
  35 #include "private/parser.h"
  36 #include "private/tree.h"
  37
  38 #define HTML_MAX_NAMELEN 1000
  39 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  40 #define HTML_PARSER_BUFFER_SIZE 100
  41
  42 /* #define DEBUG */
  43 /* #define DEBUG_PUSH */
  44
  45 static int htmlOmittedDefaultValue = 1;
  46
  47 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  48                              xmlChar end, xmlChar  end2, xmlChar end3);
  49 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  50
  51 /************************************************************************
  52  *                                                                      *
  53  *              Some factorized error routines                          *
  54  *                                                                      *
  55  ************************************************************************/
  56
  57 /**
  58  * htmlErrMemory:
  59  * @ctxt:  an HTML parser context
  60  * @extra:  extra information
  61  *
  62  * Handle a redefinition of attribute error
  63  */
  64 static void
  65 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  66 {
  67     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  68         (ctxt->instate == XML_PARSER_EOF))
  69         return;
  70     if (ctxt != NULL) {
  71         ctxt->errNo = XML_ERR_NO_MEMORY;
  72         ctxt->instate = XML_PARSER_EOF;
  73         ctxt->disableSAX = 1;
  74     }
  75     if (extra)
  76         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  77                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  78                         NULL, NULL, 0, 0,
  79                         "Memory allocation failed : %s\n", extra);
  80     else
  81         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  82                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  83                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  84 }
  85
  86 /**
  87  * htmlParseErr:
  88  * @ctxt:  an HTML parser context
  89  * @error:  the error number
  90  * @msg:  the error message
  91  * @str1:  string infor
  92  * @str2:  string infor
  93  *
  94  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  95  */
  96 static void LIBXML_ATTR_FORMAT(3,0)
  97 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  98              const char *msg, const xmlChar *str1, const xmlChar *str2)
  99 {
 100     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 101         (ctxt->instate == XML_PARSER_EOF))
 102         return;
 103     if (ctxt != NULL)
 104         ctxt->errNo = error;
 105     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 106                     XML_ERR_ERROR, NULL, 0,
 107                     (const char *) str1, (const char *) str2,
 108                     NULL, 0, 0,
 109                     msg, str1, str2);
 110     if (ctxt != NULL)
 111         ctxt->wellFormed = 0;
 112 }
 113
 114 /**
 115  * htmlParseErrInt:
 116  * @ctxt:  an HTML parser context
 117  * @error:  the error number
 118  * @msg:  the error message
 119  * @val:  integer info
 120  *
 121  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 122  */
 123 static void LIBXML_ATTR_FORMAT(3,0)
 124 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 125              const char *msg, int val)
 126 {
 127     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 128         (ctxt->instate == XML_PARSER_EOF))
 129         return;
 130     if (ctxt != NULL)
 131         ctxt->errNo = error;
 132     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 133                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 134                     NULL, val, 0, msg, val);
 135     if (ctxt != NULL)
 136         ctxt->wellFormed = 0;
 137 }
 138
 139 /************************************************************************
 140  *                                                                      *
 141  *      Parser stacks related functions and macros              *
 142  *                                                                      *
 143  ************************************************************************/
 144
 145 /**
 146  * htmlnamePush:
 147  * @ctxt:  an HTML parser context
 148  * @value:  the element name
 149  *
 150  * Pushes a new element name on top of the name stack
 151  *
 152  * Returns -1 in case of error, the index in the stack otherwise
 153  */
 154 static int
 155 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 156 {
 157     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 158         ctxt->html = 3;
 159     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 160         ctxt->html = 10;
 161     if (ctxt->nameNr >= ctxt->nameMax) {
 162         size_t newSize = ctxt->nameMax * 2;
 163         const xmlChar **tmp;
 164
 165         tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
 166                          newSize * sizeof(ctxt->nameTab[0]));
 167         if (tmp == NULL) {
 168             htmlErrMemory(ctxt, NULL);
 169             return (-1);
 170         }
 171         ctxt->nameTab = tmp;
 172         ctxt->nameMax = newSize;
 173     }
 174     ctxt->nameTab[ctxt->nameNr] = value;
 175     ctxt->name = value;
 176     return (ctxt->nameNr++);
 177 }
 178 /**
 179  * htmlnamePop:
 180  * @ctxt: an HTML parser context
 181  *
 182  * Pops the top element name from the name stack
 183  *
 184  * Returns the name just removed
 185  */
 186 static const xmlChar *
 187 htmlnamePop(htmlParserCtxtPtr ctxt)
 188 {
 189     const xmlChar *ret;
 190
 191     if (ctxt->nameNr <= 0)
 192         return (NULL);
 193     ctxt->nameNr--;
 194     if (ctxt->nameNr < 0)
 195         return (NULL);
 196     if (ctxt->nameNr > 0)
 197         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 198     else
 199         ctxt->name = NULL;
 200     ret = ctxt->nameTab[ctxt->nameNr];
 201     ctxt->nameTab[ctxt->nameNr] = NULL;
 202     return (ret);
 203 }
 204
 205 /**
 206  * htmlNodeInfoPush:
 207  * @ctxt:  an HTML parser context
 208  * @value:  the node info
 209  *
 210  * Pushes a new element name on top of the node info stack
 211  *
 212  * Returns 0 in case of error, the index in the stack otherwise
 213  */
 214 static int
 215 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 216 {
 217     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 218         if (ctxt->nodeInfoMax == 0)
 219                 ctxt->nodeInfoMax = 5;
 220         ctxt->nodeInfoMax *= 2;
 221         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 222                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 223                                     ctxt->nodeInfoMax *
 224                                     sizeof(ctxt->nodeInfoTab[0]));
 225         if (ctxt->nodeInfoTab == NULL) {
 226             htmlErrMemory(ctxt, NULL);
 227             return (0);
 228         }
 229     }
 230     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 231     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 232     return (ctxt->nodeInfoNr++);
 233 }
 234
 235 /**
 236  * htmlNodeInfoPop:
 237  * @ctxt:  an HTML parser context
 238  *
 239  * Pops the top element name from the node info stack
 240  *
 241  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 242  */
 243 static htmlParserNodeInfo *
 244 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 245 {
 246     if (ctxt->nodeInfoNr <= 0)
 247         return (NULL);
 248     ctxt->nodeInfoNr--;
 249     if (ctxt->nodeInfoNr < 0)
 250         return (NULL);
 251     if (ctxt->nodeInfoNr > 0)
 252         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 253     else
 254         ctxt->nodeInfo = NULL;
 255     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 256 }
 257
 258 /*
 259  * Macros for accessing the content. Those should be used only by the parser,
 260  * and not exported.
 261  *
 262  * Dirty macros, i.e. one need to make assumption on the context to use them
 263  *
 264  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 265  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 266  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 267  *           in UNICODE mode. This should be used internally by the parser
 268  *           only to compare to ASCII values otherwise it would break when
 269  *           running with UTF-8 encoding.
 270  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 271  *           to compare on ASCII based substring.
 272  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 273  *           it should be used only to compare on ASCII based substring.
 274  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 275  *           strings without newlines within the parser.
 276  *
 277  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 278  *
 279  *   NEXT    Skip to the next character, this does the proper decoding
 280  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 281  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 282  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 283  */
 284
 285 #define UPPER (toupper(*ctxt->input->cur))
 286
 287 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
 288
 289 #define NXT(val) ctxt->input->cur[(val)]
 290
 291 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 292
 293 #define CUR_PTR ctxt->input->cur
 294 #define BASE_PTR ctxt->input->base
 295
 296 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 297                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 298         xmlParserShrink(ctxt)
 299
 300 #define GROW if ((ctxt->progressive == 0) &&                            \
 301                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 302         xmlParserGrow(ctxt)
 303
 304 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 305
 306 /* Imported from XML */
 307
 308 #define CUR (*ctxt->input->cur)
 309 #define NEXT xmlNextChar(ctxt)
 310
 311 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 312
 313
 314 #define NEXTL(l) do {                                                   \
 315     if (*(ctxt->input->cur) == '\n') {                                  \
 316         ctxt->input->line++; ctxt->input->col = 1;                      \
 317     } else ctxt->input->col++;                                          \
 318     ctxt->token = 0; ctxt->input->cur += l;                             \
 319   } while (0)
 320
 321 /************
 322     \
 323     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 324     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 325  ************/
 326
 327 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 328 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 329
 330 #define COPY_BUF(l,b,i,v)                                               \
 331     if (l == 1) b[i++] = v;                                             \
 332     else i += xmlCopyChar(l,&b[i],v)
 333
 334 /**
 335  * htmlFindEncoding:
 336  * @the HTML parser context
 337  *
 338  * Ty to find and encoding in the current data available in the input
 339  * buffer this is needed to try to switch to the proper encoding when
 340  * one face a character error.
 341  * That's an heuristic, since it's operating outside of parsing it could
 342  * try to use a meta which had been commented out, that's the reason it
 343  * should only be used in case of error, not as a default.
 344  *
 345  * Returns an encoding string or NULL if not found, the string need to
 346  *   be freed
 347  */
 348 static xmlChar *
 349 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 350     const xmlChar *start, *cur, *end;
 351
 352     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 353         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 354         (ctxt->input->buf->encoder != NULL))
 355         return(NULL);
 356     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 357         return(NULL);
 358
 359     start = ctxt->input->cur;
 360     end = ctxt->input->end;
 361     /* we also expect the input buffer to be zero terminated */
 362     if (*end != 0)
 363         return(NULL);
 364
 365     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 366     if (cur == NULL)
 367         return(NULL);
 368     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 369     if (cur == NULL)
 370         return(NULL);
 371     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 372     if (cur == NULL)
 373         return(NULL);
 374     cur += 8;
 375     start = cur;
 376     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 377            ((*cur >= 'a') && (*cur <= 'z')) ||
 378            ((*cur >= '0') && (*cur <= '9')) ||
 379            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 380            cur++;
 381     if (cur == start)
 382         return(NULL);
 383     return(xmlStrndup(start, cur - start));
 384 }
 385
 386 /**
 387  * htmlCurrentChar:
 388  * @ctxt:  the HTML parser context
 389  * @len:  pointer to the length of the char read
 390  *
 391  * The current char value, if using UTF-8 this may actually span multiple
 392  * bytes in the input buffer. Implement the end of line normalization:
 393  * 2.11 End-of-Line Handling
 394  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 395  * char, then the encoding converter is plugged in automatically.
 396  *
 397  * Returns the current char value and its length
 398  */
 399
 400 static int
 401 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 402     const unsigned char *cur;
 403     unsigned char c;
 404     unsigned int val;
 405
 406     if (ctxt->instate == XML_PARSER_EOF)
 407         return(0);
 408
 409     if (ctxt->token != 0) {
 410         *len = 0;
 411         return(ctxt->token);
 412     }
 413
 414     if ((ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) &&
 415         (xmlParserGrow(ctxt) < 0))
 416         return(0);
 417
 418     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
 419         xmlChar * guess;
 420         xmlCharEncodingHandlerPtr handler;
 421
 422         /*
 423          * Assume it's a fixed length encoding (1) with
 424          * a compatible encoding for the ASCII set, since
 425          * HTML constructs only use < 128 chars
 426          */
 427         if (*ctxt->input->cur < 0x80) {
 428             *len = 1;
 429             if ((*ctxt->input->cur == 0) &&
 430                 (ctxt->input->cur < ctxt->input->end)) {
 431                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 432                                 "Char 0x%X out of allowed range\n", 0);
 433                 return(' ');
 434             }
 435             return(*ctxt->input->cur);
 436         }
 437
 438         /*
 439          * Humm this is bad, do an automatic flow conversion
 440          */
 441         guess = htmlFindEncoding(ctxt);
 442         if (guess == NULL) {
 443             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 444         } else {
 445             if (ctxt->input->encoding != NULL)
 446                 xmlFree((xmlChar *) ctxt->input->encoding);
 447             ctxt->input->encoding = guess;
 448             handler = xmlFindCharEncodingHandler((const char *) guess);
 449             if (handler != NULL) {
 450                 /*
 451                  * Don't use UTF-8 encoder which isn't required and
 452                  * can produce invalid UTF-8.
 453                  */
 454                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
 455                     xmlSwitchToEncoding(ctxt, handler);
 456             } else {
 457                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 458                              "Unsupported encoding %s", guess, NULL);
 459             }
 460         }
 461         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 462     }
 463
 464     /*
 465      * We are supposed to handle UTF8, check it's valid
 466      * From rfc2044: encoding of the Unicode values on UTF-8:
 467      *
 468      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 469      * 0000 0000-0000 007F   0xxxxxxx
 470      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 471      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 472      *
 473      * Check for the 0x110000 limit too
 474      */
 475     cur = ctxt->input->cur;
 476     c = *cur;
 477     if (c & 0x80) {
 478         size_t avail;
 479
 480         if ((c & 0x40) == 0)
 481             goto encoding_error;
 482
 483         avail = ctxt->input->end - ctxt->input->cur;
 484
 485         if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
 486             goto encoding_error;
 487         if ((c & 0xe0) == 0xe0) {
 488             if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
 489                 goto encoding_error;
 490             if ((c & 0xf0) == 0xf0) {
 491                 if (((c & 0xf8) != 0xf0) ||
 492                     (avail < 4) || ((cur[3] & 0xc0) != 0x80))
 493                     goto encoding_error;
 494                 /* 4-byte code */
 495                 *len = 4;
 496                 val = (cur[0] & 0x7) << 18;
 497                 val |= (cur[1] & 0x3f) << 12;
 498                 val |= (cur[2] & 0x3f) << 6;
 499                 val |= cur[3] & 0x3f;
 500                 if (val < 0x10000)
 501                     goto encoding_error;
 502             } else {
 503               /* 3-byte code */
 504                 *len = 3;
 505                 val = (cur[0] & 0xf) << 12;
 506                 val |= (cur[1] & 0x3f) << 6;
 507                 val |= cur[2] & 0x3f;
 508                 if (val < 0x800)
 509                     goto encoding_error;
 510             }
 511         } else {
 512           /* 2-byte code */
 513             *len = 2;
 514             val = (cur[0] & 0x1f) << 6;
 515             val |= cur[1] & 0x3f;
 516             if (val < 0x80)
 517                 goto encoding_error;
 518         }
 519         if (!IS_CHAR(val)) {
 520             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 521                             "Char 0x%X out of allowed range\n", val);
 522         }
 523         return(val);
 524     } else {
 525         if ((*ctxt->input->cur == 0) &&
 526             (ctxt->input->cur < ctxt->input->end)) {
 527             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 528                             "Char 0x%X out of allowed range\n", 0);
 529             *len = 1;
 530             return(' ');
 531         }
 532         /* 1-byte code */
 533         *len = 1;
 534         return(*ctxt->input->cur);
 535     }
 536
 537 encoding_error:
 538     /*
 539      * If we detect an UTF8 error that probably mean that the
 540      * input encoding didn't get properly advertised in the
 541      * declaration header. Report the error and switch the encoding
 542      * to ISO-Latin-1 (if you don't like this policy, just declare the
 543      * encoding !)
 544      */
 545     {
 546         char buffer[150];
 547
 548         if (ctxt->input->end - ctxt->input->cur >= 4) {
 549             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 550                             ctxt->input->cur[0], ctxt->input->cur[1],
 551                             ctxt->input->cur[2], ctxt->input->cur[3]);
 552         } else {
 553             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 554         }
 555         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 556                      "Input is not proper UTF-8, indicate encoding !\n",
 557                      BAD_CAST buffer, NULL);
 558     }
 559
 560     /*
 561      * Don't switch encodings twice. Note that if there's an encoder, we
 562      * shouldn't receive invalid UTF-8 anyway.
 563      *
 564      * Note that if ctxt->input->buf == NULL, switching encodings is
 565      * impossible, see Gitlab issue #34.
 566      */
 567     if ((ctxt->input->buf != NULL) &&
 568         (ctxt->input->buf->encoder == NULL))
 569         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 570     *len = 1;
 571     return(*ctxt->input->cur);
 572 }
 573
 574 /**
 575  * htmlSkipBlankChars:
 576  * @ctxt:  the HTML parser context
 577  *
 578  * skip all blanks character found at that point in the input streams.
 579  *
 580  * Returns the number of space chars skipped
 581  */
 582
 583 static int
 584 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 585     int res = 0;
 586
 587     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 588         if (*(ctxt->input->cur) == '\n') {
 589             ctxt->input->line++; ctxt->input->col = 1;
 590         } else ctxt->input->col++;
 591         ctxt->input->cur++;
 592         if (*ctxt->input->cur == 0)
 593             xmlParserGrow(ctxt);
 594         if (res < INT_MAX)
 595             res++;
 596     }
 597     return(res);
 598 }
 599
 600
 601
 602 /************************************************************************
 603  *                                                                      *
 604  *      The list of HTML elements and their properties          *
 605  *                                                                      *
 606  ************************************************************************/
 607
 608 /*
 609  *  Start Tag: 1 means the start tag can be omitted
 610  *  End Tag:   1 means the end tag can be omitted
 611  *             2 means it's forbidden (empty elements)
 612  *             3 means the tag is stylistic and should be closed easily
 613  *  Depr:      this element is deprecated
 614  *  DTD:       1 means that this element is valid only in the Loose DTD
 615  *             2 means that this element is valid only in the Frameset DTD
 616  *
 617  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 618         , subElements , impliedsubelt , Attributes, userdata
 619  */
 620
 621 /* Definitions and a couple of vars for HTML Elements */
 622
 623 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 624 #define NB_FONTSTYLE 8
 625 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 626 #define NB_PHRASE 10
 627 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 628 #define NB_SPECIAL 16
 629 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 630 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 631 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 632 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 633 #define FORMCTRL "input", "select", "textarea", "label", "button"
 634 #define NB_FORMCTRL 5
 635 #define PCDATA
 636 #define NB_PCDATA 0
 637 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 638 #define NB_HEADING 6
 639 #define LIST "ul", "ol", "dir", "menu"
 640 #define NB_LIST 4
 641 #define MODIFIER
 642 #define NB_MODIFIER 0
 643 #define FLOW BLOCK,INLINE
 644 #define NB_FLOW NB_BLOCK + NB_INLINE
 645 #define EMPTY NULL
 646
 647
 648 static const char* const html_flow[] = { FLOW, NULL } ;
 649 static const char* const html_inline[] = { INLINE, NULL } ;
 650
 651 /* placeholders: elts with content but no subelements */
 652 static const char* const html_pcdata[] = { NULL } ;
 653 #define html_cdata html_pcdata
 654
 655
 656 /* ... and for HTML Attributes */
 657
 658 #define COREATTRS "id", "class", "style", "title"
 659 #define NB_COREATTRS 4
 660 #define I18N "lang", "dir"
 661 #define NB_I18N 2
 662 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 663 #define NB_EVENTS 9
 664 #define ATTRS COREATTRS,I18N,EVENTS
 665 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 666 #define CELLHALIGN "align", "char", "charoff"
 667 #define NB_CELLHALIGN 3
 668 #define CELLVALIGN "valign"
 669 #define NB_CELLVALIGN 1
 670
 671 static const char* const html_attrs[] = { ATTRS, NULL } ;
 672 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 673 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 674 static const char* const i18n_attrs[] = { I18N, NULL } ;
 675
 676
 677 /* Other declarations that should go inline ... */
 678 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 679         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 680         "tabindex", "onfocus", "onblur", NULL } ;
 681 static const char* const target_attr[] = { "target", NULL } ;
 682 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 683 static const char* const alt_attr[] = { "alt", NULL } ;
 684 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 685 static const char* const href_attrs[] = { "href", NULL } ;
 686 static const char* const clear_attrs[] = { "clear", NULL } ;
 687 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 688
 689 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 690 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 691                 "archive", "alt", "name", "height", "width", "align",
 692                 "hspace", "vspace", NULL } ;
 693 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 694         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 695 static const char* const basefont_attrs[] =
 696         { "id", "size", "color", "face", NULL } ;
 697 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 698 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 699 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 700 static const char* const body_depr[] = { "background", "bgcolor", "text",
 701         "link", "vlink", "alink", NULL } ;
 702 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 703         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 704
 705
 706 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 707 static const char* const col_elt[] = { "col", NULL } ;
 708 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 709 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 710 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 711 static const char* const compact_attr[] = { "compact", NULL } ;
 712 static const char* const label_attr[] = { "label", NULL } ;
 713 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 714 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 715 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 716 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 717 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 718 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 719 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 720 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 721 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 722 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 723 static const char* const version_attr[] = { "version", NULL } ;
 724 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 725 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 726 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 727 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 728 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 729 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 730 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 731 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 732 static const char* const align_attr[] = { "align", NULL } ;
 733 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 734 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 735 static const char* const name_attr[] = { "name", NULL } ;
 736 static const char* const action_attr[] = { "action", NULL } ;
 737 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 738 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 739 static const char* const content_attr[] = { "content", NULL } ;
 740 static const char* const type_attr[] = { "type", NULL } ;
 741 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 742 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 743 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 744 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 745 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 746 static const char* const option_elt[] = { "option", NULL } ;
 747 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 748 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 749 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 750 static const char* const width_attr[] = { "width", NULL } ;
 751 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 752 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 753 static const char* const language_attr[] = { "language", NULL } ;
 754 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 755 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 756 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 757 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 758 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 759 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 760 static const char* const tr_elt[] = { "tr", NULL } ;
 761 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 762 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 763 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 764 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 765 static const char* const tr_contents[] = { "th", "td", NULL } ;
 766 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 767 static const char* const li_elt[] = { "li", NULL } ;
 768 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 769 static const char* const dir_attr[] = { "dir", NULL} ;
 770
 771 #define DECL (const char**)
 772
 773 static const htmlElemDesc
 774 html40ElementTable[] = {
 775 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 776         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 777 },
 778 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 779         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 780 },
 781 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 782         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 783 },
 784 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 785         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 786 },
 787 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 788         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 789 },
 790 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 791         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 792 },
 793 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 794         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 795 },
 796 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 797         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 798 },
 799 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 800         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 801 },
 802 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 803         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 804 },
 805 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 806         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 807 },
 808 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 809         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 810 },
 811 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 812         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 813 },
 814 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 815         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 816 },
 817 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 818         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 819 },
 820 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 821         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 822 },
 823 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 824         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 825 },
 826 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 827         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 828 },
 829 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 830         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 831 },
 832 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 833         EMPTY , NULL , DECL col_attrs , NULL, NULL
 834 },
 835 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 836         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 837 },
 838 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 839         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 840 },
 841 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 842         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 843 },
 844 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 845         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 846 },
 847 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 848         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 849 },
 850 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 851         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 852 },
 853 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 854         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 855 },
 856 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 857         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 858 },
 859 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 860         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 861 },
 862 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 863         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 864 },
 865 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 866         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 867 },
 868 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 869         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 870 },
 871 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 872         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 873 },
 874 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 875         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 876 },
 877 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 878         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 879 },
 880 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 881         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 882 },
 883 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 884         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 885 },
 886 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 887         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 888 },
 889 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 890         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 891 },
 892 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 893         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 894 },
 895 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 896         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 897 },
 898 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 899         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 900 },
 901 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 902         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 903 },
 904 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 905         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 906 },
 907 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 908         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 909 },
 910 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 911         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 912 },
 913 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 914         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 915 },
 916 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 917         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 918 },
 919 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 920         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 921 },
 922 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 923         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 924 },
 925 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 926         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 927 },
 928 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 929         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 930 },
 931 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 932         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 933 },
 934 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 935         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 936 },
 937 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 938         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 939 },
 940 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 941         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 942 },
 943 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 944         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 945 },
 946 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 947         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 948 },
 949 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 950         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 951 },
 952 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 953         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 954 },
 955 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 956         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 957 },
 958 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 959         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 960 },
 961 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 962         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 963 },
 964 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 965         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 966 },
 967 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 968         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 969 },
 970 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 971         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 972 },
 973 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 974         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 975 },
 976 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 977         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 978 },
 979 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 980         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 981 },
 982 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 983         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 984 },
 985 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 986         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 987 },
 988 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 989         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 990 },
 991 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 992         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 993 },
 994 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
 995         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 996 },
 997 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
 998         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 999 },
1000 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1001         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1002 },
1003 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
1004         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1005 },
1006 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
1007         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1008 },
1009 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1010         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1011 },
1012 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1013         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1014 },
1015 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1016         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1017 },
1018 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1019         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1020 },
1021 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1022         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1023 },
1024 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1025         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1026 },
1027 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1028         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1029 },
1030 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1031         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1032 },
1033 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1034         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1035 },
1036 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1037         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1038 },
1039 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1040         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1041 },
1042 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1043         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1044 },
1045 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1046         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1047 },
1048 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1049         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1050 }
1051 };
1052
1053 typedef struct {
1054     const char *oldTag;
1055     const char *newTag;
1056 } htmlStartCloseEntry;
1057
1058 /*
1059  * start tags that imply the end of current element
1060  */
1061 static const htmlStartCloseEntry htmlStartClose[] = {
1062     { "a", "a" },
1063     { "a", "fieldset" },
1064     { "a", "table" },
1065     { "a", "td" },
1066     { "a", "th" },
1067     { "address", "dd" },
1068     { "address", "dl" },
1069     { "address", "dt" },
1070     { "address", "form" },
1071     { "address", "li" },
1072     { "address", "ul" },
1073     { "b", "center" },
1074     { "b", "p" },
1075     { "b", "td" },
1076     { "b", "th" },
1077     { "big", "p" },
1078     { "caption", "col" },
1079     { "caption", "colgroup" },
1080     { "caption", "tbody" },
1081     { "caption", "tfoot" },
1082     { "caption", "thead" },
1083     { "caption", "tr" },
1084     { "col", "col" },
1085     { "col", "colgroup" },
1086     { "col", "tbody" },
1087     { "col", "tfoot" },
1088     { "col", "thead" },
1089     { "col", "tr" },
1090     { "colgroup", "colgroup" },
1091     { "colgroup", "tbody" },
1092     { "colgroup", "tfoot" },
1093     { "colgroup", "thead" },
1094     { "colgroup", "tr" },
1095     { "dd", "dt" },
1096     { "dir", "dd" },
1097     { "dir", "dl" },
1098     { "dir", "dt" },
1099     { "dir", "form" },
1100     { "dir", "ul" },
1101     { "dl", "form" },
1102     { "dl", "li" },
1103     { "dt", "dd" },
1104     { "dt", "dl" },
1105     { "font", "center" },
1106     { "font", "td" },
1107     { "font", "th" },
1108     { "form", "form" },
1109     { "h1", "fieldset" },
1110     { "h1", "form" },
1111     { "h1", "li" },
1112     { "h1", "p" },
1113     { "h1", "table" },
1114     { "h2", "fieldset" },
1115     { "h2", "form" },
1116     { "h2", "li" },
1117     { "h2", "p" },
1118     { "h2", "table" },
1119     { "h3", "fieldset" },
1120     { "h3", "form" },
1121     { "h3", "li" },
1122     { "h3", "p" },
1123     { "h3", "table" },
1124     { "h4", "fieldset" },
1125     { "h4", "form" },
1126     { "h4", "li" },
1127     { "h4", "p" },
1128     { "h4", "table" },
1129     { "h5", "fieldset" },
1130     { "h5", "form" },
1131     { "h5", "li" },
1132     { "h5", "p" },
1133     { "h5", "table" },
1134     { "h6", "fieldset" },
1135     { "h6", "form" },
1136     { "h6", "li" },
1137     { "h6", "p" },
1138     { "h6", "table" },
1139     { "head", "a" },
1140     { "head", "abbr" },
1141     { "head", "acronym" },
1142     { "head", "address" },
1143     { "head", "b" },
1144     { "head", "bdo" },
1145     { "head", "big" },
1146     { "head", "blockquote" },
1147     { "head", "body" },
1148     { "head", "br" },
1149     { "head", "center" },
1150     { "head", "cite" },
1151     { "head", "code" },
1152     { "head", "dd" },
1153     { "head", "dfn" },
1154     { "head", "dir" },
1155     { "head", "div" },
1156     { "head", "dl" },
1157     { "head", "dt" },
1158     { "head", "em" },
1159     { "head", "fieldset" },
1160     { "head", "font" },
1161     { "head", "form" },
1162     { "head", "frameset" },
1163     { "head", "h1" },
1164     { "head", "h2" },
1165     { "head", "h3" },
1166     { "head", "h4" },
1167     { "head", "h5" },
1168     { "head", "h6" },
1169     { "head", "hr" },
1170     { "head", "i" },
1171     { "head", "iframe" },
1172     { "head", "img" },
1173     { "head", "kbd" },
1174     { "head", "li" },
1175     { "head", "listing" },
1176     { "head", "map" },
1177     { "head", "menu" },
1178     { "head", "ol" },
1179     { "head", "p" },
1180     { "head", "pre" },
1181     { "head", "q" },
1182     { "head", "s" },
1183     { "head", "samp" },
1184     { "head", "small" },
1185     { "head", "span" },
1186     { "head", "strike" },
1187     { "head", "strong" },
1188     { "head", "sub" },
1189     { "head", "sup" },
1190     { "head", "table" },
1191     { "head", "tt" },
1192     { "head", "u" },
1193     { "head", "ul" },
1194     { "head", "var" },
1195     { "head", "xmp" },
1196     { "hr", "form" },
1197     { "i", "center" },
1198     { "i", "p" },
1199     { "i", "td" },
1200     { "i", "th" },
1201     { "legend", "fieldset" },
1202     { "li", "li" },
1203     { "link", "body" },
1204     { "link", "frameset" },
1205     { "listing", "dd" },
1206     { "listing", "dl" },
1207     { "listing", "dt" },
1208     { "listing", "fieldset" },
1209     { "listing", "form" },
1210     { "listing", "li" },
1211     { "listing", "table" },
1212     { "listing", "ul" },
1213     { "menu", "dd" },
1214     { "menu", "dl" },
1215     { "menu", "dt" },
1216     { "menu", "form" },
1217     { "menu", "ul" },
1218     { "ol", "form" },
1219     { "option", "optgroup" },
1220     { "option", "option" },
1221     { "p", "address" },
1222     { "p", "blockquote" },
1223     { "p", "body" },
1224     { "p", "caption" },
1225     { "p", "center" },
1226     { "p", "col" },
1227     { "p", "colgroup" },
1228     { "p", "dd" },
1229     { "p", "dir" },
1230     { "p", "div" },
1231     { "p", "dl" },
1232     { "p", "dt" },
1233     { "p", "fieldset" },
1234     { "p", "form" },
1235     { "p", "frameset" },
1236     { "p", "h1" },
1237     { "p", "h2" },
1238     { "p", "h3" },
1239     { "p", "h4" },
1240     { "p", "h5" },
1241     { "p", "h6" },
1242     { "p", "head" },
1243     { "p", "hr" },
1244     { "p", "li" },
1245     { "p", "listing" },
1246     { "p", "menu" },
1247     { "p", "ol" },
1248     { "p", "p" },
1249     { "p", "pre" },
1250     { "p", "table" },
1251     { "p", "tbody" },
1252     { "p", "td" },
1253     { "p", "tfoot" },
1254     { "p", "th" },
1255     { "p", "title" },
1256     { "p", "tr" },
1257     { "p", "ul" },
1258     { "p", "xmp" },
1259     { "pre", "dd" },
1260     { "pre", "dl" },
1261     { "pre", "dt" },
1262     { "pre", "fieldset" },
1263     { "pre", "form" },
1264     { "pre", "li" },
1265     { "pre", "table" },
1266     { "pre", "ul" },
1267     { "s", "p" },
1268     { "script", "noscript" },
1269     { "small", "p" },
1270     { "span", "td" },
1271     { "span", "th" },
1272     { "strike", "p" },
1273     { "style", "body" },
1274     { "style", "frameset" },
1275     { "tbody", "tbody" },
1276     { "tbody", "tfoot" },
1277     { "td", "tbody" },
1278     { "td", "td" },
1279     { "td", "tfoot" },
1280     { "td", "th" },
1281     { "td", "tr" },
1282     { "tfoot", "tbody" },
1283     { "th", "tbody" },
1284     { "th", "td" },
1285     { "th", "tfoot" },
1286     { "th", "th" },
1287     { "th", "tr" },
1288     { "thead", "tbody" },
1289     { "thead", "tfoot" },
1290     { "title", "body" },
1291     { "title", "frameset" },
1292     { "tr", "tbody" },
1293     { "tr", "tfoot" },
1294     { "tr", "tr" },
1295     { "tt", "p" },
1296     { "u", "p" },
1297     { "u", "td" },
1298     { "u", "th" },
1299     { "ul", "address" },
1300     { "ul", "form" },
1301     { "ul", "menu" },
1302     { "ul", "pre" },
1303     { "xmp", "dd" },
1304     { "xmp", "dl" },
1305     { "xmp", "dt" },
1306     { "xmp", "fieldset" },
1307     { "xmp", "form" },
1308     { "xmp", "li" },
1309     { "xmp", "table" },
1310     { "xmp", "ul" }
1311 };
1312
1313 /*
1314  * The list of HTML elements which are supposed not to have
1315  * CDATA content and where a p element will be implied
1316  *
1317  * TODO: extend that list by reading the HTML SGML DTD on
1318  *       implied paragraph
1319  */
1320 static const char *const htmlNoContentElements[] = {
1321     "html",
1322     "head",
1323     NULL
1324 };
1325
1326 /*
1327  * The list of HTML attributes which are of content %Script;
1328  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1329  *       it assumes the name starts with 'on'
1330  */
1331 static const char *const htmlScriptAttributes[] = {
1332     "onclick",
1333     "ondblclick",
1334     "onmousedown",
1335     "onmouseup",
1336     "onmouseover",
1337     "onmousemove",
1338     "onmouseout",
1339     "onkeypress",
1340     "onkeydown",
1341     "onkeyup",
1342     "onload",
1343     "onunload",
1344     "onfocus",
1345     "onblur",
1346     "onsubmit",
1347     "onreset",
1348     "onchange",
1349     "onselect"
1350 };
1351
1352 /*
1353  * This table is used by the htmlparser to know what to do with
1354  * broken html pages. By assigning different priorities to different
1355  * elements the parser can decide how to handle extra endtags.
1356  * Endtags are only allowed to close elements with lower or equal
1357  * priority.
1358  */
1359
1360 typedef struct {
1361     const char *name;
1362     int priority;
1363 } elementPriority;
1364
1365 static const elementPriority htmlEndPriority[] = {
1366     {"div",   150},
1367     {"td",    160},
1368     {"th",    160},
1369     {"tr",    170},
1370     {"thead", 180},
1371     {"tbody", 180},
1372     {"tfoot", 180},
1373     {"table", 190},
1374     {"head",  200},
1375     {"body",  200},
1376     {"html",  220},
1377     {NULL,    100} /* Default priority */
1378 };
1379
1380 /************************************************************************
1381  *                                                                      *
1382  *      functions to handle HTML specific data                  *
1383  *                                                                      *
1384  ************************************************************************/
1385
1386 /**
1387  * htmlInitAutoClose:
1388  *
1389  * DEPRECATED: This is a no-op.
1390  */
1391 void
1392 htmlInitAutoClose(void) {
1393 }
1394
1395 static int
1396 htmlCompareTags(const void *key, const void *member) {
1397     const xmlChar *tag = (const xmlChar *) key;
1398     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1399
1400     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1401 }
1402
1403 /**
1404  * htmlTagLookup:
1405  * @tag:  The tag name in lowercase
1406  *
1407  * Lookup the HTML tag in the ElementTable
1408  *
1409  * Returns the related htmlElemDescPtr or NULL if not found.
1410  */
1411 const htmlElemDesc *
1412 htmlTagLookup(const xmlChar *tag) {
1413     if (tag == NULL)
1414         return(NULL);
1415
1416     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1417                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1418                 sizeof(htmlElemDesc), htmlCompareTags));
1419 }
1420
1421 /**
1422  * htmlGetEndPriority:
1423  * @name: The name of the element to look up the priority for.
1424  *
1425  * Return value: The "endtag" priority.
1426  **/
1427 static int
1428 htmlGetEndPriority (const xmlChar *name) {
1429     int i = 0;
1430
1431     while ((htmlEndPriority[i].name != NULL) &&
1432            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1433         i++;
1434
1435     return(htmlEndPriority[i].priority);
1436 }
1437
1438
1439 static int
1440 htmlCompareStartClose(const void *vkey, const void *member) {
1441     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1442     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1443     int ret;
1444
1445     ret = strcmp(key->oldTag, entry->oldTag);
1446     if (ret == 0)
1447         ret = strcmp(key->newTag, entry->newTag);
1448
1449     return(ret);
1450 }
1451
1452 /**
1453  * htmlCheckAutoClose:
1454  * @newtag:  The new tag name
1455  * @oldtag:  The old tag name
1456  *
1457  * Checks whether the new tag is one of the registered valid tags for
1458  * closing old.
1459  *
1460  * Returns 0 if no, 1 if yes.
1461  */
1462 static int
1463 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1464 {
1465     htmlStartCloseEntry key;
1466     void *res;
1467
1468     key.oldTag = (const char *) oldtag;
1469     key.newTag = (const char *) newtag;
1470     res = bsearch(&key, htmlStartClose,
1471             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1472             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1473     return(res != NULL);
1474 }
1475
1476 /**
1477  * htmlAutoCloseOnClose:
1478  * @ctxt:  an HTML parser context
1479  * @newtag:  The new tag name
1480  * @force:  force the tag closure
1481  *
1482  * The HTML DTD allows an ending tag to implicitly close other tags.
1483  */
1484 static void
1485 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1486 {
1487     const htmlElemDesc *info;
1488     int i, priority;
1489
1490     priority = htmlGetEndPriority(newtag);
1491
1492     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1493
1494         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1495             break;
1496         /*
1497          * A misplaced endtag can only close elements with lower
1498          * or equal priority, so if we find an element with higher
1499          * priority before we find an element with
1500          * matching name, we just ignore this endtag
1501          */
1502         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1503             return;
1504     }
1505     if (i < 0)
1506         return;
1507
1508     while (!xmlStrEqual(newtag, ctxt->name)) {
1509         info = htmlTagLookup(ctxt->name);
1510         if ((info != NULL) && (info->endTag == 3)) {
1511             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1512                          "Opening and ending tag mismatch: %s and %s\n",
1513                          newtag, ctxt->name);
1514         }
1515         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1516             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1517         htmlnamePop(ctxt);
1518     }
1519 }
1520
1521 /**
1522  * htmlAutoCloseOnEnd:
1523  * @ctxt:  an HTML parser context
1524  *
1525  * Close all remaining tags at the end of the stream
1526  */
1527 static void
1528 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1529 {
1530     int i;
1531
1532     if (ctxt->nameNr == 0)
1533         return;
1534     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1535         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1536             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1537         htmlnamePop(ctxt);
1538     }
1539 }
1540
1541 /**
1542  * htmlAutoClose:
1543  * @ctxt:  an HTML parser context
1544  * @newtag:  The new tag name or NULL
1545  *
1546  * The HTML DTD allows a tag to implicitly close other tags.
1547  * The list is kept in htmlStartClose array. This function is
1548  * called when a new tag has been detected and generates the
1549  * appropriates closes if possible/needed.
1550  * If newtag is NULL this mean we are at the end of the resource
1551  * and we should check
1552  */
1553 static void
1554 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1555 {
1556     while ((newtag != NULL) && (ctxt->name != NULL) &&
1557            (htmlCheckAutoClose(newtag, ctxt->name))) {
1558         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1559             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1560         htmlnamePop(ctxt);
1561     }
1562     if (newtag == NULL) {
1563         htmlAutoCloseOnEnd(ctxt);
1564         return;
1565     }
1566     while ((newtag == NULL) && (ctxt->name != NULL) &&
1567            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1568             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1569             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1570         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1571             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1572         htmlnamePop(ctxt);
1573     }
1574 }
1575
1576 /**
1577  * htmlAutoCloseTag:
1578  * @doc:  the HTML document
1579  * @name:  The tag name
1580  * @elem:  the HTML element
1581  *
1582  * The HTML DTD allows a tag to implicitly close other tags.
1583  * The list is kept in htmlStartClose array. This function checks
1584  * if the element or one of it's children would autoclose the
1585  * given tag.
1586  *
1587  * Returns 1 if autoclose, 0 otherwise
1588  */
1589 int
1590 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1591     htmlNodePtr child;
1592
1593     if (elem == NULL) return(1);
1594     if (xmlStrEqual(name, elem->name)) return(0);
1595     if (htmlCheckAutoClose(elem->name, name)) return(1);
1596     child = elem->children;
1597     while (child != NULL) {
1598         if (htmlAutoCloseTag(doc, name, child)) return(1);
1599         child = child->next;
1600     }
1601     return(0);
1602 }
1603
1604 /**
1605  * htmlIsAutoClosed:
1606  * @doc:  the HTML document
1607  * @elem:  the HTML element
1608  *
1609  * The HTML DTD allows a tag to implicitly close other tags.
1610  * The list is kept in htmlStartClose array. This function checks
1611  * if a tag is autoclosed by one of it's child
1612  *
1613  * Returns 1 if autoclosed, 0 otherwise
1614  */
1615 int
1616 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1617     htmlNodePtr child;
1618
1619     if (elem == NULL) return(1);
1620     child = elem->children;
1621     while (child != NULL) {
1622         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1623         child = child->next;
1624     }
1625     return(0);
1626 }
1627
1628 /**
1629  * htmlCheckImplied:
1630  * @ctxt:  an HTML parser context
1631  * @newtag:  The new tag name
1632  *
1633  * The HTML DTD allows a tag to exists only implicitly
1634  * called when a new tag has been detected and generates the
1635  * appropriates implicit tags if missing
1636  */
1637 static void
1638 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1639     int i;
1640
1641     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1642         return;
1643     if (!htmlOmittedDefaultValue)
1644         return;
1645     if (xmlStrEqual(newtag, BAD_CAST"html"))
1646         return;
1647     if (ctxt->nameNr <= 0) {
1648         htmlnamePush(ctxt, BAD_CAST"html");
1649         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1650             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1651     }
1652     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1653         return;
1654     if ((ctxt->nameNr <= 1) &&
1655         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1656          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1657          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1658          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1659          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1660          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1661         if (ctxt->html >= 3) {
1662             /* we already saw or generated an <head> before */
1663             return;
1664         }
1665         /*
1666          * dropped OBJECT ... i you put it first BODY will be
1667          * assumed !
1668          */
1669         htmlnamePush(ctxt, BAD_CAST"head");
1670         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1671             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1672     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1673                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1674                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1675         if (ctxt->html >= 10) {
1676             /* we already saw or generated a <body> before */
1677             return;
1678         }
1679         for (i = 0;i < ctxt->nameNr;i++) {
1680             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1681                 return;
1682             }
1683             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1684                 return;
1685             }
1686         }
1687
1688         htmlnamePush(ctxt, BAD_CAST"body");
1689         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1690             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1691     }
1692 }
1693
1694 /**
1695  * htmlCheckParagraph
1696  * @ctxt:  an HTML parser context
1697  *
1698  * Check whether a p element need to be implied before inserting
1699  * characters in the current element.
1700  *
1701  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1702  *         in case of error.
1703  */
1704
1705 static int
1706 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1707     const xmlChar *tag;
1708     int i;
1709
1710     if (ctxt == NULL)
1711         return(-1);
1712     tag = ctxt->name;
1713     if (tag == NULL) {
1714         htmlAutoClose(ctxt, BAD_CAST"p");
1715         htmlCheckImplied(ctxt, BAD_CAST"p");
1716         htmlnamePush(ctxt, BAD_CAST"p");
1717         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1718             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1719         return(1);
1720     }
1721     if (!htmlOmittedDefaultValue)
1722         return(0);
1723     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1724         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1725             htmlAutoClose(ctxt, BAD_CAST"p");
1726             htmlCheckImplied(ctxt, BAD_CAST"p");
1727             htmlnamePush(ctxt, BAD_CAST"p");
1728             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1729                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1730             return(1);
1731         }
1732     }
1733     return(0);
1734 }
1735
1736 /**
1737  * htmlIsScriptAttribute:
1738  * @name:  an attribute name
1739  *
1740  * Check if an attribute is of content type Script
1741  *
1742  * Returns 1 is the attribute is a script 0 otherwise
1743  */
1744 int
1745 htmlIsScriptAttribute(const xmlChar *name) {
1746     unsigned int i;
1747
1748     if (name == NULL)
1749       return(0);
1750     /*
1751      * all script attributes start with 'on'
1752      */
1753     if ((name[0] != 'o') || (name[1] != 'n'))
1754       return(0);
1755     for (i = 0;
1756          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1757          i++) {
1758         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1759             return(1);
1760     }
1761     return(0);
1762 }
1763
1764 /************************************************************************
1765  *                                                                      *
1766  *      The list of HTML predefined entities                    *
1767  *                                                                      *
1768  ************************************************************************/
1769
1770
1771 static const htmlEntityDesc  html40EntitiesTable[] = {
1772 /*
1773  * the 4 absolute ones, plus apostrophe.
1774  */
1775 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1776 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1777 { 39,   "apos", "single quote" },
1778 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1779 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1780
1781 /*
1782  * A bunch still in the 128-255 range
1783  * Replacing them depend really on the charset used.
1784  */
1785 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1786 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1787 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1788 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1789 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1790 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1791 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1792 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1793 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1794 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1795 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1796 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1797 { 172,  "not",  "not sign, U+00AC ISOnum" },
1798 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1799 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1800 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1801 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1802 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1803 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1804 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1805 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1806 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1807 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1808 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1809 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1810 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1811 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1812 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1813 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1814 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1815 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1816 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1817 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1818 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1819 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1820 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1821 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1822 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1823 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1824 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1825 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1826 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1827 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1828 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1829 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1830 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1831 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1832 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1833 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1834 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1835 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1836 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1837 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1838 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1839 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1840 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1841 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1842 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1843 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1844 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1845 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1846 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1847 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1848 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1849 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1850 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1851 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1852 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1853 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1854 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1855 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1856 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1857 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1858 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1859 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1860 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1861 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1862 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1863 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1864 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1865 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1866 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1867 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1868 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1869 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1870 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1871 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1872 { 247,  "divide","division sign, U+00F7 ISOnum" },
1873 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1874 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1875 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1876 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1877 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1878 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1879 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1880 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1881
1882 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1883 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1884 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1885 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1886 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1887
1888 /*
1889  * Anything below should really be kept as entities references
1890  */
1891 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1892
1893 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1894 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1895
1896 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1897 { 914,  "Beta", "greek capital letter beta, U+0392" },
1898 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1899 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1900 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1901 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1902 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1903 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1904 { 921,  "Iota", "greek capital letter iota, U+0399" },
1905 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1906 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1907 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1908 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1909 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1910 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1911 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1912 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1913 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1914 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1915 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1916 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1917 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1918 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1919 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1920
1921 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1922 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1923 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1924 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1925 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1926 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1927 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1928 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1929 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1930 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1931 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1932 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1933 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1934 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1935 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1936 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1937 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1938 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1939 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1940 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1941 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1942 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1943 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1944 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1945 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1946 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1947 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1948 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1949
1950 { 8194, "ensp", "en space, U+2002 ISOpub" },
1951 { 8195, "emsp", "em space, U+2003 ISOpub" },
1952 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1953 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1954 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1955 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1956 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1957 { 8211, "ndash","en dash, U+2013 ISOpub" },
1958 { 8212, "mdash","em dash, U+2014 ISOpub" },
1959 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1960 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1961 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1962 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1963 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1964 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1965 { 8224, "dagger","dagger, U+2020 ISOpub" },
1966 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1967
1968 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1969 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1970
1971 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1972
1973 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1974 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1975
1976 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1977 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1978
1979 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1980 { 8260, "frasl","fraction slash, U+2044 NEW" },
1981
1982 { 8364, "euro", "euro sign, U+20AC NEW" },
1983
1984 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1985 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1986 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1987 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1988 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1989 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1990 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1991 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1992 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1993 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1994 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1995 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1996 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1997 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1998 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1999 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2000
2001 { 8704, "forall","for all, U+2200 ISOtech" },
2002 { 8706, "part", "partial differential, U+2202 ISOtech" },
2003 { 8707, "exist","there exists, U+2203 ISOtech" },
2004 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2005 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2006 { 8712, "isin", "element of, U+2208 ISOtech" },
2007 { 8713, "notin","not an element of, U+2209 ISOtech" },
2008 { 8715, "ni",   "contains as member, U+220B ISOtech" },
2009 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2010 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2011 { 8722, "minus","minus sign, U+2212 ISOtech" },
2012 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2013 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2014 { 8733, "prop", "proportional to, U+221D ISOtech" },
2015 { 8734, "infin","infinity, U+221E ISOtech" },
2016 { 8736, "ang",  "angle, U+2220 ISOamso" },
2017 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2018 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
2019 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2020 { 8746, "cup",  "union = cup, U+222A ISOtech" },
2021 { 8747, "int",  "integral, U+222B ISOtech" },
2022 { 8756, "there4","therefore, U+2234 ISOtech" },
2023 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2024 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2025 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2026 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
2027 { 8801, "equiv","identical to, U+2261 ISOtech" },
2028 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
2029 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
2030 { 8834, "sub",  "subset of, U+2282 ISOtech" },
2031 { 8835, "sup",  "superset of, U+2283 ISOtech" },
2032 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2033 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2034 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2035 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2036 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2037 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2038 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2039 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2040 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2041 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2042 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2043 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2044 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2045 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
2046
2047 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2048 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2049 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2050 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2051
2052 };
2053
2054 /************************************************************************
2055  *                                                                      *
2056  *              Commodity functions to handle entities                  *
2057  *                                                                      *
2058  ************************************************************************/
2059
2060 /*
2061  * Macro used to grow the current buffer.
2062  */
2063 #define growBuffer(buffer) {                                            \
2064     xmlChar *tmp;                                                       \
2065     buffer##_size *= 2;                                                 \
2066     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size);                \
2067     if (tmp == NULL) {                                                  \
2068         htmlErrMemory(ctxt, "growing buffer\n");                        \
2069         xmlFree(buffer);                                                \
2070         return(NULL);                                                   \
2071     }                                                                   \
2072     buffer = tmp;                                                       \
2073 }
2074
2075 /**
2076  * htmlEntityLookup:
2077  * @name: the entity name
2078  *
2079  * Lookup the given entity in EntitiesTable
2080  *
2081  * TODO: the linear scan is really ugly, an hash table is really needed.
2082  *
2083  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2084  */
2085 const htmlEntityDesc *
2086 htmlEntityLookup(const xmlChar *name) {
2087     unsigned int i;
2088
2089     for (i = 0;i < (sizeof(html40EntitiesTable)/
2090                     sizeof(html40EntitiesTable[0]));i++) {
2091         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2092             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2093         }
2094     }
2095     return(NULL);
2096 }
2097
2098 /**
2099  * htmlEntityValueLookup:
2100  * @value: the entity's unicode value
2101  *
2102  * Lookup the given entity in EntitiesTable
2103  *
2104  * TODO: the linear scan is really ugly, an hash table is really needed.
2105  *
2106  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2107  */
2108 const htmlEntityDesc *
2109 htmlEntityValueLookup(unsigned int value) {
2110     unsigned int i;
2111
2112     for (i = 0;i < (sizeof(html40EntitiesTable)/
2113                     sizeof(html40EntitiesTable[0]));i++) {
2114         if (html40EntitiesTable[i].value >= value) {
2115             if (html40EntitiesTable[i].value > value)
2116                 break;
2117             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2118         }
2119     }
2120     return(NULL);
2121 }
2122
2123 /**
2124  * UTF8ToHtml:
2125  * @out:  a pointer to an array of bytes to store the result
2126  * @outlen:  the length of @out
2127  * @in:  a pointer to an array of UTF-8 chars
2128  * @inlen:  the length of @in
2129  *
2130  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2131  * plus HTML entities block of chars out.
2132  *
2133  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2134  * The value of @inlen after return is the number of octets consumed
2135  *     as the return value is positive, else unpredictable.
2136  * The value of @outlen after return is the number of octets consumed.
2137  */
2138 int
2139 UTF8ToHtml(unsigned char* out, int *outlen,
2140               const unsigned char* in, int *inlen) {
2141     const unsigned char* processed = in;
2142     const unsigned char* outend;
2143     const unsigned char* outstart = out;
2144     const unsigned char* instart = in;
2145     const unsigned char* inend;
2146     unsigned int c, d;
2147     int trailing;
2148
2149     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2150     if (in == NULL) {
2151         /*
2152          * initialization nothing to do
2153          */
2154         *outlen = 0;
2155         *inlen = 0;
2156         return(0);
2157     }
2158     inend = in + (*inlen);
2159     outend = out + (*outlen);
2160     while (in < inend) {
2161         d = *in++;
2162         if      (d < 0x80)  { c= d; trailing= 0; }
2163         else if (d < 0xC0) {
2164             /* trailing byte in leading position */
2165             *outlen = out - outstart;
2166             *inlen = processed - instart;
2167             return(-2);
2168         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2169         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2170         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2171         else {
2172             /* no chance for this in Ascii */
2173             *outlen = out - outstart;
2174             *inlen = processed - instart;
2175             return(-2);
2176         }
2177
2178         if (inend - in < trailing) {
2179             break;
2180         }
2181
2182         for ( ; trailing; trailing--) {
2183             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2184                 break;
2185             c <<= 6;
2186             c |= d & 0x3F;
2187         }
2188
2189         /* assertion: c is a single UTF-4 value */
2190         if (c < 0x80) {
2191             if (out + 1 >= outend)
2192                 break;
2193             *out++ = c;
2194         } else {
2195             int len;
2196             const htmlEntityDesc * ent;
2197             const char *cp;
2198             char nbuf[16];
2199
2200             /*
2201              * Try to lookup a predefined HTML entity for it
2202              */
2203
2204             ent = htmlEntityValueLookup(c);
2205             if (ent == NULL) {
2206               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2207               cp = nbuf;
2208             }
2209             else
2210               cp = ent->name;
2211             len = strlen(cp);
2212             if (out + 2 + len >= outend)
2213                 break;
2214             *out++ = '&';
2215             memcpy(out, cp, len);
2216             out += len;
2217             *out++ = ';';
2218         }
2219         processed = in;
2220     }
2221     *outlen = out - outstart;
2222     *inlen = processed - instart;
2223     return(0);
2224 }
2225
2226 /**
2227  * htmlEncodeEntities:
2228  * @out:  a pointer to an array of bytes to store the result
2229  * @outlen:  the length of @out
2230  * @in:  a pointer to an array of UTF-8 chars
2231  * @inlen:  the length of @in
2232  * @quoteChar: the quote character to escape (' or ") or zero.
2233  *
2234  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2235  * plus HTML entities block of chars out.
2236  *
2237  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2238  * The value of @inlen after return is the number of octets consumed
2239  *     as the return value is positive, else unpredictable.
2240  * The value of @outlen after return is the number of octets consumed.
2241  */
2242 int
2243 htmlEncodeEntities(unsigned char* out, int *outlen,
2244                    const unsigned char* in, int *inlen, int quoteChar) {
2245     const unsigned char* processed = in;
2246     const unsigned char* outend;
2247     const unsigned char* outstart = out;
2248     const unsigned char* instart = in;
2249     const unsigned char* inend;
2250     unsigned int c, d;
2251     int trailing;
2252
2253     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2254         return(-1);
2255     outend = out + (*outlen);
2256     inend = in + (*inlen);
2257     while (in < inend) {
2258         d = *in++;
2259         if      (d < 0x80)  { c= d; trailing= 0; }
2260         else if (d < 0xC0) {
2261             /* trailing byte in leading position */
2262             *outlen = out - outstart;
2263             *inlen = processed - instart;
2264             return(-2);
2265         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2266         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2267         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2268         else {
2269             /* no chance for this in Ascii */
2270             *outlen = out - outstart;
2271             *inlen = processed - instart;
2272             return(-2);
2273         }
2274
2275         if (inend - in < trailing)
2276             break;
2277
2278         while (trailing--) {
2279             if (((d= *in++) & 0xC0) != 0x80) {
2280                 *outlen = out - outstart;
2281                 *inlen = processed - instart;
2282                 return(-2);
2283             }
2284             c <<= 6;
2285             c |= d & 0x3F;
2286         }
2287
2288         /* assertion: c is a single UTF-4 value */
2289         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2290             (c != '&') && (c != '<') && (c != '>')) {
2291             if (out >= outend)
2292                 break;
2293             *out++ = c;
2294         } else {
2295             const htmlEntityDesc * ent;
2296             const char *cp;
2297             char nbuf[16];
2298             int len;
2299
2300             /*
2301              * Try to lookup a predefined HTML entity for it
2302              */
2303             ent = htmlEntityValueLookup(c);
2304             if (ent == NULL) {
2305                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2306                 cp = nbuf;
2307             }
2308             else
2309                 cp = ent->name;
2310             len = strlen(cp);
2311             if (outend - out < len + 2)
2312                 break;
2313             *out++ = '&';
2314             memcpy(out, cp, len);
2315             out += len;
2316             *out++ = ';';
2317         }
2318         processed = in;
2319     }
2320     *outlen = out - outstart;
2321     *inlen = processed - instart;
2322     return(0);
2323 }
2324
2325 /************************************************************************
2326  *                                                                      *
2327  *              Commodity functions to handle streams                   *
2328  *                                                                      *
2329  ************************************************************************/
2330
2331 #ifdef LIBXML_PUSH_ENABLED
2332 /**
2333  * htmlNewInputStream:
2334  * @ctxt:  an HTML parser context
2335  *
2336  * Create a new input stream structure
2337  * Returns the new input stream or NULL
2338  */
2339 static htmlParserInputPtr
2340 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2341     htmlParserInputPtr input;
2342
2343     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2344     if (input == NULL) {
2345         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2346         return(NULL);
2347     }
2348     memset(input, 0, sizeof(htmlParserInput));
2349     input->filename = NULL;
2350     input->directory = NULL;
2351     input->base = NULL;
2352     input->cur = NULL;
2353     input->buf = NULL;
2354     input->line = 1;
2355     input->col = 1;
2356     input->buf = NULL;
2357     input->free = NULL;
2358     input->version = NULL;
2359     input->consumed = 0;
2360     input->length = 0;
2361     return(input);
2362 }
2363 #endif
2364
2365
2366 /************************************************************************
2367  *                                                                      *
2368  *              Commodity functions, cleanup needed ?                   *
2369  *                                                                      *
2370  ************************************************************************/
2371 /*
2372  * all tags allowing pc data from the html 4.01 loose dtd
2373  * NOTE: it might be more appropriate to integrate this information
2374  * into the html40ElementTable array but I don't want to risk any
2375  * binary incompatibility
2376  */
2377 static const char *allowPCData[] = {
2378     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2379     "blockquote", "body", "button", "caption", "center", "cite", "code",
2380     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2381     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2382     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2383     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2384 };
2385
2386 /**
2387  * areBlanks:
2388  * @ctxt:  an HTML parser context
2389  * @str:  a xmlChar *
2390  * @len:  the size of @str
2391  *
2392  * Is this a sequence of blank chars that one can ignore ?
2393  *
2394  * Returns 1 if ignorable 0 otherwise.
2395  */
2396
2397 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2398     unsigned int i;
2399     int j;
2400     xmlNodePtr lastChild;
2401     xmlDtdPtr dtd;
2402
2403     for (j = 0;j < len;j++)
2404         if (!(IS_BLANK_CH(str[j]))) return(0);
2405
2406     if (CUR == 0) return(1);
2407     if (CUR != '<') return(0);
2408     if (ctxt->name == NULL)
2409         return(1);
2410     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2411         return(1);
2412     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2413         return(1);
2414
2415     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2416     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2417         dtd = xmlGetIntSubset(ctxt->myDoc);
2418         if (dtd != NULL && dtd->ExternalID != NULL) {
2419             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2420                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2421                 return(1);
2422         }
2423     }
2424
2425     if (ctxt->node == NULL) return(0);
2426     lastChild = xmlGetLastChild(ctxt->node);
2427     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2428         lastChild = lastChild->prev;
2429     if (lastChild == NULL) {
2430         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2431             (ctxt->node->content != NULL)) return(0);
2432         /* keep ws in constructs like ...<b> </b>...
2433            for all tags "b" allowing PCDATA */
2434         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2435             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2436                 return(0);
2437             }
2438         }
2439     } else if (xmlNodeIsText(lastChild)) {
2440         return(0);
2441     } else {
2442         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2443            for all tags "p" allowing PCDATA */
2444         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2445             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2446                 return(0);
2447             }
2448         }
2449     }
2450     return(1);
2451 }
2452
2453 /**
2454  * htmlNewDocNoDtD:
2455  * @URI:  URI for the dtd, or NULL
2456  * @ExternalID:  the external ID of the DTD, or NULL
2457  *
2458  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2459  * are NULL
2460  *
2461  * Returns a new document, do not initialize the DTD if not provided
2462  */
2463 htmlDocPtr
2464 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2465     xmlDocPtr cur;
2466
2467     /*
2468      * Allocate a new document and fill the fields.
2469      */
2470     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2471     if (cur == NULL) {
2472         htmlErrMemory(NULL, "HTML document creation failed\n");
2473         return(NULL);
2474     }
2475     memset(cur, 0, sizeof(xmlDoc));
2476
2477     cur->type = XML_HTML_DOCUMENT_NODE;
2478     cur->version = NULL;
2479     cur->intSubset = NULL;
2480     cur->doc = cur;
2481     cur->name = NULL;
2482     cur->children = NULL;
2483     cur->extSubset = NULL;
2484     cur->oldNs = NULL;
2485     cur->encoding = NULL;
2486     cur->standalone = 1;
2487     cur->compression = 0;
2488     cur->ids = NULL;
2489     cur->refs = NULL;
2490     cur->_private = NULL;
2491     cur->charset = XML_CHAR_ENCODING_UTF8;
2492     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2493     if ((ExternalID != NULL) ||
2494         (URI != NULL))
2495         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2496     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2497         xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2498     return(cur);
2499 }
2500
2501 /**
2502  * htmlNewDoc:
2503  * @URI:  URI for the dtd, or NULL
2504  * @ExternalID:  the external ID of the DTD, or NULL
2505  *
2506  * Creates a new HTML document
2507  *
2508  * Returns a new document
2509  */
2510 htmlDocPtr
2511 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2512     if ((URI == NULL) && (ExternalID == NULL))
2513         return(htmlNewDocNoDtD(
2514                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2515                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2516
2517     return(htmlNewDocNoDtD(URI, ExternalID));
2518 }
2519
2520
2521 /************************************************************************
2522  *                                                                      *
2523  *                      The parser itself                               *
2524  *      Relates to http://www.w3.org/TR/html40                          *
2525  *                                                                      *
2526  ************************************************************************/
2527
2528 /************************************************************************
2529  *                                                                      *
2530  *                      The parser itself                               *
2531  *                                                                      *
2532  ************************************************************************/
2533
2534 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2535
2536 static void
2537 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2538     int c;
2539
2540     htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2541                  "Incorrectly opened comment\n", NULL, NULL);
2542
2543     do {
2544         c = CUR;
2545         if (c == 0)
2546             break;
2547         NEXT;
2548     } while (c != '>');
2549 }
2550
2551 /**
2552  * htmlParseHTMLName:
2553  * @ctxt:  an HTML parser context
2554  *
2555  * parse an HTML tag or attribute name, note that we convert it to lowercase
2556  * since HTML names are not case-sensitive.
2557  *
2558  * Returns the Tag Name parsed or NULL
2559  */
2560
2561 static const xmlChar *
2562 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2563     const xmlChar *ret;
2564     int i = 0;
2565     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2566
2567     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2568         (CUR != ':') && (CUR != '.')) return(NULL);
2569
2570     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2571            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2572            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2573            (CUR == '.'))) {
2574         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2575         else loc[i] = CUR;
2576         i++;
2577
2578         NEXT;
2579     }
2580
2581     ret = xmlDictLookup(ctxt->dict, loc, i);
2582     if (ret == NULL)
2583         htmlErrMemory(ctxt, NULL);
2584
2585     return(ret);
2586 }
2587
2588
2589 /**
2590  * htmlParseHTMLName_nonInvasive:
2591  * @ctxt:  an HTML parser context
2592  *
2593  * parse an HTML tag or attribute name, note that we convert it to lowercase
2594  * since HTML names are not case-sensitive, this doesn't consume the data
2595  * from the stream, it's a look-ahead
2596  *
2597  * Returns the Tag Name parsed or NULL
2598  */
2599
2600 static const xmlChar *
2601 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2602     int i = 0;
2603     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2604
2605     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2606         (NXT(1) != ':')) return(NULL);
2607
2608     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2609            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2610            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2611         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2612         else loc[i] = NXT(1+i);
2613         i++;
2614     }
2615
2616     return(xmlDictLookup(ctxt->dict, loc, i));
2617 }
2618
2619
2620 /**
2621  * htmlParseName:
2622  * @ctxt:  an HTML parser context
2623  *
2624  * parse an HTML name, this routine is case sensitive.
2625  *
2626  * Returns the Name parsed or NULL
2627  */
2628
2629 static const xmlChar *
2630 htmlParseName(htmlParserCtxtPtr ctxt) {
2631     const xmlChar *in;
2632     const xmlChar *ret;
2633     int count = 0;
2634
2635     GROW;
2636
2637     /*
2638      * Accelerator for simple ASCII names
2639      */
2640     in = ctxt->input->cur;
2641     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2642         ((*in >= 0x41) && (*in <= 0x5A)) ||
2643         (*in == '_') || (*in == ':')) {
2644         in++;
2645         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2646                ((*in >= 0x41) && (*in <= 0x5A)) ||
2647                ((*in >= 0x30) && (*in <= 0x39)) ||
2648                (*in == '_') || (*in == '-') ||
2649                (*in == ':') || (*in == '.'))
2650             in++;
2651
2652         if (in == ctxt->input->end)
2653             return(NULL);
2654
2655         if ((*in > 0) && (*in < 0x80)) {
2656             count = in - ctxt->input->cur;
2657             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2658             ctxt->input->cur = in;
2659             ctxt->input->col += count;
2660             return(ret);
2661         }
2662     }
2663     return(htmlParseNameComplex(ctxt));
2664 }
2665
2666 static const xmlChar *
2667 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2668     int len = 0, l;
2669     int c;
2670     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2671                     XML_MAX_TEXT_LENGTH :
2672                     XML_MAX_NAME_LENGTH;
2673     const xmlChar *base = ctxt->input->base;
2674
2675     /*
2676      * Handler for more complex cases
2677      */
2678     c = CUR_CHAR(l);
2679     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2680         (!IS_LETTER(c) && (c != '_') &&
2681          (c != ':'))) {
2682         return(NULL);
2683     }
2684
2685     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2686            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2687             (c == '.') || (c == '-') ||
2688             (c == '_') || (c == ':') ||
2689             (IS_COMBINING(c)) ||
2690             (IS_EXTENDER(c)))) {
2691         len += l;
2692         if (len > maxLength) {
2693             htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2694             return(NULL);
2695         }
2696         NEXTL(l);
2697         c = CUR_CHAR(l);
2698         if (ctxt->input->base != base) {
2699             /*
2700              * We changed encoding from an unknown encoding
2701              * Input buffer changed location, so we better start again
2702              */
2703             return(htmlParseNameComplex(ctxt));
2704         }
2705     }
2706     if (ctxt->instate == XML_PARSER_EOF)
2707         return(NULL);
2708
2709     if (ctxt->input->cur - ctxt->input->base < len) {
2710         /* Sanity check */
2711         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2712                      "unexpected change of input buffer", NULL, NULL);
2713         return (NULL);
2714     }
2715
2716     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2717 }
2718
2719
2720 /**
2721  * htmlParseHTMLAttribute:
2722  * @ctxt:  an HTML parser context
2723  * @stop:  a char stop value
2724  *
2725  * parse an HTML attribute value till the stop (quote), if
2726  * stop is 0 then it stops at the first space
2727  *
2728  * Returns the attribute parsed or NULL
2729  */
2730
2731 static xmlChar *
2732 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2733     xmlChar *buffer = NULL;
2734     int buffer_size = 0;
2735     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2736                     XML_MAX_HUGE_LENGTH :
2737                     XML_MAX_TEXT_LENGTH;
2738     xmlChar *out = NULL;
2739     const xmlChar *name = NULL;
2740     const xmlChar *cur = NULL;
2741     const htmlEntityDesc * ent;
2742
2743     /*
2744      * allocate a translation buffer.
2745      */
2746     buffer_size = HTML_PARSER_BUFFER_SIZE;
2747     buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2748     if (buffer == NULL) {
2749         htmlErrMemory(ctxt, "buffer allocation failed\n");
2750         return(NULL);
2751     }
2752     out = buffer;
2753
2754     /*
2755      * Ok loop until we reach one of the ending chars
2756      */
2757     while ((CUR != 0) && (CUR != stop)) {
2758         if ((stop == 0) && (CUR == '>')) break;
2759         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2760         if (CUR == '&') {
2761             if (NXT(1) == '#') {
2762                 unsigned int c;
2763                 int bits;
2764
2765                 c = htmlParseCharRef(ctxt);
2766                 if      (c <    0x80)
2767                         { *out++  = c;                bits= -6; }
2768                 else if (c <   0x800)
2769                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2770                 else if (c < 0x10000)
2771                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2772                 else
2773                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2774
2775                 for ( ; bits >= 0; bits-= 6) {
2776                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2777                 }
2778
2779                 if (out - buffer > buffer_size - 100) {
2780                         int indx = out - buffer;
2781
2782                         growBuffer(buffer);
2783                         out = &buffer[indx];
2784                 }
2785             } else {
2786                 ent = htmlParseEntityRef(ctxt, &name);
2787                 if (name == NULL) {
2788                     *out++ = '&';
2789                     if (out - buffer > buffer_size - 100) {
2790                         int indx = out - buffer;
2791
2792                         growBuffer(buffer);
2793                         out = &buffer[indx];
2794                     }
2795                 } else if (ent == NULL) {
2796                     *out++ = '&';
2797                     cur = name;
2798                     while (*cur != 0) {
2799                         if (out - buffer > buffer_size - 100) {
2800                             int indx = out - buffer;
2801
2802                             growBuffer(buffer);
2803                             out = &buffer[indx];
2804                         }
2805                         *out++ = *cur++;
2806                     }
2807                 } else {
2808                     unsigned int c;
2809                     int bits;
2810
2811                     if (out - buffer > buffer_size - 100) {
2812                         int indx = out - buffer;
2813
2814                         growBuffer(buffer);
2815                         out = &buffer[indx];
2816                     }
2817                     c = ent->value;
2818                     if      (c <    0x80)
2819                         { *out++  = c;                bits= -6; }
2820                     else if (c <   0x800)
2821                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2822                     else if (c < 0x10000)
2823                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2824                     else
2825                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2826
2827                     for ( ; bits >= 0; bits-= 6) {
2828                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2829                     }
2830                 }
2831             }
2832         } else {
2833             unsigned int c;
2834             int bits, l;
2835
2836             if (out - buffer > buffer_size - 100) {
2837                 int indx = out - buffer;
2838
2839                 growBuffer(buffer);
2840                 out = &buffer[indx];
2841             }
2842             c = CUR_CHAR(l);
2843             if (ctxt->instate == XML_PARSER_EOF) {
2844                 xmlFree(buffer);
2845                 return(NULL);
2846             }
2847             if      (c <    0x80)
2848                     { *out++  = c;                bits= -6; }
2849             else if (c <   0x800)
2850                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2851             else if (c < 0x10000)
2852                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2853             else
2854                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2855
2856             for ( ; bits >= 0; bits-= 6) {
2857                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2858             }
2859             NEXTL(l);
2860         }
2861         if (out - buffer > maxLength) {
2862             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2863                          "attribute value too long\n", NULL, NULL);
2864             xmlFree(buffer);
2865             return(NULL);
2866         }
2867     }
2868     *out = 0;
2869     return(buffer);
2870 }
2871
2872 /**
2873  * htmlParseEntityRef:
2874  * @ctxt:  an HTML parser context
2875  * @str:  location to store the entity name
2876  *
2877  * DEPRECATED: Internal function, don't use.
2878  *
2879  * parse an HTML ENTITY references
2880  *
2881  * [68] EntityRef ::= '&' Name ';'
2882  *
2883  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2884  *         if non-NULL *str will have to be freed by the caller.
2885  */
2886 const htmlEntityDesc *
2887 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2888     const xmlChar *name;
2889     const htmlEntityDesc * ent = NULL;
2890
2891     if (str != NULL) *str = NULL;
2892     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2893
2894     if (CUR == '&') {
2895         NEXT;
2896         name = htmlParseName(ctxt);
2897         if (name == NULL) {
2898             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2899                          "htmlParseEntityRef: no name\n", NULL, NULL);
2900         } else {
2901             GROW;
2902             if (CUR == ';') {
2903                 if (str != NULL)
2904                     *str = name;
2905
2906                 /*
2907                  * Lookup the entity in the table.
2908                  */
2909                 ent = htmlEntityLookup(name);
2910                 if (ent != NULL) /* OK that's ugly !!! */
2911                     NEXT;
2912             } else {
2913                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2914                              "htmlParseEntityRef: expecting ';'\n",
2915                              NULL, NULL);
2916                 if (str != NULL)
2917                     *str = name;
2918             }
2919         }
2920     }
2921     return(ent);
2922 }
2923
2924 /**
2925  * htmlParseAttValue:
2926  * @ctxt:  an HTML parser context
2927  *
2928  * parse a value for an attribute
2929  * Note: the parser won't do substitution of entities here, this
2930  * will be handled later in xmlStringGetNodeList, unless it was
2931  * asked for ctxt->replaceEntities != 0
2932  *
2933  * Returns the AttValue parsed or NULL.
2934  */
2935
2936 static xmlChar *
2937 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2938     xmlChar *ret = NULL;
2939
2940     if (CUR == '"') {
2941         NEXT;
2942         ret = htmlParseHTMLAttribute(ctxt, '"');
2943         if (CUR != '"') {
2944             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2945                          "AttValue: \" expected\n", NULL, NULL);
2946         } else
2947             NEXT;
2948     } else if (CUR == '\'') {
2949         NEXT;
2950         ret = htmlParseHTMLAttribute(ctxt, '\'');
2951         if (CUR != '\'') {
2952             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2953                          "AttValue: ' expected\n", NULL, NULL);
2954         } else
2955             NEXT;
2956     } else {
2957         /*
2958          * That's an HTMLism, the attribute value may not be quoted
2959          */
2960         ret = htmlParseHTMLAttribute(ctxt, 0);
2961         if (ret == NULL) {
2962             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2963                          "AttValue: no value found\n", NULL, NULL);
2964         }
2965     }
2966     return(ret);
2967 }
2968
2969 /**
2970  * htmlParseSystemLiteral:
2971  * @ctxt:  an HTML parser context
2972  *
2973  * parse an HTML Literal
2974  *
2975  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2976  *
2977  * Returns the SystemLiteral parsed or NULL
2978  */
2979
2980 static xmlChar *
2981 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2982     size_t len = 0, startPosition = 0;
2983     int err = 0;
2984     int quote;
2985     xmlChar *ret = NULL;
2986
2987     if ((CUR != '"') && (CUR != '\'')) {
2988         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2989                      "SystemLiteral \" or ' expected\n", NULL, NULL);
2990         return(NULL);
2991     }
2992     quote = CUR;
2993     NEXT;
2994
2995     if (CUR_PTR < BASE_PTR)
2996         return(ret);
2997     startPosition = CUR_PTR - BASE_PTR;
2998
2999     while ((CUR != 0) && (CUR != quote)) {
3000         /* TODO: Handle UTF-8 */
3001         if (!IS_CHAR_CH(CUR)) {
3002             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3003                             "Invalid char in SystemLiteral 0x%X\n", CUR);
3004             err = 1;
3005         }
3006         NEXT;
3007         len++;
3008     }
3009     if (CUR != quote) {
3010         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3011                      "Unfinished SystemLiteral\n", NULL, NULL);
3012     } else {
3013         if (err == 0)
3014             ret = xmlStrndup((BASE_PTR+startPosition), len);
3015         NEXT;
3016     }
3017
3018     return(ret);
3019 }
3020
3021 /**
3022  * htmlParsePubidLiteral:
3023  * @ctxt:  an HTML parser context
3024  *
3025  * parse an HTML public literal
3026  *
3027  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3028  *
3029  * Returns the PubidLiteral parsed or NULL.
3030  */
3031
3032 static xmlChar *
3033 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3034     size_t len = 0, startPosition = 0;
3035     int err = 0;
3036     int quote;
3037     xmlChar *ret = NULL;
3038
3039     if ((CUR != '"') && (CUR != '\'')) {
3040         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3041                      "PubidLiteral \" or ' expected\n", NULL, NULL);
3042         return(NULL);
3043     }
3044     quote = CUR;
3045     NEXT;
3046
3047     /*
3048      * Name ::= (Letter | '_') (NameChar)*
3049      */
3050     if (CUR_PTR < BASE_PTR)
3051         return(ret);
3052     startPosition = CUR_PTR - BASE_PTR;
3053
3054     while ((CUR != 0) && (CUR != quote)) {
3055         if (!IS_PUBIDCHAR_CH(CUR)) {
3056             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3057                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3058             err = 1;
3059         }
3060         len++;
3061         NEXT;
3062     }
3063
3064     if (CUR != quote) {
3065         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3066                      "Unfinished PubidLiteral\n", NULL, NULL);
3067     } else {
3068         if (err == 0)
3069             ret = xmlStrndup((BASE_PTR + startPosition), len);
3070         NEXT;
3071     }
3072
3073     return(ret);
3074 }
3075
3076 /**
3077  * htmlParseScript:
3078  * @ctxt:  an HTML parser context
3079  *
3080  * parse the content of an HTML SCRIPT or STYLE element
3081  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3082  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3083  * http://www.w3.org/TR/html4/types.html#type-script
3084  * http://www.w3.org/TR/html4/types.html#h-6.15
3085  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3086  *
3087  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3088  * element and the value of intrinsic event attributes. User agents must
3089  * not evaluate script data as HTML markup but instead must pass it on as
3090  * data to a script engine.
3091  * NOTES:
3092  * - The content is passed like CDATA
3093  * - the attributes for style and scripting "onXXX" are also described
3094  *   as CDATA but SGML allows entities references in attributes so their
3095  *   processing is identical as other attributes
3096  */
3097 static void
3098 htmlParseScript(htmlParserCtxtPtr ctxt) {
3099     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3100     int nbchar = 0;
3101     int cur,l;
3102
3103     cur = CUR_CHAR(l);
3104     while (cur != 0) {
3105         if ((cur == '<') && (NXT(1) == '/')) {
3106             /*
3107              * One should break here, the specification is clear:
3108              * Authors should therefore escape "</" within the content.
3109              * Escape mechanisms are specific to each scripting or
3110              * style sheet language.
3111              *
3112              * In recovery mode, only break if end tag match the
3113              * current tag, effectively ignoring all tags inside the
3114              * script/style block and treating the entire block as
3115              * CDATA.
3116              */
3117             if (ctxt->recovery) {
3118                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3119                                    xmlStrlen(ctxt->name)) == 0)
3120                 {
3121                     break; /* while */
3122                 } else {
3123                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3124                                  "Element %s embeds close tag\n",
3125                                  ctxt->name, NULL);
3126                 }
3127             } else {
3128                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3129                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3130                 {
3131                     break; /* while */
3132                 }
3133             }
3134         }
3135         if (IS_CHAR(cur)) {
3136             COPY_BUF(l,buf,nbchar,cur);
3137         } else {
3138             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3139                             "Invalid char in CDATA 0x%X\n", cur);
3140         }
3141         NEXTL(l);
3142         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3143             buf[nbchar] = 0;
3144             if (ctxt->sax->cdataBlock!= NULL) {
3145                 /*
3146                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3147                  */
3148                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3149             } else if (ctxt->sax->characters != NULL) {
3150                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3151             }
3152             nbchar = 0;
3153             SHRINK;
3154         }
3155         cur = CUR_CHAR(l);
3156     }
3157
3158     if (ctxt->instate == XML_PARSER_EOF)
3159         return;
3160
3161     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3162         buf[nbchar] = 0;
3163         if (ctxt->sax->cdataBlock!= NULL) {
3164             /*
3165              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3166              */
3167             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3168         } else if (ctxt->sax->characters != NULL) {
3169             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3170         }
3171     }
3172 }
3173
3174
3175 /**
3176  * htmlParseCharDataInternal:
3177  * @ctxt:  an HTML parser context
3178  * @readahead: optional read ahead character in ascii range
3179  *
3180  * parse a CharData section.
3181  * if we are within a CDATA section ']]>' marks an end of section.
3182  *
3183  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3184  */
3185
3186 static void
3187 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3188     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3189     int nbchar = 0;
3190     int cur, l;
3191
3192     if (readahead)
3193         buf[nbchar++] = readahead;
3194
3195     cur = CUR_CHAR(l);
3196     while (((cur != '<') || (ctxt->token == '<')) &&
3197            ((cur != '&') || (ctxt->token == '&')) &&
3198            (cur != 0)) {
3199         if (!(IS_CHAR(cur))) {
3200             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3201                         "Invalid char in CDATA 0x%X\n", cur);
3202         } else {
3203             COPY_BUF(l,buf,nbchar,cur);
3204         }
3205         NEXTL(l);
3206         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3207             buf[nbchar] = 0;
3208
3209             /*
3210              * Ok the segment is to be consumed as chars.
3211              */
3212             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3213                 if (areBlanks(ctxt, buf, nbchar)) {
3214                     if (ctxt->keepBlanks) {
3215                         if (ctxt->sax->characters != NULL)
3216                             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3217                     } else {
3218                         if (ctxt->sax->ignorableWhitespace != NULL)
3219                             ctxt->sax->ignorableWhitespace(ctxt->userData,
3220                                                            buf, nbchar);
3221                     }
3222                 } else {
3223                     htmlCheckParagraph(ctxt);
3224                     if (ctxt->sax->characters != NULL)
3225                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3226                 }
3227             }
3228             nbchar = 0;
3229             SHRINK;
3230         }
3231         cur = CUR_CHAR(l);
3232     }
3233     if (ctxt->instate == XML_PARSER_EOF)
3234         return;
3235     if (nbchar != 0) {
3236         buf[nbchar] = 0;
3237
3238         /*
3239          * Ok the segment is to be consumed as chars.
3240          */
3241         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3242             if (areBlanks(ctxt, buf, nbchar)) {
3243                 if (ctxt->keepBlanks) {
3244                     if (ctxt->sax->characters != NULL)
3245                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3246                 } else {
3247                     if (ctxt->sax->ignorableWhitespace != NULL)
3248                         ctxt->sax->ignorableWhitespace(ctxt->userData,
3249                                                        buf, nbchar);
3250                 }
3251             } else {
3252                 htmlCheckParagraph(ctxt);
3253                 if (ctxt->sax->characters != NULL)
3254                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
3255             }
3256         }
3257     }
3258 }
3259
3260 /**
3261  * htmlParseCharData:
3262  * @ctxt:  an HTML parser context
3263  *
3264  * parse a CharData section.
3265  * if we are within a CDATA section ']]>' marks an end of section.
3266  *
3267  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268  */
3269
3270 static void
3271 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272     htmlParseCharDataInternal(ctxt, 0);
3273 }
3274
3275 /**
3276  * htmlParseExternalID:
3277  * @ctxt:  an HTML parser context
3278  * @publicID:  a xmlChar** receiving PubidLiteral
3279  *
3280  * Parse an External ID or a Public ID
3281  *
3282  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3284  *
3285  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3286  *
3287  * Returns the function returns SystemLiteral and in the second
3288  *                case publicID receives PubidLiteral, is strict is off
3289  *                it is possible to return NULL and have publicID set.
3290  */
3291
3292 static xmlChar *
3293 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294     xmlChar *URI = NULL;
3295
3296     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299         SKIP(6);
3300         if (!IS_BLANK_CH(CUR)) {
3301             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302                          "Space required after 'SYSTEM'\n", NULL, NULL);
3303         }
3304         SKIP_BLANKS;
3305         URI = htmlParseSystemLiteral(ctxt);
3306         if (URI == NULL) {
3307             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3309         }
3310     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313         SKIP(6);
3314         if (!IS_BLANK_CH(CUR)) {
3315             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316                          "Space required after 'PUBLIC'\n", NULL, NULL);
3317         }
3318         SKIP_BLANKS;
3319         *publicID = htmlParsePubidLiteral(ctxt);
3320         if (*publicID == NULL) {
3321             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323                          NULL, NULL);
3324         }
3325         SKIP_BLANKS;
3326         if ((CUR == '"') || (CUR == '\'')) {
3327             URI = htmlParseSystemLiteral(ctxt);
3328         }
3329     }
3330     return(URI);
3331 }
3332
3333 /**
3334  * xmlParsePI:
3335  * @ctxt:  an XML parser context
3336  *
3337  * parse an XML Processing Instruction.
3338  *
3339  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3340  */
3341 static void
3342 htmlParsePI(htmlParserCtxtPtr ctxt) {
3343     xmlChar *buf = NULL;
3344     int len = 0;
3345     int size = HTML_PARSER_BUFFER_SIZE;
3346     int cur, l;
3347     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3348                     XML_MAX_HUGE_LENGTH :
3349                     XML_MAX_TEXT_LENGTH;
3350     const xmlChar *target;
3351     xmlParserInputState state;
3352
3353     if ((RAW == '<') && (NXT(1) == '?')) {
3354         state = ctxt->instate;
3355         ctxt->instate = XML_PARSER_PI;
3356         /*
3357          * this is a Processing Instruction.
3358          */
3359         SKIP(2);
3360
3361         /*
3362          * Parse the target name and check for special support like
3363          * namespace.
3364          */
3365         target = htmlParseName(ctxt);
3366         if (target != NULL) {
3367             if (RAW == '>') {
3368                 SKIP(1);
3369
3370                 /*
3371                  * SAX: PI detected.
3372                  */
3373                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3374                     (ctxt->sax->processingInstruction != NULL))
3375                     ctxt->sax->processingInstruction(ctxt->userData,
3376                                                      target, NULL);
3377                 ctxt->instate = state;
3378                 return;
3379             }
3380             buf = (xmlChar *) xmlMallocAtomic(size);
3381             if (buf == NULL) {
3382                 htmlErrMemory(ctxt, NULL);
3383                 ctxt->instate = state;
3384                 return;
3385             }
3386             cur = CUR;
3387             if (!IS_BLANK(cur)) {
3388                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3389                           "ParsePI: PI %s space expected\n", target, NULL);
3390             }
3391             SKIP_BLANKS;
3392             cur = CUR_CHAR(l);
3393             while ((cur != 0) && (cur != '>')) {
3394                 if (len + 5 >= size) {
3395                     xmlChar *tmp;
3396
3397                     size *= 2;
3398                     tmp = (xmlChar *) xmlRealloc(buf, size);
3399                     if (tmp == NULL) {
3400                         htmlErrMemory(ctxt, NULL);
3401                         xmlFree(buf);
3402                         ctxt->instate = state;
3403                         return;
3404                     }
3405                     buf = tmp;
3406                 }
3407                 if (IS_CHAR(cur)) {
3408                     COPY_BUF(l,buf,len,cur);
3409                 } else {
3410                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3411                                     "Invalid char in processing instruction "
3412                                     "0x%X\n", cur);
3413                 }
3414                 if (len > maxLength) {
3415                     htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3416                                  "PI %s too long", target, NULL);
3417                     xmlFree(buf);
3418                     ctxt->instate = state;
3419                     return;
3420                 }
3421                 NEXTL(l);
3422                 cur = CUR_CHAR(l);
3423             }
3424             buf[len] = 0;
3425             if (ctxt->instate == XML_PARSER_EOF) {
3426                 xmlFree(buf);
3427                 return;
3428             }
3429             if (cur != '>') {
3430                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3431                       "ParsePI: PI %s never end ...\n", target, NULL);
3432             } else {
3433                 SKIP(1);
3434
3435                 /*
3436                  * SAX: PI detected.
3437                  */
3438                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3439                     (ctxt->sax->processingInstruction != NULL))
3440                     ctxt->sax->processingInstruction(ctxt->userData,
3441                                                      target, buf);
3442             }
3443             xmlFree(buf);
3444         } else {
3445             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3446                          "PI is not started correctly", NULL, NULL);
3447         }
3448         ctxt->instate = state;
3449     }
3450 }
3451
3452 /**
3453  * htmlParseComment:
3454  * @ctxt:  an HTML parser context
3455  *
3456  * Parse an XML (SGML) comment <!-- .... -->
3457  *
3458  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3459  */
3460 static void
3461 htmlParseComment(htmlParserCtxtPtr ctxt) {
3462     xmlChar *buf = NULL;
3463     int len;
3464     int size = HTML_PARSER_BUFFER_SIZE;
3465     int q, ql;
3466     int r, rl;
3467     int cur, l;
3468     int next, nl;
3469     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3470                     XML_MAX_HUGE_LENGTH :
3471                     XML_MAX_TEXT_LENGTH;
3472     xmlParserInputState state;
3473
3474     /*
3475      * Check that there is a comment right here.
3476      */
3477     if ((RAW != '<') || (NXT(1) != '!') ||
3478         (NXT(2) != '-') || (NXT(3) != '-')) return;
3479
3480     state = ctxt->instate;
3481     ctxt->instate = XML_PARSER_COMMENT;
3482     SKIP(4);
3483     buf = (xmlChar *) xmlMallocAtomic(size);
3484     if (buf == NULL) {
3485         htmlErrMemory(ctxt, "buffer allocation failed\n");
3486         ctxt->instate = state;
3487         return;
3488     }
3489     len = 0;
3490     buf[len] = 0;
3491     q = CUR_CHAR(ql);
3492     if (q == 0)
3493         goto unfinished;
3494     if (q == '>') {
3495         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3496         cur = '>';
3497         goto finished;
3498     }
3499     NEXTL(ql);
3500     r = CUR_CHAR(rl);
3501     if (r == 0)
3502         goto unfinished;
3503     if (q == '-' && r == '>') {
3504         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3505         cur = '>';
3506         goto finished;
3507     }
3508     NEXTL(rl);
3509     cur = CUR_CHAR(l);
3510     while ((cur != 0) &&
3511            ((cur != '>') ||
3512             (r != '-') || (q != '-'))) {
3513         NEXTL(l);
3514         next = CUR_CHAR(nl);
3515
3516         if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3517           htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3518                        "Comment incorrectly closed by '--!>'", NULL, NULL);
3519           cur = '>';
3520           break;
3521         }
3522
3523         if (len + 5 >= size) {
3524             xmlChar *tmp;
3525
3526             size *= 2;
3527             tmp = (xmlChar *) xmlRealloc(buf, size);
3528             if (tmp == NULL) {
3529                 xmlFree(buf);
3530                 htmlErrMemory(ctxt, "growing buffer failed\n");
3531                 ctxt->instate = state;
3532                 return;
3533             }
3534             buf = tmp;
3535         }
3536         if (IS_CHAR(q)) {
3537             COPY_BUF(ql,buf,len,q);
3538         } else {
3539             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3540                             "Invalid char in comment 0x%X\n", q);
3541         }
3542         if (len > maxLength) {
3543             htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3544                          "comment too long", NULL, NULL);
3545             xmlFree(buf);
3546             ctxt->instate = state;
3547             return;
3548         }
3549
3550         q = r;
3551         ql = rl;
3552         r = cur;
3553         rl = l;
3554         cur = next;
3555         l = nl;
3556     }
3557 finished:
3558     buf[len] = 0;
3559     if (ctxt->instate == XML_PARSER_EOF) {
3560         xmlFree(buf);
3561         return;
3562     }
3563     if (cur == '>') {
3564         NEXT;
3565         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3566             (!ctxt->disableSAX))
3567             ctxt->sax->comment(ctxt->userData, buf);
3568         xmlFree(buf);
3569         ctxt->instate = state;
3570         return;
3571     }
3572
3573 unfinished:
3574     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3575                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3576     xmlFree(buf);
3577 }
3578
3579 /**
3580  * htmlParseCharRef:
3581  * @ctxt:  an HTML parser context
3582  *
3583  * DEPRECATED: Internal function, don't use.
3584  *
3585  * parse Reference declarations
3586  *
3587  * [66] CharRef ::= '&#' [0-9]+ ';' |
3588  *                  '&#x' [0-9a-fA-F]+ ';'
3589  *
3590  * Returns the value parsed (as an int)
3591  */
3592 int
3593 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3594     int val = 0;
3595
3596     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3597         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3598                      "htmlParseCharRef: context error\n",
3599                      NULL, NULL);
3600         return(0);
3601     }
3602     if ((CUR == '&') && (NXT(1) == '#') &&
3603         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3604         SKIP(3);
3605         while (CUR != ';') {
3606             if ((CUR >= '0') && (CUR <= '9')) {
3607                 if (val < 0x110000)
3608                     val = val * 16 + (CUR - '0');
3609             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3610                 if (val < 0x110000)
3611                     val = val * 16 + (CUR - 'a') + 10;
3612             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3613                 if (val < 0x110000)
3614                     val = val * 16 + (CUR - 'A') + 10;
3615             } else {
3616                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3617                              "htmlParseCharRef: missing semicolon\n",
3618                              NULL, NULL);
3619                 break;
3620             }
3621             NEXT;
3622         }
3623         if (CUR == ';')
3624             NEXT;
3625     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3626         SKIP(2);
3627         while (CUR != ';') {
3628             if ((CUR >= '0') && (CUR <= '9')) {
3629                 if (val < 0x110000)
3630                     val = val * 10 + (CUR - '0');
3631             } else {
3632                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3633                              "htmlParseCharRef: missing semicolon\n",
3634                              NULL, NULL);
3635                 break;
3636             }
3637             NEXT;
3638         }
3639         if (CUR == ';')
3640             NEXT;
3641     } else {
3642         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3643                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3644     }
3645     /*
3646      * Check the value IS_CHAR ...
3647      */
3648     if (IS_CHAR(val)) {
3649         return(val);
3650     } else if (val >= 0x110000) {
3651         htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3652                      "htmlParseCharRef: value too large\n", NULL, NULL);
3653     } else {
3654         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3655                         "htmlParseCharRef: invalid xmlChar value %d\n",
3656                         val);
3657     }
3658     return(0);
3659 }
3660
3661
3662 /**
3663  * htmlParseDocTypeDecl:
3664  * @ctxt:  an HTML parser context
3665  *
3666  * parse a DOCTYPE declaration
3667  *
3668  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3669  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3670  */
3671
3672 static void
3673 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3674     const xmlChar *name;
3675     xmlChar *ExternalID = NULL;
3676     xmlChar *URI = NULL;
3677
3678     /*
3679      * We know that '<!DOCTYPE' has been detected.
3680      */
3681     SKIP(9);
3682
3683     SKIP_BLANKS;
3684
3685     /*
3686      * Parse the DOCTYPE name.
3687      */
3688     name = htmlParseName(ctxt);
3689     if (name == NULL) {
3690         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3691                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3692                      NULL, NULL);
3693     }
3694     /*
3695      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3696      */
3697
3698     SKIP_BLANKS;
3699
3700     /*
3701      * Check for SystemID and ExternalID
3702      */
3703     URI = htmlParseExternalID(ctxt, &ExternalID);
3704     SKIP_BLANKS;
3705
3706     /*
3707      * We should be at the end of the DOCTYPE declaration.
3708      */
3709     if (CUR != '>') {
3710         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3711                      "DOCTYPE improperly terminated\n", NULL, NULL);
3712         /* Ignore bogus content */
3713         while ((CUR != 0) && (CUR != '>') &&
3714                (ctxt->instate != XML_PARSER_EOF))
3715             NEXT;
3716     }
3717     if (CUR == '>')
3718         NEXT;
3719
3720     /*
3721      * Create or update the document accordingly to the DOCTYPE
3722      */
3723     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3724         (!ctxt->disableSAX))
3725         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3726
3727     /*
3728      * Cleanup, since we don't use all those identifiers
3729      */
3730     if (URI != NULL) xmlFree(URI);
3731     if (ExternalID != NULL) xmlFree(ExternalID);
3732 }
3733
3734 /**
3735  * htmlParseAttribute:
3736  * @ctxt:  an HTML parser context
3737  * @value:  a xmlChar ** used to store the value of the attribute
3738  *
3739  * parse an attribute
3740  *
3741  * [41] Attribute ::= Name Eq AttValue
3742  *
3743  * [25] Eq ::= S? '=' S?
3744  *
3745  * With namespace:
3746  *
3747  * [NS 11] Attribute ::= QName Eq AttValue
3748  *
3749  * Also the case QName == xmlns:??? is handled independently as a namespace
3750  * definition.
3751  *
3752  * Returns the attribute name, and the value in *value.
3753  */
3754
3755 static const xmlChar *
3756 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3757     const xmlChar *name;
3758     xmlChar *val = NULL;
3759
3760     *value = NULL;
3761     name = htmlParseHTMLName(ctxt);
3762     if (name == NULL) {
3763         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3764                      "error parsing attribute name\n", NULL, NULL);
3765         return(NULL);
3766     }
3767
3768     /*
3769      * read the value
3770      */
3771     SKIP_BLANKS;
3772     if (CUR == '=') {
3773         NEXT;
3774         SKIP_BLANKS;
3775         val = htmlParseAttValue(ctxt);
3776     }
3777
3778     *value = val;
3779     return(name);
3780 }
3781
3782 /**
3783  * htmlCheckEncodingDirect:
3784  * @ctxt:  an HTML parser context
3785  * @attvalue: the attribute value
3786  *
3787  * Checks an attribute value to detect
3788  * the encoding
3789  * If a new encoding is detected the parser is switched to decode
3790  * it and pass UTF8
3791  */
3792 static void
3793 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3794
3795     if ((ctxt == NULL) || (encoding == NULL) ||
3796         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3797         return;
3798
3799     /* do not change encoding */
3800     if (ctxt->input->encoding != NULL)
3801         return;
3802
3803     if (encoding != NULL) {
3804         xmlCharEncoding enc;
3805         xmlCharEncodingHandlerPtr handler;
3806
3807         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3808
3809         if (ctxt->input->encoding != NULL)
3810             xmlFree((xmlChar *) ctxt->input->encoding);
3811         ctxt->input->encoding = xmlStrdup(encoding);
3812
3813         enc = xmlParseCharEncoding((const char *) encoding);
3814         /*
3815          * registered set of known encodings
3816          */
3817         if (enc != XML_CHAR_ENCODING_ERROR) {
3818             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3819                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3820                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3821                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3822                 (ctxt->input->buf != NULL) &&
3823                 (ctxt->input->buf->encoder == NULL)) {
3824                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3825                              "htmlCheckEncoding: wrong encoding meta\n",
3826                              NULL, NULL);
3827             } else {
3828                 xmlSwitchEncoding(ctxt, enc);
3829             }
3830             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3831         } else {
3832             /*
3833              * fallback for unknown encodings
3834              */
3835             handler = xmlFindCharEncodingHandler((const char *) encoding);
3836             if (handler != NULL) {
3837                 xmlSwitchToEncoding(ctxt, handler);
3838                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3839             } else {
3840                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3841                              "htmlCheckEncoding: unknown encoding %s\n",
3842                              encoding, NULL);
3843             }
3844         }
3845
3846         if ((ctxt->input->buf != NULL) &&
3847             (ctxt->input->buf->encoder != NULL) &&
3848             (ctxt->input->buf->raw != NULL) &&
3849             (ctxt->input->buf->buffer != NULL)) {
3850             int nbchars;
3851             size_t processed;
3852
3853             /*
3854              * convert as much as possible to the parser reading buffer.
3855              */
3856             processed = ctxt->input->cur - ctxt->input->base;
3857             xmlBufShrink(ctxt->input->buf->buffer, processed);
3858             nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3859             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3860             if (nbchars < 0) {
3861                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3862                              "htmlCheckEncoding: encoder error\n",
3863                              NULL, NULL);
3864             }
3865         }
3866     }
3867 }
3868
3869 /**
3870  * htmlCheckEncoding:
3871  * @ctxt:  an HTML parser context
3872  * @attvalue: the attribute value
3873  *
3874  * Checks an http-equiv attribute from a Meta tag to detect
3875  * the encoding
3876  * If a new encoding is detected the parser is switched to decode
3877  * it and pass UTF8
3878  */
3879 static void
3880 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3881     const xmlChar *encoding;
3882
3883     if (!attvalue)
3884         return;
3885
3886     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3887     if (encoding != NULL) {
3888         encoding += 7;
3889     }
3890     /*
3891      * skip blank
3892      */
3893     if (encoding && IS_BLANK_CH(*encoding))
3894         encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3895     if (encoding && *encoding == '=') {
3896         encoding ++;
3897         htmlCheckEncodingDirect(ctxt, encoding);
3898     }
3899 }
3900
3901 /**
3902  * htmlCheckMeta:
3903  * @ctxt:  an HTML parser context
3904  * @atts:  the attributes values
3905  *
3906  * Checks an attributes from a Meta tag
3907  */
3908 static void
3909 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3910     int i;
3911     const xmlChar *att, *value;
3912     int http = 0;
3913     const xmlChar *content = NULL;
3914
3915     if ((ctxt == NULL) || (atts == NULL))
3916         return;
3917
3918     i = 0;
3919     att = atts[i++];
3920     while (att != NULL) {
3921         value = atts[i++];
3922         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3923          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3924             http = 1;
3925         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3926             htmlCheckEncodingDirect(ctxt, value);
3927         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3928             content = value;
3929         att = atts[i++];
3930     }
3931     if ((http) && (content != NULL))
3932         htmlCheckEncoding(ctxt, content);
3933
3934 }
3935
3936 /**
3937  * htmlParseStartTag:
3938  * @ctxt:  an HTML parser context
3939  *
3940  * parse a start of tag either for rule element or
3941  * EmptyElement. In both case we don't parse the tag closing chars.
3942  *
3943  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3944  *
3945  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3946  *
3947  * With namespace:
3948  *
3949  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3950  *
3951  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3952  *
3953  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3954  */
3955
3956 static int
3957 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3958     const xmlChar *name;
3959     const xmlChar *attname;
3960     xmlChar *attvalue;
3961     const xmlChar **atts;
3962     int nbatts = 0;
3963     int maxatts;
3964     int meta = 0;
3965     int i;
3966     int discardtag = 0;
3967
3968     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3969         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3970                      "htmlParseStartTag: context error\n", NULL, NULL);
3971         return -1;
3972     }
3973     if (ctxt->instate == XML_PARSER_EOF)
3974         return(-1);
3975     if (CUR != '<') return -1;
3976     NEXT;
3977
3978     atts = ctxt->atts;
3979     maxatts = ctxt->maxatts;
3980
3981     GROW;
3982     name = htmlParseHTMLName(ctxt);
3983     if (name == NULL) {
3984         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3985                      "htmlParseStartTag: invalid element name\n",
3986                      NULL, NULL);
3987         /* Dump the bogus tag like browsers do */
3988         while ((CUR != 0) && (CUR != '>') &&
3989                (ctxt->instate != XML_PARSER_EOF))
3990             NEXT;
3991         return -1;
3992     }
3993     if (xmlStrEqual(name, BAD_CAST"meta"))
3994         meta = 1;
3995
3996     /*
3997      * Check for auto-closure of HTML elements.
3998      */
3999     htmlAutoClose(ctxt, name);
4000
4001     /*
4002      * Check for implied HTML elements.
4003      */
4004     htmlCheckImplied(ctxt, name);
4005
4006     /*
4007      * Avoid html at any level > 0, head at any level != 1
4008      * or any attempt to recurse body
4009      */
4010     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4011         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012                      "htmlParseStartTag: misplaced <html> tag\n",
4013                      name, NULL);
4014         discardtag = 1;
4015         ctxt->depth++;
4016     }
4017     if ((ctxt->nameNr != 1) &&
4018         (xmlStrEqual(name, BAD_CAST"head"))) {
4019         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4020                      "htmlParseStartTag: misplaced <head> tag\n",
4021                      name, NULL);
4022         discardtag = 1;
4023         ctxt->depth++;
4024     }
4025     if (xmlStrEqual(name, BAD_CAST"body")) {
4026         int indx;
4027         for (indx = 0;indx < ctxt->nameNr;indx++) {
4028             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4029                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4030                              "htmlParseStartTag: misplaced <body> tag\n",
4031                              name, NULL);
4032                 discardtag = 1;
4033                 ctxt->depth++;
4034             }
4035         }
4036     }
4037
4038     /*
4039      * Now parse the attributes, it ends up with the ending
4040      *
4041      * (S Attribute)* S?
4042      */
4043     SKIP_BLANKS;
4044     while ((CUR != 0) &&
4045            (CUR != '>') &&
4046            ((CUR != '/') || (NXT(1) != '>')) &&
4047            (ctxt->instate != XML_PARSER_EOF)) {
4048         GROW;
4049         attname = htmlParseAttribute(ctxt, &attvalue);
4050         if (attname != NULL) {
4051
4052             /*
4053              * Well formedness requires at most one declaration of an attribute
4054              */
4055             for (i = 0; i < nbatts;i += 2) {
4056                 if (xmlStrEqual(atts[i], attname)) {
4057                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4058                                  "Attribute %s redefined\n", attname, NULL);
4059                     if (attvalue != NULL)
4060                         xmlFree(attvalue);
4061                     goto failed;
4062                 }
4063             }
4064
4065             /*
4066              * Add the pair to atts
4067              */
4068             if (atts == NULL) {
4069                 maxatts = 22; /* allow for 10 attrs by default */
4070                 atts = (const xmlChar **)
4071                        xmlMalloc(maxatts * sizeof(xmlChar *));
4072                 if (atts == NULL) {
4073                     htmlErrMemory(ctxt, NULL);
4074                     if (attvalue != NULL)
4075                         xmlFree(attvalue);
4076                     goto failed;
4077                 }
4078                 ctxt->atts = atts;
4079                 ctxt->maxatts = maxatts;
4080             } else if (nbatts + 4 > maxatts) {
4081                 const xmlChar **n;
4082
4083                 maxatts *= 2;
4084                 n = (const xmlChar **) xmlRealloc((void *) atts,
4085                                              maxatts * sizeof(const xmlChar *));
4086                 if (n == NULL) {
4087                     htmlErrMemory(ctxt, NULL);
4088                     if (attvalue != NULL)
4089                         xmlFree(attvalue);
4090                     goto failed;
4091                 }
4092                 atts = n;
4093                 ctxt->atts = atts;
4094                 ctxt->maxatts = maxatts;
4095             }
4096             atts[nbatts++] = attname;
4097             atts[nbatts++] = attvalue;
4098             atts[nbatts] = NULL;
4099             atts[nbatts + 1] = NULL;
4100         }
4101         else {
4102             if (attvalue != NULL)
4103                 xmlFree(attvalue);
4104             /* Dump the bogus attribute string up to the next blank or
4105              * the end of the tag. */
4106             while ((CUR != 0) &&
4107                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4108                    ((CUR != '/') || (NXT(1) != '>')) &&
4109                    (ctxt->instate != XML_PARSER_EOF))
4110                 NEXT;
4111         }
4112
4113 failed:
4114         SKIP_BLANKS;
4115     }
4116
4117     /*
4118      * Handle specific association to the META tag
4119      */
4120     if (meta && (nbatts != 0))
4121         htmlCheckMeta(ctxt, atts);
4122
4123     /*
4124      * SAX: Start of Element !
4125      */
4126     if (!discardtag) {
4127         htmlnamePush(ctxt, name);
4128         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4129             if (nbatts != 0)
4130                 ctxt->sax->startElement(ctxt->userData, name, atts);
4131             else
4132                 ctxt->sax->startElement(ctxt->userData, name, NULL);
4133         }
4134     }
4135
4136     if (atts != NULL) {
4137         for (i = 1;i < nbatts;i += 2) {
4138             if (atts[i] != NULL)
4139                 xmlFree((xmlChar *) atts[i]);
4140         }
4141     }
4142
4143     return(discardtag);
4144 }
4145
4146 /**
4147  * htmlParseEndTag:
4148  * @ctxt:  an HTML parser context
4149  *
4150  * parse an end of tag
4151  *
4152  * [42] ETag ::= '</' Name S? '>'
4153  *
4154  * With namespace
4155  *
4156  * [NS 9] ETag ::= '</' QName S? '>'
4157  *
4158  * Returns 1 if the current level should be closed.
4159  */
4160
4161 static int
4162 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4163 {
4164     const xmlChar *name;
4165     const xmlChar *oldname;
4166     int i, ret;
4167
4168     if ((CUR != '<') || (NXT(1) != '/')) {
4169         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4170                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
4171         return (0);
4172     }
4173     SKIP(2);
4174
4175     name = htmlParseHTMLName(ctxt);
4176     if (name == NULL)
4177         return (0);
4178     /*
4179      * We should definitely be at the ending "S? '>'" part
4180      */
4181     SKIP_BLANKS;
4182     if (CUR != '>') {
4183         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4184                      "End tag : expected '>'\n", NULL, NULL);
4185         /* Skip to next '>' */
4186         while ((CUR != 0) && (CUR != '>'))
4187             NEXT;
4188     }
4189     if (CUR == '>')
4190         NEXT;
4191
4192     /*
4193      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4194      * out now.
4195      */
4196     if ((ctxt->depth > 0) &&
4197         (xmlStrEqual(name, BAD_CAST "html") ||
4198          xmlStrEqual(name, BAD_CAST "body") ||
4199          xmlStrEqual(name, BAD_CAST "head"))) {
4200         ctxt->depth--;
4201         return (0);
4202     }
4203
4204     /*
4205      * If the name read is not one of the element in the parsing stack
4206      * then return, it's just an error.
4207      */
4208     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4209         if (xmlStrEqual(name, ctxt->nameTab[i]))
4210             break;
4211     }
4212     if (i < 0) {
4213         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4214                      "Unexpected end tag : %s\n", name, NULL);
4215         return (0);
4216     }
4217
4218
4219     /*
4220      * Check for auto-closure of HTML elements.
4221      */
4222
4223     htmlAutoCloseOnClose(ctxt, name);
4224
4225     /*
4226      * Well formedness constraints, opening and closing must match.
4227      * With the exception that the autoclose may have popped stuff out
4228      * of the stack.
4229      */
4230     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4231         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4232                      "Opening and ending tag mismatch: %s and %s\n",
4233                      name, ctxt->name);
4234     }
4235
4236     /*
4237      * SAX: End of Tag
4238      */
4239     oldname = ctxt->name;
4240     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4241         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4242             ctxt->sax->endElement(ctxt->userData, name);
4243         htmlNodeInfoPop(ctxt);
4244         htmlnamePop(ctxt);
4245         ret = 1;
4246     } else {
4247         ret = 0;
4248     }
4249
4250     return (ret);
4251 }
4252
4253
4254 /**
4255  * htmlParseReference:
4256  * @ctxt:  an HTML parser context
4257  *
4258  * parse and handle entity references in content,
4259  * this will end-up in a call to character() since this is either a
4260  * CharRef, or a predefined entity.
4261  */
4262 static void
4263 htmlParseReference(htmlParserCtxtPtr ctxt) {
4264     const htmlEntityDesc * ent;
4265     xmlChar out[6];
4266     const xmlChar *name;
4267     if (CUR != '&') return;
4268
4269     if (NXT(1) == '#') {
4270         unsigned int c;
4271         int bits, i = 0;
4272
4273         c = htmlParseCharRef(ctxt);
4274         if (c == 0)
4275             return;
4276
4277         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4278         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4279         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4280         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4281
4282         for ( ; bits >= 0; bits-= 6) {
4283             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4284         }
4285         out[i] = 0;
4286
4287         htmlCheckParagraph(ctxt);
4288         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4289             ctxt->sax->characters(ctxt->userData, out, i);
4290     } else {
4291         ent = htmlParseEntityRef(ctxt, &name);
4292         if (name == NULL) {
4293             htmlCheckParagraph(ctxt);
4294             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4295                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4296             return;
4297         }
4298         if ((ent == NULL) || !(ent->value > 0)) {
4299             htmlCheckParagraph(ctxt);
4300             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4301                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4302                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4303                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4304             }
4305         } else {
4306             unsigned int c;
4307             int bits, i = 0;
4308
4309             c = ent->value;
4310             if      (c <    0x80)
4311                     { out[i++]= c;                bits= -6; }
4312             else if (c <   0x800)
4313                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4314             else if (c < 0x10000)
4315                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4316             else
4317                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4318
4319             for ( ; bits >= 0; bits-= 6) {
4320                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4321             }
4322             out[i] = 0;
4323
4324             htmlCheckParagraph(ctxt);
4325             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4326                 ctxt->sax->characters(ctxt->userData, out, i);
4327         }
4328     }
4329 }
4330
4331 /**
4332  * htmlParseContent:
4333  * @ctxt:  an HTML parser context
4334  *
4335  * Parse a content: comment, sub-element, reference or text.
4336  * Kept for compatibility with old code
4337  */
4338
4339 static void
4340 htmlParseContent(htmlParserCtxtPtr ctxt) {
4341     xmlChar *currentNode;
4342     int depth;
4343     const xmlChar *name;
4344
4345     currentNode = xmlStrdup(ctxt->name);
4346     depth = ctxt->nameNr;
4347     while (1) {
4348         GROW;
4349
4350         if (ctxt->instate == XML_PARSER_EOF)
4351             break;
4352
4353         /*
4354          * Our tag or one of it's parent or children is ending.
4355          */
4356         if ((CUR == '<') && (NXT(1) == '/')) {
4357             if (htmlParseEndTag(ctxt) &&
4358                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4359                 if (currentNode != NULL)
4360                     xmlFree(currentNode);
4361                 return;
4362             }
4363             continue; /* while */
4364         }
4365
4366         else if ((CUR == '<') &&
4367                  ((IS_ASCII_LETTER(NXT(1))) ||
4368                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4369             name = htmlParseHTMLName_nonInvasive(ctxt);
4370             if (name == NULL) {
4371                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4372                          "htmlParseStartTag: invalid element name\n",
4373                          NULL, NULL);
4374                 /* Dump the bogus tag like browsers do */
4375                 while ((CUR != 0) && (CUR != '>'))
4376                     NEXT;
4377
4378                 if (currentNode != NULL)
4379                     xmlFree(currentNode);
4380                 return;
4381             }
4382
4383             if (ctxt->name != NULL) {
4384                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4385                     htmlAutoClose(ctxt, name);
4386                     continue;
4387                 }
4388             }
4389         }
4390
4391         /*
4392          * Has this node been popped out during parsing of
4393          * the next element
4394          */
4395         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4396             (!xmlStrEqual(currentNode, ctxt->name)))
4397              {
4398             if (currentNode != NULL) xmlFree(currentNode);
4399             return;
4400         }
4401
4402         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4403             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4404             /*
4405              * Handle SCRIPT/STYLE separately
4406              */
4407             htmlParseScript(ctxt);
4408         }
4409
4410         else if ((CUR == '<') && (NXT(1) == '!')) {
4411             /*
4412              * Sometimes DOCTYPE arrives in the middle of the document
4413              */
4414             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4415                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4416                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4417                 (UPP(8) == 'E')) {
4418                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4419                              "Misplaced DOCTYPE declaration\n",
4420                              BAD_CAST "DOCTYPE" , NULL);
4421                 htmlParseDocTypeDecl(ctxt);
4422             }
4423             /*
4424              * First case :  a comment
4425              */
4426             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4427                 htmlParseComment(ctxt);
4428             }
4429             else {
4430                 htmlSkipBogusComment(ctxt);
4431             }
4432         }
4433
4434         /*
4435          * Second case : a Processing Instruction.
4436          */
4437         else if ((CUR == '<') && (NXT(1) == '?')) {
4438             htmlParsePI(ctxt);
4439         }
4440
4441         /*
4442          * Third case :  a sub-element.
4443          */
4444         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4445             htmlParseElement(ctxt);
4446         }
4447         else if (CUR == '<') {
4448             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4449                 (ctxt->sax->characters != NULL))
4450                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4451             NEXT;
4452         }
4453
4454         /*
4455          * Fourth case : a reference. If if has not been resolved,
4456          *    parsing returns it's Name, create the node
4457          */
4458         else if (CUR == '&') {
4459             htmlParseReference(ctxt);
4460         }
4461
4462         /*
4463          * Fifth case : end of the resource
4464          */
4465         else if (CUR == 0) {
4466             htmlAutoCloseOnEnd(ctxt);
4467             break;
4468         }
4469
4470         /*
4471          * Last case, text. Note that References are handled directly.
4472          */
4473         else {
4474             htmlParseCharData(ctxt);
4475         }
4476
4477         SHRINK;
4478         GROW;
4479     }
4480     if (currentNode != NULL) xmlFree(currentNode);
4481 }
4482
4483 /**
4484  * htmlParseElement:
4485  * @ctxt:  an HTML parser context
4486  *
4487  * DEPRECATED: Internal function, don't use.
4488  *
4489  * parse an HTML element, this is highly recursive
4490  * this is kept for compatibility with previous code versions
4491  *
4492  * [39] element ::= EmptyElemTag | STag content ETag
4493  *
4494  * [41] Attribute ::= Name Eq AttValue
4495  */
4496
4497 void
4498 htmlParseElement(htmlParserCtxtPtr ctxt) {
4499     const xmlChar *name;
4500     xmlChar *currentNode = NULL;
4501     const htmlElemDesc * info;
4502     htmlParserNodeInfo node_info;
4503     int failed;
4504     int depth;
4505     const xmlChar *oldptr;
4506
4507     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4508         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4509                      "htmlParseElement: context error\n", NULL, NULL);
4510         return;
4511     }
4512
4513     if (ctxt->instate == XML_PARSER_EOF)
4514         return;
4515
4516     /* Capture start position */
4517     if (ctxt->record_info) {
4518         node_info.begin_pos = ctxt->input->consumed +
4519                           (CUR_PTR - ctxt->input->base);
4520         node_info.begin_line = ctxt->input->line;
4521     }
4522
4523     failed = htmlParseStartTag(ctxt);
4524     name = ctxt->name;
4525     if ((failed == -1) || (name == NULL)) {
4526         if (CUR == '>')
4527             NEXT;
4528         return;
4529     }
4530
4531     /*
4532      * Lookup the info for that element.
4533      */
4534     info = htmlTagLookup(name);
4535     if (info == NULL) {
4536         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4537                      "Tag %s invalid\n", name, NULL);
4538     }
4539
4540     /*
4541      * Check for an Empty Element labeled the XML/SGML way
4542      */
4543     if ((CUR == '/') && (NXT(1) == '>')) {
4544         SKIP(2);
4545         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4546             ctxt->sax->endElement(ctxt->userData, name);
4547         htmlnamePop(ctxt);
4548         return;
4549     }
4550
4551     if (CUR == '>') {
4552         NEXT;
4553     } else {
4554         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4555                      "Couldn't find end of Start Tag %s\n", name, NULL);
4556
4557         /*
4558          * end of parsing of this node.
4559          */
4560         if (xmlStrEqual(name, ctxt->name)) {
4561             nodePop(ctxt);
4562             htmlnamePop(ctxt);
4563         }
4564
4565         /*
4566          * Capture end position and add node
4567          */
4568         if (ctxt->record_info) {
4569            node_info.end_pos = ctxt->input->consumed +
4570                               (CUR_PTR - ctxt->input->base);
4571            node_info.end_line = ctxt->input->line;
4572            node_info.node = ctxt->node;
4573            xmlParserAddNodeInfo(ctxt, &node_info);
4574         }
4575         return;
4576     }
4577
4578     /*
4579      * Check for an Empty Element from DTD definition
4580      */
4581     if ((info != NULL) && (info->empty)) {
4582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4583             ctxt->sax->endElement(ctxt->userData, name);
4584         htmlnamePop(ctxt);
4585         return;
4586     }
4587
4588     /*
4589      * Parse the content of the element:
4590      */
4591     currentNode = xmlStrdup(ctxt->name);
4592     depth = ctxt->nameNr;
4593     while (CUR != 0) {
4594         oldptr = ctxt->input->cur;
4595         htmlParseContent(ctxt);
4596         if (oldptr==ctxt->input->cur) break;
4597         if (ctxt->nameNr < depth) break;
4598     }
4599
4600     /*
4601      * Capture end position and add node
4602      */
4603     if ( currentNode != NULL && ctxt->record_info ) {
4604        node_info.end_pos = ctxt->input->consumed +
4605                           (CUR_PTR - ctxt->input->base);
4606        node_info.end_line = ctxt->input->line;
4607        node_info.node = ctxt->node;
4608        xmlParserAddNodeInfo(ctxt, &node_info);
4609     }
4610     if (CUR == 0) {
4611         htmlAutoCloseOnEnd(ctxt);
4612     }
4613
4614     if (currentNode != NULL)
4615         xmlFree(currentNode);
4616 }
4617
4618 static void
4619 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4620     /*
4621      * Capture end position and add node
4622      */
4623     if ( ctxt->node != NULL && ctxt->record_info ) {
4624        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4625                                 (CUR_PTR - ctxt->input->base);
4626        ctxt->nodeInfo->end_line = ctxt->input->line;
4627        ctxt->nodeInfo->node = ctxt->node;
4628        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4629        htmlNodeInfoPop(ctxt);
4630     }
4631     if (CUR == 0) {
4632        htmlAutoCloseOnEnd(ctxt);
4633     }
4634 }
4635
4636 /**
4637  * htmlParseElementInternal:
4638  * @ctxt:  an HTML parser context
4639  *
4640  * parse an HTML element, new version, non recursive
4641  *
4642  * [39] element ::= EmptyElemTag | STag content ETag
4643  *
4644  * [41] Attribute ::= Name Eq AttValue
4645  */
4646
4647 static void
4648 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4649     const xmlChar *name;
4650     const htmlElemDesc * info;
4651     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4652     int failed;
4653
4654     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4655         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4656                      "htmlParseElementInternal: context error\n", NULL, NULL);
4657         return;
4658     }
4659
4660     if (ctxt->instate == XML_PARSER_EOF)
4661         return;
4662
4663     /* Capture start position */
4664     if (ctxt->record_info) {
4665         node_info.begin_pos = ctxt->input->consumed +
4666                           (CUR_PTR - ctxt->input->base);
4667         node_info.begin_line = ctxt->input->line;
4668     }
4669
4670     failed = htmlParseStartTag(ctxt);
4671     name = ctxt->name;
4672     if ((failed == -1) || (name == NULL)) {
4673         if (CUR == '>')
4674             NEXT;
4675         return;
4676     }
4677
4678     /*
4679      * Lookup the info for that element.
4680      */
4681     info = htmlTagLookup(name);
4682     if (info == NULL) {
4683         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4684                      "Tag %s invalid\n", name, NULL);
4685     }
4686
4687     /*
4688      * Check for an Empty Element labeled the XML/SGML way
4689      */
4690     if ((CUR == '/') && (NXT(1) == '>')) {
4691         SKIP(2);
4692         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4693             ctxt->sax->endElement(ctxt->userData, name);
4694         htmlnamePop(ctxt);
4695         return;
4696     }
4697
4698     if (CUR == '>') {
4699         NEXT;
4700     } else {
4701         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4702                      "Couldn't find end of Start Tag %s\n", name, NULL);
4703
4704         /*
4705          * end of parsing of this node.
4706          */
4707         if (xmlStrEqual(name, ctxt->name)) {
4708             nodePop(ctxt);
4709             htmlnamePop(ctxt);
4710         }
4711
4712         if (ctxt->record_info)
4713             htmlNodeInfoPush(ctxt, &node_info);
4714         htmlParserFinishElementParsing(ctxt);
4715         return;
4716     }
4717
4718     /*
4719      * Check for an Empty Element from DTD definition
4720      */
4721     if ((info != NULL) && (info->empty)) {
4722         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4723             ctxt->sax->endElement(ctxt->userData, name);
4724         htmlnamePop(ctxt);
4725         return;
4726     }
4727
4728     if (ctxt->record_info)
4729         htmlNodeInfoPush(ctxt, &node_info);
4730 }
4731
4732 /**
4733  * htmlParseContentInternal:
4734  * @ctxt:  an HTML parser context
4735  *
4736  * Parse a content: comment, sub-element, reference or text.
4737  * New version for non recursive htmlParseElementInternal
4738  */
4739
4740 static void
4741 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4742     xmlChar *currentNode;
4743     int depth;
4744     const xmlChar *name;
4745
4746     depth = ctxt->nameNr;
4747     if (depth <= 0) {
4748         currentNode = NULL;
4749     } else {
4750         currentNode = xmlStrdup(ctxt->name);
4751         if (currentNode == NULL) {
4752             htmlErrMemory(ctxt, NULL);
4753             return;
4754         }
4755     }
4756     while (1) {
4757         GROW;
4758
4759         if (ctxt->instate == XML_PARSER_EOF)
4760             break;
4761
4762         /*
4763          * Our tag or one of it's parent or children is ending.
4764          */
4765         if ((CUR == '<') && (NXT(1) == '/')) {
4766             if (htmlParseEndTag(ctxt) &&
4767                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4768                 if (currentNode != NULL)
4769                     xmlFree(currentNode);
4770
4771                 depth = ctxt->nameNr;
4772                 if (depth <= 0) {
4773                     currentNode = NULL;
4774                 } else {
4775                     currentNode = xmlStrdup(ctxt->name);
4776                     if (currentNode == NULL) {
4777                         htmlErrMemory(ctxt, NULL);
4778                         break;
4779                     }
4780                 }
4781             }
4782             continue; /* while */
4783         }
4784
4785         else if ((CUR == '<') &&
4786                  ((IS_ASCII_LETTER(NXT(1))) ||
4787                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4788             name = htmlParseHTMLName_nonInvasive(ctxt);
4789             if (name == NULL) {
4790                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4791                          "htmlParseStartTag: invalid element name\n",
4792                          NULL, NULL);
4793                 /* Dump the bogus tag like browsers do */
4794                 while ((CUR == 0) && (CUR != '>'))
4795                     NEXT;
4796
4797                 htmlParserFinishElementParsing(ctxt);
4798                 if (currentNode != NULL)
4799                     xmlFree(currentNode);
4800
4801                 currentNode = xmlStrdup(ctxt->name);
4802                 if (currentNode == NULL) {
4803                     htmlErrMemory(ctxt, NULL);
4804                     break;
4805                 }
4806                 depth = ctxt->nameNr;
4807                 continue;
4808             }
4809
4810             if (ctxt->name != NULL) {
4811                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4812                     htmlAutoClose(ctxt, name);
4813                     continue;
4814                 }
4815             }
4816         }
4817
4818         /*
4819          * Has this node been popped out during parsing of
4820          * the next element
4821          */
4822         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4823             (!xmlStrEqual(currentNode, ctxt->name)))
4824              {
4825             htmlParserFinishElementParsing(ctxt);
4826             if (currentNode != NULL) xmlFree(currentNode);
4827
4828             currentNode = xmlStrdup(ctxt->name);
4829             if (currentNode == NULL) {
4830                 htmlErrMemory(ctxt, NULL);
4831                 break;
4832             }
4833             depth = ctxt->nameNr;
4834             continue;
4835         }
4836
4837         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4838             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4839             /*
4840              * Handle SCRIPT/STYLE separately
4841              */
4842             htmlParseScript(ctxt);
4843         }
4844
4845         else if ((CUR == '<') && (NXT(1) == '!')) {
4846             /*
4847              * Sometimes DOCTYPE arrives in the middle of the document
4848              */
4849             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4850                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4851                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4852                 (UPP(8) == 'E')) {
4853                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4854                              "Misplaced DOCTYPE declaration\n",
4855                              BAD_CAST "DOCTYPE" , NULL);
4856                 htmlParseDocTypeDecl(ctxt);
4857             }
4858             /*
4859              * First case :  a comment
4860              */
4861             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4862                 htmlParseComment(ctxt);
4863             }
4864             else {
4865                 htmlSkipBogusComment(ctxt);
4866             }
4867         }
4868
4869         /*
4870          * Second case : a Processing Instruction.
4871          */
4872         else if ((CUR == '<') && (NXT(1) == '?')) {
4873             htmlParsePI(ctxt);
4874         }
4875
4876         /*
4877          * Third case :  a sub-element.
4878          */
4879         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4880             htmlParseElementInternal(ctxt);
4881             if (currentNode != NULL) xmlFree(currentNode);
4882
4883             currentNode = xmlStrdup(ctxt->name);
4884             if (currentNode == NULL) {
4885                 htmlErrMemory(ctxt, NULL);
4886                 break;
4887             }
4888             depth = ctxt->nameNr;
4889         }
4890         else if (CUR == '<') {
4891             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4892                 (ctxt->sax->characters != NULL))
4893                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4894             NEXT;
4895         }
4896
4897         /*
4898          * Fourth case : a reference. If if has not been resolved,
4899          *    parsing returns it's Name, create the node
4900          */
4901         else if (CUR == '&') {
4902             htmlParseReference(ctxt);
4903         }
4904
4905         /*
4906          * Fifth case : end of the resource
4907          */
4908         else if (CUR == 0) {
4909             htmlAutoCloseOnEnd(ctxt);
4910             break;
4911         }
4912
4913         /*
4914          * Last case, text. Note that References are handled directly.
4915          */
4916         else {
4917             htmlParseCharData(ctxt);
4918         }
4919
4920         SHRINK;
4921         GROW;
4922     }
4923     if (currentNode != NULL) xmlFree(currentNode);
4924 }
4925
4926 /**
4927  * htmlParseContent:
4928  * @ctxt:  an HTML parser context
4929  *
4930  * Parse a content: comment, sub-element, reference or text.
4931  * This is the entry point when called from parser.c
4932  */
4933
4934 void
4935 __htmlParseContent(void *ctxt) {
4936     if (ctxt != NULL)
4937         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4938 }
4939
4940 /**
4941  * htmlParseDocument:
4942  * @ctxt:  an HTML parser context
4943  *
4944  * parse an HTML document (and build a tree if using the standard SAX
4945  * interface).
4946  *
4947  * Returns 0, -1 in case of error. the parser context is augmented
4948  *                as a result of the parsing.
4949  */
4950
4951 int
4952 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4953     xmlChar start[4];
4954     xmlCharEncoding enc;
4955     xmlDtdPtr dtd;
4956
4957     xmlInitParser();
4958
4959     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4960         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4961                      "htmlParseDocument: context error\n", NULL, NULL);
4962         return(XML_ERR_INTERNAL_ERROR);
4963     }
4964     GROW;
4965     /*
4966      * SAX: beginning of the document processing.
4967      */
4968     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4969         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4970
4971     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4972         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4973         /*
4974          * Get the 4 first bytes and decode the charset
4975          * if enc != XML_CHAR_ENCODING_NONE
4976          * plug some encoding conversion routines.
4977          */
4978         start[0] = RAW;
4979         start[1] = NXT(1);
4980         start[2] = NXT(2);
4981         start[3] = NXT(3);
4982         enc = xmlDetectCharEncoding(&start[0], 4);
4983         if (enc != XML_CHAR_ENCODING_NONE) {
4984             xmlSwitchEncoding(ctxt, enc);
4985         }
4986     }
4987
4988     /*
4989      * Wipe out everything which is before the first '<'
4990      */
4991     SKIP_BLANKS;
4992     if (CUR == 0) {
4993         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4994                      "Document is empty\n", NULL, NULL);
4995     }
4996
4997     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4998         ctxt->sax->startDocument(ctxt->userData);
4999
5000
5001     /*
5002      * Parse possible comments and PIs before any content
5003      */
5004     while (((CUR == '<') && (NXT(1) == '!') &&
5005             (NXT(2) == '-') && (NXT(3) == '-')) ||
5006            ((CUR == '<') && (NXT(1) == '?'))) {
5007         htmlParseComment(ctxt);
5008         htmlParsePI(ctxt);
5009         SKIP_BLANKS;
5010     }
5011
5012
5013     /*
5014      * Then possibly doc type declaration(s) and more Misc
5015      * (doctypedecl Misc*)?
5016      */
5017     if ((CUR == '<') && (NXT(1) == '!') &&
5018         (UPP(2) == 'D') && (UPP(3) == 'O') &&
5019         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5020         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5021         (UPP(8) == 'E')) {
5022         htmlParseDocTypeDecl(ctxt);
5023     }
5024     SKIP_BLANKS;
5025
5026     /*
5027      * Parse possible comments and PIs before any content
5028      */
5029     while (((CUR == '<') && (NXT(1) == '!') &&
5030             (NXT(2) == '-') && (NXT(3) == '-')) ||
5031            ((CUR == '<') && (NXT(1) == '?'))) {
5032         htmlParseComment(ctxt);
5033         htmlParsePI(ctxt);
5034         SKIP_BLANKS;
5035     }
5036
5037     /*
5038      * Time to start parsing the tree itself
5039      */
5040     htmlParseContentInternal(ctxt);
5041
5042     /*
5043      * autoclose
5044      */
5045     if (CUR == 0)
5046         htmlAutoCloseOnEnd(ctxt);
5047
5048
5049     /*
5050      * SAX: end of the document processing.
5051      */
5052     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5053         ctxt->sax->endDocument(ctxt->userData);
5054
5055     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5056         dtd = xmlGetIntSubset(ctxt->myDoc);
5057         if (dtd == NULL)
5058             ctxt->myDoc->intSubset =
5059                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5060                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5061                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5062     }
5063     if (! ctxt->wellFormed) return(-1);
5064     return(0);
5065 }
5066
5067
5068 /************************************************************************
5069  *                                                                      *
5070  *                      Parser contexts handling                        *
5071  *                                                                      *
5072  ************************************************************************/
5073
5074 /**
5075  * htmlInitParserCtxt:
5076  * @ctxt:  an HTML parser context
5077  * @sax:  SAX handler
5078  * @userData:  user data
5079  *
5080  * Initialize a parser context
5081  *
5082  * Returns 0 in case of success and -1 in case of error
5083  */
5084
5085 static int
5086 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
5087                    void *userData)
5088 {
5089     if (ctxt == NULL) return(-1);
5090     memset(ctxt, 0, sizeof(htmlParserCtxt));
5091
5092     ctxt->dict = xmlDictCreate();
5093     if (ctxt->dict == NULL) {
5094         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5095         return(-1);
5096     }
5097
5098     if (ctxt->sax == NULL)
5099         ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5100     if (ctxt->sax == NULL) {
5101         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5102         return(-1);
5103     }
5104     if (sax == NULL) {
5105         memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
5106         xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
5107         ctxt->userData = ctxt;
5108     } else {
5109         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5110         ctxt->userData = userData ? userData : ctxt;
5111     }
5112
5113     /* Allocate the Input stack */
5114     ctxt->inputTab = (htmlParserInputPtr *)
5115                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5116     if (ctxt->inputTab == NULL) {
5117         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5118         ctxt->inputNr = 0;
5119         ctxt->inputMax = 0;
5120         ctxt->input = NULL;
5121         return(-1);
5122     }
5123     ctxt->inputNr = 0;
5124     ctxt->inputMax = 5;
5125     ctxt->input = NULL;
5126     ctxt->version = NULL;
5127     ctxt->encoding = NULL;
5128     ctxt->standalone = -1;
5129     ctxt->instate = XML_PARSER_START;
5130
5131     /* Allocate the Node stack */
5132     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5133     if (ctxt->nodeTab == NULL) {
5134         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5135         ctxt->nodeNr = 0;
5136         ctxt->nodeMax = 0;
5137         ctxt->node = NULL;
5138         ctxt->inputNr = 0;
5139         ctxt->inputMax = 0;
5140         ctxt->input = NULL;
5141         return(-1);
5142     }
5143     ctxt->nodeNr = 0;
5144     ctxt->nodeMax = 10;
5145     ctxt->node = NULL;
5146
5147     /* Allocate the Name stack */
5148     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5149     if (ctxt->nameTab == NULL) {
5150         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5151         ctxt->nameNr = 0;
5152         ctxt->nameMax = 0;
5153         ctxt->name = NULL;
5154         ctxt->nodeNr = 0;
5155         ctxt->nodeMax = 0;
5156         ctxt->node = NULL;
5157         ctxt->inputNr = 0;
5158         ctxt->inputMax = 0;
5159         ctxt->input = NULL;
5160         return(-1);
5161     }
5162     ctxt->nameNr = 0;
5163     ctxt->nameMax = 10;
5164     ctxt->name = NULL;
5165
5166     ctxt->nodeInfoTab = NULL;
5167     ctxt->nodeInfoNr  = 0;
5168     ctxt->nodeInfoMax = 0;
5169
5170     ctxt->myDoc = NULL;
5171     ctxt->wellFormed = 1;
5172     ctxt->replaceEntities = 0;
5173     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5174     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5175     ctxt->html = 1;
5176     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5177     ctxt->vctxt.userData = ctxt;
5178     ctxt->vctxt.error = xmlParserValidityError;
5179     ctxt->vctxt.warning = xmlParserValidityWarning;
5180     ctxt->record_info = 0;
5181     ctxt->validate = 0;
5182     ctxt->checkIndex = 0;
5183     ctxt->catalogs = NULL;
5184     xmlInitNodeInfoSeq(&ctxt->node_seq);
5185     return(0);
5186 }
5187
5188 /**
5189  * htmlFreeParserCtxt:
5190  * @ctxt:  an HTML parser context
5191  *
5192  * Free all the memory used by a parser context. However the parsed
5193  * document in ctxt->myDoc is not freed.
5194  */
5195
5196 void
5197 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5198 {
5199     xmlFreeParserCtxt(ctxt);
5200 }
5201
5202 /**
5203  * htmlNewParserCtxt:
5204  *
5205  * Allocate and initialize a new parser context.
5206  *
5207  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5208  */
5209
5210 htmlParserCtxtPtr
5211 htmlNewParserCtxt(void)
5212 {
5213     return(htmlNewSAXParserCtxt(NULL, NULL));
5214 }
5215
5216 /**
5217  * htmlNewSAXParserCtxt:
5218  * @sax:  SAX handler
5219  * @userData:  user data
5220  *
5221  * Allocate and initialize a new SAX parser context. If userData is NULL,
5222  * the parser context will be passed as user data.
5223  *
5224  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5225  */
5226
5227 htmlParserCtxtPtr
5228 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5229 {
5230     xmlParserCtxtPtr ctxt;
5231
5232     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5233     if (ctxt == NULL) {
5234         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5235         return(NULL);
5236     }
5237     memset(ctxt, 0, sizeof(xmlParserCtxt));
5238     if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5239         htmlFreeParserCtxt(ctxt);
5240         return(NULL);
5241     }
5242     return(ctxt);
5243 }
5244
5245 /**
5246  * htmlCreateMemoryParserCtxt:
5247  * @buffer:  a pointer to a char array
5248  * @size:  the size of the array
5249  *
5250  * Create a parser context for an HTML in-memory document.
5251  *
5252  * Returns the new parser context or NULL
5253  */
5254 htmlParserCtxtPtr
5255 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5256     xmlParserCtxtPtr ctxt;
5257     xmlParserInputPtr input;
5258     xmlParserInputBufferPtr buf;
5259
5260     if (buffer == NULL)
5261         return(NULL);
5262     if (size <= 0)
5263         return(NULL);
5264
5265     ctxt = htmlNewParserCtxt();
5266     if (ctxt == NULL)
5267         return(NULL);
5268
5269     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5270     if (buf == NULL) {
5271         xmlFreeParserCtxt(ctxt);
5272         return(NULL);
5273     }
5274
5275     input = xmlNewInputStream(ctxt);
5276     if (input == NULL) {
5277         xmlFreeParserInputBuffer(buf);
5278         xmlFreeParserCtxt(ctxt);
5279         return(NULL);
5280     }
5281
5282     input->filename = NULL;
5283     input->buf = buf;
5284     xmlBufResetInput(buf->buffer, input);
5285
5286     inputPush(ctxt, input);
5287     return(ctxt);
5288 }
5289
5290 /**
5291  * htmlCreateDocParserCtxt:
5292  * @cur:  a pointer to an array of xmlChar
5293  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5294  *
5295  * Create a parser context for an HTML document.
5296  *
5297  * TODO: check the need to add encoding handling there
5298  *
5299  * Returns the new parser context or NULL
5300  */
5301 static htmlParserCtxtPtr
5302 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5303     int len;
5304     htmlParserCtxtPtr ctxt;
5305
5306     if (cur == NULL)
5307         return(NULL);
5308     len = xmlStrlen(cur);
5309     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5310     if (ctxt == NULL)
5311         return(NULL);
5312
5313     if (encoding != NULL) {
5314         xmlCharEncoding enc;
5315         xmlCharEncodingHandlerPtr handler;
5316
5317         if (ctxt->input->encoding != NULL)
5318             xmlFree((xmlChar *) ctxt->input->encoding);
5319         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5320
5321         enc = xmlParseCharEncoding(encoding);
5322         /*
5323          * registered set of known encodings
5324          */
5325         if (enc != XML_CHAR_ENCODING_ERROR) {
5326             xmlSwitchEncoding(ctxt, enc);
5327             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5328                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5329                              "Unsupported encoding %s\n",
5330                              (const xmlChar *) encoding, NULL);
5331             }
5332         } else {
5333             /*
5334              * fallback for unknown encodings
5335              */
5336             handler = xmlFindCharEncodingHandler((const char *) encoding);
5337             if (handler != NULL) {
5338                 xmlSwitchToEncoding(ctxt, handler);
5339             } else {
5340                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5341                              "Unsupported encoding %s\n",
5342                              (const xmlChar *) encoding, NULL);
5343             }
5344         }
5345     }
5346     return(ctxt);
5347 }
5348
5349 #ifdef LIBXML_PUSH_ENABLED
5350 /************************************************************************
5351  *                                                                      *
5352  *      Progressive parsing interfaces                          *
5353  *                                                                      *
5354  ************************************************************************/
5355
5356 /**
5357  * htmlParseLookupSequence:
5358  * @ctxt:  an HTML parser context
5359  * @first:  the first char to lookup
5360  * @next:  the next char to lookup or zero
5361  * @third:  the next char to lookup or zero
5362  * @ignoreattrval: skip over attribute values
5363  *
5364  * Try to find if a sequence (first, next, third) or  just (first next) or
5365  * (first) is available in the input stream.
5366  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5367  * to avoid rescanning sequences of bytes, it DOES change the state of the
5368  * parser, do not use liberally.
5369  * This is basically similar to xmlParseLookupSequence()
5370  *
5371  * Returns the index to the current parsing point if the full sequence
5372  *      is available, -1 otherwise.
5373  */
5374 static int
5375 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5376                         xmlChar next, xmlChar third, int ignoreattrval)
5377 {
5378     size_t base, len;
5379     htmlParserInputPtr in;
5380     const xmlChar *buf;
5381     int quote;
5382
5383     in = ctxt->input;
5384     if (in == NULL)
5385         return (-1);
5386
5387     base = ctxt->checkIndex;
5388     quote = ctxt->endCheckState;
5389
5390     buf = in->cur;
5391     len = in->end - in->cur;
5392
5393     /* take into account the sequence length */
5394     if (third)
5395         len -= 2;
5396     else if (next)
5397         len--;
5398     for (; base < len; base++) {
5399         if (base >= INT_MAX / 2) {
5400             ctxt->checkIndex = 0;
5401             ctxt->endCheckState = 0;
5402             return (base - 2);
5403         }
5404         if (ignoreattrval) {
5405             if (quote) {
5406                 if (buf[base] == quote)
5407                     quote = 0;
5408                 continue;
5409             }
5410             if (buf[base] == '"' || buf[base] == '\'') {
5411                 quote = buf[base];
5412                 continue;
5413             }
5414         }
5415         if (buf[base] == first) {
5416             if (third != 0) {
5417                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5418                     continue;
5419             } else if (next != 0) {
5420                 if (buf[base + 1] != next)
5421                     continue;
5422             }
5423             ctxt->checkIndex = 0;
5424             ctxt->endCheckState = 0;
5425             return (base);
5426         }
5427     }
5428     ctxt->checkIndex = base;
5429     ctxt->endCheckState = quote;
5430 #ifdef DEBUG_PUSH
5431     if (next == 0)
5432         xmlGenericError(xmlGenericErrorContext,
5433                         "HPP: lookup '%c' failed\n", first);
5434     else if (third == 0)
5435         xmlGenericError(xmlGenericErrorContext,
5436                         "HPP: lookup '%c%c' failed\n", first, next);
5437     else
5438         xmlGenericError(xmlGenericErrorContext,
5439                         "HPP: lookup '%c%c%c' failed\n", first, next,
5440                         third);
5441 #endif
5442     return (-1);
5443 }
5444
5445 /**
5446  * htmlParseLookupCommentEnd:
5447  * @ctxt: an HTML parser context
5448  *
5449  * Try to find a comment end tag in the input stream
5450  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5451  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5452  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5453  * to avoid rescanning sequences of bytes, it DOES change the state of the
5454  * parser, do not use liberally.
5455  * This wraps to htmlParseLookupSequence()
5456  *
5457  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5458  */
5459 static int
5460 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5461 {
5462     int mark = 0;
5463     int offset;
5464
5465     while (1) {
5466         mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5467         if (mark < 0)
5468             break;
5469         if ((NXT(mark+2) == '>') ||
5470             ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5471             ctxt->checkIndex = 0;
5472             break;
5473         }
5474         offset = (NXT(mark+2) == '!') ? 3 : 2;
5475         if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5476             ctxt->checkIndex = mark;
5477             return(-1);
5478         }
5479         ctxt->checkIndex = mark + 1;
5480     }
5481     return mark;
5482 }
5483
5484
5485 /**
5486  * htmlParseTryOrFinish:
5487  * @ctxt:  an HTML parser context
5488  * @terminate:  last chunk indicator
5489  *
5490  * Try to progress on parsing
5491  *
5492  * Returns zero if no parsing was possible
5493  */
5494 static int
5495 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5496     int ret = 0;
5497     htmlParserInputPtr in;
5498     ptrdiff_t avail = 0;
5499     xmlChar cur, next;
5500
5501     htmlParserNodeInfo node_info;
5502
5503 #ifdef DEBUG_PUSH
5504     switch (ctxt->instate) {
5505         case XML_PARSER_EOF:
5506             xmlGenericError(xmlGenericErrorContext,
5507                     "HPP: try EOF\n"); break;
5508         case XML_PARSER_START:
5509             xmlGenericError(xmlGenericErrorContext,
5510                     "HPP: try START\n"); break;
5511         case XML_PARSER_MISC:
5512             xmlGenericError(xmlGenericErrorContext,
5513                     "HPP: try MISC\n");break;
5514         case XML_PARSER_COMMENT:
5515             xmlGenericError(xmlGenericErrorContext,
5516                     "HPP: try COMMENT\n");break;
5517         case XML_PARSER_PROLOG:
5518             xmlGenericError(xmlGenericErrorContext,
5519                     "HPP: try PROLOG\n");break;
5520         case XML_PARSER_START_TAG:
5521             xmlGenericError(xmlGenericErrorContext,
5522                     "HPP: try START_TAG\n");break;
5523         case XML_PARSER_CONTENT:
5524             xmlGenericError(xmlGenericErrorContext,
5525                     "HPP: try CONTENT\n");break;
5526         case XML_PARSER_CDATA_SECTION:
5527             xmlGenericError(xmlGenericErrorContext,
5528                     "HPP: try CDATA_SECTION\n");break;
5529         case XML_PARSER_END_TAG:
5530             xmlGenericError(xmlGenericErrorContext,
5531                     "HPP: try END_TAG\n");break;
5532         case XML_PARSER_ENTITY_DECL:
5533             xmlGenericError(xmlGenericErrorContext,
5534                     "HPP: try ENTITY_DECL\n");break;
5535         case XML_PARSER_ENTITY_VALUE:
5536             xmlGenericError(xmlGenericErrorContext,
5537                     "HPP: try ENTITY_VALUE\n");break;
5538         case XML_PARSER_ATTRIBUTE_VALUE:
5539             xmlGenericError(xmlGenericErrorContext,
5540                     "HPP: try ATTRIBUTE_VALUE\n");break;
5541         case XML_PARSER_DTD:
5542             xmlGenericError(xmlGenericErrorContext,
5543                     "HPP: try DTD\n");break;
5544         case XML_PARSER_EPILOG:
5545             xmlGenericError(xmlGenericErrorContext,
5546                     "HPP: try EPILOG\n");break;
5547         case XML_PARSER_PI:
5548             xmlGenericError(xmlGenericErrorContext,
5549                     "HPP: try PI\n");break;
5550         case XML_PARSER_SYSTEM_LITERAL:
5551             xmlGenericError(xmlGenericErrorContext,
5552                     "HPP: try SYSTEM_LITERAL\n");break;
5553     }
5554 #endif
5555
5556     while (1) {
5557
5558         in = ctxt->input;
5559         if (in == NULL) break;
5560         avail = in->end - in->cur;
5561         if ((avail == 0) && (terminate)) {
5562             htmlAutoCloseOnEnd(ctxt);
5563             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5564                 /*
5565                  * SAX: end of the document processing.
5566                  */
5567                 ctxt->instate = XML_PARSER_EOF;
5568                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5569                     ctxt->sax->endDocument(ctxt->userData);
5570             }
5571         }
5572         if (avail < 1)
5573             goto done;
5574         /*
5575          * This is done to make progress and avoid an infinite loop
5576          * if a parsing attempt was aborted by hitting a NUL byte. After
5577          * changing htmlCurrentChar, this probably isn't necessary anymore.
5578          * We should consider removing this check.
5579          */
5580         cur = in->cur[0];
5581         if (cur == 0) {
5582             SKIP(1);
5583             continue;
5584         }
5585
5586         switch (ctxt->instate) {
5587             case XML_PARSER_EOF:
5588                 /*
5589                  * Document parsing is done !
5590                  */
5591                 goto done;
5592             case XML_PARSER_START:
5593                 /*
5594                  * Very first chars read from the document flow.
5595                  */
5596                 cur = in->cur[0];
5597                 if (IS_BLANK_CH(cur)) {
5598                     SKIP_BLANKS;
5599                     avail = in->end - in->cur;
5600                 }
5601                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5602                     ctxt->sax->setDocumentLocator(ctxt->userData,
5603                                                   &xmlDefaultSAXLocator);
5604                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5605                     (!ctxt->disableSAX))
5606                     ctxt->sax->startDocument(ctxt->userData);
5607
5608                 cur = in->cur[0];
5609                 next = in->cur[1];
5610                 if ((cur == '<') && (next == '!') &&
5611                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5612                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5613                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5614                     (UPP(8) == 'E')) {
5615                     if ((!terminate) &&
5616                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5617                         goto done;
5618 #ifdef DEBUG_PUSH
5619                     xmlGenericError(xmlGenericErrorContext,
5620                             "HPP: Parsing internal subset\n");
5621 #endif
5622                     htmlParseDocTypeDecl(ctxt);
5623                     ctxt->instate = XML_PARSER_PROLOG;
5624 #ifdef DEBUG_PUSH
5625                     xmlGenericError(xmlGenericErrorContext,
5626                             "HPP: entering PROLOG\n");
5627 #endif
5628                 } else {
5629                     ctxt->instate = XML_PARSER_MISC;
5630 #ifdef DEBUG_PUSH
5631                     xmlGenericError(xmlGenericErrorContext,
5632                             "HPP: entering MISC\n");
5633 #endif
5634                 }
5635                 break;
5636             case XML_PARSER_MISC:
5637                 SKIP_BLANKS;
5638                 avail = in->end - in->cur;
5639                 /*
5640                  * no chars in buffer
5641                  */
5642                 if (avail < 1)
5643                     goto done;
5644                 /*
5645                  * not enough chars in buffer
5646                  */
5647                 if (avail < 2) {
5648                     if (!terminate)
5649                         goto done;
5650                     else
5651                         next = ' ';
5652                 } else {
5653                     next = in->cur[1];
5654                 }
5655                 cur = in->cur[0];
5656                 if ((cur == '<') && (next == '!') &&
5657                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5658                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5659                         goto done;
5660 #ifdef DEBUG_PUSH
5661                     xmlGenericError(xmlGenericErrorContext,
5662                             "HPP: Parsing Comment\n");
5663 #endif
5664                     htmlParseComment(ctxt);
5665                     ctxt->instate = XML_PARSER_MISC;
5666                 } else if ((cur == '<') && (next == '?')) {
5667                     if ((!terminate) &&
5668                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5669                         goto done;
5670 #ifdef DEBUG_PUSH
5671                     xmlGenericError(xmlGenericErrorContext,
5672                             "HPP: Parsing PI\n");
5673 #endif
5674                     htmlParsePI(ctxt);
5675                     ctxt->instate = XML_PARSER_MISC;
5676                 } else if ((cur == '<') && (next == '!') &&
5677                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5678                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5679                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5680                     (UPP(8) == 'E')) {
5681                     if ((!terminate) &&
5682                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5683                         goto done;
5684 #ifdef DEBUG_PUSH
5685                     xmlGenericError(xmlGenericErrorContext,
5686                             "HPP: Parsing internal subset\n");
5687 #endif
5688                     htmlParseDocTypeDecl(ctxt);
5689                     ctxt->instate = XML_PARSER_PROLOG;
5690 #ifdef DEBUG_PUSH
5691                     xmlGenericError(xmlGenericErrorContext,
5692                             "HPP: entering PROLOG\n");
5693 #endif
5694                 } else if ((cur == '<') && (next == '!') &&
5695                            (avail < 9)) {
5696                     goto done;
5697                 } else {
5698                     ctxt->instate = XML_PARSER_CONTENT;
5699 #ifdef DEBUG_PUSH
5700                     xmlGenericError(xmlGenericErrorContext,
5701                             "HPP: entering START_TAG\n");
5702 #endif
5703                 }
5704                 break;
5705             case XML_PARSER_PROLOG:
5706                 SKIP_BLANKS;
5707                 avail = in->end - in->cur;
5708                 if (avail < 2)
5709                     goto done;
5710                 cur = in->cur[0];
5711                 next = in->cur[1];
5712                 if ((cur == '<') && (next == '!') &&
5713                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5714                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5715                         goto done;
5716 #ifdef DEBUG_PUSH
5717                     xmlGenericError(xmlGenericErrorContext,
5718                             "HPP: Parsing Comment\n");
5719 #endif
5720                     htmlParseComment(ctxt);
5721                     ctxt->instate = XML_PARSER_PROLOG;
5722                 } else if ((cur == '<') && (next == '?')) {
5723                     if ((!terminate) &&
5724                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5725                         goto done;
5726 #ifdef DEBUG_PUSH
5727                     xmlGenericError(xmlGenericErrorContext,
5728                             "HPP: Parsing PI\n");
5729 #endif
5730                     htmlParsePI(ctxt);
5731                     ctxt->instate = XML_PARSER_PROLOG;
5732                 } else if ((cur == '<') && (next == '!') &&
5733                            (avail < 4)) {
5734                     goto done;
5735                 } else {
5736                     ctxt->instate = XML_PARSER_CONTENT;
5737 #ifdef DEBUG_PUSH
5738                     xmlGenericError(xmlGenericErrorContext,
5739                             "HPP: entering START_TAG\n");
5740 #endif
5741                 }
5742                 break;
5743             case XML_PARSER_EPILOG:
5744                 avail = in->end - in->cur;
5745                 if (avail < 1)
5746                     goto done;
5747                 cur = in->cur[0];
5748                 if (IS_BLANK_CH(cur)) {
5749                     htmlParseCharData(ctxt);
5750                     goto done;
5751                 }
5752                 if (avail < 2)
5753                     goto done;
5754                 next = in->cur[1];
5755                 if ((cur == '<') && (next == '!') &&
5756                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5757                     if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5758                         goto done;
5759 #ifdef DEBUG_PUSH
5760                     xmlGenericError(xmlGenericErrorContext,
5761                             "HPP: Parsing Comment\n");
5762 #endif
5763                     htmlParseComment(ctxt);
5764                     ctxt->instate = XML_PARSER_EPILOG;
5765                 } else if ((cur == '<') && (next == '?')) {
5766                     if ((!terminate) &&
5767                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5768                         goto done;
5769 #ifdef DEBUG_PUSH
5770                     xmlGenericError(xmlGenericErrorContext,
5771                             "HPP: Parsing PI\n");
5772 #endif
5773                     htmlParsePI(ctxt);
5774                     ctxt->instate = XML_PARSER_EPILOG;
5775                 } else if ((cur == '<') && (next == '!') &&
5776                            (avail < 4)) {
5777                     goto done;
5778                 } else {
5779                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5780                     ctxt->wellFormed = 0;
5781                     ctxt->instate = XML_PARSER_EOF;
5782 #ifdef DEBUG_PUSH
5783                     xmlGenericError(xmlGenericErrorContext,
5784                             "HPP: entering EOF\n");
5785 #endif
5786                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5787                         ctxt->sax->endDocument(ctxt->userData);
5788                     goto done;
5789                 }
5790                 break;
5791             case XML_PARSER_START_TAG: {
5792                 const xmlChar *name;
5793                 int failed;
5794                 const htmlElemDesc * info;
5795
5796                 /*
5797                  * no chars in buffer
5798                  */
5799                 if (avail < 1)
5800                     goto done;
5801                 /*
5802                  * not enough chars in buffer
5803                  */
5804                 if (avail < 2) {
5805                     if (!terminate)
5806                         goto done;
5807                     else
5808                         next = ' ';
5809                 } else {
5810                     next = in->cur[1];
5811                 }
5812                 cur = in->cur[0];
5813                 if (cur != '<') {
5814                     ctxt->instate = XML_PARSER_CONTENT;
5815 #ifdef DEBUG_PUSH
5816                     xmlGenericError(xmlGenericErrorContext,
5817                             "HPP: entering CONTENT\n");
5818 #endif
5819                     break;
5820                 }
5821                 if (next == '/') {
5822                     ctxt->instate = XML_PARSER_END_TAG;
5823                     ctxt->checkIndex = 0;
5824 #ifdef DEBUG_PUSH
5825                     xmlGenericError(xmlGenericErrorContext,
5826                             "HPP: entering END_TAG\n");
5827 #endif
5828                     break;
5829                 }
5830                 if ((!terminate) &&
5831                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5832                     goto done;
5833
5834                 /* Capture start position */
5835                 if (ctxt->record_info) {
5836                      node_info.begin_pos = ctxt->input->consumed +
5837                                         (CUR_PTR - ctxt->input->base);
5838                      node_info.begin_line = ctxt->input->line;
5839                 }
5840
5841
5842                 failed = htmlParseStartTag(ctxt);
5843                 name = ctxt->name;
5844                 if ((failed == -1) ||
5845                     (name == NULL)) {
5846                     if (CUR == '>')
5847                         NEXT;
5848                     break;
5849                 }
5850
5851                 /*
5852                  * Lookup the info for that element.
5853                  */
5854                 info = htmlTagLookup(name);
5855                 if (info == NULL) {
5856                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5857                                  "Tag %s invalid\n", name, NULL);
5858                 }
5859
5860                 /*
5861                  * Check for an Empty Element labeled the XML/SGML way
5862                  */
5863                 if ((CUR == '/') && (NXT(1) == '>')) {
5864                     SKIP(2);
5865                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5866                         ctxt->sax->endElement(ctxt->userData, name);
5867                     htmlnamePop(ctxt);
5868                     ctxt->instate = XML_PARSER_CONTENT;
5869 #ifdef DEBUG_PUSH
5870                     xmlGenericError(xmlGenericErrorContext,
5871                             "HPP: entering CONTENT\n");
5872 #endif
5873                     break;
5874                 }
5875
5876                 if (CUR == '>') {
5877                     NEXT;
5878                 } else {
5879                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5880                                  "Couldn't find end of Start Tag %s\n",
5881                                  name, NULL);
5882
5883                     /*
5884                      * end of parsing of this node.
5885                      */
5886                     if (xmlStrEqual(name, ctxt->name)) {
5887                         nodePop(ctxt);
5888                         htmlnamePop(ctxt);
5889                     }
5890
5891                     if (ctxt->record_info)
5892                         htmlNodeInfoPush(ctxt, &node_info);
5893
5894                     ctxt->instate = XML_PARSER_CONTENT;
5895 #ifdef DEBUG_PUSH
5896                     xmlGenericError(xmlGenericErrorContext,
5897                             "HPP: entering CONTENT\n");
5898 #endif
5899                     break;
5900                 }
5901
5902                 /*
5903                  * Check for an Empty Element from DTD definition
5904                  */
5905                 if ((info != NULL) && (info->empty)) {
5906                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5907                         ctxt->sax->endElement(ctxt->userData, name);
5908                     htmlnamePop(ctxt);
5909                 }
5910
5911                 if (ctxt->record_info)
5912                     htmlNodeInfoPush(ctxt, &node_info);
5913
5914                 ctxt->instate = XML_PARSER_CONTENT;
5915 #ifdef DEBUG_PUSH
5916                 xmlGenericError(xmlGenericErrorContext,
5917                         "HPP: entering CONTENT\n");
5918 #endif
5919                 break;
5920             }
5921             case XML_PARSER_CONTENT: {
5922                 xmlChar chr[2] = { 0, 0 };
5923
5924                 /*
5925                  * Handle preparsed entities and charRef
5926                  */
5927                 if (ctxt->token != 0) {
5928                     chr[0] = ctxt->token;
5929                     htmlCheckParagraph(ctxt);
5930                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5931                         ctxt->sax->characters(ctxt->userData, chr, 1);
5932                     ctxt->token = 0;
5933                     ctxt->checkIndex = 0;
5934                 }
5935                 if ((avail == 1) && (terminate)) {
5936                     cur = in->cur[0];
5937                     if ((cur != '<') && (cur != '&')) {
5938                         if (ctxt->sax != NULL) {
5939                             chr[0] = cur;
5940                             if (IS_BLANK_CH(cur)) {
5941                                 if (ctxt->keepBlanks) {
5942                                     if (ctxt->sax->characters != NULL)
5943                                         ctxt->sax->characters(
5944                                                 ctxt->userData, chr, 1);
5945                                 } else {
5946                                     if (ctxt->sax->ignorableWhitespace != NULL)
5947                                         ctxt->sax->ignorableWhitespace(
5948                                                 ctxt->userData, chr, 1);
5949                                 }
5950                             } else {
5951                                 htmlCheckParagraph(ctxt);
5952                                 if (ctxt->sax->characters != NULL)
5953                                     ctxt->sax->characters(
5954                                             ctxt->userData, chr, 1);
5955                             }
5956                         }
5957                         ctxt->token = 0;
5958                         ctxt->checkIndex = 0;
5959                         in->cur++;
5960                         break;
5961                     }
5962                 }
5963                 if (avail < 2)
5964                     goto done;
5965                 cur = in->cur[0];
5966                 next = in->cur[1];
5967                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5968                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5969                     /*
5970                      * Handle SCRIPT/STYLE separately
5971                      */
5972                     if (!terminate) {
5973                         int idx;
5974                         xmlChar val;
5975
5976                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5977                         if (idx < 0)
5978                             goto done;
5979                         val = in->cur[idx + 2];
5980                         if (val == 0) { /* bad cut of input */
5981                             /*
5982                              * FIXME: htmlParseScript checks for additional
5983                              * characters after '</'.
5984                              */
5985                             ctxt->checkIndex = idx;
5986                             goto done;
5987                         }
5988                     }
5989                     htmlParseScript(ctxt);
5990                     if ((cur == '<') && (next == '/')) {
5991                         ctxt->instate = XML_PARSER_END_TAG;
5992                         ctxt->checkIndex = 0;
5993 #ifdef DEBUG_PUSH
5994                         xmlGenericError(xmlGenericErrorContext,
5995                                 "HPP: entering END_TAG\n");
5996 #endif
5997                         break;
5998                     }
5999                 } else if ((cur == '<') && (next == '!')) {
6000                     if (avail < 4)
6001                         goto done;
6002                     /*
6003                      * Sometimes DOCTYPE arrives in the middle of the document
6004                      */
6005                     if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
6006                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
6007                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
6008                         (UPP(8) == 'E')) {
6009                         if ((!terminate) &&
6010                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
6011                             goto done;
6012                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
6013                                      "Misplaced DOCTYPE declaration\n",
6014                                      BAD_CAST "DOCTYPE" , NULL);
6015                         htmlParseDocTypeDecl(ctxt);
6016                     } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
6017                         if ((!terminate) &&
6018                             (htmlParseLookupCommentEnd(ctxt) < 0))
6019                             goto done;
6020 #ifdef DEBUG_PUSH
6021                         xmlGenericError(xmlGenericErrorContext,
6022                                 "HPP: Parsing Comment\n");
6023 #endif
6024                         htmlParseComment(ctxt);
6025                         ctxt->instate = XML_PARSER_CONTENT;
6026                     } else {
6027                         if ((!terminate) &&
6028                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6029                             goto done;
6030                         htmlSkipBogusComment(ctxt);
6031                     }
6032                 } else if ((cur == '<') && (next == '?')) {
6033                     if ((!terminate) &&
6034                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6035                         goto done;
6036 #ifdef DEBUG_PUSH
6037                     xmlGenericError(xmlGenericErrorContext,
6038                             "HPP: Parsing PI\n");
6039 #endif
6040                     htmlParsePI(ctxt);
6041                     ctxt->instate = XML_PARSER_CONTENT;
6042                 } else if ((cur == '<') && (next == '/')) {
6043                     ctxt->instate = XML_PARSER_END_TAG;
6044                     ctxt->checkIndex = 0;
6045 #ifdef DEBUG_PUSH
6046                     xmlGenericError(xmlGenericErrorContext,
6047                             "HPP: entering END_TAG\n");
6048 #endif
6049                     break;
6050                 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6051                     if ((!terminate) && (next == 0))
6052                         goto done;
6053                     ctxt->instate = XML_PARSER_START_TAG;
6054                     ctxt->checkIndex = 0;
6055 #ifdef DEBUG_PUSH
6056                     xmlGenericError(xmlGenericErrorContext,
6057                             "HPP: entering START_TAG\n");
6058 #endif
6059                     break;
6060                 } else if (cur == '<') {
6061                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6062                         (ctxt->sax->characters != NULL))
6063                         ctxt->sax->characters(ctxt->userData,
6064                                               BAD_CAST "<", 1);
6065                     NEXT;
6066                 } else {
6067                     /*
6068                      * check that the text sequence is complete
6069                      * before handing out the data to the parser
6070                      * to avoid problems with erroneous end of
6071                      * data detection.
6072                      */
6073                     if ((!terminate) &&
6074                         (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6075                         goto done;
6076                     ctxt->checkIndex = 0;
6077 #ifdef DEBUG_PUSH
6078                     xmlGenericError(xmlGenericErrorContext,
6079                             "HPP: Parsing char data\n");
6080 #endif
6081                     while ((ctxt->instate != XML_PARSER_EOF) &&
6082                            (cur != '<') && (in->cur < in->end)) {
6083                         if (cur == '&') {
6084                             htmlParseReference(ctxt);
6085                         } else {
6086                             htmlParseCharData(ctxt);
6087                         }
6088                         cur = in->cur[0];
6089                     }
6090                 }
6091
6092                 break;
6093             }
6094             case XML_PARSER_END_TAG:
6095                 if (avail < 2)
6096                     goto done;
6097                 if ((!terminate) &&
6098                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6099                     goto done;
6100                 htmlParseEndTag(ctxt);
6101                 if (ctxt->nameNr == 0) {
6102                     ctxt->instate = XML_PARSER_EPILOG;
6103                 } else {
6104                     ctxt->instate = XML_PARSER_CONTENT;
6105                 }
6106                 ctxt->checkIndex = 0;
6107 #ifdef DEBUG_PUSH
6108                 xmlGenericError(xmlGenericErrorContext,
6109                         "HPP: entering CONTENT\n");
6110 #endif
6111                 break;
6112             case XML_PARSER_CDATA_SECTION:
6113                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6114                         "HPP: internal error, state == CDATA\n",
6115                              NULL, NULL);
6116                 ctxt->instate = XML_PARSER_CONTENT;
6117                 ctxt->checkIndex = 0;
6118 #ifdef DEBUG_PUSH
6119                 xmlGenericError(xmlGenericErrorContext,
6120                         "HPP: entering CONTENT\n");
6121 #endif
6122                 break;
6123             case XML_PARSER_DTD:
6124                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6125                         "HPP: internal error, state == DTD\n",
6126                              NULL, NULL);
6127                 ctxt->instate = XML_PARSER_CONTENT;
6128                 ctxt->checkIndex = 0;
6129 #ifdef DEBUG_PUSH
6130                 xmlGenericError(xmlGenericErrorContext,
6131                         "HPP: entering CONTENT\n");
6132 #endif
6133                 break;
6134             case XML_PARSER_COMMENT:
6135                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6136                         "HPP: internal error, state == COMMENT\n",
6137                              NULL, NULL);
6138                 ctxt->instate = XML_PARSER_CONTENT;
6139                 ctxt->checkIndex = 0;
6140 #ifdef DEBUG_PUSH
6141                 xmlGenericError(xmlGenericErrorContext,
6142                         "HPP: entering CONTENT\n");
6143 #endif
6144                 break;
6145             case XML_PARSER_PI:
6146                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6147                         "HPP: internal error, state == PI\n",
6148                              NULL, NULL);
6149                 ctxt->instate = XML_PARSER_CONTENT;
6150                 ctxt->checkIndex = 0;
6151 #ifdef DEBUG_PUSH
6152                 xmlGenericError(xmlGenericErrorContext,
6153                         "HPP: entering CONTENT\n");
6154 #endif
6155                 break;
6156             case XML_PARSER_ENTITY_DECL:
6157                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6158                         "HPP: internal error, state == ENTITY_DECL\n",
6159                              NULL, NULL);
6160                 ctxt->instate = XML_PARSER_CONTENT;
6161                 ctxt->checkIndex = 0;
6162 #ifdef DEBUG_PUSH
6163                 xmlGenericError(xmlGenericErrorContext,
6164                         "HPP: entering CONTENT\n");
6165 #endif
6166                 break;
6167             case XML_PARSER_ENTITY_VALUE:
6168                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6169                         "HPP: internal error, state == ENTITY_VALUE\n",
6170                              NULL, NULL);
6171                 ctxt->instate = XML_PARSER_CONTENT;
6172                 ctxt->checkIndex = 0;
6173 #ifdef DEBUG_PUSH
6174                 xmlGenericError(xmlGenericErrorContext,
6175                         "HPP: entering DTD\n");
6176 #endif
6177                 break;
6178             case XML_PARSER_ATTRIBUTE_VALUE:
6179                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6180                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6181                              NULL, NULL);
6182                 ctxt->instate = XML_PARSER_START_TAG;
6183                 ctxt->checkIndex = 0;
6184 #ifdef DEBUG_PUSH
6185                 xmlGenericError(xmlGenericErrorContext,
6186                         "HPP: entering START_TAG\n");
6187 #endif
6188                 break;
6189             case XML_PARSER_SYSTEM_LITERAL:
6190                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6191                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6192                              NULL, NULL);
6193                 ctxt->instate = XML_PARSER_CONTENT;
6194                 ctxt->checkIndex = 0;
6195 #ifdef DEBUG_PUSH
6196                 xmlGenericError(xmlGenericErrorContext,
6197                         "HPP: entering CONTENT\n");
6198 #endif
6199                 break;
6200             case XML_PARSER_IGNORE:
6201                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6202                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
6203                              NULL, NULL);
6204                 ctxt->instate = XML_PARSER_CONTENT;
6205                 ctxt->checkIndex = 0;
6206 #ifdef DEBUG_PUSH
6207                 xmlGenericError(xmlGenericErrorContext,
6208                         "HPP: entering CONTENT\n");
6209 #endif
6210                 break;
6211             case XML_PARSER_PUBLIC_LITERAL:
6212                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6213                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
6214                              NULL, NULL);
6215                 ctxt->instate = XML_PARSER_CONTENT;
6216                 ctxt->checkIndex = 0;
6217 #ifdef DEBUG_PUSH
6218                 xmlGenericError(xmlGenericErrorContext,
6219                         "HPP: entering CONTENT\n");
6220 #endif
6221                 break;
6222
6223         }
6224     }
6225 done:
6226     if ((avail == 0) && (terminate)) {
6227         htmlAutoCloseOnEnd(ctxt);
6228         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6229             /*
6230              * SAX: end of the document processing.
6231              */
6232             ctxt->instate = XML_PARSER_EOF;
6233             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6234                 ctxt->sax->endDocument(ctxt->userData);
6235         }
6236     }
6237     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6238         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6239          (ctxt->instate == XML_PARSER_EPILOG))) {
6240         xmlDtdPtr dtd;
6241         dtd = xmlGetIntSubset(ctxt->myDoc);
6242         if (dtd == NULL)
6243             ctxt->myDoc->intSubset =
6244                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6245                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6246                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6247     }
6248 #ifdef DEBUG_PUSH
6249     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6250 #endif
6251     return(ret);
6252 }
6253
6254 /**
6255  * htmlParseChunk:
6256  * @ctxt:  an HTML parser context
6257  * @chunk:  an char array
6258  * @size:  the size in byte of the chunk
6259  * @terminate:  last chunk indicator
6260  *
6261  * Parse a Chunk of memory
6262  *
6263  * Returns zero if no error, the xmlParserErrors otherwise.
6264  */
6265 int
6266 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6267               int terminate) {
6268     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6269         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6270                      "htmlParseChunk: context error\n", NULL, NULL);
6271         return(XML_ERR_INTERNAL_ERROR);
6272     }
6273     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6274         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6275         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6276         size_t cur = ctxt->input->cur - ctxt->input->base;
6277         int res;
6278
6279         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6280         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6281         if (res < 0) {
6282             htmlErrMemory(ctxt, NULL);
6283             return (ctxt->errNo);
6284         }
6285 #ifdef DEBUG_PUSH
6286         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6287 #endif
6288
6289 #if 0
6290         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6291             htmlParseTryOrFinish(ctxt, terminate);
6292 #endif
6293     } else if (ctxt->instate != XML_PARSER_EOF) {
6294         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6295             xmlParserInputBufferPtr in = ctxt->input->buf;
6296             if ((in->encoder != NULL) && (in->buffer != NULL) &&
6297                     (in->raw != NULL)) {
6298                 int nbchars;
6299                 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6300                 size_t current = ctxt->input->cur - ctxt->input->base;
6301
6302                 nbchars = xmlCharEncInput(in, terminate);
6303                 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6304                 if (nbchars < 0) {
6305                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6306                                  "encoder error\n", NULL, NULL);
6307                     return(XML_ERR_INVALID_ENCODING);
6308                 }
6309             }
6310         }
6311     }
6312     htmlParseTryOrFinish(ctxt, terminate);
6313     if (terminate) {
6314         if ((ctxt->instate != XML_PARSER_EOF) &&
6315             (ctxt->instate != XML_PARSER_EPILOG) &&
6316             (ctxt->instate != XML_PARSER_MISC)) {
6317             ctxt->errNo = XML_ERR_DOCUMENT_END;
6318             ctxt->wellFormed = 0;
6319         }
6320         if (ctxt->instate != XML_PARSER_EOF) {
6321             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6322                 ctxt->sax->endDocument(ctxt->userData);
6323         }
6324         ctxt->instate = XML_PARSER_EOF;
6325     }
6326     return((xmlParserErrors) ctxt->errNo);
6327 }
6328
6329 /************************************************************************
6330  *                                                                      *
6331  *                      User entry points                               *
6332  *                                                                      *
6333  ************************************************************************/
6334
6335 /**
6336  * htmlCreatePushParserCtxt:
6337  * @sax:  a SAX handler
6338  * @user_data:  The user data returned on SAX callbacks
6339  * @chunk:  a pointer to an array of chars
6340  * @size:  number of chars in the array
6341  * @filename:  an optional file name or URI
6342  * @enc:  an optional encoding
6343  *
6344  * Create a parser context for using the HTML parser in push mode
6345  * The value of @filename is used for fetching external entities
6346  * and error/warning reports.
6347  *
6348  * Returns the new parser context or NULL
6349  */
6350 htmlParserCtxtPtr
6351 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6352                          const char *chunk, int size, const char *filename,
6353                          xmlCharEncoding enc) {
6354     htmlParserCtxtPtr ctxt;
6355     htmlParserInputPtr inputStream;
6356     xmlParserInputBufferPtr buf;
6357
6358     xmlInitParser();
6359
6360     buf = xmlAllocParserInputBuffer(enc);
6361     if (buf == NULL) return(NULL);
6362
6363     ctxt = htmlNewSAXParserCtxt(sax, user_data);
6364     if (ctxt == NULL) {
6365         xmlFreeParserInputBuffer(buf);
6366         return(NULL);
6367     }
6368     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6369         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6370     if (filename == NULL) {
6371         ctxt->directory = NULL;
6372     } else {
6373         ctxt->directory = xmlParserGetDirectory(filename);
6374     }
6375
6376     inputStream = htmlNewInputStream(ctxt);
6377     if (inputStream == NULL) {
6378         xmlFreeParserCtxt(ctxt);
6379         xmlFreeParserInputBuffer(buf);
6380         return(NULL);
6381     }
6382
6383     if (filename == NULL)
6384         inputStream->filename = NULL;
6385     else
6386         inputStream->filename = (char *)
6387             xmlCanonicPath((const xmlChar *) filename);
6388     inputStream->buf = buf;
6389     xmlBufResetInput(buf->buffer, inputStream);
6390
6391     inputPush(ctxt, inputStream);
6392
6393     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6394         (ctxt->input->buf != NULL))  {
6395         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6396         size_t cur = ctxt->input->cur - ctxt->input->base;
6397
6398         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6399
6400         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6401 #ifdef DEBUG_PUSH
6402         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6403 #endif
6404     }
6405     ctxt->progressive = 1;
6406
6407     return(ctxt);
6408 }
6409 #endif /* LIBXML_PUSH_ENABLED */
6410
6411 /**
6412  * htmlSAXParseDoc:
6413  * @cur:  a pointer to an array of xmlChar
6414  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6415  * @sax:  the SAX handler block
6416  * @userData: if using SAX, this pointer will be provided on callbacks.
6417  *
6418  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6419  *
6420  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6421  * to handle parse events. If sax is NULL, fallback to the default DOM
6422  * behavior and return a tree.
6423  *
6424  * Returns the resulting document tree unless SAX is NULL or the document is
6425  *     not well formed.
6426  */
6427
6428 htmlDocPtr
6429 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6430                 htmlSAXHandlerPtr sax, void *userData) {
6431     htmlDocPtr ret;
6432     htmlParserCtxtPtr ctxt;
6433
6434     xmlInitParser();
6435
6436     if (cur == NULL) return(NULL);
6437
6438
6439     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6440     if (ctxt == NULL) return(NULL);
6441     if (sax != NULL) {
6442         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6443         ctxt->sax = sax;
6444         ctxt->userData = userData;
6445     }
6446
6447     htmlParseDocument(ctxt);
6448     ret = ctxt->myDoc;
6449     if (sax != NULL) {
6450         ctxt->sax = NULL;
6451         ctxt->userData = NULL;
6452     }
6453     htmlFreeParserCtxt(ctxt);
6454
6455     return(ret);
6456 }
6457
6458 /**
6459  * htmlParseDoc:
6460  * @cur:  a pointer to an array of xmlChar
6461  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6462  *
6463  * parse an HTML in-memory document and build a tree.
6464  *
6465  * Returns the resulting document tree
6466  */
6467
6468 htmlDocPtr
6469 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6470     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6471 }
6472
6473
6474 /**
6475  * htmlCreateFileParserCtxt:
6476  * @filename:  the filename
6477  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6478  *
6479  * Create a parser context for a file content.
6480  * Automatic support for ZLIB/Compress compressed document is provided
6481  * by default if found at compile-time.
6482  *
6483  * Returns the new parser context or NULL
6484  */
6485 htmlParserCtxtPtr
6486 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6487 {
6488     htmlParserCtxtPtr ctxt;
6489     htmlParserInputPtr inputStream;
6490     char *canonicFilename;
6491     /* htmlCharEncoding enc; */
6492     xmlChar *content, *content_line = (xmlChar *) "charset=";
6493
6494     if (filename == NULL)
6495         return(NULL);
6496
6497     ctxt = htmlNewParserCtxt();
6498     if (ctxt == NULL) {
6499         return(NULL);
6500     }
6501     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6502     if (canonicFilename == NULL) {
6503         xmlFreeParserCtxt(ctxt);
6504         return(NULL);
6505     }
6506
6507     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6508     xmlFree(canonicFilename);
6509     if (inputStream == NULL) {
6510         xmlFreeParserCtxt(ctxt);
6511         return(NULL);
6512     }
6513
6514     inputPush(ctxt, inputStream);
6515
6516     /* set encoding */
6517     if (encoding) {
6518         size_t l = strlen(encoding);
6519
6520         if (l < 1000) {
6521             content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6522             if (content) {
6523                 strcpy ((char *)content, (char *)content_line);
6524                 strcat ((char *)content, (char *)encoding);
6525                 htmlCheckEncoding (ctxt, content);
6526                 xmlFree (content);
6527             }
6528         }
6529     }
6530
6531     return(ctxt);
6532 }
6533
6534 /**
6535  * htmlSAXParseFile:
6536  * @filename:  the filename
6537  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6538  * @sax:  the SAX handler block
6539  * @userData: if using SAX, this pointer will be provided on callbacks.
6540  *
6541  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6542  *
6543  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6544  * compressed document is provided by default if found at compile-time.
6545  * It use the given SAX function block to handle the parsing callback.
6546  * If sax is NULL, fallback to the default DOM tree building routines.
6547  *
6548  * Returns the resulting document tree unless SAX is NULL or the document is
6549  *     not well formed.
6550  */
6551
6552 htmlDocPtr
6553 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6554                  void *userData) {
6555     htmlDocPtr ret;
6556     htmlParserCtxtPtr ctxt;
6557     htmlSAXHandlerPtr oldsax = NULL;
6558
6559     xmlInitParser();
6560
6561     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6562     if (ctxt == NULL) return(NULL);
6563     if (sax != NULL) {
6564         oldsax = ctxt->sax;
6565         ctxt->sax = sax;
6566         ctxt->userData = userData;
6567     }
6568
6569     htmlParseDocument(ctxt);
6570
6571     ret = ctxt->myDoc;
6572     if (sax != NULL) {
6573         ctxt->sax = oldsax;
6574         ctxt->userData = NULL;
6575     }
6576     htmlFreeParserCtxt(ctxt);
6577
6578     return(ret);
6579 }
6580
6581 /**
6582  * htmlParseFile:
6583  * @filename:  the filename
6584  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6585  *
6586  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6587  * compressed document is provided by default if found at compile-time.
6588  *
6589  * Returns the resulting document tree
6590  */
6591
6592 htmlDocPtr
6593 htmlParseFile(const char *filename, const char *encoding) {
6594     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6595 }
6596
6597 /**
6598  * htmlHandleOmittedElem:
6599  * @val:  int 0 or 1
6600  *
6601  * Set and return the previous value for handling HTML omitted tags.
6602  *
6603  * Returns the last value for 0 for no handling, 1 for auto insertion.
6604  */
6605
6606 int
6607 htmlHandleOmittedElem(int val) {
6608     int old = htmlOmittedDefaultValue;
6609
6610     htmlOmittedDefaultValue = val;
6611     return(old);
6612 }
6613
6614 /**
6615  * htmlElementAllowedHere:
6616  * @parent: HTML parent element
6617  * @elt: HTML element
6618  *
6619  * Checks whether an HTML element may be a direct child of a parent element.
6620  * Note - doesn't check for deprecated elements
6621  *
6622  * Returns 1 if allowed; 0 otherwise.
6623  */
6624 int
6625 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6626   const char** p ;
6627
6628   if ( ! elt || ! parent || ! parent->subelts )
6629         return 0 ;
6630
6631   for ( p = parent->subelts; *p; ++p )
6632     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6633       return 1 ;
6634
6635   return 0 ;
6636 }
6637 /**
6638  * htmlElementStatusHere:
6639  * @parent: HTML parent element
6640  * @elt: HTML element
6641  *
6642  * Checks whether an HTML element may be a direct child of a parent element.
6643  * and if so whether it is valid or deprecated.
6644  *
6645  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6646  */
6647 htmlStatus
6648 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6649   if ( ! parent || ! elt )
6650     return HTML_INVALID ;
6651   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6652     return HTML_INVALID ;
6653
6654   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6655 }
6656 /**
6657  * htmlAttrAllowed:
6658  * @elt: HTML element
6659  * @attr: HTML attribute
6660  * @legacy: whether to allow deprecated attributes
6661  *
6662  * Checks whether an attribute is valid for an element
6663  * Has full knowledge of Required and Deprecated attributes
6664  *
6665  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6666  */
6667 htmlStatus
6668 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6669   const char** p ;
6670
6671   if ( !elt || ! attr )
6672         return HTML_INVALID ;
6673
6674   if ( elt->attrs_req )
6675     for ( p = elt->attrs_req; *p; ++p)
6676       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6677         return HTML_REQUIRED ;
6678
6679   if ( elt->attrs_opt )
6680     for ( p = elt->attrs_opt; *p; ++p)
6681       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6682         return HTML_VALID ;
6683
6684   if ( legacy && elt->attrs_depr )
6685     for ( p = elt->attrs_depr; *p; ++p)
6686       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6687         return HTML_DEPRECATED ;
6688
6689   return HTML_INVALID ;
6690 }
6691 /**
6692  * htmlNodeStatus:
6693  * @node: an htmlNodePtr in a tree
6694  * @legacy: whether to allow deprecated elements (YES is faster here
6695  *      for Element nodes)
6696  *
6697  * Checks whether the tree node is valid.  Experimental (the author
6698  *     only uses the HTML enhancements in a SAX parser)
6699  *
6700  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6701  *      legacy allowed) or htmlElementStatusHere (otherwise).
6702  *      for Attribute nodes, a return from htmlAttrAllowed
6703  *      for other nodes, HTML_NA (no checks performed)
6704  */
6705 htmlStatus
6706 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6707   if ( ! node )
6708     return HTML_INVALID ;
6709
6710   switch ( node->type ) {
6711     case XML_ELEMENT_NODE:
6712       return legacy
6713         ? ( htmlElementAllowedHere (
6714                 htmlTagLookup(node->parent->name) , node->name
6715                 ) ? HTML_VALID : HTML_INVALID )
6716         : htmlElementStatusHere(
6717                 htmlTagLookup(node->parent->name) ,
6718                 htmlTagLookup(node->name) )
6719         ;
6720     case XML_ATTRIBUTE_NODE:
6721       return htmlAttrAllowed(
6722         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6723     default: return HTML_NA ;
6724   }
6725 }
6726 /************************************************************************
6727  *                                                                      *
6728  *      New set (2.6.0) of simpler and more flexible APIs               *
6729  *                                                                      *
6730  ************************************************************************/
6731 /**
6732  * DICT_FREE:
6733  * @str:  a string
6734  *
6735  * Free a string if it is not owned by the "dict" dictionary in the
6736  * current scope
6737  */
6738 #define DICT_FREE(str)                                          \
6739         if ((str) && ((!dict) ||                                \
6740             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6741             xmlFree((char *)(str));
6742
6743 /**
6744  * htmlCtxtReset:
6745  * @ctxt: an HTML parser context
6746  *
6747  * Reset a parser context
6748  */
6749 void
6750 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6751 {
6752     xmlParserInputPtr input;
6753     xmlDictPtr dict;
6754
6755     if (ctxt == NULL)
6756         return;
6757
6758     xmlInitParser();
6759     dict = ctxt->dict;
6760
6761     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6762         xmlFreeInputStream(input);
6763     }
6764     ctxt->inputNr = 0;
6765     ctxt->input = NULL;
6766
6767     ctxt->spaceNr = 0;
6768     if (ctxt->spaceTab != NULL) {
6769         ctxt->spaceTab[0] = -1;
6770         ctxt->space = &ctxt->spaceTab[0];
6771     } else {
6772         ctxt->space = NULL;
6773     }
6774
6775
6776     ctxt->nodeNr = 0;
6777     ctxt->node = NULL;
6778
6779     ctxt->nameNr = 0;
6780     ctxt->name = NULL;
6781
6782     ctxt->nsNr = 0;
6783
6784     DICT_FREE(ctxt->version);
6785     ctxt->version = NULL;
6786     DICT_FREE(ctxt->encoding);
6787     ctxt->encoding = NULL;
6788     DICT_FREE(ctxt->directory);
6789     ctxt->directory = NULL;
6790     DICT_FREE(ctxt->extSubURI);
6791     ctxt->extSubURI = NULL;
6792     DICT_FREE(ctxt->extSubSystem);
6793     ctxt->extSubSystem = NULL;
6794     if (ctxt->myDoc != NULL)
6795         xmlFreeDoc(ctxt->myDoc);
6796     ctxt->myDoc = NULL;
6797
6798     ctxt->standalone = -1;
6799     ctxt->hasExternalSubset = 0;
6800     ctxt->hasPErefs = 0;
6801     ctxt->html = 1;
6802     ctxt->external = 0;
6803     ctxt->instate = XML_PARSER_START;
6804     ctxt->token = 0;
6805
6806     ctxt->wellFormed = 1;
6807     ctxt->nsWellFormed = 1;
6808     ctxt->disableSAX = 0;
6809     ctxt->valid = 1;
6810     ctxt->vctxt.userData = ctxt;
6811     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6812     ctxt->vctxt.error = xmlParserValidityError;
6813     ctxt->vctxt.warning = xmlParserValidityWarning;
6814     ctxt->record_info = 0;
6815     ctxt->checkIndex = 0;
6816     ctxt->endCheckState = 0;
6817     ctxt->inSubset = 0;
6818     ctxt->errNo = XML_ERR_OK;
6819     ctxt->depth = 0;
6820     ctxt->charset = XML_CHAR_ENCODING_NONE;
6821     ctxt->catalogs = NULL;
6822     xmlInitNodeInfoSeq(&ctxt->node_seq);
6823
6824     if (ctxt->attsDefault != NULL) {
6825         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6826         ctxt->attsDefault = NULL;
6827     }
6828     if (ctxt->attsSpecial != NULL) {
6829         xmlHashFree(ctxt->attsSpecial, NULL);
6830         ctxt->attsSpecial = NULL;
6831     }
6832
6833     ctxt->nbErrors = 0;
6834     ctxt->nbWarnings = 0;
6835     if (ctxt->lastError.code != XML_ERR_OK)
6836         xmlResetError(&ctxt->lastError);
6837 }
6838
6839 /**
6840  * htmlCtxtUseOptions:
6841  * @ctxt: an HTML parser context
6842  * @options:  a combination of htmlParserOption(s)
6843  *
6844  * Applies the options to the parser context
6845  *
6846  * Returns 0 in case of success, the set of unknown or unimplemented options
6847  *         in case of error.
6848  */
6849 int
6850 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6851 {
6852     if (ctxt == NULL)
6853         return(-1);
6854
6855     if (options & HTML_PARSE_NOWARNING) {
6856         ctxt->sax->warning = NULL;
6857         ctxt->vctxt.warning = NULL;
6858         options -= XML_PARSE_NOWARNING;
6859         ctxt->options |= XML_PARSE_NOWARNING;
6860     }
6861     if (options & HTML_PARSE_NOERROR) {
6862         ctxt->sax->error = NULL;
6863         ctxt->vctxt.error = NULL;
6864         ctxt->sax->fatalError = NULL;
6865         options -= XML_PARSE_NOERROR;
6866         ctxt->options |= XML_PARSE_NOERROR;
6867     }
6868     if (options & HTML_PARSE_PEDANTIC) {
6869         ctxt->pedantic = 1;
6870         options -= XML_PARSE_PEDANTIC;
6871         ctxt->options |= XML_PARSE_PEDANTIC;
6872     } else
6873         ctxt->pedantic = 0;
6874     if (options & XML_PARSE_NOBLANKS) {
6875         ctxt->keepBlanks = 0;
6876         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6877         options -= XML_PARSE_NOBLANKS;
6878         ctxt->options |= XML_PARSE_NOBLANKS;
6879     } else
6880         ctxt->keepBlanks = 1;
6881     if (options & HTML_PARSE_RECOVER) {
6882         ctxt->recovery = 1;
6883         options -= HTML_PARSE_RECOVER;
6884     } else
6885         ctxt->recovery = 0;
6886     if (options & HTML_PARSE_COMPACT) {
6887         ctxt->options |= HTML_PARSE_COMPACT;
6888         options -= HTML_PARSE_COMPACT;
6889     }
6890     if (options & XML_PARSE_HUGE) {
6891         ctxt->options |= XML_PARSE_HUGE;
6892         options -= XML_PARSE_HUGE;
6893     }
6894     if (options & HTML_PARSE_NODEFDTD) {
6895         ctxt->options |= HTML_PARSE_NODEFDTD;
6896         options -= HTML_PARSE_NODEFDTD;
6897     }
6898     if (options & HTML_PARSE_IGNORE_ENC) {
6899         ctxt->options |= HTML_PARSE_IGNORE_ENC;
6900         options -= HTML_PARSE_IGNORE_ENC;
6901     }
6902     if (options & HTML_PARSE_NOIMPLIED) {
6903         ctxt->options |= HTML_PARSE_NOIMPLIED;
6904         options -= HTML_PARSE_NOIMPLIED;
6905     }
6906     ctxt->dictNames = 0;
6907     ctxt->linenumbers = 1;
6908     return (options);
6909 }
6910
6911 /**
6912  * htmlDoRead:
6913  * @ctxt:  an HTML parser context
6914  * @URL:  the base URL to use for the document
6915  * @encoding:  the document encoding, or NULL
6916  * @options:  a combination of htmlParserOption(s)
6917  * @reuse:  keep the context for reuse
6918  *
6919  * Common front-end for the htmlRead functions
6920  *
6921  * Returns the resulting document tree or NULL
6922  */
6923 static htmlDocPtr
6924 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6925           int options, int reuse)
6926 {
6927     htmlDocPtr ret;
6928
6929     htmlCtxtUseOptions(ctxt, options);
6930     ctxt->html = 1;
6931     if (encoding != NULL) {
6932         xmlCharEncodingHandlerPtr hdlr;
6933
6934         hdlr = xmlFindCharEncodingHandler(encoding);
6935         if (hdlr != NULL) {
6936             xmlSwitchToEncoding(ctxt, hdlr);
6937             if (ctxt->input->encoding != NULL)
6938               xmlFree((xmlChar *) ctxt->input->encoding);
6939             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6940         }
6941     }
6942     if ((URL != NULL) && (ctxt->input != NULL) &&
6943         (ctxt->input->filename == NULL))
6944         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6945     htmlParseDocument(ctxt);
6946     ret = ctxt->myDoc;
6947     ctxt->myDoc = NULL;
6948     if (!reuse) {
6949         if ((ctxt->dictNames) &&
6950             (ret != NULL) &&
6951             (ret->dict == ctxt->dict))
6952             ctxt->dict = NULL;
6953         xmlFreeParserCtxt(ctxt);
6954     }
6955     return (ret);
6956 }
6957
6958 /**
6959  * htmlReadDoc:
6960  * @cur:  a pointer to a zero terminated string
6961  * @URL:  the base URL to use for the document
6962  * @encoding:  the document encoding, or NULL
6963  * @options:  a combination of htmlParserOption(s)
6964  *
6965  * parse an XML in-memory document and build a tree.
6966  *
6967  * Returns the resulting document tree
6968  */
6969 htmlDocPtr
6970 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6971 {
6972     htmlParserCtxtPtr ctxt;
6973
6974     if (cur == NULL)
6975         return (NULL);
6976
6977     xmlInitParser();
6978     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6979     if (ctxt == NULL)
6980         return (NULL);
6981     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6982 }
6983
6984 /**
6985  * htmlReadFile:
6986  * @filename:  a file or URL
6987  * @encoding:  the document encoding, or NULL
6988  * @options:  a combination of htmlParserOption(s)
6989  *
6990  * parse an XML file from the filesystem or the network.
6991  *
6992  * Returns the resulting document tree
6993  */
6994 htmlDocPtr
6995 htmlReadFile(const char *filename, const char *encoding, int options)
6996 {
6997     htmlParserCtxtPtr ctxt;
6998
6999     xmlInitParser();
7000     ctxt = htmlCreateFileParserCtxt(filename, encoding);
7001     if (ctxt == NULL)
7002         return (NULL);
7003     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
7004 }
7005
7006 /**
7007  * htmlReadMemory:
7008  * @buffer:  a pointer to a char array
7009  * @size:  the size of the array
7010  * @URL:  the base URL to use for the document
7011  * @encoding:  the document encoding, or NULL
7012  * @options:  a combination of htmlParserOption(s)
7013  *
7014  * parse an XML in-memory document and build a tree.
7015  *
7016  * Returns the resulting document tree
7017  */
7018 htmlDocPtr
7019 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
7020 {
7021     htmlParserCtxtPtr ctxt;
7022
7023     xmlInitParser();
7024     ctxt = htmlCreateMemoryParserCtxt(buffer, size);
7025     if (ctxt == NULL)
7026         return (NULL);
7027     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7028 }
7029
7030 /**
7031  * htmlReadFd:
7032  * @fd:  an open file descriptor
7033  * @URL:  the base URL to use for the document
7034  * @encoding:  the document encoding, or NULL
7035  * @options:  a combination of htmlParserOption(s)
7036  *
7037  * parse an HTML from a file descriptor and build a tree.
7038  * NOTE that the file descriptor will not be closed when the
7039  *      reader is closed or reset.
7040  *
7041  * Returns the resulting document tree
7042  */
7043 htmlDocPtr
7044 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7045 {
7046     htmlParserCtxtPtr ctxt;
7047     xmlParserInputBufferPtr input;
7048     htmlParserInputPtr stream;
7049
7050     if (fd < 0)
7051         return (NULL);
7052
7053     xmlInitParser();
7054     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7055     if (input == NULL)
7056         return (NULL);
7057     input->closecallback = NULL;
7058     ctxt = htmlNewParserCtxt();
7059     if (ctxt == NULL) {
7060         xmlFreeParserInputBuffer(input);
7061         return (NULL);
7062     }
7063     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7064     if (stream == NULL) {
7065         xmlFreeParserInputBuffer(input);
7066         htmlFreeParserCtxt(ctxt);
7067         return (NULL);
7068     }
7069     inputPush(ctxt, stream);
7070     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7071 }
7072
7073 /**
7074  * htmlReadIO:
7075  * @ioread:  an I/O read function
7076  * @ioclose:  an I/O close function
7077  * @ioctx:  an I/O handler
7078  * @URL:  the base URL to use for the document
7079  * @encoding:  the document encoding, or NULL
7080  * @options:  a combination of htmlParserOption(s)
7081  *
7082  * parse an HTML document from I/O functions and source and build a tree.
7083  *
7084  * Returns the resulting document tree
7085  */
7086 htmlDocPtr
7087 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7088           void *ioctx, const char *URL, const char *encoding, int options)
7089 {
7090     htmlParserCtxtPtr ctxt;
7091     xmlParserInputBufferPtr input;
7092     xmlParserInputPtr stream;
7093
7094     if (ioread == NULL)
7095         return (NULL);
7096     xmlInitParser();
7097
7098     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7099                                          XML_CHAR_ENCODING_NONE);
7100     if (input == NULL) {
7101         if (ioclose != NULL)
7102             ioclose(ioctx);
7103         return (NULL);
7104     }
7105     ctxt = htmlNewParserCtxt();
7106     if (ctxt == NULL) {
7107         xmlFreeParserInputBuffer(input);
7108         return (NULL);
7109     }
7110     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7111     if (stream == NULL) {
7112         xmlFreeParserInputBuffer(input);
7113         xmlFreeParserCtxt(ctxt);
7114         return (NULL);
7115     }
7116     inputPush(ctxt, stream);
7117     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7118 }
7119
7120 /**
7121  * htmlCtxtReadDoc:
7122  * @ctxt:  an HTML parser context
7123  * @cur:  a pointer to a zero terminated string
7124  * @URL:  the base URL to use for the document
7125  * @encoding:  the document encoding, or NULL
7126  * @options:  a combination of htmlParserOption(s)
7127  *
7128  * parse an XML in-memory document and build a tree.
7129  * This reuses the existing @ctxt parser context
7130  *
7131  * Returns the resulting document tree
7132  */
7133 htmlDocPtr
7134 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7135                const char *URL, const char *encoding, int options)
7136 {
7137     if (cur == NULL)
7138         return (NULL);
7139     return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
7140                                encoding, options));
7141 }
7142
7143 /**
7144  * htmlCtxtReadFile:
7145  * @ctxt:  an HTML parser context
7146  * @filename:  a file or URL
7147  * @encoding:  the document encoding, or NULL
7148  * @options:  a combination of htmlParserOption(s)
7149  *
7150  * parse an XML file from the filesystem or the network.
7151  * This reuses the existing @ctxt parser context
7152  *
7153  * Returns the resulting document tree
7154  */
7155 htmlDocPtr
7156 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7157                 const char *encoding, int options)
7158 {
7159     xmlParserInputPtr stream;
7160
7161     if (filename == NULL)
7162         return (NULL);
7163     if (ctxt == NULL)
7164         return (NULL);
7165     xmlInitParser();
7166
7167     htmlCtxtReset(ctxt);
7168
7169     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7170     if (stream == NULL) {
7171         return (NULL);
7172     }
7173     inputPush(ctxt, stream);
7174     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7175 }
7176
7177 /**
7178  * htmlCtxtReadMemory:
7179  * @ctxt:  an HTML parser context
7180  * @buffer:  a pointer to a char array
7181  * @size:  the size of the array
7182  * @URL:  the base URL to use for the document
7183  * @encoding:  the document encoding, or NULL
7184  * @options:  a combination of htmlParserOption(s)
7185  *
7186  * parse an XML in-memory document and build a tree.
7187  * This reuses the existing @ctxt parser context
7188  *
7189  * Returns the resulting document tree
7190  */
7191 htmlDocPtr
7192 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7193                   const char *URL, const char *encoding, int options)
7194 {
7195     xmlParserInputBufferPtr input;
7196     xmlParserInputPtr stream;
7197
7198     if (ctxt == NULL)
7199         return (NULL);
7200     if (buffer == NULL)
7201         return (NULL);
7202     xmlInitParser();
7203
7204     htmlCtxtReset(ctxt);
7205
7206     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7207     if (input == NULL) {
7208         return(NULL);
7209     }
7210
7211     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7212     if (stream == NULL) {
7213         xmlFreeParserInputBuffer(input);
7214         return(NULL);
7215     }
7216
7217     inputPush(ctxt, stream);
7218     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7219 }
7220
7221 /**
7222  * htmlCtxtReadFd:
7223  * @ctxt:  an HTML parser context
7224  * @fd:  an open file descriptor
7225  * @URL:  the base URL to use for the document
7226  * @encoding:  the document encoding, or NULL
7227  * @options:  a combination of htmlParserOption(s)
7228  *
7229  * parse an XML from a file descriptor and build a tree.
7230  * This reuses the existing @ctxt parser context
7231  *
7232  * Returns the resulting document tree
7233  */
7234 htmlDocPtr
7235 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7236               const char *URL, const char *encoding, int options)
7237 {
7238     xmlParserInputBufferPtr input;
7239     xmlParserInputPtr stream;
7240
7241     if (fd < 0)
7242         return (NULL);
7243     if (ctxt == NULL)
7244         return (NULL);
7245     xmlInitParser();
7246
7247     htmlCtxtReset(ctxt);
7248
7249
7250     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7251     if (input == NULL)
7252         return (NULL);
7253     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7254     if (stream == NULL) {
7255         xmlFreeParserInputBuffer(input);
7256         return (NULL);
7257     }
7258     inputPush(ctxt, stream);
7259     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7260 }
7261
7262 /**
7263  * htmlCtxtReadIO:
7264  * @ctxt:  an HTML parser context
7265  * @ioread:  an I/O read function
7266  * @ioclose:  an I/O close function
7267  * @ioctx:  an I/O handler
7268  * @URL:  the base URL to use for the document
7269  * @encoding:  the document encoding, or NULL
7270  * @options:  a combination of htmlParserOption(s)
7271  *
7272  * parse an HTML document from I/O functions and source and build a tree.
7273  * This reuses the existing @ctxt parser context
7274  *
7275  * Returns the resulting document tree
7276  */
7277 htmlDocPtr
7278 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7279               xmlInputCloseCallback ioclose, void *ioctx,
7280               const char *URL,
7281               const char *encoding, int options)
7282 {
7283     xmlParserInputBufferPtr input;
7284     xmlParserInputPtr stream;
7285
7286     if (ioread == NULL)
7287         return (NULL);
7288     if (ctxt == NULL)
7289         return (NULL);
7290     xmlInitParser();
7291
7292     htmlCtxtReset(ctxt);
7293
7294     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7295                                          XML_CHAR_ENCODING_NONE);
7296     if (input == NULL) {
7297         if (ioclose != NULL)
7298             ioclose(ioctx);
7299         return (NULL);
7300     }
7301     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7302     if (stream == NULL) {
7303         xmlFreeParserInputBuffer(input);
7304         return (NULL);
7305     }
7306     inputPush(ctxt, stream);
7307     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7308 }
7309
7310 #endif /* LIBXML_HTML_ENABLED */