libs/xml2/HTMLtree.c

   1 /*
   2  * HTMLtree.c : implementation of access function for an HTML tree.
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9
  10 #define IN_LIBXML
  11 #include "libxml.h"
  12 #ifdef LIBXML_HTML_ENABLED
  13
  14 #include <string.h> /* for memset() only ! */
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17
  18 #include <libxml/xmlmemory.h>
  19 #include <libxml/HTMLparser.h>
  20 #include <libxml/HTMLtree.h>
  21 #include <libxml/entities.h>
  22 #include <libxml/valid.h>
  23 #include <libxml/xmlerror.h>
  24 #include <libxml/parserInternals.h>
  25 #include <libxml/globals.h>
  26 #include <libxml/uri.h>
  27
  28 #include "buf.h"
  29
  30 /************************************************************************
  31  *                                                                      *
  32  *              Getting/Setting encoding meta tags                      *
  33  *                                                                      *
  34  ************************************************************************/
  35
  36 /**
  37  * htmlGetMetaEncoding:
  38  * @doc:  the document
  39  *
  40  * Encoding definition lookup in the Meta tags
  41  *
  42  * Returns the current encoding as flagged in the HTML source
  43  */
  44 const xmlChar *
  45 htmlGetMetaEncoding(htmlDocPtr doc) {
  46     htmlNodePtr cur;
  47     const xmlChar *content;
  48     const xmlChar *encoding;
  49
  50     if (doc == NULL)
  51         return(NULL);
  52     cur = doc->children;
  53
  54     /*
  55      * Search the html
  56      */
  57     while (cur != NULL) {
  58         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  59             if (xmlStrEqual(cur->name, BAD_CAST"html"))
  60                 break;
  61             if (xmlStrEqual(cur->name, BAD_CAST"head"))
  62                 goto found_head;
  63             if (xmlStrEqual(cur->name, BAD_CAST"meta"))
  64                 goto found_meta;
  65         }
  66         cur = cur->next;
  67     }
  68     if (cur == NULL)
  69         return(NULL);
  70     cur = cur->children;
  71
  72     /*
  73      * Search the head
  74      */
  75     while (cur != NULL) {
  76         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  77             if (xmlStrEqual(cur->name, BAD_CAST"head"))
  78                 break;
  79             if (xmlStrEqual(cur->name, BAD_CAST"meta"))
  80                 goto found_meta;
  81         }
  82         cur = cur->next;
  83     }
  84     if (cur == NULL)
  85         return(NULL);
  86 found_head:
  87     cur = cur->children;
  88
  89     /*
  90      * Search the meta elements
  91      */
  92 found_meta:
  93     while (cur != NULL) {
  94         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  95             if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
  96                 xmlAttrPtr attr = cur->properties;
  97                 int http;
  98                 const xmlChar *value;
  99
 100                 content = NULL;
 101                 http = 0;
 102                 while (attr != NULL) {
 103                     if ((attr->children != NULL) &&
 104                         (attr->children->type == XML_TEXT_NODE) &&
 105                         (attr->children->next == NULL)) {
 106                         value = attr->children->content;
 107                         if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
 108                          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 109                             http = 1;
 110                         else if ((value != NULL)
 111                          && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
 112                             content = value;
 113                         if ((http != 0) && (content != NULL))
 114                             goto found_content;
 115                     }
 116                     attr = attr->next;
 117                 }
 118             }
 119         }
 120         cur = cur->next;
 121     }
 122     return(NULL);
 123
 124 found_content:
 125     encoding = xmlStrstr(content, BAD_CAST"charset=");
 126     if (encoding == NULL)
 127         encoding = xmlStrstr(content, BAD_CAST"Charset=");
 128     if (encoding == NULL)
 129         encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
 130     if (encoding != NULL) {
 131         encoding += 8;
 132     } else {
 133         encoding = xmlStrstr(content, BAD_CAST"charset =");
 134         if (encoding == NULL)
 135             encoding = xmlStrstr(content, BAD_CAST"Charset =");
 136         if (encoding == NULL)
 137             encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
 138         if (encoding != NULL)
 139             encoding += 9;
 140     }
 141     if (encoding != NULL) {
 142         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
 143     }
 144     return(encoding);
 145 }
 146
 147 /**
 148  * htmlSetMetaEncoding:
 149  * @doc:  the document
 150  * @encoding:  the encoding string
 151  *
 152  * Sets the current encoding in the Meta tags
 153  * NOTE: this will not change the document content encoding, just
 154  * the META flag associated.
 155  *
 156  * Returns 0 in case of success and -1 in case of error
 157  */
 158 int
 159 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
 160     htmlNodePtr cur, meta = NULL, head = NULL;
 161     const xmlChar *content = NULL;
 162     char newcontent[100];
 163
 164     newcontent[0] = 0;
 165
 166     if (doc == NULL)
 167         return(-1);
 168
 169     /* html isn't a real encoding it's just libxml2 way to get entities */
 170     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
 171         return(-1);
 172
 173     if (encoding != NULL) {
 174         snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
 175                 (char *)encoding);
 176         newcontent[sizeof(newcontent) - 1] = 0;
 177     }
 178
 179     cur = doc->children;
 180
 181     /*
 182      * Search the html
 183      */
 184     while (cur != NULL) {
 185         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 186             if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
 187                 break;
 188             if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
 189                 goto found_head;
 190             if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
 191                 goto found_meta;
 192         }
 193         cur = cur->next;
 194     }
 195     if (cur == NULL)
 196         return(-1);
 197     cur = cur->children;
 198
 199     /*
 200      * Search the head
 201      */
 202     while (cur != NULL) {
 203         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 204             if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
 205                 break;
 206             if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
 207                 head = cur->parent;
 208                 goto found_meta;
 209             }
 210         }
 211         cur = cur->next;
 212     }
 213     if (cur == NULL)
 214         return(-1);
 215 found_head:
 216     head = cur;
 217     if (cur->children == NULL)
 218         goto create;
 219     cur = cur->children;
 220
 221 found_meta:
 222     /*
 223      * Search and update all the remaining the meta elements carrying
 224      * encoding information
 225      */
 226     while (cur != NULL) {
 227         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 228             if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
 229                 xmlAttrPtr attr = cur->properties;
 230                 int http;
 231                 const xmlChar *value;
 232
 233                 content = NULL;
 234                 http = 0;
 235                 while (attr != NULL) {
 236                     if ((attr->children != NULL) &&
 237                         (attr->children->type == XML_TEXT_NODE) &&
 238                         (attr->children->next == NULL)) {
 239                         value = attr->children->content;
 240                         if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
 241                          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 242                             http = 1;
 243                         else
 244                         {
 245                            if ((value != NULL) &&
 246                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
 247                                content = value;
 248                         }
 249                         if ((http != 0) && (content != NULL))
 250                             break;
 251                     }
 252                     attr = attr->next;
 253                 }
 254                 if ((http != 0) && (content != NULL)) {
 255                     meta = cur;
 256                     break;
 257                 }
 258
 259             }
 260         }
 261         cur = cur->next;
 262     }
 263 create:
 264     if (meta == NULL) {
 265         if ((encoding != NULL) && (head != NULL)) {
 266             /*
 267              * Create a new Meta element with the right attributes
 268              */
 269
 270             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
 271             if (head->children == NULL)
 272                 xmlAddChild(head, meta);
 273             else
 274                 xmlAddPrevSibling(head->children, meta);
 275             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
 276             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
 277         }
 278     } else {
 279         /* remove the meta tag if NULL is passed */
 280         if (encoding == NULL) {
 281             xmlUnlinkNode(meta);
 282             xmlFreeNode(meta);
 283         }
 284         /* change the document only if there is a real encoding change */
 285         else if (xmlStrcasestr(content, encoding) == NULL) {
 286             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
 287         }
 288     }
 289
 290
 291     return(0);
 292 }
 293
 294 /**
 295  * booleanHTMLAttrs:
 296  *
 297  * These are the HTML attributes which will be output
 298  * in minimized form, i.e. <option selected="selected"> will be
 299  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
 300  *
 301  */
 302 static const char* const htmlBooleanAttrs[] = {
 303   "checked", "compact", "declare", "defer", "disabled", "ismap",
 304   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
 305   "selected", NULL
 306 };
 307
 308
 309 /**
 310  * htmlIsBooleanAttr:
 311  * @name:  the name of the attribute to check
 312  *
 313  * Determine if a given attribute is a boolean attribute.
 314  *
 315  * returns: false if the attribute is not boolean, true otherwise.
 316  */
 317 int
 318 htmlIsBooleanAttr(const xmlChar *name)
 319 {
 320     int i = 0;
 321
 322     while (htmlBooleanAttrs[i] != NULL) {
 323         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
 324             return 1;
 325         i++;
 326     }
 327     return 0;
 328 }
 329
 330 #ifdef LIBXML_OUTPUT_ENABLED
 331 /*
 332  * private routine exported from xmlIO.c
 333  */
 334 xmlOutputBufferPtr
 335 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
 336 /************************************************************************
 337  *                                                                      *
 338  *                      Output error handlers                           *
 339  *                                                                      *
 340  ************************************************************************/
 341 /**
 342  * htmlSaveErrMemory:
 343  * @extra:  extra information
 344  *
 345  * Handle an out of memory condition
 346  */
 347 static void
 348 htmlSaveErrMemory(const char *extra)
 349 {
 350     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
 351 }
 352
 353 /**
 354  * htmlSaveErr:
 355  * @code:  the error number
 356  * @node:  the location of the error.
 357  * @extra:  extra information
 358  *
 359  * Handle an out of memory condition
 360  */
 361 static void
 362 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
 363 {
 364     const char *msg = NULL;
 365
 366     switch(code) {
 367         case XML_SAVE_NOT_UTF8:
 368             msg = "string is not in UTF-8\n";
 369             break;
 370         case XML_SAVE_CHAR_INVALID:
 371             msg = "invalid character value\n";
 372             break;
 373         case XML_SAVE_UNKNOWN_ENCODING:
 374             msg = "unknown encoding %s\n";
 375             break;
 376         case XML_SAVE_NO_DOCTYPE:
 377             msg = "HTML has no DOCTYPE\n";
 378             break;
 379         default:
 380             msg = "unexpected error number\n";
 381     }
 382     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
 383 }
 384
 385 /************************************************************************
 386  *                                                                      *
 387  *              Dumping HTML tree content to a simple buffer            *
 388  *                                                                      *
 389  ************************************************************************/
 390
 391 /**
 392  * htmlBufNodeDumpFormat:
 393  * @buf:  the xmlBufPtr output
 394  * @doc:  the document
 395  * @cur:  the current node
 396  * @format:  should formatting spaces been added
 397  *
 398  * Dump an HTML node, recursive behaviour,children are printed too.
 399  *
 400  * Returns the number of byte written or -1 in case of error
 401  */
 402 static size_t
 403 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
 404                    int format) {
 405     size_t use;
 406     int ret;
 407     xmlOutputBufferPtr outbuf;
 408
 409     if (cur == NULL) {
 410         return (-1);
 411     }
 412     if (buf == NULL) {
 413         return (-1);
 414     }
 415     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
 416     if (outbuf == NULL) {
 417         htmlSaveErrMemory("allocating HTML output buffer");
 418         return (-1);
 419     }
 420     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
 421     outbuf->buffer = buf;
 422     outbuf->encoder = NULL;
 423     outbuf->writecallback = NULL;
 424     outbuf->closecallback = NULL;
 425     outbuf->context = NULL;
 426     outbuf->written = 0;
 427
 428     use = xmlBufUse(buf);
 429     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
 430     xmlFree(outbuf);
 431     ret = xmlBufUse(buf) - use;
 432     return (ret);
 433 }
 434
 435 /**
 436  * htmlNodeDump:
 437  * @buf:  the HTML buffer output
 438  * @doc:  the document
 439  * @cur:  the current node
 440  *
 441  * Dump an HTML node, recursive behaviour,children are printed too,
 442  * and formatting returns are added.
 443  *
 444  * Returns the number of byte written or -1 in case of error
 445  */
 446 int
 447 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
 448     xmlBufPtr buffer;
 449     size_t ret;
 450
 451     if ((buf == NULL) || (cur == NULL))
 452         return(-1);
 453
 454     xmlInitParser();
 455     buffer = xmlBufFromBuffer(buf);
 456     if (buffer == NULL)
 457         return(-1);
 458
 459     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
 460
 461     xmlBufBackToBuffer(buffer);
 462
 463     if (ret > INT_MAX)
 464         return(-1);
 465     return((int) ret);
 466 }
 467
 468 /**
 469  * htmlNodeDumpFileFormat:
 470  * @out:  the FILE pointer
 471  * @doc:  the document
 472  * @cur:  the current node
 473  * @encoding: the document encoding
 474  * @format:  should formatting spaces been added
 475  *
 476  * Dump an HTML node, recursive behaviour,children are printed too.
 477  *
 478  * TODO: if encoding == NULL try to save in the doc encoding
 479  *
 480  * returns: the number of byte written or -1 in case of failure.
 481  */
 482 int
 483 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
 484                        xmlNodePtr cur, const char *encoding, int format) {
 485     xmlOutputBufferPtr buf;
 486     xmlCharEncodingHandlerPtr handler = NULL;
 487     int ret;
 488
 489     xmlInitParser();
 490
 491     if (encoding != NULL) {
 492         xmlCharEncoding enc;
 493
 494         enc = xmlParseCharEncoding(encoding);
 495         if (enc != XML_CHAR_ENCODING_UTF8) {
 496             handler = xmlFindCharEncodingHandler(encoding);
 497             if (handler == NULL)
 498                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
 499         }
 500     } else {
 501         /*
 502          * Fallback to HTML or ASCII when the encoding is unspecified
 503          */
 504         if (handler == NULL)
 505             handler = xmlFindCharEncodingHandler("HTML");
 506         if (handler == NULL)
 507             handler = xmlFindCharEncodingHandler("ascii");
 508     }
 509
 510     /*
 511      * save the content to a temp buffer.
 512      */
 513     buf = xmlOutputBufferCreateFile(out, handler);
 514     if (buf == NULL) return(0);
 515
 516     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
 517
 518     ret = xmlOutputBufferClose(buf);
 519     return(ret);
 520 }
 521
 522 /**
 523  * htmlNodeDumpFile:
 524  * @out:  the FILE pointer
 525  * @doc:  the document
 526  * @cur:  the current node
 527  *
 528  * Dump an HTML node, recursive behaviour,children are printed too,
 529  * and formatting returns are added.
 530  */
 531 void
 532 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
 533     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
 534 }
 535
 536 /**
 537  * htmlDocDumpMemoryFormat:
 538  * @cur:  the document
 539  * @mem:  OUT: the memory pointer
 540  * @size:  OUT: the memory length
 541  * @format:  should formatting spaces been added
 542  *
 543  * Dump an HTML document in memory and return the xmlChar * and it's size.
 544  * It's up to the caller to free the memory.
 545  */
 546 void
 547 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
 548     xmlOutputBufferPtr buf;
 549     xmlCharEncodingHandlerPtr handler = NULL;
 550     const char *encoding;
 551
 552     xmlInitParser();
 553
 554     if ((mem == NULL) || (size == NULL))
 555         return;
 556     if (cur == NULL) {
 557         *mem = NULL;
 558         *size = 0;
 559         return;
 560     }
 561
 562     encoding = (const char *) htmlGetMetaEncoding(cur);
 563
 564     if (encoding != NULL) {
 565         xmlCharEncoding enc;
 566
 567         enc = xmlParseCharEncoding(encoding);
 568         if (enc != XML_CHAR_ENCODING_UTF8) {
 569             handler = xmlFindCharEncodingHandler(encoding);
 570             if (handler == NULL)
 571                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
 572
 573         }
 574     } else {
 575         /*
 576          * Fallback to HTML or ASCII when the encoding is unspecified
 577          */
 578         if (handler == NULL)
 579             handler = xmlFindCharEncodingHandler("HTML");
 580         if (handler == NULL)
 581             handler = xmlFindCharEncodingHandler("ascii");
 582     }
 583
 584     buf = xmlAllocOutputBufferInternal(handler);
 585     if (buf == NULL) {
 586         *mem = NULL;
 587         *size = 0;
 588         return;
 589     }
 590
 591     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
 592
 593     xmlOutputBufferFlush(buf);
 594     if (buf->conv != NULL) {
 595         *size = xmlBufUse(buf->conv);
 596         *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
 597     } else {
 598         *size = xmlBufUse(buf->buffer);
 599         *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
 600     }
 601     (void)xmlOutputBufferClose(buf);
 602 }
 603
 604 /**
 605  * htmlDocDumpMemory:
 606  * @cur:  the document
 607  * @mem:  OUT: the memory pointer
 608  * @size:  OUT: the memory length
 609  *
 610  * Dump an HTML document in memory and return the xmlChar * and it's size.
 611  * It's up to the caller to free the memory.
 612  */
 613 void
 614 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
 615         htmlDocDumpMemoryFormat(cur, mem, size, 1);
 616 }
 617
 618
 619 /************************************************************************
 620  *                                                                      *
 621  *              Dumping HTML tree content to an I/O output buffer       *
 622  *                                                                      *
 623  ************************************************************************/
 624
 625 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
 626
 627 /**
 628  * htmlDtdDumpOutput:
 629  * @buf:  the HTML buffer output
 630  * @doc:  the document
 631  * @encoding:  the encoding string
 632  *
 633  * TODO: check whether encoding is needed
 634  *
 635  * Dump the HTML document DTD, if any.
 636  */
 637 static void
 638 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 639                   const char *encoding ATTRIBUTE_UNUSED) {
 640     xmlDtdPtr cur = doc->intSubset;
 641
 642     if (cur == NULL) {
 643         htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
 644         return;
 645     }
 646     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
 647     xmlOutputBufferWriteString(buf, (const char *)cur->name);
 648     if (cur->ExternalID != NULL) {
 649         xmlOutputBufferWriteString(buf, " PUBLIC ");
 650         xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
 651         if (cur->SystemID != NULL) {
 652             xmlOutputBufferWriteString(buf, " ");
 653             xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
 654         }
 655     } else if (cur->SystemID != NULL &&
 656                xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
 657         xmlOutputBufferWriteString(buf, " SYSTEM ");
 658         xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
 659     }
 660     xmlOutputBufferWriteString(buf, ">\n");
 661 }
 662
 663 /**
 664  * htmlAttrDumpOutput:
 665  * @buf:  the HTML buffer output
 666  * @doc:  the document
 667  * @cur:  the attribute pointer
 668  *
 669  * Dump an HTML attribute
 670  */
 671 static void
 672 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
 673     xmlChar *value;
 674
 675     /*
 676      * The html output method should not escape a & character
 677      * occurring in an attribute value immediately followed by
 678      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
 679      * This is implemented in xmlEncodeEntitiesReentrant
 680      */
 681
 682     if (cur == NULL) {
 683         return;
 684     }
 685     xmlOutputBufferWriteString(buf, " ");
 686     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 687         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 688         xmlOutputBufferWriteString(buf, ":");
 689     }
 690     xmlOutputBufferWriteString(buf, (const char *)cur->name);
 691     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
 692         value = xmlNodeListGetString(doc, cur->children, 0);
 693         if (value) {
 694             xmlOutputBufferWriteString(buf, "=");
 695             if ((cur->ns == NULL) && (cur->parent != NULL) &&
 696                 (cur->parent->ns == NULL) &&
 697                 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
 698                  (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
 699                  (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
 700                  ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
 701                   (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
 702                 xmlChar *escaped;
 703                 xmlChar *tmp = value;
 704
 705                 while (IS_BLANK_CH(*tmp)) tmp++;
 706
 707                 /*
 708                  * the < and > have already been escaped at the entity level
 709                  * And doing so here breaks server side includes
 710                  */
 711                 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
 712                 if (escaped != NULL) {
 713                     xmlBufWriteQuotedString(buf->buffer, escaped);
 714                     xmlFree(escaped);
 715                 } else {
 716                     xmlBufWriteQuotedString(buf->buffer, value);
 717                 }
 718             } else {
 719                 xmlBufWriteQuotedString(buf->buffer, value);
 720             }
 721             xmlFree(value);
 722         } else  {
 723             xmlOutputBufferWriteString(buf, "=\"\"");
 724         }
 725     }
 726 }
 727
 728 /**
 729  * htmlNodeDumpFormatOutput:
 730  * @buf:  the HTML buffer output
 731  * @doc:  the document
 732  * @cur:  the current node
 733  * @encoding:  the encoding string (unused)
 734  * @format:  should formatting spaces been added
 735  *
 736  * Dump an HTML node, recursive behaviour,children are printed too.
 737  */
 738 void
 739 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 740                          xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
 741                          int format) {
 742     xmlNodePtr root, parent;
 743     xmlAttrPtr attr;
 744     const htmlElemDesc * info;
 745
 746     xmlInitParser();
 747
 748     if ((cur == NULL) || (buf == NULL)) {
 749         return;
 750     }
 751
 752     root = cur;
 753     parent = cur->parent;
 754     while (1) {
 755         switch (cur->type) {
 756         case XML_HTML_DOCUMENT_NODE:
 757         case XML_DOCUMENT_NODE:
 758             if (((xmlDocPtr) cur)->intSubset != NULL) {
 759                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
 760             }
 761             if (cur->children != NULL) {
 762                 /* Always validate cur->parent when descending. */
 763                 if (cur->parent == parent) {
 764                     parent = cur;
 765                     cur = cur->children;
 766                     continue;
 767                 }
 768             } else {
 769                 xmlOutputBufferWriteString(buf, "\n");
 770             }
 771             break;
 772
 773         case XML_ELEMENT_NODE:
 774             /*
 775              * Some users like lxml are known to pass nodes with a corrupted
 776              * tree structure. Fall back to a recursive call to handle this
 777              * case.
 778              */
 779             if ((cur->parent != parent) && (cur->children != NULL)) {
 780                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
 781                 break;
 782             }
 783
 784             /*
 785              * Get specific HTML info for that node.
 786              */
 787             if (cur->ns == NULL)
 788                 info = htmlTagLookup(cur->name);
 789             else
 790                 info = NULL;
 791
 792             xmlOutputBufferWriteString(buf, "<");
 793             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 794                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 795                 xmlOutputBufferWriteString(buf, ":");
 796             }
 797             xmlOutputBufferWriteString(buf, (const char *)cur->name);
 798             if (cur->nsDef)
 799                 xmlNsListDumpOutput(buf, cur->nsDef);
 800             attr = cur->properties;
 801             while (attr != NULL) {
 802                 htmlAttrDumpOutput(buf, doc, attr);
 803                 attr = attr->next;
 804             }
 805
 806             if ((info != NULL) && (info->empty)) {
 807                 xmlOutputBufferWriteString(buf, ">");
 808             } else if (cur->children == NULL) {
 809                 if ((info != NULL) && (info->saveEndTag != 0) &&
 810                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
 811                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
 812                     xmlOutputBufferWriteString(buf, ">");
 813                 } else {
 814                     xmlOutputBufferWriteString(buf, "></");
 815                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 816                         xmlOutputBufferWriteString(buf,
 817                                 (const char *)cur->ns->prefix);
 818                         xmlOutputBufferWriteString(buf, ":");
 819                     }
 820                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
 821                     xmlOutputBufferWriteString(buf, ">");
 822                 }
 823             } else {
 824                 xmlOutputBufferWriteString(buf, ">");
 825                 if ((format) && (info != NULL) && (!info->isinline) &&
 826                     (cur->children->type != HTML_TEXT_NODE) &&
 827                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
 828                     (cur->children != cur->last) &&
 829                     (cur->name != NULL) &&
 830                     (cur->name[0] != 'p')) /* p, pre, param */
 831                     xmlOutputBufferWriteString(buf, "\n");
 832                 parent = cur;
 833                 cur = cur->children;
 834                 continue;
 835             }
 836
 837             if ((format) && (cur->next != NULL) &&
 838                 (info != NULL) && (!info->isinline)) {
 839                 if ((cur->next->type != HTML_TEXT_NODE) &&
 840                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
 841                     (parent != NULL) &&
 842                     (parent->name != NULL) &&
 843                     (parent->name[0] != 'p')) /* p, pre, param */
 844                     xmlOutputBufferWriteString(buf, "\n");
 845             }
 846
 847             break;
 848
 849         case XML_ATTRIBUTE_NODE:
 850             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
 851             break;
 852
 853         case HTML_TEXT_NODE:
 854             if (cur->content == NULL)
 855                 break;
 856             if (((cur->name == (const xmlChar *)xmlStringText) ||
 857                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
 858                 ((parent == NULL) ||
 859                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
 860                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
 861                 xmlChar *buffer;
 862
 863                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
 864                 if (buffer != NULL) {
 865                     xmlOutputBufferWriteString(buf, (const char *)buffer);
 866                     xmlFree(buffer);
 867                 }
 868             } else {
 869                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
 870             }
 871             break;
 872
 873         case HTML_COMMENT_NODE:
 874             if (cur->content != NULL) {
 875                 xmlOutputBufferWriteString(buf, "<!--");
 876                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
 877                 xmlOutputBufferWriteString(buf, "-->");
 878             }
 879             break;
 880
 881         case HTML_PI_NODE:
 882             if (cur->name != NULL) {
 883                 xmlOutputBufferWriteString(buf, "<?");
 884                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
 885                 if (cur->content != NULL) {
 886                     xmlOutputBufferWriteString(buf, " ");
 887                     xmlOutputBufferWriteString(buf,
 888                             (const char *)cur->content);
 889                 }
 890                 xmlOutputBufferWriteString(buf, ">");
 891             }
 892             break;
 893
 894         case HTML_ENTITY_REF_NODE:
 895             xmlOutputBufferWriteString(buf, "&");
 896             xmlOutputBufferWriteString(buf, (const char *)cur->name);
 897             xmlOutputBufferWriteString(buf, ";");
 898             break;
 899
 900         case HTML_PRESERVE_NODE:
 901             if (cur->content != NULL) {
 902                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
 903             }
 904             break;
 905
 906         default:
 907             break;
 908         }
 909
 910         while (1) {
 911             if (cur == root)
 912                 return;
 913             if (cur->next != NULL) {
 914                 cur = cur->next;
 915                 break;
 916             }
 917
 918             cur = parent;
 919             /* cur->parent was validated when descending. */
 920             parent = cur->parent;
 921
 922             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
 923                 (cur->type == XML_DOCUMENT_NODE)) {
 924                 xmlOutputBufferWriteString(buf, "\n");
 925             } else {
 926                 if ((format) && (cur->ns == NULL))
 927                     info = htmlTagLookup(cur->name);
 928                 else
 929                     info = NULL;
 930
 931                 if ((format) && (info != NULL) && (!info->isinline) &&
 932                     (cur->last->type != HTML_TEXT_NODE) &&
 933                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
 934                     (cur->children != cur->last) &&
 935                     (cur->name != NULL) &&
 936                     (cur->name[0] != 'p')) /* p, pre, param */
 937                     xmlOutputBufferWriteString(buf, "\n");
 938
 939                 xmlOutputBufferWriteString(buf, "</");
 940                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 941                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 942                     xmlOutputBufferWriteString(buf, ":");
 943                 }
 944                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
 945                 xmlOutputBufferWriteString(buf, ">");
 946
 947                 if ((format) && (info != NULL) && (!info->isinline) &&
 948                     (cur->next != NULL)) {
 949                     if ((cur->next->type != HTML_TEXT_NODE) &&
 950                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
 951                         (parent != NULL) &&
 952                         (parent->name != NULL) &&
 953                         (parent->name[0] != 'p')) /* p, pre, param */
 954                         xmlOutputBufferWriteString(buf, "\n");
 955                 }
 956             }
 957         }
 958     }
 959 }
 960
 961 /**
 962  * htmlNodeDumpOutput:
 963  * @buf:  the HTML buffer output
 964  * @doc:  the document
 965  * @cur:  the current node
 966  * @encoding:  the encoding string (unused)
 967  *
 968  * Dump an HTML node, recursive behaviour,children are printed too,
 969  * and formatting returns/spaces are added.
 970  */
 971 void
 972 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 973                    xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
 974     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
 975 }
 976
 977 /**
 978  * htmlDocContentDumpFormatOutput:
 979  * @buf:  the HTML buffer output
 980  * @cur:  the document
 981  * @encoding:  the encoding string (unused)
 982  * @format:  should formatting spaces been added
 983  *
 984  * Dump an HTML document.
 985  */
 986 void
 987 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
 988                                const char *encoding ATTRIBUTE_UNUSED,
 989                                int format) {
 990     int type = 0;
 991     if (cur) {
 992         type = cur->type;
 993         cur->type = XML_HTML_DOCUMENT_NODE;
 994     }
 995     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
 996     if (cur)
 997         cur->type = (xmlElementType) type;
 998 }
 999
1000 /**
1001  * htmlDocContentDumpOutput:
1002  * @buf:  the HTML buffer output
1003  * @cur:  the document
1004  * @encoding:  the encoding string (unused)
1005  *
1006  * Dump an HTML document. Formatting return/spaces are added.
1007  */
1008 void
1009 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1010                          const char *encoding ATTRIBUTE_UNUSED) {
1011     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1012 }
1013
1014 /************************************************************************
1015  *                                                                      *
1016  *              Saving functions front-ends                             *
1017  *                                                                      *
1018  ************************************************************************/
1019
1020 /**
1021  * htmlDocDump:
1022  * @f:  the FILE*
1023  * @cur:  the document
1024  *
1025  * Dump an HTML document to an open FILE.
1026  *
1027  * returns: the number of byte written or -1 in case of failure.
1028  */
1029 int
1030 htmlDocDump(FILE *f, xmlDocPtr cur) {
1031     xmlOutputBufferPtr buf;
1032     xmlCharEncodingHandlerPtr handler = NULL;
1033     const char *encoding;
1034     int ret;
1035
1036     xmlInitParser();
1037
1038     if ((cur == NULL) || (f == NULL)) {
1039         return(-1);
1040     }
1041
1042     encoding = (const char *) htmlGetMetaEncoding(cur);
1043
1044     if (encoding != NULL) {
1045         xmlCharEncoding enc;
1046
1047         enc = xmlParseCharEncoding(encoding);
1048         if (enc != XML_CHAR_ENCODING_UTF8) {
1049             handler = xmlFindCharEncodingHandler(encoding);
1050             if (handler == NULL)
1051                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1052         }
1053     } else {
1054         /*
1055          * Fallback to HTML or ASCII when the encoding is unspecified
1056          */
1057         if (handler == NULL)
1058             handler = xmlFindCharEncodingHandler("HTML");
1059         if (handler == NULL)
1060             handler = xmlFindCharEncodingHandler("ascii");
1061     }
1062
1063     buf = xmlOutputBufferCreateFile(f, handler);
1064     if (buf == NULL) return(-1);
1065     htmlDocContentDumpOutput(buf, cur, NULL);
1066
1067     ret = xmlOutputBufferClose(buf);
1068     return(ret);
1069 }
1070
1071 /**
1072  * htmlSaveFile:
1073  * @filename:  the filename (or URL)
1074  * @cur:  the document
1075  *
1076  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1077  * used.
1078  * returns: the number of byte written or -1 in case of failure.
1079  */
1080 int
1081 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1082     xmlOutputBufferPtr buf;
1083     xmlCharEncodingHandlerPtr handler = NULL;
1084     const char *encoding;
1085     int ret;
1086
1087     if ((cur == NULL) || (filename == NULL))
1088         return(-1);
1089
1090     xmlInitParser();
1091
1092     encoding = (const char *) htmlGetMetaEncoding(cur);
1093
1094     if (encoding != NULL) {
1095         xmlCharEncoding enc;
1096
1097         enc = xmlParseCharEncoding(encoding);
1098         if (enc != XML_CHAR_ENCODING_UTF8) {
1099             handler = xmlFindCharEncodingHandler(encoding);
1100             if (handler == NULL)
1101                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1102         }
1103     } else {
1104         /*
1105          * Fallback to HTML or ASCII when the encoding is unspecified
1106          */
1107         if (handler == NULL)
1108             handler = xmlFindCharEncodingHandler("HTML");
1109         if (handler == NULL)
1110             handler = xmlFindCharEncodingHandler("ascii");
1111     }
1112
1113     /*
1114      * save the content to a temp buffer.
1115      */
1116     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1117     if (buf == NULL) return(0);
1118
1119     htmlDocContentDumpOutput(buf, cur, NULL);
1120
1121     ret = xmlOutputBufferClose(buf);
1122     return(ret);
1123 }
1124
1125 /**
1126  * htmlSaveFileFormat:
1127  * @filename:  the filename
1128  * @cur:  the document
1129  * @format:  should formatting spaces been added
1130  * @encoding: the document encoding
1131  *
1132  * Dump an HTML document to a file using a given encoding.
1133  *
1134  * returns: the number of byte written or -1 in case of failure.
1135  */
1136 int
1137 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1138                    const char *encoding, int format) {
1139     xmlOutputBufferPtr buf;
1140     xmlCharEncodingHandlerPtr handler = NULL;
1141     int ret;
1142
1143     if ((cur == NULL) || (filename == NULL))
1144         return(-1);
1145
1146     xmlInitParser();
1147
1148     if (encoding != NULL) {
1149         xmlCharEncoding enc;
1150
1151         enc = xmlParseCharEncoding(encoding);
1152         if (enc != XML_CHAR_ENCODING_UTF8) {
1153             handler = xmlFindCharEncodingHandler(encoding);
1154             if (handler == NULL)
1155                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1156         }
1157         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1158     } else {
1159         htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1160
1161         /*
1162          * Fallback to HTML or ASCII when the encoding is unspecified
1163          */
1164         if (handler == NULL)
1165             handler = xmlFindCharEncodingHandler("HTML");
1166         if (handler == NULL)
1167             handler = xmlFindCharEncodingHandler("ascii");
1168     }
1169
1170     /*
1171      * save the content to a temp buffer.
1172      */
1173     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1174     if (buf == NULL) return(0);
1175
1176     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1177
1178     ret = xmlOutputBufferClose(buf);
1179     return(ret);
1180 }
1181
1182 /**
1183  * htmlSaveFileEnc:
1184  * @filename:  the filename
1185  * @cur:  the document
1186  * @encoding: the document encoding
1187  *
1188  * Dump an HTML document to a file using a given encoding
1189  * and formatting returns/spaces are added.
1190  *
1191  * returns: the number of byte written or -1 in case of failure.
1192  */
1193 int
1194 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1195     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1196 }
1197
1198 #endif /* LIBXML_OUTPUT_ENABLED */
1199
1200 #endif /* LIBXML_HTML_ENABLED */