dmd/html.c

   1
   2 // Copyright (c) 1999-2006 by Digital Mars
   3 // All Rights Reserved
   4 // written by Walter Bright
   5 // http://www.digitalmars.com
   6 // License for redistribution is by either the Artistic License
   7 // in artistic.txt, or the GNU General Public License in gnu.txt.
   8 // See the included readme.txt for details.
   9
  10 /* NOTE: This file has been patched from the original DMD distribution to
  11    work with the GDC compiler.
  12
  13    Modified by David Friedman, September 2004
  14    Modified by Thomas Kuehne, November 2004
  15 */
  16
  17 /* HTML parser
  18  */
  19
  20 #include <stdio.h>
  21 #include <string.h>
  22 #include <ctype.h>
  23 #include <stdarg.h>
  24 #include <errno.h>
  25 //#include <wchar.h>
  26
  27 #include "mars.h"
  28 #include "html.h"
  29
  30 #include <assert.h>
  31 #include "root.h"
  32 //#include "../mars/mars.h"
  33
  34 extern int HtmlNamedEntity(unsigned char *p, int length);
  35
  36 /**********************************
  37  * Determine if beginning of tag identifier
  38  * or a continuation of a tag identifier.
  39  */
  40
  41 inline int istagstart(int c)
  42 {
  43     return (isalpha(c) || c == '_' || c == '!');
  44 }
  45
  46 inline int istag(int c)
  47 {
  48     return (isalnum(c) || c == '_');
  49 }
  50
  51 /**
  52  * identify DOS, Linux, Mac, Next and Unicode line endings
  53  * 0 if this is no line seperator
  54  * >0 the length of the seperator
  55  * Note: input has to be UTF-8
  56  */
  57 static int isLineSeperator(const unsigned char* p){
  58         // Linux
  59         if( p[0]=='\n'){
  60                 return 1;
  61         }
  62
  63         // Mac & Dos
  64         if( p[0]=='\r'){
  65                 return (p[1]=='\n') ? 2 : 1;
  66         }
  67
  68         // Unicode (line || paragarph sep.)
  69         if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)){
  70                 return 3;
  71         }
  72
  73         // Next
  74         if( p[0]==0xC2 && p[1]==0x85){
  75                 return 2;
  76         }
  77
  78         return 0;
  79 }
  80
  81 /**********************************************
  82  */
  83
  84 Html::Html(const char *sourcename, unsigned char *base, unsigned length)
  85 {
  86     this->sourcename = sourcename;
  87     this->base = base;
  88     p = base;
  89     end = base + length;
  90     linnum = 1;
  91     dbuf = NULL;
  92     inCode = 0;
  93 }
  94
  95 /**********************************************
  96  * Print error & quit.
  97  */
  98
  99 void Html::error(const char *format, ...)
 100 {
 101     if (!global.gag)
 102     {
 103         fprintf(stderr, "%s:%d: HTML Error: ", sourcename, linnum);
 104
 105         va_list ap;
 106         va_start(ap, format);
 107         vfprintf(stderr, format, ap);
 108         va_end(ap);
 109
 110         fprintf(stderr, "\n");
 111         fflush(stderr);
 112     }
 113
 114     global.errors++;
 115     fatal();
 116 }
 117
 118 /**********************************************
 119  * Extract all the code from an HTML file,
 120  * concatenate it all together, and store in buf.
 121  */
 122
 123 void Html::extractCode(OutBuffer *buf)
 124 {
 125     //printf("Html::extractCode()\n");
 126     dbuf = buf;                 // save for other routines
 127     buf->reserve(end - p);
 128     inCode = 0;
 129     while (1)
 130     {
 131         //printf("p = %p, *p = x%x\n", p, *p);
 132         switch (*p)
 133         {
 134 #if 0 // strings are not recognized outside of tags
 135             case '"':
 136             case '\'':
 137                 skipString();
 138                 continue;
 139             */
 140 #endif
 141             case '<':
 142                 //-OLDOLDREMOVE//               if (p[1] == '!' && p[2] == '-' && p[3] == '-')
 143                 if (p[1] == '!' && isCommentStart())
 144                 {   // Comments start with <!--
 145                     //OLDOLDREMOVE//                p += 4;
 146                     scanComment();
 147                 }
 148                 //OLDOLDREMOVE//else if ((p[1] == '/' && istagstart(p[2])) ||
 149                 //OLDOLDREMOVE//         istagstart(p[1]))
 150                 /*OLDOLDMYCHANGES
 151                 {
 152                     skipTag();
 153                 }
 154                 else
 155                     p++;*/
 156                 else if(p[1] == '!' && isCDATAStart())
 157                 {
 158                     scanCDATA();
 159                 }
 160                 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
 161                     skipTag();
 162                 else if (istagstart(*skipWhite(p + 1)))
 163                     skipTag();
 164                 else
 165                     goto Ldefault;
 166                 continue;
 167
 168             case 0:
 169             case 0x1a:
 170                 break;          // end of file
 171
 172             case '&':
 173                 if (inCode)
 174                 {   // Translate character entity into ascii for D parser
 175                     int c;
 176
 177                     c = charEntity();
 178                     buf->writeUTF8(c);
 179                 }
 180                 else
 181                     p++;
 182                 continue;
 183
 184                 /* all this handled by isLineSeparator
 185             case '\r':
 186                 if (p[1] == '\n')
 187                     goto Ldefault;
 188             case '\n':
 189                 linnum++;
 190                 // Always extract new lines, so that D lexer counts the
 191                 // lines right.
 192                 buf->writeByte(*p);
 193                 p++;
 194                 continue;
 195                 */
 196
 197             default:
 198             Ldefault:
 199                 int lineSepLength=isLineSeperator(p);
 200                 if( lineSepLength>0 ){
 201                         linnum++;
 202                         // Always extract new lines, so that the D lexer
 203                         // counts the lines right.
 204                         buf->writeByte('\n');   // BUG: wchar
 205                         p+=lineSepLength;
 206                         continue;
 207                 }
 208
 209                 if (inCode)
 210                     buf->writeByte(*p);
 211                 p++;
 212                 continue;
 213         }
 214         break;
 215     }
 216     buf->writeByte(0);                          // ending sentinel
 217     //printf("D code is: '%s'\n", (char *)buf->data);
 218 }
 219
 220 /***********************************************
 221  * Scan to end of <> tag.
 222  * Look for <code> and </code> tags to start/stop D processing.
 223  * Input:
 224  *      p is on opening '<' of tag; it's already verified that
 225  *      it's a tag by lookahead
 226  * Output:
 227  *      p is past closing '>' of tag
 228  */
 229
 230 void Html::skipTag()
 231 {
 232     enum TagState       // what parsing state we're in
 233     {
 234         TStagstart,     // start of tag name
 235         TStag,          // in a tag name
 236         TSrest,         // following tag name
 237     };
 238     enum TagState state = TStagstart;
 239     int inot;
 240     unsigned char *tagstart = NULL;
 241     int taglen = 0;
 242
 243     p++;
 244     inot = 0;
 245     if (*p == '/')
 246     {   inot = 1;
 247         p++;
 248     }
 249     while (1)
 250     {
 251         switch (*p)
 252         {
 253             case '>':           // found end of tag
 254                 p++;
 255                 break;
 256
 257             case '"':
 258             case '\'':
 259                 state = TSrest;
 260                 skipString();
 261                 continue;
 262
 263             case '<':
 264                 if (p[1] == '!' && isCommentStart())
 265                 {   // Comments start with <!--
 266                     //OLDOLD//p += 4;
 267                     scanComment();
 268                 }
 269                 //OLDOLD//else if ((p[1] == '/' && istagstart(p[2])) ||
 270                 //OLDOLD//       istagstart(p[1]))
 271                 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
 272                 {   error("nested tag");
 273                     skipTag();
 274                 }
 275                 else if (istagstart(*skipWhite(p + 1)))
 276                 {   error("nested tag");
 277                     skipTag();
 278                 }
 279                 else
 280                     //CHECKCHECK//stillneeded?
 281                     p++;
 282                 // Treat comments as if they were whitespace
 283                 state = TSrest;
 284                 continue;
 285
 286             case 0:
 287             case 0x1a:
 288                 error("end of file before end of tag");
 289                 break;          // end of file
 290
 291                 /* all handled by isLineSeparator
 292             case '\r':
 293                 if (p[1] == '\n')
 294                     goto Ldefault;
 295             case '\n':
 296                 linnum++;
 297                 // Always extract new lines, so that code lexer counts the
 298                 // lines right.
 299                 dbuf->writeByte(*p);
 300                 state = TSrest;                 // end of tag
 301                 p++;
 302                 continue;
 303                 */
 304
 305             case ' ':
 306             case '\t':
 307             case '\f':
 308             case '\v':
 309                 if (state == TStagstart)
 310                 {   p++;
 311                     continue;
 312                 }
 313             default:
 314                 //    Ldefault:
 315                 int lineSepLength = isLineSeperator(p);
 316                 if( lineSepLength>0 ){
 317                         linnum++;
 318                         // Always extract new lines, so that code lexer counts
 319                         // the lines right.
 320                         dbuf->writeByte('\n');  // BUG: wchar
 321                         state = TSrest;
 322                         p+=lineSepLength;
 323                         continue;
 324                 }
 325                 switch (state)
 326                 {
 327                     case TStagstart:            // start of tag name
 328                         assert(istagstart(*p));
 329                         state = TStag;
 330                         tagstart = p;
 331                         taglen = 1;
 332                         break;
 333
 334                     case TStag:
 335                         if (istag(*p))
 336                         {   // Continuing tag name
 337                             taglen++;
 338                         }
 339                         else
 340                         {   // End of tag name
 341                             state = TSrest;
 342                         }
 343                         break;
 344
 345                     case TSrest:
 346                         break;
 347                 }
 348                 p++;
 349                 continue;
 350         }
 351         break;
 352     }
 353
 354     // See if we parsed a <code> or </code> tag
 355     if (taglen == 4 && memicmp((const char *)tagstart, "CODE", taglen) == 0
 356         && *(p - 2) != '/') // ignore "<code />" (XHTML)
 357     {
 358         if (inot)
 359         {   inCode--;
 360             if (inCode < 0)
 361                 inCode = 0;             // ignore extra </code>'s
 362         }
 363         else
 364             inCode++;
 365     }
 366 }
 367
 368 /***********************************************
 369  * Scan to end of attribute string.
 370  */
 371
 372 void Html::skipString()
 373 {
 374     int tc = *p;
 375
 376     while (1)
 377     {
 378         p++;
 379         switch (*p)
 380         {
 381             case '"':
 382             case '\'':
 383                 if (*p == tc)
 384                 {   p++;
 385                     break;
 386                 }
 387                 continue;
 388
 389                 /* all handled by isLineSeparator
 390             case '\r':
 391                 if (p[1] == '\n')
 392                     goto Ldefault;
 393             case '\n':
 394                 linnum++;
 395                 // Always extract new lines, so that D lexer counts the
 396                 // lines right.
 397                 dbuf->writeByte(*p);
 398                 continue;
 399                 */
 400
 401             case 0:
 402             case 0x1a:
 403             Leof:
 404                 error("end of file before closing %c of string", tc);
 405                 break;
 406
 407             default:
 408                 //          Ldefault:
 409                 int lineSepLength = isLineSeperator(p);
 410                 if( lineSepLength>0 ){
 411                         linnum++;
 412                         // Always extract new lines, so that D lexer counts
 413                         // the lines right.
 414                         dbuf->writeByte('\n');   // BUG: wchar
 415                         continue;
 416                 }
 417                 continue;
 418         }
 419         break;
 420     }
 421 }
 422
 423 /*********************************
 424  * If p points to any white space, skip it
 425  * and return pointer just past it.
 426  */
 427
 428 unsigned char *Html::skipWhite(unsigned char *q)
 429 {
 430     for (; 1; q++)
 431     {
 432         switch (*q)
 433         {
 434             case ' ':
 435             case '\t':
 436             case '\f':
 437             case '\v':
 438             case '\r':
 439             case '\n':
 440                 continue;
 441
 442             default:
 443                 break;
 444         }
 445         break;
 446     }
 447     return q;
 448 }
 449
 450 /***************************************************
 451  * Scan to end of comment.
 452  * Comments are defined any of a number of ways.
 453  * IE 5.0: <!-- followed by >
 454  * "HTML The Definitive Guide": <!-- text with at least one space in it -->
 455  * Netscape: <!-- --> comments nest
 456  * w3c: whitespace can appear between -- and > of comment close
 457  */
 458
 459 void Html::scanComment()
 460 {
 461     // Most of the complexity is dealing with the case that
 462     // an arbitrary amount of whitespace can appear between
 463     // the -- and the > of a comment close.
 464     int scangt = 0;
 465
 466     //printf("scanComment()\n");
 467     if (*p == '\n')
 468     {   linnum++;
 469         // Always extract new lines, so that D lexer counts the
 470         // lines right.
 471         dbuf->writeByte(*p);
 472     }
 473     while (1)
 474     {
 475         //scangt = 1;                   // IE 5.0 compatibility
 476         p++;
 477         switch (*p)
 478         {
 479             case '-':
 480                 if (p[1] == '-')
 481                 {
 482                     if (p[2] == '>')    // optimize for most common case
 483                     {
 484                         p += 3;
 485                         break;
 486                     }
 487                     p++;
 488                     scangt = 1;
 489                 }
 490                 else
 491                     scangt = 0;
 492                 continue;
 493
 494             case '>':
 495                 if (scangt)
 496                 {   // found -->
 497                     p++;
 498                     break;
 499                 }
 500                 continue;
 501
 502             case ' ':
 503             case '\t':
 504             case '\f':
 505             case '\v':
 506                 // skip white space
 507                 continue;
 508
 509                 /* all handled by isLineSeparator
 510             case '\r':
 511                 if (p[1] == '\n')
 512                     goto Ldefault;
 513             case '\n':
 514                 linnum++;               // remember to count lines
 515                 // Always extract new lines, so that D lexer counts the
 516                 // lines right.
 517                 dbuf->writeByte(*p);
 518                 continue;
 519                 */
 520
 521             case 0:
 522             case 0x1a:
 523                 error("end of file before closing --> of comment");
 524                 break;
 525
 526             default:
 527                 //    Ldefault:
 528                 int lineSepLength = isLineSeperator(p);
 529                 if( lineSepLength>0 ){
 530                         linnum++;       // remember to count lines
 531                         // Always extract new lines, so that D lexer counts
 532                         // the lines right.
 533                         dbuf->writeByte('\n');  // BUG: wchar
 534                         p+=lineSepLength-1;
 535                         continue;
 536                 }
 537                 scangt = 0;             // it's not -->
 538                 continue;
 539         }
 540         break;
 541     }
 542     //printf("*p = '%c'\n", *p);
 543 }
 544
 545  /********************************************
 546  * Determine if we are at the start of a comment.
 547  * Input:
 548  *      p is on the opening '<'
 549  * Returns:
 550  *      0 if not start of a comment
 551  *      1 if start of a comment, p is adjusted to point past --
 552  */
 553
 554 int Html::isCommentStart()
 555 #ifdef __DMC__
 556     __out(result)
 557     {
 558         if (result == 0)
 559             ;
 560         else if (result == 1)
 561         {
 562             assert(p[-2] == '-' && p[-1] == '-');
 563         }
 564         else
 565             assert(0);
 566     }
 567     __body
 568 #endif /* __DMC__ */
 569     {   unsigned char *s;
 570
 571         if (p[0] == '<' && p[1] == '!')
 572         {
 573             for (s = p + 2; 1; s++)
 574             {
 575                 switch (*s)
 576                 {
 577                     case ' ':
 578                     case '\t':
 579                     case '\r':
 580                     case '\f':
 581                     case '\v':
 582                         // skip white space, even though spec says no
 583                         // white space is allowed
 584                         continue;
 585
 586                     case '-':
 587                         if (s[1] == '-')
 588                         {
 589                             p = s + 2;
 590                             return 1;
 591                         }
 592                         goto No;
 593
 594                     default:
 595                         goto No;
 596                 }
 597             }
 598         }
 599     No:
 600         return 0;
 601     }
 602
 603 int Html::isCDATAStart()
 604 {
 605     const char * CDATA_START_MARKER = "<![CDATA[";
 606     size_t len = strlen(CDATA_START_MARKER);
 607
 608     if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
 609     {
 610         p += len;
 611         return 1;
 612     }
 613     else
 614     {
 615         return 0;
 616     }
 617 }
 618
 619 void Html::scanCDATA()
 620 {
 621     while(*p && *p != 0x1A)
 622     {
 623         int lineSepLength = isLineSeperator(p);
 624         if (lineSepLength>0)
 625         {
 626             /* Always extract new lines, so that D lexer counts the lines
 627              * right.
 628              */
 629             linnum++;
 630             dbuf->writeUTF8('\n');
 631             p += lineSepLength;
 632             continue;
 633         }
 634         else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
 635         {
 636             /* end of CDATA section */
 637             p += 3;
 638             return;
 639         }
 640         else if (inCode)
 641         {
 642             /* this CDATA section contains D code */
 643             dbuf->writeByte(*p);
 644         }
 645
 646         p++;
 647     }
 648 }
 649
 650 /********************************************
 651  * Convert an HTML character entity into a character.
 652  * Forms are:
 653  *      &name;          named entity
 654  *      &#ddd;          decimal
 655  *      &#xhhhh;        hex
 656  * Input:
 657  *      p is on the &
 658  */
 659
 660 int Html::charEntity()
 661 {   int c = 0;
 662     int v;
 663     int hex;
 664     unsigned char *pstart = p;
 665
 666     //printf("Html::charEntity('%c')\n", *p);
 667     if (p[1] == '#')
 668     {
 669         p++;
 670         if (p[1] == 'x' || p[1] == 'X')
 671         {   p++;
 672             hex = 1;
 673         }
 674         else
 675             hex = 0;
 676
 677         if (p[1] == ';')
 678             goto Linvalid;
 679         while (1)
 680         {
 681             p++;
 682             switch (*p)
 683             {
 684                 case 0:
 685                 case 0x1a:
 686                     error("end of file before end of character entity");
 687                     goto Lignore;
 688
 689                 case '\n':
 690                 case '\r':
 691                 case '<':       // tag start
 692                     // Termination is assumed
 693                     break;
 694
 695                 case ';':
 696                     // Termination is explicit
 697                     p++;
 698                     break;
 699
 700                 case '0': case '1': case '2': case '3': case '4':
 701                 case '5': case '6': case '7': case '8': case '9':
 702                     v = *p - '0';
 703                     goto Lvalue;
 704
 705                 case 'a': case 'b': case 'c':
 706                 case 'd': case 'e': case 'f':
 707                     if (!hex)
 708                         goto Linvalid;
 709                     v = (*p - 'a') + 10;
 710                     goto Lvalue;
 711
 712                 case 'A': case 'B': case 'C':
 713                 case 'D': case 'E': case 'F':
 714                     if (!hex)
 715                         goto Linvalid;
 716                     v = (*p - 'A') + 10;
 717                     goto Lvalue;
 718
 719                 Lvalue:
 720                     if (hex)
 721                         c = (c << 4) + v;
 722                     else
 723                         c = (c * 10) + v;
 724                     if (c > 0x10FFFF)
 725                     {
 726                         error("character entity out of range");
 727                         goto Lignore;
 728                     }
 729                     continue;
 730
 731                 default:
 732                 Linvalid:
 733                     error("invalid numeric character reference");
 734                     goto Lignore;
 735             }
 736             break;
 737         }
 738     }
 739     else
 740     {
 741         // It's a named entity; gather all characters until ;
 742         unsigned char *idstart = p + 1;
 743
 744         while (1)
 745         {
 746             p++;
 747             switch (*p)
 748             {
 749                 case 0:
 750                 case 0x1a:
 751                     error("end of file before end of character entity");
 752                     break;
 753
 754                 case '\n':
 755                 case '\r':
 756                 case '<':       // tag start
 757                     // Termination is assumed
 758                     c = HtmlNamedEntity(idstart, p - idstart);
 759                     if (c == -1)
 760                         goto Lignore;
 761                     break;
 762
 763                 case ';':
 764                     // Termination is explicit
 765                     c = HtmlNamedEntity(idstart, p - idstart);
 766                     if (c == -1)
 767                         goto Lignore;
 768                     p++;
 769                     break;
 770
 771                 default:
 772                     continue;
 773             }
 774             break;
 775         }
 776     }
 777
 778     // Kludge to convert non-breaking space to ascii space
 779     if (c == 160)
 780         c = 32;
 781     return c;
 782 Lignore:
 783     //printf("Lignore\n");
 784     p = pstart + 1;
 785     return '&';
 786 }
 787