2 // Copyright (c) 1999-2006 by Digital Mars
4 // written by Walter Bright
5 // http://www.digitalmars.com
6 // License for redistribution is by either the Artistic License
7 // in artistic.txt, or the GNU General Public License in gnu.txt.
8 // See the included readme.txt for details.
10 /* NOTE: This file has been patched from the original DMD distribution to
11 work with the GDC compiler.
13 Modified by David Friedman, September 2004
14 Modified by Thomas Kuehne, November 2004
32 //#include "../mars/mars.h"
34 extern int HtmlNamedEntity(unsigned char *p
, int length
);
36 /**********************************
37 * Determine if beginning of tag identifier
38 * or a continuation of a tag identifier.
41 inline int istagstart(int c
)
43 return (isalpha(c
) || c
== '_' || c
== '!');
46 inline int istag(int c
)
48 return (isalnum(c
) || c
== '_');
52 * identify DOS, Linux, Mac, Next and Unicode line endings
53 * 0 if this is no line seperator
54 * >0 the length of the seperator
55 * Note: input has to be UTF-8
57 static int isLineSeperator(const unsigned char* p
){
65 return (p
[1]=='\n') ? 2 : 1;
68 // Unicode (line || paragarph sep.)
69 if( p
[0]==0xE2 && p
[1]==0x80 && (p
[2]==0xA8 || p
[2]==0xA9)){
74 if( p
[0]==0xC2 && p
[1]==0x85){
81 /**********************************************
84 Html::Html(const char *sourcename
, unsigned char *base
, unsigned length
)
86 this->sourcename
= sourcename
;
95 /**********************************************
99 void Html::error(const char *format
, ...)
103 fprintf(stderr
, "%s:%d: HTML Error: ", sourcename
, linnum
);
106 va_start(ap
, format
);
107 vfprintf(stderr
, format
, ap
);
110 fprintf(stderr
, "\n");
118 /**********************************************
119 * Extract all the code from an HTML file,
120 * concatenate it all together, and store in buf.
123 void Html::extractCode(OutBuffer
*buf
)
125 //printf("Html::extractCode()\n");
126 dbuf
= buf
; // save for other routines
127 buf
->reserve(end
- p
);
131 //printf("p = %p, *p = x%x\n", p, *p);
134 #if 0 // strings are not recognized outside of tags
142 //-OLDOLDREMOVE// if (p[1] == '!' && p[2] == '-' && p[3] == '-')
143 if (p
[1] == '!' && isCommentStart())
144 { // Comments start with <!--
145 //OLDOLDREMOVE// p += 4;
148 //OLDOLDREMOVE//else if ((p[1] == '/' && istagstart(p[2])) ||
149 //OLDOLDREMOVE// istagstart(p[1]))
156 else if(p
[1] == '!' && isCDATAStart())
160 else if (p
[1] == '/' && istagstart(*skipWhite(p
+ 2)))
162 else if (istagstart(*skipWhite(p
+ 1)))
170 break; // end of file
174 { // Translate character entity into ascii for D parser
184 /* all this handled by isLineSeparator
190 // Always extract new lines, so that D lexer counts the
199 int lineSepLength
=isLineSeperator(p
);
200 if( lineSepLength
>0 ){
202 // Always extract new lines, so that the D lexer
203 // counts the lines right.
204 buf
->writeByte('\n'); // BUG: wchar
216 buf
->writeByte(0); // ending sentinel
217 //printf("D code is: '%s'\n", (char *)buf->data);
220 /***********************************************
221 * Scan to end of <> tag.
222 * Look for <code> and </code> tags to start/stop D processing.
224 * p is on opening '<' of tag; it's already verified that
225 * it's a tag by lookahead
227 * p is past closing '>' of tag
232 enum TagState
// what parsing state we're in
234 TStagstart
, // start of tag name
235 TStag
, // in a tag name
236 TSrest
, // following tag name
238 enum TagState state
= TStagstart
;
240 unsigned char *tagstart
= NULL
;
253 case '>': // found end of tag
264 if (p
[1] == '!' && isCommentStart())
265 { // Comments start with <!--
269 //OLDOLD//else if ((p[1] == '/' && istagstart(p[2])) ||
270 //OLDOLD// istagstart(p[1]))
271 else if (p
[1] == '/' && istagstart(*skipWhite(p
+ 2)))
272 { error("nested tag");
275 else if (istagstart(*skipWhite(p
+ 1)))
276 { error("nested tag");
280 //CHECKCHECK//stillneeded?
282 // Treat comments as if they were whitespace
288 error("end of file before end of tag");
289 break; // end of file
291 /* all handled by isLineSeparator
297 // Always extract new lines, so that code lexer counts the
300 state = TSrest; // end of tag
309 if (state
== TStagstart
)
315 int lineSepLength
= isLineSeperator(p
);
316 if( lineSepLength
>0 ){
318 // Always extract new lines, so that code lexer counts
320 dbuf
->writeByte('\n'); // BUG: wchar
327 case TStagstart
: // start of tag name
328 assert(istagstart(*p
));
336 { // Continuing tag name
354 // See if we parsed a <code> or </code> tag
355 if (taglen
== 4 && memicmp((const char *)tagstart
, "CODE", taglen
) == 0
356 && *(p
- 2) != '/') // ignore "<code />" (XHTML)
361 inCode
= 0; // ignore extra </code>'s
368 /***********************************************
369 * Scan to end of attribute string.
372 void Html::skipString()
389 /* all handled by isLineSeparator
395 // Always extract new lines, so that D lexer counts the
404 error("end of file before closing %c of string", tc
);
409 int lineSepLength
= isLineSeperator(p
);
410 if( lineSepLength
>0 ){
412 // Always extract new lines, so that D lexer counts
414 dbuf
->writeByte('\n'); // BUG: wchar
423 /*********************************
424 * If p points to any white space, skip it
425 * and return pointer just past it.
428 unsigned char *Html::skipWhite(unsigned char *q
)
450 /***************************************************
451 * Scan to end of comment.
452 * Comments are defined any of a number of ways.
453 * IE 5.0: <!-- followed by >
454 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
455 * Netscape: <!-- --> comments nest
456 * w3c: whitespace can appear between -- and > of comment close
459 void Html::scanComment()
461 // Most of the complexity is dealing with the case that
462 // an arbitrary amount of whitespace can appear between
463 // the -- and the > of a comment close.
466 //printf("scanComment()\n");
469 // Always extract new lines, so that D lexer counts the
475 //scangt = 1; // IE 5.0 compatibility
482 if (p
[2] == '>') // optimize for most common case
509 /* all handled by isLineSeparator
514 linnum++; // remember to count lines
515 // Always extract new lines, so that D lexer counts the
523 error("end of file before closing --> of comment");
528 int lineSepLength
= isLineSeperator(p
);
529 if( lineSepLength
>0 ){
530 linnum
++; // remember to count lines
531 // Always extract new lines, so that D lexer counts
533 dbuf
->writeByte('\n'); // BUG: wchar
537 scangt
= 0; // it's not -->
542 //printf("*p = '%c'\n", *p);
545 /********************************************
546 * Determine if we are at the start of a comment.
548 * p is on the opening '<'
550 * 0 if not start of a comment
551 * 1 if start of a comment, p is adjusted to point past --
554 int Html::isCommentStart()
560 else if (result
== 1)
562 assert(p
[-2] == '-' && p
[-1] == '-');
571 if (p
[0] == '<' && p
[1] == '!')
573 for (s
= p
+ 2; 1; s
++)
582 // skip white space, even though spec says no
583 // white space is allowed
603 int Html::isCDATAStart()
605 const char * CDATA_START_MARKER
= "<![CDATA[";
606 size_t len
= strlen(CDATA_START_MARKER
);
608 if (strncmp((char*)p
, CDATA_START_MARKER
, len
) == 0)
619 void Html::scanCDATA()
621 while(*p
&& *p
!= 0x1A)
623 int lineSepLength
= isLineSeperator(p
);
626 /* Always extract new lines, so that D lexer counts the lines
630 dbuf
->writeUTF8('\n');
634 else if (p
[0] == ']' && p
[1] == ']' && p
[2] == '>')
636 /* end of CDATA section */
642 /* this CDATA section contains D code */
650 /********************************************
651 * Convert an HTML character entity into a character.
653 * &name; named entity
660 int Html::charEntity()
664 unsigned char *pstart
= p
;
666 //printf("Html::charEntity('%c')\n", *p);
670 if (p
[1] == 'x' || p
[1] == 'X')
686 error("end of file before end of character entity");
691 case '<': // tag start
692 // Termination is assumed
696 // Termination is explicit
700 case '0': case '1': case '2': case '3': case '4':
701 case '5': case '6': case '7': case '8': case '9':
705 case 'a': case 'b': case 'c':
706 case 'd': case 'e': case 'f':
712 case 'A': case 'B': case 'C':
713 case 'D': case 'E': case 'F':
726 error("character entity out of range");
733 error("invalid numeric character reference");
741 // It's a named entity; gather all characters until ;
742 unsigned char *idstart
= p
+ 1;
751 error("end of file before end of character entity");
756 case '<': // tag start
757 // Termination is assumed
758 c
= HtmlNamedEntity(idstart
, p
- idstart
);
764 // Termination is explicit
765 c
= HtmlNamedEntity(idstart
, p
- idstart
);
778 // Kludge to convert non-breaking space to ascii space
783 //printf("Lignore\n");