Allow returning something of type void in a function that returns void
[delight/core.git] / dmd2 / html.c
blob38a49317728b39ada45297693b1361d5b62594c9
2 // Copyright (c) 1999-2006 by Digital Mars
3 // All Rights Reserved
4 // written by Walter Bright
5 // http://www.digitalmars.com
6 // License for redistribution is by either the Artistic License
7 // in artistic.txt, or the GNU General Public License in gnu.txt.
8 // See the included readme.txt for details.
10 /* NOTE: This file has been patched from the original DMD distribution to
11 work with the GDC compiler.
13 Modified by David Friedman, September 2004
14 Modified by Thomas Kuehne, November 2004
17 /* HTML parser
20 #include <stdio.h>
21 #include <string.h>
22 #include <ctype.h>
23 #include <stdarg.h>
24 #include <errno.h>
25 //#include <wchar.h>
27 #include "mars.h"
28 #include "html.h"
30 #include <assert.h>
31 #include "root.h"
32 //#include "../mars/mars.h"
34 extern int HtmlNamedEntity(unsigned char *p, int length);
36 /**********************************
37 * Determine if beginning of tag identifier
38 * or a continuation of a tag identifier.
41 inline int istagstart(int c)
43 return (isalpha(c) || c == '_' || c == '!');
46 inline int istag(int c)
48 return (isalnum(c) || c == '_');
51 /**
52 * identify DOS, Linux, Mac, Next and Unicode line endings
53 * 0 if this is no line seperator
54 * >0 the length of the seperator
55 * Note: input has to be UTF-8
57 static int isLineSeperator(const unsigned char* p){
58 // Linux
59 if( p[0]=='\n'){
60 return 1;
63 // Mac & Dos
64 if( p[0]=='\r'){
65 return (p[1]=='\n') ? 2 : 1;
68 // Unicode (line || paragarph sep.)
69 if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)){
70 return 3;
73 // Next
74 if( p[0]==0xC2 && p[1]==0x85){
75 return 2;
78 return 0;
81 /**********************************************
84 Html::Html(const char *sourcename, unsigned char *base, unsigned length)
86 this->sourcename = sourcename;
87 this->base = base;
88 p = base;
89 end = base + length;
90 linnum = 1;
91 dbuf = NULL;
92 inCode = 0;
95 /**********************************************
96 * Print error & quit.
99 void Html::error(const char *format, ...)
101 if (!global.gag)
103 fprintf(stderr, "%s:%d: HTML Error: ", sourcename, linnum);
105 va_list ap;
106 va_start(ap, format);
107 vfprintf(stderr, format, ap);
108 va_end(ap);
110 fprintf(stderr, "\n");
111 fflush(stderr);
114 global.errors++;
115 fatal();
118 /**********************************************
119 * Extract all the code from an HTML file,
120 * concatenate it all together, and store in buf.
123 void Html::extractCode(OutBuffer *buf)
125 //printf("Html::extractCode()\n");
126 dbuf = buf; // save for other routines
127 buf->reserve(end - p);
128 inCode = 0;
129 while (1)
131 //printf("p = %p, *p = x%x\n", p, *p);
132 switch (*p)
134 #if 0 // strings are not recognized outside of tags
135 case '"':
136 case '\'':
137 skipString();
138 continue;
140 #endif
141 case '<':
142 //-OLDOLDREMOVE// if (p[1] == '!' && p[2] == '-' && p[3] == '-')
143 if (p[1] == '!' && isCommentStart())
144 { // Comments start with <!--
145 //OLDOLDREMOVE// p += 4;
146 scanComment();
148 //OLDOLDREMOVE//else if ((p[1] == '/' && istagstart(p[2])) ||
149 //OLDOLDREMOVE// istagstart(p[1]))
150 /*OLDOLDMYCHANGES
152 skipTag();
154 else
155 p++;*/
156 else if(p[1] == '!' && isCDATAStart())
158 scanCDATA();
160 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
161 skipTag();
162 else if (istagstart(*skipWhite(p + 1)))
163 skipTag();
164 else
165 goto Ldefault;
166 continue;
168 case 0:
169 case 0x1a:
170 break; // end of file
172 case '&':
173 if (inCode)
174 { // Translate character entity into ascii for D parser
175 int c;
177 c = charEntity();
178 buf->writeUTF8(c);
180 else
181 p++;
182 continue;
184 /* all this handled by isLineSeparator
185 case '\r':
186 if (p[1] == '\n')
187 goto Ldefault;
188 case '\n':
189 linnum++;
190 // Always extract new lines, so that D lexer counts the
191 // lines right.
192 buf->writeByte(*p);
193 p++;
194 continue;
197 default:
198 Ldefault:
199 int lineSepLength=isLineSeperator(p);
200 if( lineSepLength>0 ){
201 linnum++;
202 // Always extract new lines, so that the D lexer
203 // counts the lines right.
204 buf->writeByte('\n'); // BUG: wchar
205 p+=lineSepLength;
206 continue;
209 if (inCode)
210 buf->writeByte(*p);
211 p++;
212 continue;
214 break;
216 buf->writeByte(0); // ending sentinel
217 //printf("D code is: '%s'\n", (char *)buf->data);
220 /***********************************************
221 * Scan to end of <> tag.
222 * Look for <code> and </code> tags to start/stop D processing.
223 * Input:
224 * p is on opening '<' of tag; it's already verified that
225 * it's a tag by lookahead
226 * Output:
227 * p is past closing '>' of tag
230 void Html::skipTag()
232 enum TagState // what parsing state we're in
234 TStagstart, // start of tag name
235 TStag, // in a tag name
236 TSrest, // following tag name
238 enum TagState state = TStagstart;
239 int inot;
240 unsigned char *tagstart = NULL;
241 int taglen = 0;
243 p++;
244 inot = 0;
245 if (*p == '/')
246 { inot = 1;
247 p++;
249 while (1)
251 switch (*p)
253 case '>': // found end of tag
254 p++;
255 break;
257 case '"':
258 case '\'':
259 state = TSrest;
260 skipString();
261 continue;
263 case '<':
264 if (p[1] == '!' && isCommentStart())
265 { // Comments start with <!--
266 //OLDOLD//p += 4;
267 scanComment();
269 //OLDOLD//else if ((p[1] == '/' && istagstart(p[2])) ||
270 //OLDOLD// istagstart(p[1]))
271 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
272 { error("nested tag");
273 skipTag();
275 else if (istagstart(*skipWhite(p + 1)))
276 { error("nested tag");
277 skipTag();
279 else
280 //CHECKCHECK//stillneeded?
281 p++;
282 // Treat comments as if they were whitespace
283 state = TSrest;
284 continue;
286 case 0:
287 case 0x1a:
288 error("end of file before end of tag");
289 break; // end of file
291 /* all handled by isLineSeparator
292 case '\r':
293 if (p[1] == '\n')
294 goto Ldefault;
295 case '\n':
296 linnum++;
297 // Always extract new lines, so that code lexer counts the
298 // lines right.
299 dbuf->writeByte(*p);
300 state = TSrest; // end of tag
301 p++;
302 continue;
305 case ' ':
306 case '\t':
307 case '\f':
308 case '\v':
309 if (state == TStagstart)
310 { p++;
311 continue;
313 default:
314 // Ldefault:
315 int lineSepLength = isLineSeperator(p);
316 if( lineSepLength>0 ){
317 linnum++;
318 // Always extract new lines, so that code lexer counts
319 // the lines right.
320 dbuf->writeByte('\n'); // BUG: wchar
321 state = TSrest;
322 p+=lineSepLength;
323 continue;
325 switch (state)
327 case TStagstart: // start of tag name
328 assert(istagstart(*p));
329 state = TStag;
330 tagstart = p;
331 taglen = 1;
332 break;
334 case TStag:
335 if (istag(*p))
336 { // Continuing tag name
337 taglen++;
339 else
340 { // End of tag name
341 state = TSrest;
343 break;
345 case TSrest:
346 break;
348 p++;
349 continue;
351 break;
354 // See if we parsed a <code> or </code> tag
355 if (taglen == 4 && memicmp((const char *)tagstart, "CODE", taglen) == 0
356 && *(p - 2) != '/') // ignore "<code />" (XHTML)
358 if (inot)
359 { inCode--;
360 if (inCode < 0)
361 inCode = 0; // ignore extra </code>'s
363 else
364 inCode++;
368 /***********************************************
369 * Scan to end of attribute string.
372 void Html::skipString()
374 int tc = *p;
376 while (1)
378 p++;
379 switch (*p)
381 case '"':
382 case '\'':
383 if (*p == tc)
384 { p++;
385 break;
387 continue;
389 /* all handled by isLineSeparator
390 case '\r':
391 if (p[1] == '\n')
392 goto Ldefault;
393 case '\n':
394 linnum++;
395 // Always extract new lines, so that D lexer counts the
396 // lines right.
397 dbuf->writeByte(*p);
398 continue;
401 case 0:
402 case 0x1a:
403 Leof:
404 error("end of file before closing %c of string", tc);
405 break;
407 default:
408 // Ldefault:
409 int lineSepLength = isLineSeperator(p);
410 if( lineSepLength>0 ){
411 linnum++;
412 // Always extract new lines, so that D lexer counts
413 // the lines right.
414 dbuf->writeByte('\n'); // BUG: wchar
415 continue;
417 continue;
419 break;
423 /*********************************
424 * If p points to any white space, skip it
425 * and return pointer just past it.
428 unsigned char *Html::skipWhite(unsigned char *q)
430 for (; 1; q++)
432 switch (*q)
434 case ' ':
435 case '\t':
436 case '\f':
437 case '\v':
438 case '\r':
439 case '\n':
440 continue;
442 default:
443 break;
445 break;
447 return q;
450 /***************************************************
451 * Scan to end of comment.
452 * Comments are defined any of a number of ways.
453 * IE 5.0: <!-- followed by >
454 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
455 * Netscape: <!-- --> comments nest
456 * w3c: whitespace can appear between -- and > of comment close
459 void Html::scanComment()
461 // Most of the complexity is dealing with the case that
462 // an arbitrary amount of whitespace can appear between
463 // the -- and the > of a comment close.
464 int scangt = 0;
466 //printf("scanComment()\n");
467 if (*p == '\n')
468 { linnum++;
469 // Always extract new lines, so that D lexer counts the
470 // lines right.
471 dbuf->writeByte(*p);
473 while (1)
475 //scangt = 1; // IE 5.0 compatibility
476 p++;
477 switch (*p)
479 case '-':
480 if (p[1] == '-')
482 if (p[2] == '>') // optimize for most common case
484 p += 3;
485 break;
487 p++;
488 scangt = 1;
490 else
491 scangt = 0;
492 continue;
494 case '>':
495 if (scangt)
496 { // found -->
497 p++;
498 break;
500 continue;
502 case ' ':
503 case '\t':
504 case '\f':
505 case '\v':
506 // skip white space
507 continue;
509 /* all handled by isLineSeparator
510 case '\r':
511 if (p[1] == '\n')
512 goto Ldefault;
513 case '\n':
514 linnum++; // remember to count lines
515 // Always extract new lines, so that D lexer counts the
516 // lines right.
517 dbuf->writeByte(*p);
518 continue;
521 case 0:
522 case 0x1a:
523 error("end of file before closing --> of comment");
524 break;
526 default:
527 // Ldefault:
528 int lineSepLength = isLineSeperator(p);
529 if( lineSepLength>0 ){
530 linnum++; // remember to count lines
531 // Always extract new lines, so that D lexer counts
532 // the lines right.
533 dbuf->writeByte('\n'); // BUG: wchar
534 p+=lineSepLength-1;
535 continue;
537 scangt = 0; // it's not -->
538 continue;
540 break;
542 //printf("*p = '%c'\n", *p);
545 /********************************************
546 * Determine if we are at the start of a comment.
547 * Input:
548 * p is on the opening '<'
549 * Returns:
550 * 0 if not start of a comment
551 * 1 if start of a comment, p is adjusted to point past --
554 int Html::isCommentStart()
555 #ifdef __DMC__
556 __out(result)
558 if (result == 0)
560 else if (result == 1)
562 assert(p[-2] == '-' && p[-1] == '-');
564 else
565 assert(0);
567 __body
568 #endif /* __DMC__ */
569 { unsigned char *s;
571 if (p[0] == '<' && p[1] == '!')
573 for (s = p + 2; 1; s++)
575 switch (*s)
577 case ' ':
578 case '\t':
579 case '\r':
580 case '\f':
581 case '\v':
582 // skip white space, even though spec says no
583 // white space is allowed
584 continue;
586 case '-':
587 if (s[1] == '-')
589 p = s + 2;
590 return 1;
592 goto No;
594 default:
595 goto No;
600 return 0;
603 int Html::isCDATAStart()
605 const char * CDATA_START_MARKER = "<![CDATA[";
606 size_t len = strlen(CDATA_START_MARKER);
608 if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
610 p += len;
611 return 1;
613 else
615 return 0;
619 void Html::scanCDATA()
621 while(*p && *p != 0x1A)
623 int lineSepLength = isLineSeperator(p);
624 if (lineSepLength>0)
626 /* Always extract new lines, so that D lexer counts the lines
627 * right.
629 linnum++;
630 dbuf->writeUTF8('\n');
631 p += lineSepLength;
632 continue;
634 else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
636 /* end of CDATA section */
637 p += 3;
638 return;
640 else if (inCode)
642 /* this CDATA section contains D code */
643 dbuf->writeByte(*p);
646 p++;
650 /********************************************
651 * Convert an HTML character entity into a character.
652 * Forms are:
653 * &name; named entity
654 * &#ddd; decimal
655 * &#xhhhh; hex
656 * Input:
657 * p is on the &
660 int Html::charEntity()
661 { int c = 0;
662 int v;
663 int hex;
664 unsigned char *pstart = p;
666 //printf("Html::charEntity('%c')\n", *p);
667 if (p[1] == '#')
669 p++;
670 if (p[1] == 'x' || p[1] == 'X')
671 { p++;
672 hex = 1;
674 else
675 hex = 0;
677 if (p[1] == ';')
678 goto Linvalid;
679 while (1)
681 p++;
682 switch (*p)
684 case 0:
685 case 0x1a:
686 error("end of file before end of character entity");
687 goto Lignore;
689 case '\n':
690 case '\r':
691 case '<': // tag start
692 // Termination is assumed
693 break;
695 case ';':
696 // Termination is explicit
697 p++;
698 break;
700 case '0': case '1': case '2': case '3': case '4':
701 case '5': case '6': case '7': case '8': case '9':
702 v = *p - '0';
703 goto Lvalue;
705 case 'a': case 'b': case 'c':
706 case 'd': case 'e': case 'f':
707 if (!hex)
708 goto Linvalid;
709 v = (*p - 'a') + 10;
710 goto Lvalue;
712 case 'A': case 'B': case 'C':
713 case 'D': case 'E': case 'F':
714 if (!hex)
715 goto Linvalid;
716 v = (*p - 'A') + 10;
717 goto Lvalue;
719 Lvalue:
720 if (hex)
721 c = (c << 4) + v;
722 else
723 c = (c * 10) + v;
724 if (c > 0x10FFFF)
726 error("character entity out of range");
727 goto Lignore;
729 continue;
731 default:
732 Linvalid:
733 error("invalid numeric character reference");
734 goto Lignore;
736 break;
739 else
741 // It's a named entity; gather all characters until ;
742 unsigned char *idstart = p + 1;
744 while (1)
746 p++;
747 switch (*p)
749 case 0:
750 case 0x1a:
751 error("end of file before end of character entity");
752 break;
754 case '\n':
755 case '\r':
756 case '<': // tag start
757 // Termination is assumed
758 c = HtmlNamedEntity(idstart, p - idstart);
759 if (c == -1)
760 goto Lignore;
761 break;
763 case ';':
764 // Termination is explicit
765 c = HtmlNamedEntity(idstart, p - idstart);
766 if (c == -1)
767 goto Lignore;
768 p++;
769 break;
771 default:
772 continue;
774 break;
778 // Kludge to convert non-breaking space to ascii space
779 if (c == 160)
780 c = 32;
781 return c;
782 Lignore:
783 //printf("Lignore\n");
784 p = pstart + 1;
785 return '&';