2 // Compiler implementation of the D programming language
3 // Copyright (c) 1999-2008 by Digital Mars
5 // written by Walter Bright
6 // http://www.digitalmars.com
7 // License for redistribution is by either the Artistic License
8 // in artistic.txt, or the GNU General Public License in gnu.txt.
9 // See the included readme.txt for details.
11 /* NOTE: This file has been patched from the original DMD distribution to
12 work with the GDC compiler.
14 Modified by David Friedman, December 2006
17 /* Lexical Analyzer */
41 #include "..\root\mem.h"
43 #include "../root/mem.h"
47 #include "stringtable.h"
51 #include "identifier.h"
56 // from \dm\src\include\setlocal.h
57 extern "C" char * __cdecl __locale_decpoint
;
60 extern int HtmlNamedEntity(unsigned char *p
, int length
);
62 #define LS 0x2028 // UTF line separator
63 #define PS 0x2029 // UTF paragraph separator
65 /********************************************
66 * Do our own char maps
69 static unsigned char cmtable
[256];
71 const int CMoctal
= 0x1;
72 const int CMhex
= 0x2;
73 const int CMidchar
= 0x4;
75 inline unsigned char isoctal (unsigned char c
) { return cmtable
[c
] & CMoctal
; }
76 inline unsigned char ishex (unsigned char c
) { return cmtable
[c
] & CMhex
; }
77 inline unsigned char isidchar(unsigned char c
) { return cmtable
[c
] & CMidchar
; }
79 static void cmtable_init()
81 for (unsigned c
= 0; c
< sizeof(cmtable
) / sizeof(cmtable
[0]); c
++)
83 if ('0' <= c
&& c
<= '7')
84 cmtable
[c
] |= CMoctal
;
85 if (isdigit(c
) || ('a' <= c
&& c
<= 'f') || ('A' <= c
&& c
<= 'F'))
87 if (isalnum(c
) || c
== '_')
88 cmtable
[c
] |= CMidchar
;
93 /************************* Token **********************************************/
95 char *Token::tochars
[TOKMAX
];
97 void *Token::operator new(size_t size
)
103 Lexer::freelist
= t
->next
;
107 return ::operator new(size
);
113 fprintf(stdmsg
, "%s\n", toChars());
117 char *Token::toChars()
119 static char buffer
[3 + 3 * sizeof(value
) + 1];
126 sprintf(buffer
,"%d",(d_int32
)int64value
);
128 sprintf(buffer
,"%d",int32value
);
137 sprintf(buffer
,"%uU",(d_uns32
)uns64value
);
139 sprintf(buffer
,"%uU",uns32value
);
144 sprintf(buffer
,"%"PRIdMAX
"L",int64value
);
148 sprintf(buffer
,"%"PRIuMAX
"UL",uns64value
);
155 float80value
.format(buffer
, sizeof(buffer
));
157 case TOKimaginary32v
:
158 case TOKimaginary64v
:
159 case TOKimaginary80v
:
160 float80value
.format(buffer
, sizeof(buffer
));
166 sprintf(buffer
,"%Lgf", float80value
);
170 sprintf(buffer
,"%Lg", float80value
);
174 sprintf(buffer
,"%LgL", float80value
);
177 case TOKimaginary32v
:
178 sprintf(buffer
,"%Lgfi", float80value
);
181 case TOKimaginary64v
:
182 sprintf(buffer
,"%Lgi", float80value
);
185 case TOKimaginary80v
:
186 sprintf(buffer
,"%LgLi", float80value
);
198 for (size_t i
= 0; i
< len
; )
201 utf_decodeChar((unsigned char *)ustring
, len
, &i
, &c
);
214 buf
.printf("\\x%02x", c
);
215 else if (c
<= 0xFFFF)
216 buf
.printf("\\u%04x", c
);
218 buf
.printf("\\U%08x", c
);
227 p
= (char *)buf
.extractData();
237 p
= ident
->toChars();
247 char *Token::toChars(enum TOK value
)
249 static char buffer
[3 + 3 * sizeof(value
) + 1];
253 { sprintf(buffer
,"TOK%d",value
);
259 /*************************** Lexer ********************************************/
261 Token
*Lexer::freelist
= NULL
;
262 StringTable
Lexer::stringtable
;
263 OutBuffer
Lexer::stringbuffer
;
265 Lexer::Lexer(Module
*mod
,
266 unsigned char *base
, unsigned begoffset
, unsigned endoffset
,
267 int doDocComment
, int commentToken
)
270 //printf("Lexer::Lexer(%p,%d)\n",base,length);
271 //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
272 memset(&token
,0,sizeof(token
));
274 this->end
= base
+ endoffset
;
275 p
= base
+ begoffset
;
277 this->doDocComment
= doDocComment
;
279 this->commentToken
= commentToken
;
282 /* If first line starts with '#!', ignore the line
285 if (p
[0] == '#' && p
[1] =='!')
289 { unsigned char c
= *p
;
308 { unsigned u
= decodeUTF();
309 if (u
== PS
|| u
== LS
)
322 void Lexer::error(const char *format
, ...)
324 if (mod
&& !global
.gag
)
326 char *p
= loc
.toChars();
328 fprintf(stdmsg
, "%s: ", p
);
332 va_start(ap
, format
);
333 vfprintf(stdmsg
, format
, ap
);
336 fprintf(stdmsg
, "\n");
339 if (global
.errors
>= 20) // moderate blizzard of cascading messages
345 void Lexer::error(Loc loc
, const char *format
, ...)
347 if (mod
&& !global
.gag
)
349 char *p
= loc
.toChars();
351 fprintf(stdmsg
, "%s: ", p
);
355 va_start(ap
, format
);
356 vfprintf(stdmsg
, format
, ap
);
359 fprintf(stdmsg
, "\n");
362 if (global
.errors
>= 20) // moderate blizzard of cascading messages
368 TOK
Lexer::nextToken()
374 memcpy(&token
,t
,sizeof(Token
));
386 Token
*Lexer::peek(Token
*ct
)
401 /*********************************
402 * tk is on the opening (.
403 * Look ahead and return token that is past the closing ).
406 Token
*Lexer::peekPastParen(Token
*tk
)
408 //printf("peekPastParen()\n");
433 if (--curlynest
>= 0)
452 /**********************************
453 * Determine if string is a valid Identifier.
454 * Placed here because of commonality with Lexer functionality.
459 int Lexer::isValidIdentifier(char *p
)
467 if (*p
>= '0' && *p
<= '9') // beware of isdigit() on signed chars
475 char *q
= utf_decodeChar((unsigned char *)p
, len
, &idx
, &dc
);
479 if (!((dc
>= 0x80 && isUniAlpha(dc
)) || isalnum(dc
) || dc
== '_'))
488 /****************************
489 * Turn next token in buffer into a token.
492 void Lexer::scan(Token
*t
)
494 unsigned lastLine
= loc
.linnum
;
497 t
->blockComment
= NULL
;
498 t
->lineComment
= NULL
;
502 //printf("p = %p, *p = '%c'\n",p,*p);
507 t
->value
= TOKeof
; // end of file
515 continue; // skip white space
519 if (*p
!= '\n') // if CR stands by itself
521 continue; // skip white space
526 continue; // skip white space
528 case '0': case '1': case '2': case '3': case '4':
529 case '5': case '6': case '7': case '8': case '9':
530 t
->value
= number(t
);
535 t
->value
= charConstant(t
, 0);
539 t
->value
= stringConstant(t
,0);
547 t
->value
= charConstant(t
, 1);
550 else if (p
[1] == '"')
553 t
->value
= stringConstant(t
, 1);
558 t
->value
= charConstant(t
,0);
566 t
->value
= wysiwygStringConstant(t
, *p
);
573 t
->value
= hexStringConstant(t
);
581 t
->value
= delimitedStringConstant(t
);
584 else if (p
[1] == '{')
587 t
->value
= tokenStringConstant(t
);
595 t
->value
= escapeStringConstant(t
,0);
598 case '\\': // escaped string literal
601 stringbuffer
.reset();
610 c
= escapeSequence();
611 stringbuffer
.writeUTF8(c
);
615 c
= escapeSequence();
616 stringbuffer
.writeByte(c
);
619 } while (*p
== '\\');
620 t
->len
= stringbuffer
.offset
;
621 stringbuffer
.writeByte(0);
622 t
->ustring
= (unsigned char *)mem
.malloc(stringbuffer
.offset
);
623 memcpy(t
->ustring
, stringbuffer
.data
, stringbuffer
.offset
);
625 t
->value
= TOKstring
;
632 case 'a': case 'b': case 'c': case 'd': case 'e':
633 case 'f': case 'g': case 'h': case 'i': case 'j':
634 case 'k': case 'm': case 'n': case 'o':
636 case 'p': /*case 'q': case 'r':*/ case 's': case 't':
638 case 'p': case 'q': /*case 'r':*/ case 's': case 't':
640 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y':
642 case 'A': case 'B': case 'C': case 'D': case 'E':
643 case 'F': case 'G': case 'H': case 'I': case 'J':
644 case 'K': case 'M': case 'N': case 'O':
645 case 'P': case 'Q': case 'R': case 'S': case 'T':
646 case 'U': case 'V': case 'W': case 'X': case 'Y':
657 } while (isidchar(c
) || (c
& 0x80 && isUniAlpha(decodeUTF())));
658 sv
= stringtable
.update((char *)t
->ptr
, p
- t
->ptr
);
659 id
= (Identifier
*) sv
->ptrvalue
;
661 { id
= new Identifier(sv
->lstring
.string
,TOKidentifier
);
665 t
->value
= (enum TOK
) id
->value
;
667 if (*t
->ptr
== '_') // if special identifier token
669 static char date
[11+1];
670 static char time
[8+1];
671 static char timestamp
[24+1];
673 if (!date
[0]) // lazy evaluation
680 sprintf(date
, "%.6s %.4s", p
+ 4, p
+ 20);
681 sprintf(time
, "%.8s", p
+ 11);
682 sprintf(timestamp
, "%.24s", p
);
686 if (mod
&& id
== Id::FILE)
688 t
->ustring
= (unsigned char *)(loc
.filename
? loc
.filename
: mod
->ident
->toChars());
691 else if (mod
&& id
== Id::LINE
)
693 t
->value
= TOKint64v
;
694 t
->uns64value
= loc
.linnum
;
700 t
->ustring
= (unsigned char *)date
;
703 else if (id
== Id::TIME
)
705 t
->ustring
= (unsigned char *)time
;
708 else if (id
== Id::VENDOR
)
711 t
->ustring
= (unsigned char *)"GDC";
713 t
->ustring
= (unsigned char *)"Digital Mars D";
717 else if (id
== Id::TIMESTAMP
)
719 t
->ustring
= (unsigned char *)timestamp
;
721 t
->value
= TOKstring
;
724 t
->len
= strlen((char *)t
->ustring
);
726 else if (id
== Id::VERSIONX
)
727 { unsigned major
= 0;
730 for (char *p
= global
.version
+ 1; 1; p
++)
734 minor
= minor
* 10 + c
- '0';
742 t
->value
= TOKint64v
;
743 t
->uns64value
= major
* 1000 + minor
;
746 else if (id
== Id::EOFX
)
749 // Advance scanner to end of file
750 while (!(*p
== 0 || *p
== 0x1A))
755 //printf("t->value = %d\n",t->value);
765 t
->value
= TOKdivass
;
774 { unsigned char c
= *p
;
793 error("unterminated /* */ comment");
800 { unsigned u
= decodeUTF();
801 if (u
== PS
|| u
== LS
)
810 if (p
[-2] == '*' && p
- 3 != t
->ptr
)
815 t
->value
= TOKcomment
;
818 else if (doDocComment
&& t
->ptr
[2] == '*' && p
- 4 != t
->ptr
)
819 { // if /** but not /**/
820 getDocComment(t
, lastLine
== linnum
);
824 case '/': // do // style comments
827 { unsigned char c
= *++p
;
843 t
->value
= TOKcomment
;
846 if (doDocComment
&& t
->ptr
[2] == '/')
847 getDocComment(t
, lastLine
== linnum
);
854 { unsigned u
= decodeUTF();
855 if (u
== PS
|| u
== LS
)
867 t
->value
= TOKcomment
;
870 if (doDocComment
&& t
->ptr
[2] == '/')
871 getDocComment(t
, lastLine
== linnum
);
884 { unsigned char c
= *p
;
919 error("unterminated /+ +/ comment");
926 { unsigned u
= decodeUTF();
927 if (u
== PS
|| u
== LS
)
937 t
->value
= TOKcomment
;
940 if (doDocComment
&& t
->ptr
[2] == '+' && p
- 4 != t
->ptr
)
941 { // if /++ but not /++/
942 getDocComment(t
, lastLine
== linnum
);
953 { /* Note that we don't allow ._1 and ._ as being
954 * valid floating point numbers.
957 t
->value
= inreal(t
);
959 else if (p
[0] == '.')
963 t
->value
= TOKdotdotdot
;
978 t
->value
= TOKandass
;
982 t
->value
= TOKandand
;
1006 t
->value
= TOKminass
;
1011 t
->value
= TOKarrow
;
1016 t
->value
= TOKminusminus
;
1026 t
->value
= TOKaddass
;
1030 t
->value
= TOKplusplus
;
1040 t
->value
= TOKle
; // <=
1046 t
->value
= TOKshlass
; // <<=
1049 t
->value
= TOKshl
; // <<
1055 t
->value
= TOKleg
; // <>=
1058 t
->value
= TOKlg
; // <>
1061 t
->value
= TOKlt
; // <
1068 t
->value
= TOKge
; // >=
1074 t
->value
= TOKshrass
; // >>=
1080 t
->value
= TOKushrass
; // >>>=
1083 t
->value
= TOKushr
; // >>>
1086 t
->value
= TOKshr
; // >>
1089 t
->value
= TOKgt
; // >
1096 if (*p
== '=' && global
.params
.Dversion
== 1)
1098 t
->value
= TOKnotidentity
; // !==
1101 t
->value
= TOKnotequal
; // !=
1109 t
->value
= TOKunord
; // !<>=
1112 t
->value
= TOKue
; // !<>
1116 t
->value
= TOKug
; // !<=
1119 t
->value
= TOKuge
; // !<
1125 t
->value
= TOKul
; // !>=
1128 t
->value
= TOKule
; // !>
1131 t
->value
= TOKnot
; // !
1138 if (*p
== '=' && global
.params
.Dversion
== 1)
1140 t
->value
= TOKidentity
; // ===
1143 t
->value
= TOKequal
; // ==
1146 t
->value
= TOKassign
; // =
1153 t
->value
= TOKcatass
; // ~=
1156 t
->value
= TOKtilde
; // ~
1159 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1161 SINGLE('(', TOKlparen
)
1162 SINGLE(')', TOKrparen
)
1163 SINGLE('[', TOKlbracket
)
1164 SINGLE(']', TOKrbracket
)
1165 SINGLE('{', TOKlcurly
)
1166 SINGLE('}', TOKrcurly
)
1167 SINGLE('?', TOKquestion
)
1168 SINGLE(',', TOKcomma
)
1169 SINGLE(';', TOKsemicolon
)
1170 SINGLE(':', TOKcolon
)
1171 SINGLE('$', TOKdollar
)
1175 #define DOUBLE(c1,tok1,c2,tok2) \
1186 DOUBLE('*', TOKmul
, '=', TOKmulass
)
1187 DOUBLE('%', TOKmod
, '=', TOKmodass
)
1188 DOUBLE('^', TOKxor
, '=', TOKxorass
)
1198 { unsigned char c
= *p
;
1201 { unsigned u
= decodeUTF();
1203 // Check for start of unicode identifier
1207 if (u
== PS
|| u
== LS
)
1215 error("unsupported char '%c'", c
);
1217 error("unsupported char 0x%02x", c
);
1225 /*******************************************
1226 * Parse escape sequence.
1229 unsigned Lexer::escapeSequence()
1245 case 'a': c
= 7; goto Lconsume
;
1246 case 'b': c
= 8; goto Lconsume
;
1247 case 'f': c
= 12; goto Lconsume
;
1248 case 'n': c
= 10; goto Lconsume
;
1249 case 'r': c
= 13; goto Lconsume
;
1250 case 't': c
= 9; goto Lconsume
;
1251 case 'v': c
= 11; goto Lconsume
;
1273 else if (islower(c
))
1282 { error("escape hex sequence has %d hex digits instead of %d", n
, ndigits
);
1286 if (ndigits
!= 2 && !utf_isValidDchar(v
))
1287 error("invalid UTF character \\U%08x", v
);
1291 error("undefined escape hex sequence \\%c\n",c
);
1294 case '&': // named character entity
1295 for (unsigned char *idstart
= ++p
; 1; p
++)
1300 c
= HtmlNamedEntity(idstart
, p
- idstart
);
1302 { error("unnamed character entity &%.*s;", (int)(p
- idstart
), idstart
);
1310 (p
!= idstart
+ 1 && isdigit(*p
)))
1312 error("unterminated named entity");
1320 case 0x1A: // end of file
1332 v
= v
* 8 + (c
- '0');
1334 } while (++n
< 3 && isoctal(c
));
1337 error("0%03o is larger than a byte", c
);
1340 error("undefined escape sequence \\%c\n",c
);
1346 /**************************************
1349 TOK
Lexer::wysiwygStringConstant(Token
*t
, int tc
)
1354 stringbuffer
.reset();
1367 c
= '\n'; // treat EndOfLine as \n character
1373 error("unterminated string constant starting at %s", start
.toChars());
1374 t
->ustring
= (unsigned char *)"";
1383 t
->len
= stringbuffer
.offset
;
1384 stringbuffer
.writeByte(0);
1385 t
->ustring
= (unsigned char *)mem
.malloc(stringbuffer
.offset
);
1386 memcpy(t
->ustring
, stringbuffer
.data
, stringbuffer
.offset
);
1395 unsigned u
= decodeUTF();
1397 if (u
== PS
|| u
== LS
)
1399 stringbuffer
.writeUTF8(u
);
1404 stringbuffer
.writeByte(c
);
1408 /**************************************
1413 TOK
Lexer::hexStringConstant(Token
*t
)
1420 stringbuffer
.reset();
1430 continue; // skip white space
1435 // Treat isolated '\r' as if it were a '\n'
1442 error("unterminated string constant starting at %s", start
.toChars());
1443 t
->ustring
= (unsigned char *)"";
1450 { error("odd number (%d) of hex characters in hex string", n
);
1451 stringbuffer
.writeByte(v
);
1453 t
->len
= stringbuffer
.offset
;
1454 stringbuffer
.writeByte(0);
1455 t
->ustring
= (unsigned char *)mem
.malloc(stringbuffer
.offset
);
1456 memcpy(t
->ustring
, stringbuffer
.data
, stringbuffer
.offset
);
1461 if (c
>= '0' && c
<= '9')
1463 else if (c
>= 'a' && c
<= 'f')
1465 else if (c
>= 'A' && c
<= 'F')
1469 unsigned u
= decodeUTF();
1471 if (u
== PS
|| u
== LS
)
1474 error("non-hex character \\u%x", u
);
1477 error("non-hex character '%c'", c
);
1480 stringbuffer
.writeByte(v
);
1492 /**************************************
1493 * Lex delimited strings:
1494 * q"(foo(xxx))" // "foo(xxx)"
1495 * q"[foo(]" // "foo("
1496 * q"/foo]/" // "foo]"
1504 TOK
Lexer::delimitedStringConstant(Token
*t
)
1507 unsigned delimleft
= 0;
1508 unsigned delimright
= 0;
1511 Identifier
*hereid
= NULL
;
1512 unsigned blankrol
= 0;
1513 unsigned startline
= 0;
1516 stringbuffer
.reset();
1520 //printf("c = '%c'\n", c);
1533 stringbuffer
.writeUTF8(c
);
1541 c
= '\n'; // treat EndOfLine as \n character
1553 if (c
== PS
|| c
== LS
)
1570 else if (isalpha(c
) || c
== '_' || (c
>= 0x80 && isUniAlpha(c
)))
1571 { // Start of identifier; must be a heredoc
1574 scan(&t
); // read in heredoc identifier
1575 if (t
.value
!= TOKidentifier
)
1576 { error("identifier expected for heredoc, not %s", t
.toChars());
1581 //printf("hereid = '%s'\n", hereid->toChars());
1594 { error("heredoc rest of line should be blank");
1602 else if (c
== delimright
)
1608 else if (c
== delimright
)
1610 if (startline
&& isalpha(c
))
1612 unsigned char *psave
= p
;
1614 scan(&t
); // read in possible heredoc identifier
1615 //printf("endid = '%s'\n", t.ident->toChars());
1616 if (t
.value
== TOKidentifier
&& t
.ident
->equals(hereid
))
1617 { /* should check that rest of line is blank
1623 stringbuffer
.writeUTF8(c
);
1632 error("delimited string must end in %c\"", delimright
);
1633 t
->len
= stringbuffer
.offset
;
1634 stringbuffer
.writeByte(0);
1635 t
->ustring
= (unsigned char *)mem
.malloc(stringbuffer
.offset
);
1636 memcpy(t
->ustring
, stringbuffer
.data
, stringbuffer
.offset
);
1641 error("unterminated string constant starting at %s", start
.toChars());
1642 t
->ustring
= (unsigned char *)"";
1648 /**************************************
1649 * Lex delimited strings:
1650 * q{ foo(xxx) } // " foo(xxx) "
1652 * q{{foo}"}"} // "{foo}"}""
1657 TOK
Lexer::tokenStringConstant(Token
*t
)
1661 unsigned char *pstart
= ++p
;
1687 t
->len
= p
- 1 - pstart
;
1688 t
->ustring
= (unsigned char *)mem
.malloc(t
->len
+ 1);
1689 memcpy(t
->ustring
, pstart
, t
->len
);
1690 t
->ustring
[t
->len
] = 0;
1695 error("unterminated token string constant starting at %s", start
.toChars());
1696 t
->ustring
= (unsigned char *)"";
1705 /**************************************
1708 TOK
Lexer::escapeStringConstant(Token
*t
, int wide
)
1713 stringbuffer
.reset();
1725 c
= escapeSequence();
1726 stringbuffer
.writeUTF8(c
);
1730 c
= escapeSequence();
1742 c
= '\n'; // treat EndOfLine as \n character
1747 t
->len
= stringbuffer
.offset
;
1748 stringbuffer
.writeByte(0);
1749 t
->ustring
= (unsigned char *)mem
.malloc(stringbuffer
.offset
);
1750 memcpy(t
->ustring
, stringbuffer
.data
, stringbuffer
.offset
);
1757 error("unterminated string constant starting at %s", start
.toChars());
1758 t
->ustring
= (unsigned char *)"";
1768 if (c
== LS
|| c
== PS
)
1773 stringbuffer
.writeUTF8(c
);
1778 stringbuffer
.writeByte(c
);
1782 /**************************************
1785 TOK
Lexer::charConstant(Token
*t
, int wide
)
1790 //printf("Lexer::charConstant\n");
1799 t
->uns64value
= escapeSequence();
1805 t
->uns64value
= escapeSequence();
1810 t
->uns64value
= escapeSequence();
1822 error("unterminated character constant");
1831 if (c
== LS
|| c
== PS
)
1833 if (c
< 0xD800 || (c
>= 0xE000 && c
< 0xFFFE))
1843 { error("unterminated character constant");
1850 /***************************************
1851 * Get postfix of string literal.
1854 void Lexer::stringPostfix(Token
*t
)
1871 /***************************************
1872 * Read \u or \U unicode sequence
1878 unsigned Lexer::wchar(unsigned u
)
1885 nchars
= (u
== 'U') ? 8 : 4;
1894 { error("\\%c sequence must be followed by %d hex characters", u
, nchars
);
1899 else if (islower(c
))
1910 /**************************************
1912 * If it's an integer, store it in tok.TKutok.Vlong.
1913 * integers can be decimal, octal or hex
1914 * Handle the suffixes U, UL, LU, L, etc.
1915 * If it's double, store it in tok.TKutok.Vdouble.
1921 TOK
Lexer::number(Token
*t
)
1923 // We use a state machine to collect numbers
1924 enum STATE
{ STATE_initial
, STATE_0
, STATE_decimal
, STATE_octal
, STATE_octale
,
1925 STATE_hex
, STATE_binary
, STATE_hex0
, STATE_binary0
,
1926 STATE_hexh
, STATE_error
};
1930 { FLAGS_decimal
= 1, // decimal
1931 FLAGS_unsigned
= 2, // u or U suffix
1932 FLAGS_long
= 4, // l or L suffix
1934 enum FLAGS flags
= FLAGS_decimal
;
1939 unsigned char *start
;
1942 //printf("Lexer::number()\n");
1943 state
= STATE_initial
;
1945 stringbuffer
.reset();
1952 case STATE_initial
: // opening state
1956 state
= STATE_decimal
;
1960 flags
= (FLAGS
) (flags
& ~FLAGS_decimal
);
1974 if (p
[1] == '.') // .. is a separate token
1987 state
= STATE_binary0
;
1990 case '0': case '1': case '2': case '3':
1991 case '4': case '5': case '6': case '7':
1992 state
= STATE_octal
;
1996 case '8': case '9': case 'A':
1997 case 'C': case 'D': case 'F':
1998 case 'a': case 'c': case 'd': case 'f':
2004 state
= STATE_octal
;
2018 case STATE_decimal
: // reading decimal number
2023 || c
== 'H' || c
== 'h'
2027 if (c
== '_') // ignore embedded _
2031 if (c
== '.' && p
[1] != '.')
2033 else if (c
== 'i' || c
== 'f' || c
== 'F' ||
2034 c
== 'e' || c
== 'E')
2036 real
: // It's a real number. Back up and rescan as a real
2040 else if (c
== 'L' && p
[1] == 'i')
2046 case STATE_hex0
: // reading hex number
2050 if (c
== '_') // ignore embedded _
2054 if (c
== '.' && p
[1] != '.')
2056 if (c
== 'P' || c
== 'p' || c
== 'i')
2058 if (state
== STATE_hex0
)
2059 error("Hex digit expected, not '%c'", c
);
2068 case STATE_hexh
: // parse numbers like 0FFh
2071 if (c
== 'H' || c
== 'h')
2079 // Check for something like 1E3 or 0E24
2080 if (memchr((char *)stringbuffer
.data
, 'E', stringbuffer
.offset
) ||
2081 memchr((char *)stringbuffer
.data
, 'e', stringbuffer
.offset
))
2083 error("Hex digit expected, not '%c'", c
);
2090 case STATE_octal
: // reading octal number
2091 case STATE_octale
: // reading octal number with non-octal digits
2096 || c
== 'H' || c
== 'h'
2100 if (c
== '_') // ignore embedded _
2104 if (c
== '.' && p
[1] != '.')
2110 state
= STATE_octale
;
2117 case STATE_binary0
: // starting binary number
2118 case STATE_binary
: // reading binary number
2119 if (c
!= '0' && c
!= '1')
2123 || c
== 'H' || c
== 'h'
2127 if (c
== '_') // ignore embedded _
2131 if (state
== STATE_binary0
)
2132 { error("binary digit expected");
2133 state
= STATE_error
;
2139 state
= STATE_binary
;
2142 case STATE_error
: // for error recovery
2143 if (!isdigit(c
)) // scan until non-digit
2150 stringbuffer
.writeByte(c
);
2154 stringbuffer
.writeByte(0); // terminate string
2155 if (state
== STATE_octale
)
2156 error("Octal digit expected");
2158 uinteger_t n
; // unsigned >=64 bit integer type
2160 if (stringbuffer
.offset
== 2 && (state
== STATE_decimal
|| state
== STATE_0
))
2161 n
= stringbuffer
.data
[0] - '0';
2164 // Convert string to integer
2167 n
= strtoull((char *)stringbuffer
.data
,NULL
,base
);
2168 if (errno
== ERANGE
)
2169 error("integer overflow");
2171 // Not everybody implements strtoull()
2172 char *p
= (char *)stringbuffer
.data
;
2177 if (p
[1] == 'x' || p
[1] == 'X')
2179 else if (p
[1] == 'b' || p
[1] == 'B')
2181 else if (isdigit(p
[1]))
2188 if (*p
>= '0' && *p
<= '9')
2190 else if (*p
>= 'a' && *p
<= 'z')
2192 else if (*p
>= 'A' && *p
<= 'Z')
2198 if (n
&& n
* r
+ d
<= n
)
2200 error ("integer overflow");
2208 if (sizeof(n
) > 8 &&
2209 n
> 0xFFFFFFFFFFFFFFFFULL
) // if n needs more than 64 bits
2210 error("integer overflow");
2213 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2224 if (1 || !global
.params
.useDeprecated
)
2225 error("'l' suffix is deprecated, use 'L' instead");
2231 error("unrecognized token");
2232 flags
= (FLAGS
) (flags
| f
);
2243 /* Octal or Hexadecimal constant.
2244 * First that fits: int, uint, long, ulong
2246 if (n
& 0x8000000000000000LL
)
2248 else if (n
& 0xFFFFFFFF00000000LL
)
2250 else if (n
& 0x80000000)
2257 /* First that fits: int, long, long long
2259 if (n
& 0x8000000000000000LL
)
2260 { error("signed integer overflow");
2263 else if (n
& 0xFFFFFFFF80000000LL
)
2269 case FLAGS_unsigned
:
2270 case FLAGS_decimal
| FLAGS_unsigned
:
2271 /* First that fits: uint, ulong
2273 if (n
& 0xFFFFFFFF00000000LL
)
2279 case FLAGS_decimal
| FLAGS_long
:
2280 if (n
& 0x8000000000000000LL
)
2281 { error("signed integer overflow");
2289 if (n
& 0x8000000000000000LL
)
2295 case FLAGS_unsigned
| FLAGS_long
:
2296 case FLAGS_decimal
| FLAGS_unsigned
| FLAGS_long
:
2302 printf("%x\n",flags
);
2310 /**************************************
2311 * Read in characters, converting them to real.
2313 * Exponent overflow not detected.
2314 * Too much requested precision is not detected.
2317 TOK
Lexer::inreal(Token
*t
)
2321 assert(*p
== '.' || isdigit(*p
));
2330 case TOKimaginary32v
:
2331 case TOKimaginary64v
:
2332 case TOKimaginary80v
:
2340 #endif /* __DMC__ */
2343 char hex
; // is this a hexadecimal-floating-constant?
2346 //printf("Lexer::inreal()\n");
2347 stringbuffer
.reset();
2353 // Get next char from input
2355 //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2360 case 0: // opening state
2371 if (c
== 'X' || c
== 'x')
2375 case 1: // digits to left of .
2376 case 3: // digits to right of .
2377 case 7: // continuing exponent digits
2378 if (!isdigit(c
) && !(hex
&& isxdigit(c
)))
2381 goto Lnext
; // ignore embedded '_'
2387 case 2: // no more digits to left of .
2392 case 4: // no more digits to right of .
2393 if ((c
== 'E' || c
== 'e') ||
2394 hex
&& (c
== 'P' || c
== 'p'))
2396 hex
= 0; // exponent is always decimal
2400 error("binary-exponent-part required");
2403 case 5: // looking immediately to right of E
2405 if (c
== '-' || c
== '+')
2407 case 6: // 1st exponent digit expected
2409 error("exponent expected");
2413 case 8: // past end of exponent digits
2418 stringbuffer
.writeByte(c
);
2423 stringbuffer
.writeByte(0);
2425 #if _WIN32 && __DMC__
2426 char *save
= __locale_decpoint
;
2427 __locale_decpoint
= ".";
2430 t
->float80value
= real_t::parse((char *)stringbuffer
.data
, real_t::LongDouble
);
2432 t
->float80value
= strtold((char *)stringbuffer
.data
, NULL
);
2440 real_t::parse((char *)stringbuffer
.data
, real_t::Float
);
2442 strtof((char *)stringbuffer
.data
, NULL
);
2444 result
= TOKfloat32v
;
2450 real_t::parse((char *)stringbuffer
.data
, real_t::Double
);
2452 strtod((char *)stringbuffer
.data
, NULL
);
2454 result
= TOKfloat64v
;
2458 if (!global
.params
.useDeprecated
)
2459 error("'l' suffix is deprecated, use 'L' instead");
2461 result
= TOKfloat80v
;
2465 if (*p
== 'i' || *p
== 'I')
2467 if (!global
.params
.useDeprecated
&& *p
== 'I')
2468 error("'I' suffix is deprecated, use 'i' instead");
2473 result
= TOKimaginary32v
;
2476 result
= TOKimaginary64v
;
2479 result
= TOKimaginary80v
;
2483 #if _WIN32 && __DMC__
2484 __locale_decpoint
= save
;
2486 if (errno
== ERANGE
)
2487 error("number is not representable");
2491 /*********************************************
2493 * Currently, the only pragma supported is:
2494 * #line linnum [filespec]
2497 void Lexer::pragma()
2501 char *filespec
= NULL
;
2502 Loc loc
= this->loc
;
2505 if (tok
.value
!= TOKidentifier
|| tok
.ident
!= Id::line
)
2509 if (tok
.value
== TOKint32v
|| tok
.value
== TOKint64v
)
2510 linnum
= tok
.uns64value
- 1;
2522 this->loc
.linnum
= linnum
;
2524 this->loc
.filename
= filespec
;
2540 continue; // skip white space
2543 if (mod
&& memcmp(p
, "__FILE__", 8) == 0)
2546 filespec
= mem
.strdup(loc
.filename
? loc
.filename
: mod
->ident
->toChars());
2553 stringbuffer
.reset();
2568 stringbuffer
.writeByte(0);
2569 filespec
= mem
.strdup((char *)stringbuffer
.data
);
2575 { unsigned u
= decodeUTF();
2576 if (u
== PS
|| u
== LS
)
2579 stringbuffer
.writeByte(c
);
2589 { unsigned u
= decodeUTF();
2590 if (u
== PS
|| u
== LS
)
2598 error(loc
, "#line integer [\"filespec\"]\\n expected");
2602 /********************************************
2603 * Decode UTF character.
2604 * Issue error messages for invalid sequences.
2605 * Return decoded character, advance p to last character in UTF sequence.
2608 unsigned Lexer::decodeUTF()
2612 unsigned char *s
= p
;
2620 // Check length of remaining string up to 6 UTF-8 characters
2621 for (len
= 1; len
< 6 && s
[len
]; len
++)
2625 msg
= utf_decodeChar(s
, len
, &idx
, &u
);
2635 /***************************************************
2636 * Parse doc comment embedded between t->ptr and p.
2637 * Remove trailing blanks and tabs from lines.
2638 * Replace all newlines with \n.
2639 * Remove leading comment character from each line.
2640 * Decide if it's a lineComment or a blockComment.
2641 * Append to previous one for this token.
2644 void Lexer::getDocComment(Token
*t
, unsigned lineComment
)
2647 unsigned char ct
= t
->ptr
[2];
2648 unsigned char *q
= t
->ptr
+ 3; // start of comment text
2651 unsigned char *qend
= p
;
2652 if (ct
== '*' || ct
== '+')
2655 /* Scan over initial row of ****'s or ++++'s or ////'s
2657 for (; q
< qend
; q
++)
2663 /* Remove trailing row of ****'s or ++++'s
2667 for (; q
< qend
; qend
--)
2674 for (; q
< qend
; q
++)
2676 unsigned char c
= *q
;
2682 if (linestart
&& c
== ct
)
2684 /* Trim preceding whitespace up to preceding \n
2686 while (buf
.offset
&& (buf
.data
[buf
.offset
- 1] == ' ' || buf
.data
[buf
.offset
- 1] == '\t'))
2698 continue; // skip the \r
2706 (q
[2] == 168 || q
[2] == 169))
2716 c
= '\n'; // replace all newlines with \n
2720 /* Trim trailing whitespace
2722 while (buf
.offset
&& (buf
.data
[buf
.offset
- 1] == ' ' || buf
.data
[buf
.offset
- 1] == '\t'))
2730 // Always end with a newline
2731 if (!buf
.offset
|| buf
.data
[buf
.offset
- 1] != '\n')
2732 buf
.writeByte('\n');
2736 // It's a line comment if the start of the doc comment comes
2737 // after other non-whitespace on the same line.
2738 unsigned char** dc
= (lineComment
&& anyToken
)
2742 // Combine with previous doc comment, if any
2744 *dc
= combineComments(*dc
, (unsigned char *)buf
.data
);
2746 *dc
= (unsigned char *)buf
.extractData();
2749 /********************************************
2750 * Combine two document comments into one.
2753 unsigned char *Lexer::combineComments(unsigned char *c1
, unsigned char *c2
)
2755 unsigned char *c
= c2
;
2760 { size_t len1
= strlen((char *)c1
);
2761 size_t len2
= strlen((char *)c2
);
2763 c
= (unsigned char *)mem
.malloc(len1
+ 1 + len2
+ 1);
2764 memcpy(c
, c1
, len1
);
2766 memcpy(c
+ len1
+ 1, c2
, len2
);
2767 c
[len1
+ 1 + len2
] = 0;
2773 /********************************************
2774 * Create an identifier in the string table.
2777 Identifier
*Lexer::idPool(const char *s
)
2779 size_t len
= strlen(s
);
2780 StringValue
*sv
= stringtable
.update(s
, len
);
2781 Identifier
*id
= (Identifier
*) sv
->ptrvalue
;
2784 id
= new Identifier(sv
->lstring
.string
, TOKidentifier
);
2790 /*********************************************
2791 * Create a unique identifier using the prefix s.
2794 Identifier
*Lexer::uniqueId(const char *s
, int num
)
2796 size_t slen
= strlen(s
);
2798 assert(slen
+ sizeof(num
) * 3 + 1 <= sizeof(buffer
));
2799 sprintf(buffer
, "%s%d", s
, num
);
2800 return idPool(buffer
);
2803 Identifier
*Lexer::uniqueId(const char *s
)
2806 return uniqueId(s
, ++num
);
2809 /****************************************
2817 static Keyword keywords
[] =
2821 { "this", TOKthis
},
2822 { "super", TOKsuper
},
2823 { "assert", TOKassert
},
2824 { "null", TOKnull
},
2825 { "true", TOKtrue
},
2826 { "false", TOKfalse
},
2827 { "cast", TOKcast
},
2829 { "delete", TOKdelete
},
2830 { "throw", TOKthrow
},
2831 { "module", TOKmodule
},
2832 { "pragma", TOKpragma
},
2833 { "typeof", TOKtypeof
},
2834 { "typeid", TOKtypeid
},
2836 { "template", TOKtemplate
},
2838 { "void", TOKvoid
},
2839 { "byte", TOKint8
},
2840 { "ubyte", TOKuns8
},
2841 { "short", TOKint16
},
2842 { "ushort", TOKuns16
},
2843 { "int", TOKint32
},
2844 { "uint", TOKuns32
},
2845 { "long", TOKint64
},
2846 { "ulong", TOKuns64
},
2847 { "cent", TOKcent
, },
2848 { "ucent", TOKucent
, },
2849 { "float", TOKfloat32
},
2850 { "double", TOKfloat64
},
2851 { "real", TOKfloat80
},
2853 { "bool", TOKbool
},
2854 { "char", TOKchar
},
2855 { "wchar", TOKwchar
},
2856 { "dchar", TOKdchar
},
2858 { "ifloat", TOKimaginary32
},
2859 { "idouble", TOKimaginary64
},
2860 { "ireal", TOKimaginary80
},
2862 { "cfloat", TOKcomplex32
},
2863 { "cdouble", TOKcomplex64
},
2864 { "creal", TOKcomplex80
},
2866 { "delegate", TOKdelegate
},
2867 { "function", TOKfunction
},
2871 { "else", TOKelse
},
2872 { "while", TOKwhile
},
2875 { "switch", TOKswitch
},
2876 { "case", TOKcase
},
2877 { "default", TOKdefault
},
2878 { "break", TOKbreak
},
2879 { "continue", TOKcontinue
},
2880 { "synchronized", TOKsynchronized
},
2881 { "return", TOKreturn
},
2882 { "goto", TOKgoto
},
2884 { "catch", TOKcatch
},
2885 { "finally", TOKfinally
},
2886 { "with", TOKwith
},
2888 { "foreach", TOKforeach
},
2889 { "foreach_reverse", TOKforeach_reverse
},
2890 { "scope", TOKscope
},
2892 { "struct", TOKstruct
},
2893 { "class", TOKclass
},
2894 { "interface", TOKinterface
},
2895 { "union", TOKunion
},
2896 { "enum", TOKenum
},
2897 { "import", TOKimport
},
2898 { "mixin", TOKmixin
},
2899 { "static", TOKstatic
},
2900 { "final", TOKfinal
},
2901 { "const", TOKconst
},
2902 { "typedef", TOKtypedef
},
2903 { "alias", TOKalias
},
2904 { "override", TOKoverride
},
2905 { "abstract", TOKabstract
},
2906 { "volatile", TOKvolatile
},
2907 { "debug", TOKdebug
},
2908 { "deprecated", TOKdeprecated
},
2911 { "inout", TOKinout
},
2912 { "lazy", TOKlazy
},
2913 { "auto", TOKauto
},
2915 { "align", TOKalign
},
2916 { "extern", TOKextern
},
2917 { "private", TOKprivate
},
2918 { "package", TOKpackage
},
2919 { "protected", TOKprotected
},
2920 { "public", TOKpublic
},
2921 { "export", TOKexport
},
2923 { "body", TOKbody
},
2924 { "invariant", TOKinvariant
},
2925 { "unittest", TOKunittest
},
2926 { "version", TOKversion
},
2927 //{ "manifest", TOKmanifest },
2931 { "macro", TOKmacro
},
2933 { "pure", TOKpure
},
2934 { "nothrow", TOKnothrow
},
2935 { "__thread", TOKtls
},
2936 { "__traits", TOKtraits
},
2937 { "__overloadset", TOKoverloadset
},
2938 { "__FILE__", TOKfile
},
2939 { "__LINE__", TOKline
},
2943 int Token::isKeyword()
2945 for (unsigned u
= 0; u
< sizeof(keywords
) / sizeof(keywords
[0]); u
++)
2947 if (keywords
[u
].value
== value
)
2953 void Lexer::initKeywords()
2957 unsigned nkeywords
= sizeof(keywords
) / sizeof(keywords
[0]);
2959 if (global
.params
.Dversion
== 1)
2964 for (u
= 0; u
< nkeywords
; u
++)
2967 //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
2968 s
= keywords
[u
].name
;
2969 v
= keywords
[u
].value
;
2970 sv
= stringtable
.insert(s
, strlen(s
));
2971 sv
->ptrvalue
= (void *) new Identifier(sv
->lstring
.string
,v
);
2973 //printf("tochars[%d] = '%s'\n",v, s);
2974 Token::tochars
[v
] = s
;
2977 Token::tochars
[TOKeof
] = "EOF";
2978 Token::tochars
[TOKlcurly
] = "{";
2979 Token::tochars
[TOKrcurly
] = "}";
2980 Token::tochars
[TOKlparen
] = "(";
2981 Token::tochars
[TOKrparen
] = ")";
2982 Token::tochars
[TOKlbracket
] = "[";
2983 Token::tochars
[TOKrbracket
] = "]";
2984 Token::tochars
[TOKsemicolon
] = ";";
2985 Token::tochars
[TOKcolon
] = ":";
2986 Token::tochars
[TOKcomma
] = ",";
2987 Token::tochars
[TOKdot
] = ".";
2988 Token::tochars
[TOKxor
] = "^";
2989 Token::tochars
[TOKxorass
] = "^=";
2990 Token::tochars
[TOKassign
] = "=";
2991 Token::tochars
[TOKconstruct
] = "=";
2993 Token::tochars
[TOKblit
] = "=";
2995 Token::tochars
[TOKlt
] = "<";
2996 Token::tochars
[TOKgt
] = ">";
2997 Token::tochars
[TOKle
] = "<=";
2998 Token::tochars
[TOKge
] = ">=";
2999 Token::tochars
[TOKequal
] = "==";
3000 Token::tochars
[TOKnotequal
] = "!=";
3001 Token::tochars
[TOKnotidentity
] = "!is";
3002 Token::tochars
[TOKtobool
] = "!!";
3004 Token::tochars
[TOKunord
] = "!<>=";
3005 Token::tochars
[TOKue
] = "!<>";
3006 Token::tochars
[TOKlg
] = "<>";
3007 Token::tochars
[TOKleg
] = "<>=";
3008 Token::tochars
[TOKule
] = "!>";
3009 Token::tochars
[TOKul
] = "!>=";
3010 Token::tochars
[TOKuge
] = "!<";
3011 Token::tochars
[TOKug
] = "!<=";
3013 Token::tochars
[TOKnot
] = "!";
3014 Token::tochars
[TOKtobool
] = "!!";
3015 Token::tochars
[TOKshl
] = "<<";
3016 Token::tochars
[TOKshr
] = ">>";
3017 Token::tochars
[TOKushr
] = ">>>";
3018 Token::tochars
[TOKadd
] = "+";
3019 Token::tochars
[TOKmin
] = "-";
3020 Token::tochars
[TOKmul
] = "*";
3021 Token::tochars
[TOKdiv
] = "/";
3022 Token::tochars
[TOKmod
] = "%";
3023 Token::tochars
[TOKslice
] = "..";
3024 Token::tochars
[TOKdotdotdot
] = "...";
3025 Token::tochars
[TOKand
] = "&";
3026 Token::tochars
[TOKandand
] = "&&";
3027 Token::tochars
[TOKor
] = "|";
3028 Token::tochars
[TOKoror
] = "||";
3029 Token::tochars
[TOKarray
] = "[]";
3030 Token::tochars
[TOKindex
] = "[i]";
3031 Token::tochars
[TOKaddress
] = "&";
3032 Token::tochars
[TOKstar
] = "*";
3033 Token::tochars
[TOKtilde
] = "~";
3034 Token::tochars
[TOKdollar
] = "$";
3035 Token::tochars
[TOKcast
] = "cast";
3036 Token::tochars
[TOKplusplus
] = "++";
3037 Token::tochars
[TOKminusminus
] = "--";
3038 Token::tochars
[TOKtype
] = "type";
3039 Token::tochars
[TOKquestion
] = "?";
3040 Token::tochars
[TOKneg
] = "-";
3041 Token::tochars
[TOKuadd
] = "+";
3042 Token::tochars
[TOKvar
] = "var";
3043 Token::tochars
[TOKaddass
] = "+=";
3044 Token::tochars
[TOKminass
] = "-=";
3045 Token::tochars
[TOKmulass
] = "*=";
3046 Token::tochars
[TOKdivass
] = "/=";
3047 Token::tochars
[TOKmodass
] = "%=";
3048 Token::tochars
[TOKshlass
] = "<<=";
3049 Token::tochars
[TOKshrass
] = ">>=";
3050 Token::tochars
[TOKushrass
] = ">>>=";
3051 Token::tochars
[TOKandass
] = "&=";
3052 Token::tochars
[TOKorass
] = "|=";
3053 Token::tochars
[TOKcatass
] = "~=";
3054 Token::tochars
[TOKcat
] = "~";
3055 Token::tochars
[TOKcall
] = "call";
3056 Token::tochars
[TOKidentity
] = "is";
3057 Token::tochars
[TOKnotidentity
] = "!is";
3059 Token::tochars
[TOKorass
] = "|=";
3060 Token::tochars
[TOKidentifier
] = "identifier";
3063 Token::tochars
[TOKdotexp
] = "dotexp";
3064 Token::tochars
[TOKdotti
] = "dotti";
3065 Token::tochars
[TOKdotvar
] = "dotvar";
3066 Token::tochars
[TOKdottype
] = "dottype";
3067 Token::tochars
[TOKsymoff
] = "symoff";
3068 Token::tochars
[TOKtypedot
] = "typedot";
3069 Token::tochars
[TOKarraylength
] = "arraylength";
3070 Token::tochars
[TOKarrayliteral
] = "arrayliteral";
3071 Token::tochars
[TOKassocarrayliteral
] = "assocarrayliteral";
3072 Token::tochars
[TOKstructliteral
] = "structliteral";
3073 Token::tochars
[TOKstring
] = "string";
3074 Token::tochars
[TOKdsymbol
] = "symbol";
3075 Token::tochars
[TOKtuple
] = "tuple";
3076 Token::tochars
[TOKdeclaration
] = "declaration";
3077 Token::tochars
[TOKdottd
] = "dottd";
3078 Token::tochars
[TOKon_scope_exit
] = "scope(exit)";