2 * Implements the lexical analyzer, which converts source code into lexical tokens.
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
6 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
16 import core
.stdc
.ctype
;
17 import core
.stdc
.stdio
;
18 import core
.stdc
.string
;
23 import dmd
.identifier
;
25 import dmd
.root
.array
;
26 import dmd
.root
.ctfloat
;
27 import dmd
.common
.outbuffer
;
40 /***********************************************************
41 * Values to use for various magic identifiers
45 uint versionNumber
; /// __VERSION__
46 const(char)[] date
; /// __DATE__
47 const(char)[] time
; /// __TIME__
48 const(char)[] vendor
; /// __VENDOR__
49 const(char)[] timestamp
; /// __TIMESTAMP__
51 bool previewIn
; /// `in` means `[ref] scope const`, accepts rvalues
52 bool ddocOutput
; /// collect embedded documentation comments
53 bool masm
; /// use MASM inline asm syntax
56 /***********************************************************
60 private __gshared OutBuffer stringbuffer
;
62 Loc scanloc
; // for error messages
63 Loc prevloc
; // location of token before current
65 const(char)* p
; // current character
70 bool Ccompile
; /// true if compiling ImportC
72 // The following are valid only if (Ccompile == true)
73 ubyte boolsize
; /// size of a C _Bool, default 1
74 ubyte shortsize
; /// size of a C short, default 2
75 ubyte intsize
; /// size of a C int, default 4
76 ubyte longsize
; /// size of C long, 4 or 8
77 ubyte long_longsize
; /// size of a C long long, default 8
78 ubyte long_doublesize
; /// size of C long double, 8 or D real.sizeof
79 ubyte wchar_tsize
; /// size of C wchar_t, 2 or 4
81 ErrorSink eSink
; /// send error messages through this interface
82 CompileEnv compileEnv
; /// environment
86 const(char)* base
; // pointer to start of buffer
87 const(char)* end
; // pointer to last element of buffer
88 const(char)* line
; // start of current line
90 bool doDocComment
; // collect doc comment information
91 bool anyToken
; // seen at least one token
92 bool commentToken
; // comments are TOK.comment's
93 bool tokenizeNewlines
; // newlines are turned into TOK.endOfLine's
95 bool whitespaceToken
; // tokenize whitespaces (only for DMDLIB)
97 int inTokenStringConstant
; // can be larger than 1 when in nested q{} strings
98 int lastDocLine
; // last line of previous doc comment
100 Token
* tokenFreelist
;
105 /*********************
106 * Creates a Lexer for the source code base[begoffset..endoffset+1].
107 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
110 * filename = used for error messages
111 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
112 * begoffset = starting offset into base[]
113 * endoffset = the last offset to read into base[]
114 * doDocComment = handle documentation comments
115 * commentToken = comments become TOK.comment's
116 * errorSink = where error messages go, must not be null
117 * compileEnv = version, vendor, date, time, etc.
119 this(const(char)* filename
, const(char)* base
, size_t begoffset
,
120 size_t endoffset
, bool doDocComment
, bool commentToken
,
122 const CompileEnv
* compileEnv
) scope
124 scanloc
= Loc(filename
, 1, 1);
125 // debug printf("Lexer::Lexer(%p)\n", base);
126 // debug printf("lexer.filename = %s\n", filename);
129 this.end
= base
+ endoffset
;
130 p
= base
+ begoffset
;
132 this.doDocComment
= doDocComment
;
133 this.commentToken
= commentToken
;
134 this.tokenizeNewlines
= false;
135 this.inTokenStringConstant
= 0;
136 this.lastDocLine
= 0;
137 this.eSink
= errorSink
;
140 this.compileEnv
= *compileEnv
;
143 this.compileEnv
.versionNumber
= 1;
144 this.compileEnv
.vendor
= "DLF";
147 /* If first line starts with '#!', ignore the line
149 if (p
&& p
[0] == '#' && p
[1] == '!')
165 // Note: We do allow malformed UTF-8 on shebang line.
166 // It could have a meaning if the native system
167 // encoding is not Unicode. See test compilable/test13512.d
168 // for example encoded in KOI-8.
169 // We also allow bidirectional control characters.
170 // We do not execute the shebang line, so it can't be used
171 // to conceal code. It is up to the shell to sanitize it.
180 /***********************
181 * Alternative entry point for DMDLIB, adds `whitespaceToken`
183 this(const(char)* filename
, const(char)* base
, size_t begoffset
, size_t endoffset
,
184 bool doDocComment
, bool commentToken
, bool whitespaceToken
,
185 ErrorSink errorSink
, const CompileEnv
* compileEnv
= null
188 this(filename
, base
, begoffset
, endoffset
, doDocComment
, commentToken
, errorSink
, compileEnv
);
189 this.whitespaceToken
= whitespaceToken
;
193 * Used for unittests for a mock Lexer
195 this(ErrorSink errorSink
) scope @safe { assert(errorSink
); this.eSink
= errorSink
; }
197 /**************************************
198 * Reset lexer to lex #define's
200 final void resetDefineLines(const(char)[] slice
)
203 end
= base
+ slice
.length
;
207 tokenizeNewlines
= true;
208 inTokenStringConstant
= 0;
210 scanloc
= Loc("#defines", 1, 1);
213 /**********************************
214 * Set up for next #define line.
215 * p should be at start of next line.
217 final void nextDefineLine()
219 tokenizeNewlines
= true;
226 final bool empty() const pure @property @nogc @safe
228 return front() == TOK
.endOfFile
;
231 final TOK
front() const pure @property @nogc @safe
236 final void popFront()
241 /// Returns: a newly allocated `Token`.
242 Token
* allocateToken() pure nothrow @safe
246 Token
* t
= tokenFreelist
;
247 tokenFreelist
= t
.next
;
254 /// Frees the given token by returning it to the freelist.
255 private void releaseToken(Token
* token
) pure nothrow @nogc @safe
259 token
.next
= tokenFreelist
;
260 tokenFreelist
= token
;
263 final TOK
nextToken()
268 Token
* t
= token
.next
;
269 memcpy(&token
, t
, Token
.sizeof
);
276 //printf(token.toChars());
280 /***********************
281 * Look ahead at next token's value.
285 return peek(&token
).value
;
288 /***********************
289 * Look 2 tokens ahead at value.
291 final TOK
peekNext2()
293 Token
* t
= peek(&token
);
294 return peek(t
).value
;
297 /****************************
298 * Turn next token in buffer into a token.
300 * t = the token to set the resulting Token to
302 final void scan(Token
* t
)
304 const lastLine
= scanloc
.linnum
;
306 t
.blockComment
= null;
307 t
.lineComment
= null;
312 //printf("p = %p, *p = '%c'\n",p,*p);
318 t
.value
= TOK
.endOfFile
; // end of file
319 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
322 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
323 while ((cast(size_t
)p
) % uint.sizeof
)
326 goto LendSkipFourSpaces
;
329 while (*(cast(uint*)p
) == 0x20202020) // ' ' == 0x20
331 // Skip over any remaining space on the line.
339 t
.value
= TOK
.whitespace
;
343 continue; // skip white space
352 t
.value
= TOK
.whitespace
;
356 continue; // skip white space
359 if (*p
!= '\n') // if CR stands by itself
362 if (tokenizeNewlines
)
364 t
.value
= TOK
.endOfLine
;
365 tokenizeNewlines
= false;
373 t
.value
= TOK
.whitespace
;
377 continue; // skip white space
381 if (tokenizeNewlines
)
383 t
.value
= TOK
.endOfLine
;
384 tokenizeNewlines
= false;
391 t
.value
= TOK
.whitespace
;
395 continue; // skip white space
398 if (Ccompile
&& (p
[1] == '\r' || p
[1] == '\n'))
400 ++p
; // ignore \ followed by new line, like VC does
406 if (!isZeroSecond(p
[1])) // if numeric literal does not continue
410 t
.value
= TOK
.int32Literal
;
415 case '1': .. case '9':
416 if (!isDigitSecond(p
[1])) // if numeric literal does not continue
418 t
.unsvalue
= *p
- '0';
420 t
.value
= TOK
.int32Literal
;
428 if (issinglechar(p
[1]) && p
[2] == '\'')
430 t
.unsvalue
= p
[1]; // simple one character literal
431 t
.value
= TOK
.charLiteral
;
436 clexerCharConstant(*t
, 0);
440 t
.value
= charConstant(t
);
449 if (p
[1] == '\'') // C wide character constant
452 if (c
== 'L') // convert L to u or U
453 c
= (wchar_tsize
== 4) ?
'u' : 'U';
455 clexerCharConstant(*t
, c
);
458 else if (p
[1] == '\"') // C wide string literal
462 escapeStringConstant(t
);
463 t
.postfix
= c
== 'L' ?
(wchar_tsize
== 2 ?
'w' : 'd') :
468 else if (p
[1] == '8' && p
[2] == '\"') // C UTF-8 string literal
471 escapeStringConstant(t
);
477 if (Ccompile || p
[1] != '"')
484 wysiwygStringConstant(t
);
490 t
.value
= hexStringConstant(t
);
498 delimitedStringConstant(t
);
501 else if (p
[1] == '{')
504 tokenStringConstant(t
);
510 escapeStringConstant(t
);
528 /*case 'q': case 'r':*/
574 const u
= decodeUTF();
577 error(t
.loc
, "char 0x%04x not allowed in identifier", u
);
582 Identifier id
= Identifier
.idPool((cast(char*)t
.ptr
)[0 .. p
- t
.ptr
], false);
584 t
.value
= cast(TOK
)id
.getValue();
588 /* Different keywords for C and D
592 if (t
.value
!= TOK
.identifier
)
594 t
.value
= Ckeywords
[t
.value
]; // filter out D keywords
597 else if (t
.value
>= FirstCKeyword
)
598 t
.value
= TOK
.identifier
; // filter out C keywords
600 else if (*t
.ptr
== '_') // if special identifier token
602 void toToken(const(char)[] s
)
604 t
.value
= TOK
.string_
;
606 t
.len
= cast(uint)s
.length
;
611 toToken(compileEnv
.date
);
612 else if (id
== Id
.TIME
)
613 toToken(compileEnv
.time
);
614 else if (id
== Id
.VENDOR
)
615 toToken(compileEnv
.vendor
);
616 else if (id
== Id
.TIMESTAMP
)
617 toToken(compileEnv
.timestamp
);
618 else if (id
== Id
.VERSIONX
)
620 t
.value
= TOK
.int64Literal
;
621 t
.unsvalue
= compileEnv
.versionNumber
;
623 else if (id
== Id
.EOFX
)
625 t
.value
= TOK
.endOfFile
;
626 // Advance scanner to end of file
627 while (!(*p
== 0 ||
*p
== 0x1A))
631 //printf("t.value = %d\n",t.value);
640 t
.value
= TOK
.divAssign
;
665 error(t
.loc
, "unterminated /* */ comment");
668 t
.value
= TOK
.endOfFile
;
673 const u
= decodeUTF();
674 if (u
== PS || u
== LS
)
683 if (p
[-2] == '*' && p
- 3 != t
.ptr
)
689 t
.value
= TOK
.comment
;
692 else if (doDocComment
&& t
.ptr
[2] == '*' && p
- 4 != t
.ptr
)
694 // if /** but not /**/
695 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
696 lastDocLine
= scanloc
.linnum
;
699 case '/': // do // style comments
718 t
.value
= TOK
.comment
;
721 if (doDocComment
&& t
.ptr
[2] == '/')
723 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
724 lastDocLine
= scanloc
.linnum
;
728 t
.value
= TOK
.endOfFile
;
733 const u
= decodeUTF();
734 if (u
== PS || u
== LS
)
750 t
.value
= TOK
.comment
;
753 if (doDocComment
&& t
.ptr
[2] == '/')
755 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
756 lastDocLine
= scanloc
.linnum
;
801 error(t
.loc
, "unterminated /+ +/ comment");
804 t
.value
= TOK
.endOfFile
;
809 uint u
= decodeUTF();
810 if (u
== PS || u
== LS
)
821 t
.value
= TOK
.comment
;
824 if (doDocComment
&& t
.ptr
[2] == '+' && p
- 4 != t
.ptr
)
826 // if /++ but not /++/
827 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
828 lastDocLine
= scanloc
.linnum
;
842 /* Note that we don't allow ._1 and ._ as being
843 * valid floating point numbers.
848 else if (p
[0] == '.')
853 t
.value
= TOK
.dotDotDot
;
869 t
.value
= TOK
.andAssign
;
874 t
.value
= TOK
.andAnd
;
884 t
.value
= TOK
.orAssign
;
899 t
.value
= TOK
.minAssign
;
904 t
.value
= TOK
.minusMinus
;
919 t
.value
= TOK
.addAssign
;
924 t
.value
= TOK
.plusPlus
;
934 t
.value
= TOK
.lessOrEqual
; // <=
942 t
.value
= TOK
.leftShiftAssign
; // <<=
945 t
.value
= TOK
.leftShift
; // <<
947 else if (*p
== ':' && Ccompile
)
950 t
.value
= TOK
.leftBracket
; // <:
952 else if (*p
== '%' && Ccompile
)
955 t
.value
= TOK
.leftCurly
; // <%
958 t
.value
= TOK
.lessThan
; // <
965 t
.value
= TOK
.greaterOrEqual
; // >=
973 t
.value
= TOK
.rightShiftAssign
; // >>=
981 t
.value
= TOK
.unsignedRightShiftAssign
; // >>>=
984 t
.value
= TOK
.unsignedRightShift
; // >>>
987 t
.value
= TOK
.rightShift
; // >>
990 t
.value
= TOK
.greaterThan
; // >
997 t
.value
= TOK
.notEqual
; // !=
1000 t
.value
= TOK
.not; // !
1007 t
.value
= TOK
.equal
; // ==
1012 t
.value
= TOK
.goesTo
; // =>
1015 t
.value
= TOK
.assign
; // =
1022 t
.value
= TOK
.concatenateAssign
; // ~=
1025 t
.value
= TOK
.tilde
; // ~
1035 t
.value
= TOK
.powAssign
; // ^^=
1038 t
.value
= TOK
.pow
; // ^^
1043 t
.value
= TOK
.xorAssign
; // ^=
1046 t
.value
= TOK
.xor; // ^
1050 t
.value
= TOK
.leftParenthesis
;
1054 t
.value
= TOK
.rightParenthesis
;
1058 t
.value
= TOK
.leftBracket
;
1062 t
.value
= TOK
.rightBracket
;
1066 t
.value
= TOK
.leftCurly
;
1070 t
.value
= TOK
.rightCurly
;
1074 t
.value
= TOK
.question
;
1078 t
.value
= TOK
.comma
;
1082 t
.value
= TOK
.semicolon
;
1089 t
.value
= TOK
.colonColon
;
1091 else if (*p
== '>' && Ccompile
)
1094 t
.value
= TOK
.rightBracket
;
1097 t
.value
= TOK
.colon
;
1101 t
.value
= TOK
.dollar
;
1112 t
.value
= TOK
.mulAssign
;
1122 t
.value
= TOK
.modAssign
;
1124 else if (*p
== '>' && Ccompile
)
1127 t
.value
= TOK
.rightCurly
;
1129 else if (*p
== ':' && Ccompile
)
1131 goto case '#'; // %: means #
1138 // https://issues.dlang.org/show_bug.cgi?id=22825
1139 // Special token sequences are terminated by newlines,
1140 // and should not be skipped over.
1141 this.tokenizeNewlines
= true;
1143 if (parseSpecialTokenSequence())
1145 t
.value
= TOK
.pound
;
1154 // Check for start of unicode identifier
1157 if (c
== PS || c
== LS
)
1161 if (tokenizeNewlines
)
1163 t
.value
= TOK
.endOfLine
;
1164 tokenizeNewlines
= false;
1170 if (c
< 0x80 && isprint(c
))
1171 error(t
.loc
, "character '%c' is not a valid token", c
);
1173 error(t
.loc
, "character 0x%02x is not a valid token", c
);
1182 final Token
* peek(Token
* ct
)
1189 t
= allocateToken();
1196 /*********************************
1197 * tk is on the opening (.
1198 * Look ahead and return token that is past the closing ).
1200 final Token
* peekPastParen(Token
* tk
)
1202 //printf("peekPastParen()\n");
1211 case TOK
.leftParenthesis
:
1214 case TOK
.rightParenthesis
:
1223 case TOK
.rightCurly
:
1224 if (--curlynest
>= 0)
1240 /*******************************************
1241 * Parse escape sequence.
1243 private uint escapeSequence(out dchar c2
)
1245 return Lexer
.escapeSequence(token
.loc
, p
, Ccompile
, c2
);
1249 * Parse the given string literal escape sequence into a single character.
1250 * D https://dlang.org/spec/lex.html#escape_sequences
1253 * loc = location to use for error messages
1254 * sequence = pointer to string with escape sequence to parse. Updated to
1255 * point past the end of the escape sequence
1256 * Ccompile = true for compile C11 escape sequences
1257 * c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init`
1259 * the escape sequence as a single character
1261 private dchar escapeSequence(const ref Loc loc
, ref const(char)* sequence
, bool Ccompile
, out dchar c2
)
1263 const(char)* p
= sequence
; // cache sequence reference on stack
1264 scope(exit
) sequence
= p
;
1309 if (ishex(cast(char)c
))
1313 if (Ccompile
&& ndigits
== 2)
1315 /* C11 6.4.4.4-7 one to infinity hex digits
1319 if (isdigit(cast(char)c
))
1321 else if (islower(c
))
1327 } while (ishex(cast(char)c
));
1333 if (isdigit(cast(char)c
))
1335 else if (islower(c
))
1343 if (!ishex(cast(char)c
))
1345 error(loc
, "escape hex sequence has %d hex digits instead of %d", n
, ndigits
);
1349 if (ndigits
!= 2 && !utf_isValidDchar(v
))
1351 error(loc
, "invalid UTF character \\U%08x", v
);
1352 v
= '?'; // recover with valid UTF character
1359 error(loc
, "undefined escape hex sequence \\%c%c", sequence
[0], c
);
1367 // named character entity
1368 for (const idstart
= ++p
; 1; p
++)
1373 auto entity
= HtmlNamedEntity(idstart
[0 .. p
- idstart
]);
1375 if (entity
== entity
.init
)
1377 error(loc
, "unnamed character entity &%.*s;", cast(int)(p
- idstart
), idstart
);
1380 if (entity
[1] != entity
.init
[1])
1386 if (isalpha(*p
) ||
(p
!= idstart
&& isdigit(*p
)))
1388 error(loc
, "unterminated named entity &%.*s;", cast(int)(p
- idstart
+ 1), idstart
);
1401 if (isoctal(cast(char)c
))
1407 v
= v
* 8 + (c
- '0');
1410 while (++n
< 3 && isoctal(cast(char)c
));
1413 error(loc
, "escape octal sequence \\%03o is larger than \\377", c
);
1417 error(loc
, "undefined escape sequence \\%c", c
);
1426 Lex a wysiwyg string. `p` must be pointing to the first character before the
1427 contents of the string literal. The character pointed to by `p` will be used as
1428 the terminating character (i.e. backtick or double-quote).
1430 result = pointer to the token that accepts the result
1432 private void wysiwygStringConstant(Token
* result
)
1434 result
.value
= TOK
.string_
;
1436 auto terminator
= p
[0];
1438 stringbuffer
.setsize(0);
1451 c
= '\n'; // treat EndOfLine as \n character
1456 error("unterminated string constant starting at %s", start
.toChars());
1458 // rewind `p` so it points to the EOF character
1462 if (c
== terminator
)
1464 result
.setString(stringbuffer
);
1465 stringPostfix(result
);
1471 const u
= decodeUTF();
1473 if (u
== PS || u
== LS
)
1475 stringbuffer
.writeUTF8(u
);
1480 stringbuffer
.writeByte(c
);
1484 /**************************************
1488 final TOK
hexStringConstant(Token
* t
)
1492 uint v
= ~0; // dead assignment, needed to suppress warning
1494 stringbuffer
.setsize(0);
1504 continue; // skip white space
1507 continue; // ignore '\r' if followed by '\n'
1508 // Treat isolated '\r' as if it were a '\n'
1515 error("unterminated string constant starting at %s", start
.toChars());
1517 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1519 return TOK
.hexadecimalString
;
1523 error("odd number (%d) of hex characters in hex string", n
);
1524 stringbuffer
.writeByte(v
);
1526 t
.setString(stringbuffer
);
1529 return TOK
.hexadecimalString
;
1531 if (c
>= '0' && c
<= '9')
1533 else if (c
>= 'a' && c
<= 'f')
1535 else if (c
>= 'A' && c
<= 'F')
1540 const u
= decodeUTF();
1542 if (u
== PS || u
== LS
)
1545 error("non-hex character \\u%04x in hex string", u
);
1548 error("non-hex character '%c' in hex string", c
);
1552 stringbuffer
.writeByte(v
);
1560 assert(0); // see bug 15731
1564 Lex a delimited string. Some examples of delimited strings are:
1566 q"(foo(xxx))" // "foo(xxx)"
1567 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1573 It is assumed that `p` points to the opening double-quote '"'.
1575 result = pointer to the token that accepts the result
1577 private void delimitedStringConstant(Token
* result
)
1579 result
.value
= TOK
.string_
;
1581 dchar delimleft
= 0;
1582 dchar delimright
= 0;
1584 uint nestcount
= ~0; // dead assignment, needed to suppress warning
1585 Identifier hereid
= null;
1589 stringbuffer
.setsize(0);
1594 //printf("c = '%c'\n", c);
1608 stringbuffer
.writeUTF8(c
);
1615 c
= '\n'; // treat EndOfLine as \n character
1619 error("unterminated delimited string constant starting at %s", start
.toChars());
1621 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1630 if (c
== PS || c
== LS
)
1648 else if (isalpha(c
) || c
== '_' ||
(c
>= 0x80 && isUniAlpha(c
)))
1650 // Start of identifier; must be a heredoc
1653 scan(&tok
); // read in heredoc identifier
1654 if (tok
.value
!= TOK
.identifier
)
1656 error("identifier expected for heredoc, not %s", tok
.toChars());
1662 //printf("hereid = '%s'\n", hereid.toChars());
1672 error("delimiter cannot be whitespace");
1679 error("heredoc rest of line should be blank");
1687 else if (c
== delimright
)
1694 else if (c
== delimright
)
1696 if (startline
&& (isalpha(c
) || c
== '_' ||
(c
>= 0x80 && isUniAlpha(c
))) && hereid
)
1701 scan(&tok
); // read in possible heredoc identifier
1702 //printf("endid = '%s'\n", tok.ident.toChars());
1703 if (tok
.value
== TOK
.identifier
&& tok
.ident
is hereid
)
1705 /* should check that rest of line is blank
1711 stringbuffer
.writeUTF8(c
);
1719 error("delimited string must end in `%s\"`", hereid
.toChars());
1720 else if (isspace(delimright
))
1721 error("delimited string must end in `\"`");
1723 error(token
.loc
, "delimited string must end in `%c\"`", delimright
);
1724 result
.setString(stringbuffer
);
1725 stringPostfix(result
);
1729 Lex a token string. Some examples of token strings are:
1731 q{ foo(xxx) } // " foo(xxx) "
1732 q{foo$(LPAREN)} // "foo$(LPAREN)"
1733 q{{foo}"}"} // "{foo}"}""
1735 It is assumed that `p` points to the opening curly-brace.
1737 result = pointer to the token that accepts the result
1739 private void tokenStringConstant(Token
* result
)
1741 result
.value
= TOK
.string_
;
1744 const start
= loc();
1746 inTokenStringConstant
++;
1747 scope(exit
) inTokenStringConstant
--;
1757 case TOK
.rightCurly
:
1760 result
.setString(pstart
, p
- 1 - pstart
);
1761 stringPostfix(result
);
1766 error("unterminated token string constant starting at %s", start
.toChars());
1776 Scan a quoted string while building the processed string value by
1777 handling escape sequences. The result is returned in the given `t` token.
1778 This function assumes that `p` currently points to the opening quote
1781 t = the token to set the resulting string to
1783 * D https://dlang.org/spec/lex.html#double_quoted_strings
1786 private void escapeStringConstant(Token
* t
)
1788 t
.value
= TOK
.string_
;
1790 const start
= loc();
1791 const tc
= *p
++; // opening quote
1792 stringbuffer
.setsize(0);
1806 c
= escapeSequence(c2
);
1807 stringbuffer
.writeUTF8(c
);
1808 if (c2
!= dchar.init
)
1809 stringbuffer
.writeUTF8(c2
);
1813 c
= escapeSequence(c2
);
1814 stringbuffer
.writeUTF8(c
);
1817 c
= escapeSequence(c2
);
1829 c
= '\n'; // treat EndOfLine as \n character
1838 t
.setString(stringbuffer
);
1844 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1847 error("unterminated string constant starting at %s", start
.toChars());
1855 if (c
== LS || c
== PS
)
1863 stringbuffer
.writeUTF8(c
);
1868 stringbuffer
.writeByte(c
);
1872 /**************************************
1874 * https://dlang.org/spec/lex.html#characterliteral
1876 private TOK
charConstant(Token
* t
)
1878 TOK tk
= TOK
.charLiteral
;
1879 //printf("Lexer::charConstant\n");
1889 tk
= TOK
.wcharLiteral
;
1893 tk
= TOK
.dcharLiteral
;
1896 t
.unsvalue
= escapeSequence(c2
);
1899 error("html entity requires 2 code units, use a string instead of a character");
1913 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1917 error("unterminated character constant");
1926 if (c
== LS || c
== PS
)
1928 if (c
< 0xD800 ||
(c
>= 0xE000 && c
< 0xFFFE))
1929 tk
= TOK
.wcharLiteral
;
1931 tk
= TOK
.dcharLiteral
;
1938 while (*p
!= '\'' && *p
!= 0x1A && *p
!= 0 && *p
!= '\n' &&
1939 *p
!= '\r' && *p
!= ';' && *p
!= ')' && *p
!= ']' && *p
!= '}')
1945 if (c
== LS || c
== PS
)
1956 error("character constant has multiple characters");
1960 error("unterminated character constant");
1968 /***************************************
1969 * Lex C character constant.
1970 * Parser is on the opening quote.
1972 * t = token to fill in
1973 * prefix = one of `u`, `U` or 0.
1977 private void clexerCharConstant(ref Token t
, char prefix
)
1979 escapeStringConstant(&t
);
1980 const(char)[] str = t
.ustring
[0 .. t
.len
];
1981 const n
= str.length
;
1985 error(loc
, "empty character constant");
1986 t
.value
= TOK
.semicolon
;
1994 if (n
== 1) // fast case
1999 error(loc
, "max number of chars in character literal is 4, had %d",
2004 (cast(char*)&u
)[n
- 1 - i
] = c
;
2013 string msg
= utf_decodeChar(str, idx
, d1
);
2015 error(loc
, "%.*s", cast(int)msg
.length
, msg
.ptr
);
2018 error(loc
, "x%x does not fit in 16 bits", d1
);
2020 t
.value
= TOK
.wcharLiteral
; // C11 6.4.4.4-9
2026 auto msg
= utf_decodeChar(str, idx
, d
);
2028 error(loc
, "%.*s", cast(int)msg
.length
, msg
.ptr
);
2030 error(loc
, "max number of chars in 32 bit character literal is 1, had %d",
2031 cast(int)((n
+ 3) >> 2));
2033 t
.value
= TOK
.dcharLiteral
; // C11 6.4.4.4-9
2039 t
.value
= n
== 1 ? TOK
.charLiteral
: TOK
.int32Literal
;
2043 /***************************************
2044 * Get postfix of string literal.
2046 private void stringPostfix(Token
* t
) pure @nogc
2062 /**************************************
2064 * If it's an integer, store it in tok.TKutok.Vlong.
2065 * integers can be decimal, octal or hex
2066 * Handle the suffixes U, UL, LU, L, etc.
2067 * If it's double, store it in tok.TKutok.Vdouble.
2072 private TOK
number(Token
* t
)
2076 ulong n
= 0; // unsigned >=64 bit integer type
2079 bool overflow
= false;
2080 bool anyBinaryDigitsNoSingleUS
= false;
2081 bool anyHexDigitsNoSingleUS
= false;
2082 char errorDigit
= 0;
2103 errorDigit
= cast(char) c
;
2118 goto Ldone
; // if ".."
2119 if (isalpha(p
[1]) || p
[1] == '_' || p
[1] & 0x80)
2121 if (Ccompile
&& (p
[1] == 'f' || p
[1] == 'F' || p
[1] == 'l' || p
[1] == 'L'))
2122 goto Lreal
; // if `0.f` or `0.L`
2123 goto Ldone
; // if ".identifier" or ".unicode"
2125 goto Lreal
; // '.' is part of current token
2132 error("embedded `_` not allowed");
2177 if (c
== 'e' || c
== 'E' || c
== 'f' || c
== 'F')
2191 goto Ldone
; // if ".."
2192 if (base
<= 10 && n
> 0 && (isalpha(p
[1]) || p
[1] == '_' || p
[1] & 0x80))
2194 if (Ccompile
&& base
== 10 &&
2195 (p
[1] == 'e' || p
[1] == 'E' || p
[1] == 'f' || p
[1] == 'F' || p
[1] == 'l' || p
[1] == 'L'))
2196 goto Lreal
; // if `1.e6` or `1.f` or `1.L`
2197 goto Ldone
; // if ".identifier" or ".unicode"
2199 if (base
== 16 && (!ishex(p
[1]) || p
[1] == '_' || p
[1] & 0x80))
2200 goto Ldone
; // if ".identifier" or ".unicode"
2202 goto Ldone
; // if ".identifier" or ".unicode"
2203 goto Lreal
; // otherwise as part of a floating point literal
2223 // got a digit here, set any necessary flags, check for errors
2224 anyHexDigitsNoSingleUS
= true;
2225 anyBinaryDigitsNoSingleUS
= true;
2226 if (!errorDigit
&& d
>= base
)
2228 errorDigit
= cast(char) c
;
2230 // Avoid expensive overflow check if we aren't at risk of overflow
2231 if (n
<= 0x0FFF_FFFF_FFFF_FFFFUL
)
2235 import core
.checkedint
: mulu
, addu
;
2237 n
= mulu(n
, base
, overflow
);
2238 n
= addu(n
, d
, overflow
);
2244 error(token
.loc
, "%s digit expected, not `%c`", base
== 2 ?
"binary".ptr
:
2245 base
== 8 ?
"octal".ptr
:
2246 "decimal".ptr
, errorDigit
);
2249 if (overflow
&& !err
)
2251 error("integer overflow");
2254 if ((base
== 2 && !anyBinaryDigitsNoSingleUS
) ||
2255 (base
== 16 && !anyHexDigitsNoSingleUS
))
2256 error(token
.loc
, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p
- start
), start
, 2, start
);
2261 return cnumber(base
, n
);
2266 decimal
= 1, // decimal
2267 unsigned
= 2, // u or U suffix
2268 long_
= 4, // L suffix
2271 FLAGS flags
= (base
== 10) ? FLAGS
.decimal
: FLAGS
.none
;
2272 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2285 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2291 if ((flags
& f
) && !err
)
2293 error("repeated integer suffix `%c`", p
[-1]);
2296 flags
= cast(FLAGS
)(flags | f
);
2303 if (base
== 8 && n
>= 8)
2306 // can't translate invalid octal value, just show a generic message
2307 error("octal literals larger than 7 are no longer supported");
2309 error(token
.loc
, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2310 n
, cast(int)(p
- psuffix
), psuffix
, n
, cast(int)(p
- psuffix
), psuffix
);
2316 /* Octal or Hexadecimal constant.
2317 * First that fits: int, uint, long, ulong
2319 if (n
& 0x8000000000000000L
)
2320 result
= TOK
.uns64Literal
;
2321 else if (n
& 0xFFFFFFFF00000000L
)
2322 result
= TOK
.int64Literal
;
2323 else if (n
& 0x80000000)
2324 result
= TOK
.uns32Literal
;
2326 result
= TOK
.int32Literal
;
2329 /* First that fits: int, long, long long
2331 if (n
& 0x8000000000000000L
)
2333 result
= TOK
.uns64Literal
;
2335 else if (n
& 0xFFFFFFFF80000000L
)
2336 result
= TOK
.int64Literal
;
2338 result
= TOK
.int32Literal
;
2340 case FLAGS
.unsigned
:
2341 case FLAGS
.decimal | FLAGS
.unsigned
:
2342 /* First that fits: uint, ulong
2344 if (n
& 0xFFFFFFFF00000000L
)
2345 result
= TOK
.uns64Literal
;
2347 result
= TOK
.uns32Literal
;
2349 case FLAGS
.decimal | FLAGS
.long_
:
2350 if (n
& 0x8000000000000000L
)
2354 error("signed integer overflow");
2357 result
= TOK
.uns64Literal
;
2360 result
= TOK
.int64Literal
;
2363 if (n
& 0x8000000000000000L
)
2364 result
= TOK
.uns64Literal
;
2366 result
= TOK
.int64Literal
;
2368 case FLAGS
.unsigned | FLAGS
.long_
:
2369 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.long_
:
2370 result
= TOK
.uns64Literal
;
2375 printf("%x\n", flags
);
2382 /**************************************
2383 * Lex C integer-suffix
2385 * base = number base
2386 * n = raw integer value
2390 private TOK
cnumber(int base
, ulong n
)
2393 * Parse trailing suffixes:
2400 octalhex
= 1, // octal or hexadecimal
2401 decimal
= 2, // decimal
2402 unsigned
= 4, // u or U suffix
2403 long_
= 8, // l or L suffix
2404 llong
= 0x10, // ll or LL
2406 // Microsoft extensions
2412 FLAGS flags
= (base
== 10) ? FLAGS
.decimal
: FLAGS
.octalhex
;
2431 f
= FLAGS
.long_ | FLAGS
.llong
;
2443 else if (p
[1] == '1' && p
[2] == '6')
2448 else if (p
[1] == '3' && p
[2] == '2')
2453 else if (p
[1] == '6' && p
[2] == '4')
2460 if (p
[1] >= '0' && p
[1] <= '9' && !err
)
2462 error("invalid integer suffix");
2471 if ((flags
& f
) && !err
)
2473 error("duplicate integer suffixes");
2476 flags
= cast(FLAGS
)(flags | f
);
2479 TOK result
= TOK
.int32Literal
; // default
2482 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2483 * this code deviates from C by picking D int, uint, long, or ulong instead
2486 case FLAGS
.octalhex
:
2487 /* Octal or Hexadecimal constant.
2488 * First that fits: int, unsigned, long, unsigned long,
2489 * long long, unsigned long long
2491 if (n
& 0x8000000000000000L
)
2492 result
= TOK
.uns64Literal
; // unsigned long
2493 else if (n
& 0xFFFFFFFF00000000L
)
2494 result
= TOK
.int64Literal
; // long
2495 else if (n
& 0x80000000)
2496 result
= TOK
.uns32Literal
;
2498 result
= TOK
.int32Literal
;
2502 /* First that fits: int, long, long long
2504 if (n
& 0x8000000000000000L
)
2505 result
= TOK
.uns64Literal
; // unsigned long
2506 else if (n
& 0xFFFFFFFF80000000L
)
2507 result
= TOK
.int64Literal
; // long
2509 result
= TOK
.int32Literal
;
2512 case FLAGS
.octalhex | FLAGS
.unsigned
:
2513 case FLAGS
.decimal | FLAGS
.unsigned
:
2514 /* First that fits: unsigned, unsigned long, unsigned long long
2516 if (n
& 0xFFFFFFFF00000000L
)
2517 result
= TOK
.uns64Literal
; // unsigned long
2519 result
= TOK
.uns32Literal
;
2522 case FLAGS
.decimal | FLAGS
.long_
:
2523 /* First that fits: long, long long
2525 if (longsize
== 4 || long_longsize
== 4)
2527 if (n
& 0xFFFFFFFF_80000000L)
2528 result
= TOK
.int64Literal
;
2530 result
= TOK
.int32Literal
; // long
2534 result
= TOK
.int64Literal
; // long
2538 case FLAGS
.octalhex | FLAGS
.long_
:
2539 /* First that fits: long, unsigned long, long long,
2540 * unsigned long long
2542 if (longsize
== 4 || long_longsize
== 4)
2544 if (n
& 0x8000000000000000L
)
2545 result
= TOK
.uns64Literal
;
2546 else if (n
& 0xFFFFFFFF00000000L
)
2547 result
= TOK
.int64Literal
;
2548 else if (n
& 0x80000000)
2549 result
= TOK
.uns32Literal
; // unsigned long
2551 result
= TOK
.int32Literal
; // long
2555 if (n
& 0x80000000_00000000L)
2556 result
= TOK
.uns64Literal
; // unsigned long
2558 result
= TOK
.int64Literal
; // long
2562 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.long_
:
2563 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.long_
:
2564 /* First that fits: unsigned long, unsigned long long
2566 if (longsize
== 4 || long_longsize
== 4)
2568 if (n
& 0xFFFFFFFF00000000L
)
2569 result
= TOK
.uns64Literal
;
2571 result
= TOK
.uns32Literal
; // unsigned long
2575 result
= TOK
.uns64Literal
; // unsigned long
2579 case FLAGS
.octalhex | FLAGS
.long_ | FLAGS
.llong
:
2580 /* First that fits: long long, unsigned long long
2582 if (n
& 0x8000000000000000L
)
2583 result
= TOK
.uns64Literal
;
2585 result
= TOK
.int64Literal
;
2588 case FLAGS
.decimal | FLAGS
.long_ | FLAGS
.llong
:
2591 result
= TOK
.int64Literal
;
2594 case FLAGS
.octalhex | FLAGS
.long_ | FLAGS
.unsigned | FLAGS
.llong
:
2595 case FLAGS
.decimal | FLAGS
.long_ | FLAGS
.unsigned | FLAGS
.llong
:
2596 result
= TOK
.uns64Literal
;
2599 case FLAGS
.octalhex | FLAGS
.i8
:
2600 case FLAGS
.octalhex | FLAGS
.i16
:
2601 case FLAGS
.octalhex | FLAGS
.i32
:
2602 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.i8
:
2603 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.i16
:
2604 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.i32
:
2605 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.i8
:
2606 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.i16
:
2607 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.i32
:
2608 result
= TOK
.uns32Literal
;
2611 case FLAGS
.decimal | FLAGS
.i8
:
2612 case FLAGS
.decimal | FLAGS
.i16
:
2613 case FLAGS
.decimal | FLAGS
.i32
:
2614 result
= TOK
.int32Literal
;
2617 case FLAGS
.octalhex | FLAGS
.i64
:
2618 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.i64
:
2619 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.i64
:
2620 result
= TOK
.uns64Literal
;
2623 case FLAGS
.decimal | FLAGS
.i64
:
2624 result
= TOK
.int64Literal
;
2628 debug printf("%x\n",flags
);
2634 /**************************************
2635 * Read in characters, converting them to real.
2637 * Exponent overflow not detected.
2638 * Too much requested precision is not detected.
2640 private TOK
inreal(Token
* t
)
2642 //printf("Lexer::inreal()\n");
2645 assert(*p
== '.' ||
isdigit(*p
));
2647 bool isWellformedString
= true;
2648 stringbuffer
.setsize(0);
2656 if (c
== 'x' || c
== 'X')
2662 // Digits to left of '.'
2670 if (isdigit(c
) ||
(hex
&& isxdigit(c
)) || c
== '_')
2677 // Digits to right of '.'
2680 if (isdigit(c
) ||
(hex
&& isxdigit(c
)) || c
== '_')
2687 if (c
== 'e' || c
== 'E' ||
(hex
&& (c
== 'p' || c
== 'P')))
2690 if (c
== '-' || c
== '+')
2694 bool anyexp
= false;
2706 error("embedded `_` in numeric literals not allowed");
2712 error("missing exponent");
2713 isWellformedString
= false;
2720 error("exponent required for hex float");
2721 isWellformedString
= false;
2727 stringbuffer
.writeByte(*pstart
);
2730 stringbuffer
.writeByte(0);
2731 auto sbufptr
= cast(const(char)*)stringbuffer
[].ptr
;
2733 bool isOutOfRange
= false;
2734 t
.floatvalue
= (isWellformedString ? CTFloat
.parse(sbufptr
, isOutOfRange
) : CTFloat
.zero
);
2736 bool imaginary
= false;
2737 if (*p
== 'i' && Ccompile
)
2747 if (isWellformedString
&& !isOutOfRange
)
2748 isOutOfRange
= Port
.isFloat32LiteralOutOfRange(sbufptr
);
2749 result
= TOK
.float32Literal
;
2753 if (isWellformedString
&& !isOutOfRange
)
2754 isOutOfRange
= Port
.isFloat64LiteralOutOfRange(sbufptr
);
2755 result
= TOK
.float64Literal
;
2759 error("use 'L' suffix instead of 'l'");
2763 if (Ccompile
&& long_doublesize
== 8)
2765 result
= TOK
.float80Literal
;
2769 if ((*p
== 'i' ||
*p
== 'I') && !Ccompile
)
2772 error("use 'i' suffix instead of 'I'");
2781 case TOK
.float32Literal
:
2782 result
= TOK
.imaginary32Literal
;
2784 case TOK
.float64Literal
:
2785 result
= TOK
.imaginary64Literal
;
2787 case TOK
.float80Literal
:
2788 result
= TOK
.imaginary80Literal
;
2794 const isLong
= (result
== TOK
.float80Literal || result
== TOK
.imaginary80Literal
);
2795 if (isOutOfRange
&& !isLong
&& (!Ccompile || hex
))
2797 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2799 const char* suffix
= result
== TOK
.float32Literal ?
"f" : result
== TOK
.float80Literal ?
"L" : "";
2800 const char* type
= [TOK
.float32Literal
: "`float`".ptr
,
2801 TOK
.float64Literal
: "`double`".ptr
,
2802 TOK
.float80Literal
: "`real` for the current target".ptr
][result
];
2803 error(scanloc
, "number `%s%s` is not representable as a %s", sbufptr
, suffix
, type
);
2804 const char* extra
= result
== TOK
.float64Literal ?
"`real` literals can be written using the `L` suffix. " : "";
2805 eSink
.errorSupplemental(scanloc
, "%shttps://dlang.org/spec/lex.html#floatliteral", extra
);
2811 case TOK
.float32Literal
:
2812 case TOK
.float64Literal
:
2813 case TOK
.float80Literal
:
2814 case TOK
.imaginary32Literal
:
2815 case TOK
.imaginary64Literal
:
2816 case TOK
.imaginary80Literal
:
2825 final Loc
loc() @nogc
2827 scanloc
.charnum
= cast(ushort)(1 + p
- line
);
2829 scanloc
.fileOffset
= cast(uint)(p
- base
);
2833 void error(T
...)(const(char)* format
, T args
)
2835 eSink
.error(token
.loc
, format
, args
);
2838 void error(T
...)(const ref Loc loc
, const(char)* format
, T args
)
2840 eSink
.error(loc
, format
, args
);
2843 void deprecation(T
...)(const ref Loc loc
, const(char)* format
, T args
)
2845 eSink
.deprecation(loc
, format
, args
);
2848 void deprecation(T
...)(const(char)* format
, T args
)
2850 eSink
.deprecation(token
.loc
, format
, args
);
2853 void deprecationSupplemental(T
...)(const(char)* format
, T args
)
2855 eSink
.deprecationSupplemental(token
.loc
, format
, args
);
2858 /***************************************
2859 * Parse special token sequence:
2861 * true if the special token sequence was handled
2863 * https://dlang.org/spec/lex.html#special-token-sequence
2865 bool parseSpecialTokenSequence()
2869 if (n
.value
== TOK
.identifier
)
2871 if (n
.ident
== Id
.line
)
2873 poundLine(n
, false);
2879 // @@@DEPRECATED_2.103@@@
2880 // Turn into an error in 2.113
2881 if (inTokenStringConstant
)
2882 deprecation(locx
, "token string requires valid D tokens, not `#%s`", n
.ident
.toChars());
2884 error(locx
, "C preprocessor directive `#%s` is not supported", n
.ident
.toChars());
2887 else if (n
.value
== TOK
.if_
)
2890 if (inTokenStringConstant
)
2891 error(locx
, "token string requires valid D tokens, not `#if`");
2893 error(locx
, "C preprocessor directive `#if` is not supported, use `version` or `static if`");
2898 /*********************************************
2899 * Parse line/file preprocessor directive:
2900 * #line linnum [filespec]
2901 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2902 * Accept linemarker format:
2903 * # linnum [filespec] {flags}
2904 * There can be zero or more flags, which are one of the digits 1..4, and
2905 * must be in ascending order. The flags are ignored.
2907 * tok = token we're on, which is linnum of linemarker
2908 * linemarker = true if line marker format and lexer is on linnum
2910 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2912 final void poundLine(ref Token tok
, bool linemarker
)
2914 auto linnum
= this.scanloc
.linnum
;
2915 const(char)* filespec
= null;
2920 if (tok
.value
== TOK
.int32Literal || tok
.value
== TOK
.int64Literal
)
2922 const lin
= cast(int)(tok
.unsvalue
);
2923 if (lin
!= tok
.unsvalue
)
2925 error(tok
.loc
, "line number `%lld` out of range", cast(ulong)tok
.unsvalue
);
2932 else if (tok
.value
== TOK
.line
) // #line __LINE__
2937 error(tok
.loc
, "positive integer argument expected following `#line`");
2938 if (tok
.value
!= TOK
.endOfLine
)
2949 if (!inTokenStringConstant
)
2951 this.scanloc
.linnum
= linnum
;
2953 this.scanloc
.filename
= filespec
;
2957 if (filespec || flags
)
2959 filespec
= mem
.xstrdup(scanloc
.filename
);
2962 if (filespec || flags
)
2964 if (tok
.ptr
[0] != '"' || tok
.postfix
!= 0)
2966 filespec
= tok
.ustring
;
2968 case TOK
.int32Literal
:
2971 if (linemarker
&& tok
.unsvalue
>= 1 && tok
.unsvalue
<= 4)
2973 flags
= true; // linemarker flags seen
2982 if (filespec
is null)
2983 error(tok
.loc
, "invalid filename for `#line` directive");
2984 else if (linemarker
)
2985 error(tok
.loc
, "invalid flag for line marker directive");
2987 error(tok
.loc
, "found `%s` when expecting new line following `#line` directive", tok
.toChars());
2988 if (tok
.value
!= TOK
.endOfLine
)
2992 /***************************************
2993 * Scan forward to start of next line.
2995 * defines = send characters to `defines`
2997 final void skipToNextLine(OutBuffer
* defines
= null)
3005 return; // do not advance p
3019 defines
.writeByte(*p
); // don't care about Unicode line endings for C
3022 const u
= decodeUTF();
3023 if (u
== PS || u
== LS
)
3035 tokenizeNewlines
= false;
3038 /********************************************
3039 * Decode UTF character.
3040 * Issue error messages for invalid sequences.
3041 * Return decoded character, advance p to last character in UTF sequence.
3043 private uint decodeUTF()
3046 auto result
= decodeUTFpure(msg
);
3049 error(token
.loc
, "%.*s", cast(int)msg
.length
, msg
.ptr
);
3053 /********************************************
3054 * Same as above, but the potential error message is stored to the
3055 * msg parameter instead of being issued.
3057 private pure uint decodeUTFpure(out string msg
)
3061 // Check length of remaining string up to 4 UTF-8 characters
3063 for (len
= 1; len
< 4 && s
[len
]; len
++)
3068 msg
= utf_decodeChar(s
[0 .. len
], idx
, u
);
3070 if (!msg
&& isBidiControl(u
))
3071 msg
= "Bidirectional control characters are disallowed for security reasons.";
3075 /***************************************************
3076 * Parse doc comment embedded between t.ptr and p.
3077 * Remove trailing blanks and tabs from lines.
3078 * Replace all newlines with \n.
3079 * Remove leading comment character from each line.
3080 * Decide if it's a lineComment or a blockComment.
3081 * Append to previous one for this token.
3083 * If newParagraph is true, an extra newline will be
3084 * added between adjoining doc comments.
3086 private void getDocComment(Token
* t
, uint lineComment
, bool newParagraph
) pure
3088 /* ct tells us which kind of comment it is: '/', '*', or '+'
3090 const ct
= t
.ptr
[2];
3091 /* Start of comment text skips over / * *, / + +, or / / /
3093 const(char)* q
= t
.ptr
+ 3; // start of comment text
3094 const(char)* qend
= p
;
3095 if (ct
== '*' || ct
== '+')
3097 /* Scan over initial row of ****'s or ++++'s or ////'s
3099 for (; q
< qend
; q
++)
3104 /* Remove leading spaces until start of the comment
3109 while (q
< qend
&& (*q
== ' ' ||
*q
== '\t'))
3117 if (q
< qend
&& *q
== '\n')
3121 else if (*q
== '\n')
3127 /* Remove trailing row of ****'s or ++++'s
3131 for (; q
< qend
; qend
--)
3137 /* Comment is now [q .. qend].
3138 * Canonicalize it into buf[].
3142 void trimTrailingWhitespace()
3145 auto len
= s
.length
;
3146 while (len
&& (s
[len
- 1] == ' ' || s
[len
- 1] == '\t'))
3151 for (; q
< qend
; q
++)
3158 if (linestart
&& c
== ct
)
3161 /* Trim preceding whitespace up to preceding \n
3163 trimTrailingWhitespace();
3172 continue; // skip the \r
3178 if (q
[1] == 128 && (q
[2] == 168 || q
[2] == 169))
3187 c
= '\n'; // replace all newlines with \n
3191 /* Trim trailing whitespace
3193 trimTrailingWhitespace();
3198 /* Trim trailing whitespace (if the last line does not have newline)
3200 trimTrailingWhitespace();
3202 // Always end with a newline
3204 if (s
.length
== 0 || s
[$ - 1] != '\n')
3205 buf
.writeByte('\n');
3207 // It's a line comment if the start of the doc comment comes
3208 // after other non-whitespace on the same line.
3209 auto dc
= (lineComment
&& anyToken
) ?
&t
.lineComment
: &t
.blockComment
;
3210 // Combine with previous doc comment, if any
3213 auto p
= combineComments(*dc
, buf
[], newParagraph
);
3214 *dc
= p ? p
[0 .. strlen(p
)] : null;
3217 *dc
= buf
.extractSlice(true);
3220 /********************************************
3221 * Combine two document comments into one,
3222 * separated by an extra newline if newParagraph is true.
3224 static const(char)* combineComments(const(char)[] c1
, const(char)[] c2
, bool newParagraph
) pure
3226 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
3227 const(int) newParagraphSize
= newParagraph ?
1 : 0; // Size of the combining '\n'
3233 int insertNewLine
= 0;
3234 if (c1
.length
&& c1
[$ - 1] != '\n')
3236 const retSize
= c1
.length
+ insertNewLine
+ newParagraphSize
+ c2
.length
;
3237 auto p
= cast(char*)mem
.xmalloc_noscan(retSize
+ 1);
3238 p
[0 .. c1
.length
] = c1
[];
3240 p
[c1
.length
] = '\n';
3242 p
[c1
.length
+ insertNewLine
] = '\n';
3243 p
[retSize
- c2
.length
.. retSize
] = c2
[];
3248 /**************************
3249 * `p` should be at start of next line
3251 private void endOfLine() @nogc @safe
3253 scanloc
.linnum
= scanloc
.linnum
+ 1;
3257 /****************************
3258 * Print the tokens from the current `token` to the end,
3259 * while not advancing the parser forward.
3260 * Useful for debugging.
3262 void printRestOfTokens()
3267 printf("%s ", (*tk
).toChars());
3268 if (tk
.value
== TOK
.endOfFile || tk
.value
== TOK
.endOfLine
)
3277 /******************************* Private *****************************************/
3281 private enum LS
= 0x2028; // UTF line separator
3282 private enum PS
= 0x2029; // UTF paragraph separator
3284 /********************************************
3285 * Do our own char maps
3287 private static immutable cmtable
= ()
3290 foreach (const c
; 0 .. table
.length
)
3292 if ('0' <= c
&& c
<= '7')
3293 table
[c
] |
= CMoctal
;
3296 if (c_isalnum(c
) || c
== '_')
3297 table
[c
] |
= CMidchar
;
3303 table
[c
] |
= CMzerosecond
;
3306 case '0': .. case '9':
3315 table
[c
] |
= CMzerosecond | CMdigitsecond
;
3333 table
[c
] |
= CMsinglechar
;
3344 enum CMidchar
= 0x4;
3345 enum CMzerosecond
= 0x8;
3346 enum CMdigitsecond
= 0x10;
3347 enum CMsinglechar
= 0x20;
3350 private bool isoctal(const char c
) pure @nogc @safe
3352 return (cmtable
[c
] & CMoctal
) != 0;
3355 private bool ishex(const char c
) pure @nogc @safe
3357 return (cmtable
[c
] & CMhex
) != 0;
3360 private bool isidchar(const char c
) pure @nogc @safe
3362 return (cmtable
[c
] & CMidchar
) != 0;
3365 private bool isZeroSecond(const char c
) pure @nogc @safe
3367 return (cmtable
[c
] & CMzerosecond
) != 0;
3370 private bool isDigitSecond(const char c
) pure @nogc @safe
3372 return (cmtable
[c
] & CMdigitsecond
) != 0;
3375 private bool issinglechar(const char c
) pure @nogc @safe
3377 return (cmtable
[c
] & CMsinglechar
) != 0;
3380 private bool c_isxdigit(const int c
) pure @nogc @safe
3382 return (( c
>= '0' && c
<= '9') ||
3383 ( c
>= 'a' && c
<= 'f') ||
3384 ( c
>= 'A' && c
<= 'F'));
3387 private bool c_isalnum(const int c
) pure @nogc @safe
3389 return (( c
>= '0' && c
<= '9') ||
3390 ( c
>= 'a' && c
<= 'z') ||
3391 ( c
>= 'A' && c
<= 'Z'));
3394 /******************************* Unittest *****************************************/
3398 fprintf(stderr
, "Lexer.unittest %d\n", __LINE__
);
3400 ErrorSink errorSink
= new ErrorSinkStderr
;
3402 void test(T
)(string sequence
, T expected
, bool Ccompile
= false)
3404 auto p
= cast(const(char)*)sequence
.ptr
;
3406 Lexer lexer
= new Lexer(errorSink
);
3407 assert(expected
== lexer
.escapeSequence(Loc
.initial
, p
, Ccompile
, c2
));
3408 assert(p
== sequence
.ptr
+ sequence
.length
);
3433 test(`357`, '\357');
3435 test(`u1234`, '\u1234');
3436 test(`uf0e4`, '\uf0e4');
3438 test(`U0001f603`, '\U0001f603');
3440 test(`"`, '"');
3447 fprintf(stderr
, "Lexer.unittest %d\n", __LINE__
);
3449 static class ErrorSinkTest
: ErrorSinkNull
3455 import core
.stdc
.stdio
;
3456 import core
.stdc
.stdarg
;
3461 void error(const ref Loc loc
, const(char)* format
, ...)
3464 char[100] buffer
= void;
3466 va_start(ap
, format
);
3467 auto actual
= buffer
[0 .. vsnprintf(buffer
.ptr
, buffer
.length
, format
, ap
)];
3469 assert(expected
== actual
);
3473 ErrorSinkTest errorSink
= new ErrorSinkTest
;
3475 void test(string sequence
, string expectedError
, dchar expectedReturnValue
, uint expectedScanLength
, bool Ccompile
= false)
3477 errorSink
.expected
= expectedError
;
3478 errorSink
.gotError
= false;
3479 auto p
= cast(const(char)*)sequence
.ptr
;
3480 Lexer lexer
= new Lexer(errorSink
);
3482 auto actualReturnValue
= lexer
.escapeSequence(Loc
.initial
, p
, Ccompile
, c2
);
3483 assert(errorSink
.gotError
);
3484 assert(expectedReturnValue
== actualReturnValue
);
3486 auto actualScanLength
= p
- sequence
.ptr
;
3487 assert(expectedScanLength
== actualScanLength
);
3490 test("c", `undefined escape sequence \c`, 'c', 1);
3491 test("!", `undefined escape sequence \!`, '!', 1);
3492 test(""", `undefined escape sequence \&`, '&', 1, true);
3494 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3496 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3497 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3498 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3500 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3501 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3502 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3503 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3504 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3505 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3506 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3508 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3509 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3510 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3512 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3513 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3514 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3516 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3517 test(""", `unterminated named entity "`, '?', 5);
3518 test(""", `unterminated named entity "`, '?', 5);
3520 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3525 fprintf(stderr
, "Lexer.unittest %d\n", __LINE__
);
3526 /* Not much here, just trying things out.
3528 string text
= "int"; // We rely on the implicit null-terminator
3529 ErrorSink errorSink
= new ErrorSinkStderr
;
3530 scope Lexer lex1
= new Lexer(null, text
.ptr
, 0, text
.length
, false, false, errorSink
, null);
3532 tok
= lex1
.nextToken();
3533 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3534 assert(tok
== TOK
.int32
);
3535 tok
= lex1
.nextToken();
3536 assert(tok
== TOK
.endOfFile
);
3537 tok
= lex1
.nextToken();
3538 assert(tok
== TOK
.endOfFile
);
3539 tok
= lex1
.nextToken();
3540 assert(tok
== TOK
.endOfFile
);
3545 fprintf(stderr
, "Lexer.unittest %d\n", __LINE__
);
3547 // We don't want to see Lexer error output during these tests.
3548 ErrorSink errorSink
= new ErrorSinkNull
;
3550 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3551 static immutable char[][] testcases
=
3552 [ // Testcase must end with 0 or 0x1A.
3553 [0], // not malformed, but pathological
3556 ['{', '{', 'q', '{', 0],
3564 foreach (testcase
; testcases
)
3566 scope Lexer lex2
= new Lexer(null, testcase
.ptr
, 0, testcase
.length
-1, false, false, errorSink
, null);
3567 TOK tok
= lex2
.nextToken();
3568 size_t iterations
= 1;
3569 while ((tok
!= TOK
.endOfFile
) && (iterations
++ < testcase
.length
))
3571 tok
= lex2
.nextToken();
3573 assert(tok
== TOK
.endOfFile
);
3574 tok
= lex2
.nextToken();
3575 assert(tok
== TOK
.endOfFile
);