d: Merge upstream dmd, druntime f1a045928e
[official-gcc.git] / gcc / d / dmd / lexer.d
blob2c6a59505698777b1948851b51fedd2b5e757a62
1 /**
2 * Implements the lexical analyzer, which converts source code into lexical tokens.
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
6 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
14 module dmd.lexer;
16 import core.stdc.ctype;
17 import core.stdc.stdio;
18 import core.stdc.string;
20 import dmd.entity;
21 import dmd.errorsink;
22 import dmd.id;
23 import dmd.identifier;
24 import dmd.location;
25 import dmd.root.array;
26 import dmd.root.ctfloat;
27 import dmd.common.outbuffer;
28 import dmd.root.port;
29 import dmd.root.rmem;
30 import dmd.root.utf;
31 import dmd.tokens;
33 nothrow:
35 version (DMDLIB)
37 version = LocOffset;
40 /***********************************************************
41 * Values to use for various magic identifiers
43 struct CompileEnv
45 uint versionNumber; /// __VERSION__
46 const(char)[] date; /// __DATE__
47 const(char)[] time; /// __TIME__
48 const(char)[] vendor; /// __VENDOR__
49 const(char)[] timestamp; /// __TIMESTAMP__
51 bool previewIn; /// `in` means `[ref] scope const`, accepts rvalues
52 bool ddocOutput; /// collect embedded documentation comments
53 bool masm; /// use MASM inline asm syntax
56 /***********************************************************
58 class Lexer
60 private __gshared OutBuffer stringbuffer;
62 Loc scanloc; // for error messages
63 Loc prevloc; // location of token before current
65 const(char)* p; // current character
67 Token token;
69 // For ImportC
70 bool Ccompile; /// true if compiling ImportC
72 // The following are valid only if (Ccompile == true)
73 ubyte boolsize; /// size of a C _Bool, default 1
74 ubyte shortsize; /// size of a C short, default 2
75 ubyte intsize; /// size of a C int, default 4
76 ubyte longsize; /// size of C long, 4 or 8
77 ubyte long_longsize; /// size of a C long long, default 8
78 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
79 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
81 ErrorSink eSink; /// send error messages through this interface
82 CompileEnv compileEnv; /// environment
84 private
86 const(char)* base; // pointer to start of buffer
87 const(char)* end; // pointer to last element of buffer
88 const(char)* line; // start of current line
90 bool doDocComment; // collect doc comment information
91 bool anyToken; // seen at least one token
92 bool commentToken; // comments are TOK.comment's
93 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's
95 bool whitespaceToken; // tokenize whitespaces (only for DMDLIB)
97 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
98 int lastDocLine; // last line of previous doc comment
100 Token* tokenFreelist;
103 nothrow:
105 /*********************
106 * Creates a Lexer for the source code base[begoffset..endoffset+1].
107 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
109 * Params:
110 * filename = used for error messages
111 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
112 * begoffset = starting offset into base[]
113 * endoffset = the last offset to read into base[]
114 * doDocComment = handle documentation comments
115 * commentToken = comments become TOK.comment's
116 * errorSink = where error messages go, must not be null
117 * compileEnv = version, vendor, date, time, etc.
119 this(const(char)* filename, const(char)* base, size_t begoffset,
120 size_t endoffset, bool doDocComment, bool commentToken,
121 ErrorSink errorSink,
122 const CompileEnv* compileEnv) scope
124 scanloc = Loc(filename, 1, 1);
125 // debug printf("Lexer::Lexer(%p)\n", base);
126 // debug printf("lexer.filename = %s\n", filename);
127 token = Token.init;
128 this.base = base;
129 this.end = base + endoffset;
130 p = base + begoffset;
131 line = p;
132 this.doDocComment = doDocComment;
133 this.commentToken = commentToken;
134 this.tokenizeNewlines = false;
135 this.inTokenStringConstant = 0;
136 this.lastDocLine = 0;
137 this.eSink = errorSink;
138 assert(errorSink);
139 if (compileEnv)
140 this.compileEnv = *compileEnv;
141 else
143 this.compileEnv.versionNumber = 1;
144 this.compileEnv.vendor = "DLF";
146 //initKeywords();
147 /* If first line starts with '#!', ignore the line
149 if (p && p[0] == '#' && p[1] == '!')
151 p += 2;
152 for (;;p++)
154 char c = *p;
155 switch (c)
157 case '\n':
158 p++;
159 goto case;
160 case 0:
161 case 0x1A:
162 break;
164 default:
165 // Note: We do allow malformed UTF-8 on shebang line.
166 // It could have a meaning if the native system
167 // encoding is not Unicode. See test compilable/test13512.d
168 // for example encoded in KOI-8.
169 // We also allow bidirectional control characters.
170 // We do not execute the shebang line, so it can't be used
171 // to conceal code. It is up to the shell to sanitize it.
172 continue;
174 break;
176 endOfLine();
180 /***********************
181 * Alternative entry point for DMDLIB, adds `whitespaceToken`
183 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
184 bool doDocComment, bool commentToken, bool whitespaceToken,
185 ErrorSink errorSink, const CompileEnv* compileEnv = null
188 this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink, compileEnv);
189 this.whitespaceToken = whitespaceToken;
192 /******************
193 * Used for unittests for a mock Lexer
195 this(ErrorSink errorSink) scope @safe { assert(errorSink); this.eSink = errorSink; }
197 /**************************************
198 * Reset lexer to lex #define's
200 final void resetDefineLines(const(char)[] slice)
202 base = slice.ptr;
203 end = base + slice.length;
204 assert(*end == 0);
205 p = base;
206 line = p;
207 tokenizeNewlines = true;
208 inTokenStringConstant = 0;
209 lastDocLine = 0;
210 scanloc = Loc("#defines", 1, 1);
213 /**********************************
214 * Set up for next #define line.
215 * p should be at start of next line.
217 final void nextDefineLine()
219 tokenizeNewlines = true;
222 /***************
223 * Range interface
226 final bool empty() const pure @property @nogc @safe
228 return front() == TOK.endOfFile;
231 final TOK front() const pure @property @nogc @safe
233 return token.value;
236 final void popFront()
238 nextToken();
241 /// Returns: a newly allocated `Token`.
242 Token* allocateToken() pure nothrow @safe
244 if (tokenFreelist)
246 Token* t = tokenFreelist;
247 tokenFreelist = t.next;
248 t.next = null;
249 return t;
251 return new Token();
254 /// Frees the given token by returning it to the freelist.
255 private void releaseToken(Token* token) pure nothrow @nogc @safe
257 if (mem.isGCEnabled)
258 *token = Token.init;
259 token.next = tokenFreelist;
260 tokenFreelist = token;
263 final TOK nextToken()
265 prevloc = token.loc;
266 if (token.next)
268 Token* t = token.next;
269 memcpy(&token, t, Token.sizeof);
270 releaseToken(t);
272 else
274 scan(&token);
276 //printf(token.toChars());
277 return token.value;
280 /***********************
281 * Look ahead at next token's value.
283 final TOK peekNext()
285 return peek(&token).value;
288 /***********************
289 * Look 2 tokens ahead at value.
291 final TOK peekNext2()
293 Token* t = peek(&token);
294 return peek(t).value;
297 /****************************
298 * Turn next token in buffer into a token.
299 * Params:
300 * t = the token to set the resulting Token to
302 final void scan(Token* t)
304 const lastLine = scanloc.linnum;
305 Loc startLoc;
306 t.blockComment = null;
307 t.lineComment = null;
309 while (1)
311 t.ptr = p;
312 //printf("p = %p, *p = '%c'\n",p,*p);
313 t.loc = loc();
314 switch (*p)
316 case 0:
317 case 0x1A:
318 t.value = TOK.endOfFile; // end of file
319 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
320 return;
321 case ' ':
322 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
323 while ((cast(size_t)p) % uint.sizeof)
325 if (*p != ' ')
326 goto LendSkipFourSpaces;
327 p++;
329 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
330 p += 4;
331 // Skip over any remaining space on the line.
332 while (*p == ' ')
333 p++;
334 LendSkipFourSpaces:
335 version (DMDLIB)
337 if (whitespaceToken)
339 t.value = TOK.whitespace;
340 return;
343 continue; // skip white space
344 case '\t':
345 case '\v':
346 case '\f':
347 p++;
348 version (DMDLIB)
350 if (whitespaceToken)
352 t.value = TOK.whitespace;
353 return;
356 continue; // skip white space
357 case '\r':
358 p++;
359 if (*p != '\n') // if CR stands by itself
361 endOfLine();
362 if (tokenizeNewlines)
364 t.value = TOK.endOfLine;
365 tokenizeNewlines = false;
366 return;
369 version (DMDLIB)
371 if (whitespaceToken)
373 t.value = TOK.whitespace;
374 return;
377 continue; // skip white space
378 case '\n':
379 p++;
380 endOfLine();
381 if (tokenizeNewlines)
383 t.value = TOK.endOfLine;
384 tokenizeNewlines = false;
385 return;
387 version (DMDLIB)
389 if (whitespaceToken)
391 t.value = TOK.whitespace;
392 return;
395 continue; // skip white space
397 case '\\':
398 if (Ccompile && (p[1] == '\r' || p[1] == '\n'))
400 ++p; // ignore \ followed by new line, like VC does
401 continue;
403 goto default;
405 case '0':
406 if (!isZeroSecond(p[1])) // if numeric literal does not continue
408 ++p;
409 t.unsvalue = 0;
410 t.value = TOK.int32Literal;
411 return;
413 goto Lnumber;
415 case '1': .. case '9':
416 if (!isDigitSecond(p[1])) // if numeric literal does not continue
418 t.unsvalue = *p - '0';
419 ++p;
420 t.value = TOK.int32Literal;
421 return;
423 Lnumber:
424 t.value = number(t);
425 return;
427 case '\'':
428 if (issinglechar(p[1]) && p[2] == '\'')
430 t.unsvalue = p[1]; // simple one character literal
431 t.value = TOK.charLiteral;
432 p += 3;
434 else if (Ccompile)
436 clexerCharConstant(*t, 0);
438 else
440 t.value = charConstant(t);
442 return;
444 case 'u':
445 case 'U':
446 case 'L':
447 if (!Ccompile)
448 goto case_ident;
449 if (p[1] == '\'') // C wide character constant
451 char c = *p;
452 if (c == 'L') // convert L to u or U
453 c = (wchar_tsize == 4) ? 'u' : 'U';
454 ++p;
455 clexerCharConstant(*t, c);
456 return;
458 else if (p[1] == '\"') // C wide string literal
460 const c = *p;
461 ++p;
462 escapeStringConstant(t);
463 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
464 c == 'u' ? 'w' :
465 'd';
466 return;
468 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
470 p += 2;
471 escapeStringConstant(t);
472 return;
474 goto case_ident;
476 case 'r':
477 if (Ccompile || p[1] != '"')
478 goto case_ident;
479 p++;
480 goto case '`';
481 case '`':
482 if (Ccompile)
483 goto default;
484 wysiwygStringConstant(t);
485 return;
486 case 'x':
487 if (p[1] != '"')
488 goto case_ident;
489 p++;
490 t.value = hexStringConstant(t);
491 return;
492 case 'q':
493 if (Ccompile)
494 goto case_ident;
495 if (p[1] == '"')
497 p++;
498 delimitedStringConstant(t);
499 return;
501 else if (p[1] == '{')
503 p++;
504 tokenStringConstant(t);
505 return;
507 else
508 goto case_ident;
509 case '"':
510 escapeStringConstant(t);
511 return;
512 case 'a':
513 case 'b':
514 case 'c':
515 case 'd':
516 case 'e':
517 case 'f':
518 case 'g':
519 case 'h':
520 case 'i':
521 case 'j':
522 case 'k':
523 case 'l':
524 case 'm':
525 case 'n':
526 case 'o':
527 case 'p':
528 /*case 'q': case 'r':*/
529 case 's':
530 case 't':
531 //case 'u':
532 case 'v':
533 case 'w':
534 /*case 'x':*/
535 case 'y':
536 case 'z':
537 case 'A':
538 case 'B':
539 case 'C':
540 case 'D':
541 case 'E':
542 case 'F':
543 case 'G':
544 case 'H':
545 case 'I':
546 case 'J':
547 case 'K':
548 //case 'L':
549 case 'M':
550 case 'N':
551 case 'O':
552 case 'P':
553 case 'Q':
554 case 'R':
555 case 'S':
556 case 'T':
557 //case 'U':
558 case 'V':
559 case 'W':
560 case 'X':
561 case 'Y':
562 case 'Z':
563 case '_':
564 case_ident:
566 while (1)
568 const c = *++p;
569 if (isidchar(c))
570 continue;
571 else if (c & 0x80)
573 const s = p;
574 const u = decodeUTF();
575 if (isUniAlpha(u))
576 continue;
577 error(t.loc, "char 0x%04x not allowed in identifier", u);
578 p = s;
580 break;
582 Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
583 t.ident = id;
584 t.value = cast(TOK)id.getValue();
586 anyToken = 1;
588 /* Different keywords for C and D
590 if (Ccompile)
592 if (t.value != TOK.identifier)
594 t.value = Ckeywords[t.value]; // filter out D keywords
597 else if (t.value >= FirstCKeyword)
598 t.value = TOK.identifier; // filter out C keywords
600 else if (*t.ptr == '_') // if special identifier token
602 void toToken(const(char)[] s)
604 t.value = TOK.string_;
605 t.ustring = s.ptr;
606 t.len = cast(uint)s.length;
607 t.postfix = 0;
610 if (id == Id.DATE)
611 toToken(compileEnv.date);
612 else if (id == Id.TIME)
613 toToken(compileEnv.time);
614 else if (id == Id.VENDOR)
615 toToken(compileEnv.vendor);
616 else if (id == Id.TIMESTAMP)
617 toToken(compileEnv.timestamp);
618 else if (id == Id.VERSIONX)
620 t.value = TOK.int64Literal;
621 t.unsvalue = compileEnv.versionNumber;
623 else if (id == Id.EOFX)
625 t.value = TOK.endOfFile;
626 // Advance scanner to end of file
627 while (!(*p == 0 || *p == 0x1A))
628 p++;
631 //printf("t.value = %d\n",t.value);
632 return;
634 case '/':
635 p++;
636 switch (*p)
638 case '=':
639 p++;
640 t.value = TOK.divAssign;
641 return;
642 case '*':
643 p++;
644 startLoc = loc();
645 while (1)
647 while (1)
649 const c = *p;
650 switch (c)
652 case '/':
653 break;
654 case '\n':
655 endOfLine();
656 p++;
657 continue;
658 case '\r':
659 p++;
660 if (*p != '\n')
661 endOfLine();
662 continue;
663 case 0:
664 case 0x1A:
665 error(t.loc, "unterminated /* */ comment");
666 p = end;
667 t.loc = loc();
668 t.value = TOK.endOfFile;
669 return;
670 default:
671 if (c & 0x80)
673 const u = decodeUTF();
674 if (u == PS || u == LS)
675 endOfLine();
677 p++;
678 continue;
680 break;
682 p++;
683 if (p[-2] == '*' && p - 3 != t.ptr)
684 break;
686 if (commentToken)
688 t.loc = startLoc;
689 t.value = TOK.comment;
690 return;
692 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
694 // if /** but not /**/
695 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
696 lastDocLine = scanloc.linnum;
698 continue;
699 case '/': // do // style comments
700 startLoc = loc();
701 while (1)
703 const c = *++p;
704 switch (c)
706 case '\n':
707 break;
708 case '\r':
709 if (p[1] == '\n')
710 p++;
711 break;
712 case 0:
713 case 0x1A:
714 if (commentToken)
716 p = end;
717 t.loc = startLoc;
718 t.value = TOK.comment;
719 return;
721 if (doDocComment && t.ptr[2] == '/')
723 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
724 lastDocLine = scanloc.linnum;
726 p = end;
727 t.loc = loc();
728 t.value = TOK.endOfFile;
729 return;
730 default:
731 if (c & 0x80)
733 const u = decodeUTF();
734 if (u == PS || u == LS)
735 break;
737 continue;
739 break;
741 if (commentToken)
743 version (DMDLIB) {}
744 else
746 p++;
747 endOfLine();
749 t.loc = startLoc;
750 t.value = TOK.comment;
751 return;
753 if (doDocComment && t.ptr[2] == '/')
755 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
756 lastDocLine = scanloc.linnum;
758 p++;
759 endOfLine();
760 continue;
761 case '+':
762 if (!Ccompile)
764 int nest;
765 startLoc = loc();
766 p++;
767 nest = 1;
768 while (1)
770 char c = *p;
771 switch (c)
773 case '/':
774 p++;
775 if (*p == '+')
777 p++;
778 nest++;
780 continue;
781 case '+':
782 p++;
783 if (*p == '/')
785 p++;
786 if (--nest == 0)
787 break;
789 continue;
790 case '\r':
791 p++;
792 if (*p != '\n')
793 endOfLine();
794 continue;
795 case '\n':
796 endOfLine();
797 p++;
798 continue;
799 case 0:
800 case 0x1A:
801 error(t.loc, "unterminated /+ +/ comment");
802 p = end;
803 t.loc = loc();
804 t.value = TOK.endOfFile;
805 return;
806 default:
807 if (c & 0x80)
809 uint u = decodeUTF();
810 if (u == PS || u == LS)
811 endOfLine();
813 p++;
814 continue;
816 break;
818 if (commentToken)
820 t.loc = startLoc;
821 t.value = TOK.comment;
822 return;
824 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
826 // if /++ but not /++/
827 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
828 lastDocLine = scanloc.linnum;
830 continue;
832 break;
833 default:
834 break;
836 t.value = TOK.div;
837 return;
838 case '.':
839 p++;
840 if (isdigit(*p))
842 /* Note that we don't allow ._1 and ._ as being
843 * valid floating point numbers.
845 p--;
846 t.value = inreal(t);
848 else if (p[0] == '.')
850 if (p[1] == '.')
852 p += 2;
853 t.value = TOK.dotDotDot;
855 else
857 p++;
858 t.value = TOK.slice;
861 else
862 t.value = TOK.dot;
863 return;
864 case '&':
865 p++;
866 if (*p == '=')
868 p++;
869 t.value = TOK.andAssign;
871 else if (*p == '&')
873 p++;
874 t.value = TOK.andAnd;
876 else
877 t.value = TOK.and;
878 return;
879 case '|':
880 p++;
881 if (*p == '=')
883 p++;
884 t.value = TOK.orAssign;
886 else if (*p == '|')
888 p++;
889 t.value = TOK.orOr;
891 else
892 t.value = TOK.or;
893 return;
894 case '-':
895 p++;
896 if (*p == '=')
898 p++;
899 t.value = TOK.minAssign;
901 else if (*p == '-')
903 p++;
904 t.value = TOK.minusMinus;
906 else if (*p == '>')
908 ++p;
909 t.value = TOK.arrow;
911 else
912 t.value = TOK.min;
913 return;
914 case '+':
915 p++;
916 if (*p == '=')
918 p++;
919 t.value = TOK.addAssign;
921 else if (*p == '+')
923 p++;
924 t.value = TOK.plusPlus;
926 else
927 t.value = TOK.add;
928 return;
929 case '<':
930 p++;
931 if (*p == '=')
933 p++;
934 t.value = TOK.lessOrEqual; // <=
936 else if (*p == '<')
938 p++;
939 if (*p == '=')
941 p++;
942 t.value = TOK.leftShiftAssign; // <<=
944 else
945 t.value = TOK.leftShift; // <<
947 else if (*p == ':' && Ccompile)
949 ++p;
950 t.value = TOK.leftBracket; // <:
952 else if (*p == '%' && Ccompile)
954 ++p;
955 t.value = TOK.leftCurly; // <%
957 else
958 t.value = TOK.lessThan; // <
959 return;
960 case '>':
961 p++;
962 if (*p == '=')
964 p++;
965 t.value = TOK.greaterOrEqual; // >=
967 else if (*p == '>')
969 p++;
970 if (*p == '=')
972 p++;
973 t.value = TOK.rightShiftAssign; // >>=
975 else if (*p == '>')
977 p++;
978 if (*p == '=')
980 p++;
981 t.value = TOK.unsignedRightShiftAssign; // >>>=
983 else
984 t.value = TOK.unsignedRightShift; // >>>
986 else
987 t.value = TOK.rightShift; // >>
989 else
990 t.value = TOK.greaterThan; // >
991 return;
992 case '!':
993 p++;
994 if (*p == '=')
996 p++;
997 t.value = TOK.notEqual; // !=
999 else
1000 t.value = TOK.not; // !
1001 return;
1002 case '=':
1003 p++;
1004 if (*p == '=')
1006 p++;
1007 t.value = TOK.equal; // ==
1009 else if (*p == '>')
1011 p++;
1012 t.value = TOK.goesTo; // =>
1014 else
1015 t.value = TOK.assign; // =
1016 return;
1017 case '~':
1018 p++;
1019 if (*p == '=')
1021 p++;
1022 t.value = TOK.concatenateAssign; // ~=
1024 else
1025 t.value = TOK.tilde; // ~
1026 return;
1027 case '^':
1028 p++;
1029 if (*p == '^')
1031 p++;
1032 if (*p == '=')
1034 p++;
1035 t.value = TOK.powAssign; // ^^=
1037 else
1038 t.value = TOK.pow; // ^^
1040 else if (*p == '=')
1042 p++;
1043 t.value = TOK.xorAssign; // ^=
1045 else
1046 t.value = TOK.xor; // ^
1047 return;
1048 case '(':
1049 p++;
1050 t.value = TOK.leftParenthesis;
1051 return;
1052 case ')':
1053 p++;
1054 t.value = TOK.rightParenthesis;
1055 return;
1056 case '[':
1057 p++;
1058 t.value = TOK.leftBracket;
1059 return;
1060 case ']':
1061 p++;
1062 t.value = TOK.rightBracket;
1063 return;
1064 case '{':
1065 p++;
1066 t.value = TOK.leftCurly;
1067 return;
1068 case '}':
1069 p++;
1070 t.value = TOK.rightCurly;
1071 return;
1072 case '?':
1073 p++;
1074 t.value = TOK.question;
1075 return;
1076 case ',':
1077 p++;
1078 t.value = TOK.comma;
1079 return;
1080 case ';':
1081 p++;
1082 t.value = TOK.semicolon;
1083 return;
1084 case ':':
1085 p++;
1086 if (*p == ':')
1088 ++p;
1089 t.value = TOK.colonColon;
1091 else if (*p == '>' && Ccompile)
1093 ++p;
1094 t.value = TOK.rightBracket;
1096 else
1097 t.value = TOK.colon;
1098 return;
1099 case '$':
1100 p++;
1101 t.value = TOK.dollar;
1102 return;
1103 case '@':
1104 p++;
1105 t.value = TOK.at;
1106 return;
1107 case '*':
1108 p++;
1109 if (*p == '=')
1111 p++;
1112 t.value = TOK.mulAssign;
1114 else
1115 t.value = TOK.mul;
1116 return;
1117 case '%':
1118 p++;
1119 if (*p == '=')
1121 p++;
1122 t.value = TOK.modAssign;
1124 else if (*p == '>' && Ccompile)
1126 ++p;
1127 t.value = TOK.rightCurly;
1129 else if (*p == ':' && Ccompile)
1131 goto case '#'; // %: means #
1133 else
1134 t.value = TOK.mod;
1135 return;
1136 case '#':
1138 // https://issues.dlang.org/show_bug.cgi?id=22825
1139 // Special token sequences are terminated by newlines,
1140 // and should not be skipped over.
1141 this.tokenizeNewlines = true;
1142 p++;
1143 if (parseSpecialTokenSequence())
1144 continue;
1145 t.value = TOK.pound;
1146 return;
1148 default:
1150 dchar c = *p;
1151 if (c & 0x80)
1153 c = decodeUTF();
1154 // Check for start of unicode identifier
1155 if (isUniAlpha(c))
1156 goto case_ident;
1157 if (c == PS || c == LS)
1159 endOfLine();
1160 p++;
1161 if (tokenizeNewlines)
1163 t.value = TOK.endOfLine;
1164 tokenizeNewlines = false;
1165 return;
1167 continue;
1170 if (c < 0x80 && isprint(c))
1171 error(t.loc, "character '%c' is not a valid token", c);
1172 else
1173 error(t.loc, "character 0x%02x is not a valid token", c);
1174 p++;
1175 continue;
1176 // assert(0);
1182 final Token* peek(Token* ct)
1184 Token* t;
1185 if (ct.next)
1186 t = ct.next;
1187 else
1189 t = allocateToken();
1190 scan(t);
1191 ct.next = t;
1193 return t;
1196 /*********************************
1197 * tk is on the opening (.
1198 * Look ahead and return token that is past the closing ).
1200 final Token* peekPastParen(Token* tk)
1202 //printf("peekPastParen()\n");
1203 int parens = 1;
1204 int curlynest = 0;
1205 while (1)
1207 tk = peek(tk);
1208 //tk.print();
1209 switch (tk.value)
1211 case TOK.leftParenthesis:
1212 parens++;
1213 continue;
1214 case TOK.rightParenthesis:
1215 --parens;
1216 if (parens)
1217 continue;
1218 tk = peek(tk);
1219 break;
1220 case TOK.leftCurly:
1221 curlynest++;
1222 continue;
1223 case TOK.rightCurly:
1224 if (--curlynest >= 0)
1225 continue;
1226 break;
1227 case TOK.semicolon:
1228 if (curlynest)
1229 continue;
1230 break;
1231 case TOK.endOfFile:
1232 break;
1233 default:
1234 continue;
1236 return tk;
1240 /*******************************************
1241 * Parse escape sequence.
1243 private uint escapeSequence(out dchar c2)
1245 return Lexer.escapeSequence(token.loc, p, Ccompile, c2);
1248 /********
1249 * Parse the given string literal escape sequence into a single character.
1250 * D https://dlang.org/spec/lex.html#escape_sequences
1251 * C11 6.4.4.4
1252 * Params:
1253 * loc = location to use for error messages
1254 * sequence = pointer to string with escape sequence to parse. Updated to
1255 * point past the end of the escape sequence
1256 * Ccompile = true for compile C11 escape sequences
1257 * c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init`
1258 * Returns:
1259 * the escape sequence as a single character
1261 private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2)
1263 const(char)* p = sequence; // cache sequence reference on stack
1264 scope(exit) sequence = p;
1266 uint c = *p;
1267 int ndigits;
1268 switch (c)
1270 case '\'':
1271 case '"':
1272 case '?':
1273 case '\\':
1274 Lconsume:
1275 p++;
1276 break;
1277 case 'a':
1278 c = 7;
1279 goto Lconsume;
1280 case 'b':
1281 c = 8;
1282 goto Lconsume;
1283 case 'f':
1284 c = 12;
1285 goto Lconsume;
1286 case 'n':
1287 c = 10;
1288 goto Lconsume;
1289 case 'r':
1290 c = 13;
1291 goto Lconsume;
1292 case 't':
1293 c = 9;
1294 goto Lconsume;
1295 case 'v':
1296 c = 11;
1297 goto Lconsume;
1298 case 'u':
1299 ndigits = 4;
1300 goto Lhex;
1301 case 'U':
1302 ndigits = 8;
1303 goto Lhex;
1304 case 'x':
1305 ndigits = 2;
1306 Lhex:
1307 p++;
1308 c = *p;
1309 if (ishex(cast(char)c))
1311 uint v = 0;
1312 int n = 0;
1313 if (Ccompile && ndigits == 2)
1315 /* C11 6.4.4.4-7 one to infinity hex digits
1319 if (isdigit(cast(char)c))
1320 c -= '0';
1321 else if (islower(c))
1322 c -= 'a' - 10;
1323 else
1324 c -= 'A' - 10;
1325 v = v * 16 + c;
1326 c = *++p;
1327 } while (ishex(cast(char)c));
1329 else
1331 while (1)
1333 if (isdigit(cast(char)c))
1334 c -= '0';
1335 else if (islower(c))
1336 c -= 'a' - 10;
1337 else
1338 c -= 'A' - 10;
1339 v = v * 16 + c;
1340 c = *++p;
1341 if (++n == ndigits)
1342 break;
1343 if (!ishex(cast(char)c))
1345 error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1346 break;
1349 if (ndigits != 2 && !utf_isValidDchar(v))
1351 error(loc, "invalid UTF character \\U%08x", v);
1352 v = '?'; // recover with valid UTF character
1355 c = v;
1357 else
1359 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1360 p++;
1362 break;
1363 case '&':
1364 if (Ccompile)
1365 goto default;
1367 // named character entity
1368 for (const idstart = ++p; 1; p++)
1370 switch (*p)
1372 case ';':
1373 auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]);
1374 c = entity[0];
1375 if (entity == entity.init)
1377 error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1378 c = '?';
1380 if (entity[1] != entity.init[1])
1381 c2 = entity[1];
1383 p++;
1384 break;
1385 default:
1386 if (isalpha(*p) || (p != idstart && isdigit(*p)))
1387 continue;
1388 error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1389 c = '?';
1390 break;
1392 break;
1394 break;
1395 case 0:
1396 case 0x1A:
1397 // end of file
1398 c = '\\';
1399 break;
1400 default:
1401 if (isoctal(cast(char)c))
1403 uint v = 0;
1404 int n = 0;
1407 v = v * 8 + (c - '0');
1408 c = *++p;
1410 while (++n < 3 && isoctal(cast(char)c));
1411 c = v;
1412 if (c > 0xFF)
1413 error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1415 else
1417 error(loc, "undefined escape sequence \\%c", c);
1418 p++;
1420 break;
1422 return c;
1426 Lex a wysiwyg string. `p` must be pointing to the first character before the
1427 contents of the string literal. The character pointed to by `p` will be used as
1428 the terminating character (i.e. backtick or double-quote).
1429 Params:
1430 result = pointer to the token that accepts the result
1432 private void wysiwygStringConstant(Token* result)
1434 result.value = TOK.string_;
1435 Loc start = loc();
1436 auto terminator = p[0];
1437 p++;
1438 stringbuffer.setsize(0);
1439 while (1)
1441 dchar c = p[0];
1442 p++;
1443 switch (c)
1445 case '\n':
1446 endOfLine();
1447 break;
1448 case '\r':
1449 if (p[0] == '\n')
1450 continue; // ignore
1451 c = '\n'; // treat EndOfLine as \n character
1452 endOfLine();
1453 break;
1454 case 0:
1455 case 0x1A:
1456 error("unterminated string constant starting at %s", start.toChars());
1457 result.setString();
1458 // rewind `p` so it points to the EOF character
1459 p--;
1460 return;
1461 default:
1462 if (c == terminator)
1464 result.setString(stringbuffer);
1465 stringPostfix(result);
1466 return;
1468 else if (c & 0x80)
1470 p--;
1471 const u = decodeUTF();
1472 p++;
1473 if (u == PS || u == LS)
1474 endOfLine();
1475 stringbuffer.writeUTF8(u);
1476 continue;
1478 break;
1480 stringbuffer.writeByte(c);
1484 /**************************************
1485 * Lex hex strings:
1486 * x"0A ae 34FE BD"
1488 final TOK hexStringConstant(Token* t)
1490 Loc start = loc();
1491 uint n = 0;
1492 uint v = ~0; // dead assignment, needed to suppress warning
1493 p++;
1494 stringbuffer.setsize(0);
1495 while (1)
1497 dchar c = *p++;
1498 switch (c)
1500 case ' ':
1501 case '\t':
1502 case '\v':
1503 case '\f':
1504 continue; // skip white space
1505 case '\r':
1506 if (*p == '\n')
1507 continue; // ignore '\r' if followed by '\n'
1508 // Treat isolated '\r' as if it were a '\n'
1509 goto case '\n';
1510 case '\n':
1511 endOfLine();
1512 continue;
1513 case 0:
1514 case 0x1A:
1515 error("unterminated string constant starting at %s", start.toChars());
1516 t.setString();
1517 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1518 p--;
1519 return TOK.hexadecimalString;
1520 case '"':
1521 if (n & 1)
1523 error("odd number (%d) of hex characters in hex string", n);
1524 stringbuffer.writeByte(v);
1526 t.setString(stringbuffer);
1527 t.postfix = 'h';
1528 stringPostfix(t);
1529 return TOK.hexadecimalString;
1530 default:
1531 if (c >= '0' && c <= '9')
1532 c -= '0';
1533 else if (c >= 'a' && c <= 'f')
1534 c -= 'a' - 10;
1535 else if (c >= 'A' && c <= 'F')
1536 c -= 'A' - 10;
1537 else if (c & 0x80)
1539 p--;
1540 const u = decodeUTF();
1541 p++;
1542 if (u == PS || u == LS)
1543 endOfLine();
1544 else
1545 error("non-hex character \\u%04x in hex string", u);
1547 else
1548 error("non-hex character '%c' in hex string", c);
1549 if (n & 1)
1551 v = (v << 4) | c;
1552 stringbuffer.writeByte(v);
1554 else
1555 v = c;
1556 n++;
1557 break;
1560 assert(0); // see bug 15731
1564 Lex a delimited string. Some examples of delimited strings are:
1566 q"(foo(xxx))" // "foo(xxx)"
1567 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1568 q"/foo]/" // "foo]"
1569 q"HERE
1571 HERE" // "foo\n"
1573 It is assumed that `p` points to the opening double-quote '"'.
1574 Params:
1575 result = pointer to the token that accepts the result
1577 private void delimitedStringConstant(Token* result)
1579 result.value = TOK.string_;
1580 Loc start = loc();
1581 dchar delimleft = 0;
1582 dchar delimright = 0;
1583 uint nest = 1;
1584 uint nestcount = ~0; // dead assignment, needed to suppress warning
1585 Identifier hereid = null;
1586 uint blankrol = 0;
1587 uint startline = 0;
1588 p++;
1589 stringbuffer.setsize(0);
1590 while (1)
1592 const s = p;
1593 dchar c = *p++;
1594 //printf("c = '%c'\n", c);
1595 switch (c)
1597 case '\n':
1598 Lnextline:
1599 endOfLine();
1600 startline = 1;
1601 if (blankrol)
1603 blankrol = 0;
1604 continue;
1606 if (hereid)
1608 stringbuffer.writeUTF8(c);
1609 continue;
1611 break;
1612 case '\r':
1613 if (*p == '\n')
1614 continue; // ignore
1615 c = '\n'; // treat EndOfLine as \n character
1616 goto Lnextline;
1617 case 0:
1618 case 0x1A:
1619 error("unterminated delimited string constant starting at %s", start.toChars());
1620 result.setString();
1621 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1622 p--;
1623 return;
1624 default:
1625 if (c & 0x80)
1627 p--;
1628 c = decodeUTF();
1629 p++;
1630 if (c == PS || c == LS)
1631 goto Lnextline;
1633 break;
1635 if (delimleft == 0)
1637 delimleft = c;
1638 nest = 1;
1639 nestcount = 1;
1640 if (c == '(')
1641 delimright = ')';
1642 else if (c == '{')
1643 delimright = '}';
1644 else if (c == '[')
1645 delimright = ']';
1646 else if (c == '<')
1647 delimright = '>';
1648 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1650 // Start of identifier; must be a heredoc
1651 Token tok;
1652 p = s;
1653 scan(&tok); // read in heredoc identifier
1654 if (tok.value != TOK.identifier)
1656 error("identifier expected for heredoc, not %s", tok.toChars());
1657 delimright = c;
1659 else
1661 hereid = tok.ident;
1662 //printf("hereid = '%s'\n", hereid.toChars());
1663 blankrol = 1;
1665 nest = 0;
1667 else
1669 delimright = c;
1670 nest = 0;
1671 if (isspace(c))
1672 error("delimiter cannot be whitespace");
1675 else
1677 if (blankrol)
1679 error("heredoc rest of line should be blank");
1680 blankrol = 0;
1681 continue;
1683 if (nest == 1)
1685 if (c == delimleft)
1686 nestcount++;
1687 else if (c == delimright)
1689 nestcount--;
1690 if (nestcount == 0)
1691 goto Ldone;
1694 else if (c == delimright)
1695 goto Ldone;
1696 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1698 Token tok;
1699 auto psave = p;
1700 p = s;
1701 scan(&tok); // read in possible heredoc identifier
1702 //printf("endid = '%s'\n", tok.ident.toChars());
1703 if (tok.value == TOK.identifier && tok.ident is hereid)
1705 /* should check that rest of line is blank
1707 goto Ldone;
1709 p = psave;
1711 stringbuffer.writeUTF8(c);
1712 startline = 0;
1715 Ldone:
1716 if (*p == '"')
1717 p++;
1718 else if (hereid)
1719 error("delimited string must end in `%s\"`", hereid.toChars());
1720 else if (isspace(delimright))
1721 error("delimited string must end in `\"`");
1722 else
1723 error(token.loc, "delimited string must end in `%c\"`", delimright);
1724 result.setString(stringbuffer);
1725 stringPostfix(result);
1729 Lex a token string. Some examples of token strings are:
1731 q{ foo(xxx) } // " foo(xxx) "
1732 q{foo$(LPAREN)} // "foo$(LPAREN)"
1733 q{{foo}"}"} // "{foo}"}""
1735 It is assumed that `p` points to the opening curly-brace.
1736 Params:
1737 result = pointer to the token that accepts the result
1739 private void tokenStringConstant(Token* result)
1741 result.value = TOK.string_;
1743 uint nest = 1;
1744 const start = loc();
1745 const pstart = ++p;
1746 inTokenStringConstant++;
1747 scope(exit) inTokenStringConstant--;
1748 while (1)
1750 Token tok;
1751 scan(&tok);
1752 switch (tok.value)
1754 case TOK.leftCurly:
1755 nest++;
1756 continue;
1757 case TOK.rightCurly:
1758 if (--nest == 0)
1760 result.setString(pstart, p - 1 - pstart);
1761 stringPostfix(result);
1762 return;
1764 continue;
1765 case TOK.endOfFile:
1766 error("unterminated token string constant starting at %s", start.toChars());
1767 result.setString();
1768 return;
1769 default:
1770 continue;
1776 Scan a quoted string while building the processed string value by
1777 handling escape sequences. The result is returned in the given `t` token.
1778 This function assumes that `p` currently points to the opening quote
1779 of the string.
1780 Params:
1781 t = the token to set the resulting string to
1782 * References:
1783 * D https://dlang.org/spec/lex.html#double_quoted_strings
1784 * ImportC C11 6.4.5
1786 private void escapeStringConstant(Token* t)
1788 t.value = TOK.string_;
1790 const start = loc();
1791 const tc = *p++; // opening quote
1792 stringbuffer.setsize(0);
1793 while (1)
1795 dchar c = *p++;
1796 dchar c2;
1797 switch (c)
1799 case '\\':
1800 switch (*p)
1802 case '&':
1803 if (Ccompile)
1804 goto default;
1806 c = escapeSequence(c2);
1807 stringbuffer.writeUTF8(c);
1808 if (c2 != dchar.init)
1809 stringbuffer.writeUTF8(c2);
1810 continue;
1811 case 'u':
1812 case 'U':
1813 c = escapeSequence(c2);
1814 stringbuffer.writeUTF8(c);
1815 continue;
1816 default:
1817 c = escapeSequence(c2);
1818 break;
1820 break;
1821 case '\n':
1822 endOfLine();
1823 if (Ccompile)
1824 goto Lunterminated;
1825 break;
1826 case '\r':
1827 if (*p == '\n')
1828 continue; // ignore
1829 c = '\n'; // treat EndOfLine as \n character
1830 endOfLine();
1831 if (Ccompile)
1832 goto Lunterminated;
1833 break;
1834 case '\'':
1835 case '"':
1836 if (c != tc)
1837 goto default;
1838 t.setString(stringbuffer);
1839 if (!Ccompile)
1840 stringPostfix(t);
1841 return;
1842 case 0:
1843 case 0x1A:
1844 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1845 p--;
1846 Lunterminated:
1847 error("unterminated string constant starting at %s", start.toChars());
1848 t.setString();
1849 return;
1850 default:
1851 if (c & 0x80)
1853 p--;
1854 c = decodeUTF();
1855 if (c == LS || c == PS)
1857 c = '\n';
1858 endOfLine();
1859 if (Ccompile)
1860 goto Lunterminated;
1862 p++;
1863 stringbuffer.writeUTF8(c);
1864 continue;
1866 break;
1868 stringbuffer.writeByte(c);
1872 /**************************************
1873 * Reference:
1874 * https://dlang.org/spec/lex.html#characterliteral
1876 private TOK charConstant(Token* t)
1878 TOK tk = TOK.charLiteral;
1879 //printf("Lexer::charConstant\n");
1880 p++;
1881 dchar c = *p++;
1882 dchar c2;
1883 switch (c)
1885 case '\\':
1886 switch (*p)
1888 case 'u':
1889 tk = TOK.wcharLiteral;
1890 goto default;
1891 case 'U':
1892 case '&':
1893 tk = TOK.dcharLiteral;
1894 goto default;
1895 default:
1896 t.unsvalue = escapeSequence(c2);
1897 if (c2 != c2.init)
1899 error("html entity requires 2 code units, use a string instead of a character");
1900 t.unsvalue = '?';
1902 break;
1904 break;
1905 case '\n':
1907 endOfLine();
1908 goto case;
1909 case '\r':
1910 goto case '\'';
1911 case 0:
1912 case 0x1A:
1913 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1914 p--;
1915 goto case;
1916 case '\'':
1917 error("unterminated character constant");
1918 t.unsvalue = '?';
1919 return tk;
1920 default:
1921 if (c & 0x80)
1923 p--;
1924 c = decodeUTF();
1925 p++;
1926 if (c == LS || c == PS)
1927 goto L1;
1928 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1929 tk = TOK.wcharLiteral;
1930 else
1931 tk = TOK.dcharLiteral;
1933 t.unsvalue = c;
1934 break;
1936 if (*p != '\'')
1938 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1939 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1941 if (*p & 0x80)
1943 const s = p;
1944 c = decodeUTF();
1945 if (c == LS || c == PS)
1947 p = s;
1948 break;
1951 p++;
1954 if (*p == '\'')
1956 error("character constant has multiple characters");
1957 p++;
1959 else
1960 error("unterminated character constant");
1961 t.unsvalue = '?';
1962 return tk;
1964 p++;
1965 return tk;
1968 /***************************************
1969 * Lex C character constant.
1970 * Parser is on the opening quote.
1971 * Params:
1972 * t = token to fill in
1973 * prefix = one of `u`, `U` or 0.
1974 * Reference:
1975 * C11 6.4.4.4
1977 private void clexerCharConstant(ref Token t, char prefix)
1979 escapeStringConstant(&t);
1980 const(char)[] str = t.ustring[0 .. t.len];
1981 const n = str.length;
1982 const loc = t.loc;
1983 if (n == 0)
1985 error(loc, "empty character constant");
1986 t.value = TOK.semicolon;
1987 return;
1990 uint u;
1991 switch (prefix)
1993 case 0:
1994 if (n == 1) // fast case
1996 u = str[0];
1998 else if (n > 4)
1999 error(loc, "max number of chars in character literal is 4, had %d",
2000 cast(int)n);
2001 else
2003 foreach (i, c; str)
2004 (cast(char*)&u)[n - 1 - i] = c;
2006 break;
2008 case 'u':
2009 dchar d1;
2010 size_t idx;
2011 while (idx < n)
2013 string msg = utf_decodeChar(str, idx, d1);
2014 if (msg)
2015 error(loc, "%.*s", cast(int)msg.length, msg.ptr);
2017 if (d1 >= 0x1_0000)
2018 error(loc, "x%x does not fit in 16 bits", d1);
2019 t.unsvalue = d1;
2020 t.value = TOK.wcharLiteral; // C11 6.4.4.4-9
2021 return;
2023 case 'U':
2024 dchar d;
2025 size_t idx;
2026 auto msg = utf_decodeChar(str, idx, d);
2027 if (msg)
2028 error(loc, "%.*s", cast(int)msg.length, msg.ptr);
2029 else if (idx < n)
2030 error(loc, "max number of chars in 32 bit character literal is 1, had %d",
2031 cast(int)((n + 3) >> 2));
2032 t.unsvalue = d;
2033 t.value = TOK.dcharLiteral; // C11 6.4.4.4-9
2034 return;
2036 default:
2037 assert(0);
2039 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
2040 t.unsvalue = u;
2043 /***************************************
2044 * Get postfix of string literal.
2046 private void stringPostfix(Token* t) pure @nogc
2048 switch (*p)
2050 case 'c':
2051 case 'w':
2052 case 'd':
2053 t.postfix = *p;
2054 p++;
2055 break;
2056 default:
2057 t.postfix = 0;
2058 break;
2062 /**************************************
2063 * Read in a number.
2064 * If it's an integer, store it in tok.TKutok.Vlong.
2065 * integers can be decimal, octal or hex
2066 * Handle the suffixes U, UL, LU, L, etc.
2067 * If it's double, store it in tok.TKutok.Vdouble.
2068 * Returns:
2069 * TKnum
2070 * TKdouble,...
2072 private TOK number(Token* t)
2074 int base = 10;
2075 const start = p;
2076 ulong n = 0; // unsigned >=64 bit integer type
2077 int d;
2078 bool err = false;
2079 bool overflow = false;
2080 bool anyBinaryDigitsNoSingleUS = false;
2081 bool anyHexDigitsNoSingleUS = false;
2082 char errorDigit = 0;
2083 dchar c = *p;
2084 if (c == '0')
2086 ++p;
2087 c = *p;
2088 switch (c)
2090 case '0':
2091 case '1':
2092 case '2':
2093 case '3':
2094 case '4':
2095 case '5':
2096 case '6':
2097 case '7':
2098 base = 8;
2099 break;
2101 case '8':
2102 case '9':
2103 errorDigit = cast(char) c;
2104 base = 8;
2105 break;
2106 case 'x':
2107 case 'X':
2108 ++p;
2109 base = 16;
2110 break;
2111 case 'b':
2112 case 'B':
2113 ++p;
2114 base = 2;
2115 break;
2116 case '.':
2117 if (p[1] == '.')
2118 goto Ldone; // if ".."
2119 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
2121 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2122 goto Lreal; // if `0.f` or `0.L`
2123 goto Ldone; // if ".identifier" or ".unicode"
2125 goto Lreal; // '.' is part of current token
2126 case 'i':
2127 case 'f':
2128 case 'F':
2129 goto Lreal;
2130 case '_':
2131 if (Ccompile)
2132 error("embedded `_` not allowed");
2133 ++p;
2134 base = 8;
2135 break;
2136 case 'L':
2137 if (p[1] == 'i')
2138 goto Lreal;
2139 break;
2140 default:
2141 break;
2144 while (1)
2146 c = *p;
2147 switch (c)
2149 case '0':
2150 case '1':
2151 case '2':
2152 case '3':
2153 case '4':
2154 case '5':
2155 case '6':
2156 case '7':
2157 case '8':
2158 case '9':
2159 ++p;
2160 d = c - '0';
2161 break;
2162 case 'a':
2163 case 'b':
2164 case 'c':
2165 case 'd':
2166 case 'e':
2167 case 'f':
2168 case 'A':
2169 case 'B':
2170 case 'C':
2171 case 'D':
2172 case 'E':
2173 case 'F':
2174 ++p;
2175 if (base != 16)
2177 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2178 goto Lreal;
2180 if (c >= 'a')
2181 d = c + 10 - 'a';
2182 else
2183 d = c + 10 - 'A';
2184 break;
2185 case 'L':
2186 if (p[1] == 'i')
2187 goto Lreal;
2188 goto Ldone;
2189 case '.':
2190 if (p[1] == '.')
2191 goto Ldone; // if ".."
2192 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2194 if (Ccompile && base == 10 &&
2195 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2196 goto Lreal; // if `1.e6` or `1.f` or `1.L`
2197 goto Ldone; // if ".identifier" or ".unicode"
2199 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2200 goto Ldone; // if ".identifier" or ".unicode"
2201 if (base == 2)
2202 goto Ldone; // if ".identifier" or ".unicode"
2203 goto Lreal; // otherwise as part of a floating point literal
2205 case 'i':
2206 if (Ccompile)
2207 goto Ldone;
2208 goto Lreal;
2210 case 'p':
2211 case 'P':
2212 Lreal:
2213 p = start;
2214 return inreal(t);
2215 case '_':
2216 if (Ccompile)
2217 goto default;
2218 ++p;
2219 continue;
2220 default:
2221 goto Ldone;
2223 // got a digit here, set any necessary flags, check for errors
2224 anyHexDigitsNoSingleUS = true;
2225 anyBinaryDigitsNoSingleUS = true;
2226 if (!errorDigit && d >= base)
2228 errorDigit = cast(char) c;
2230 // Avoid expensive overflow check if we aren't at risk of overflow
2231 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2232 n = n * base + d;
2233 else
2235 import core.checkedint : mulu, addu;
2237 n = mulu(n, base, overflow);
2238 n = addu(n, d, overflow);
2241 Ldone:
2242 if (errorDigit)
2244 error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2245 base == 8 ? "octal".ptr :
2246 "decimal".ptr, errorDigit);
2247 err = true;
2249 if (overflow && !err)
2251 error("integer overflow");
2252 err = true;
2254 if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2255 (base == 16 && !anyHexDigitsNoSingleUS))
2256 error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2258 t.unsvalue = n;
2260 if (Ccompile)
2261 return cnumber(base, n);
2263 enum FLAGS : int
2265 none = 0,
2266 decimal = 1, // decimal
2267 unsigned = 2, // u or U suffix
2268 long_ = 4, // L suffix
2271 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2272 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2273 const psuffix = p;
2274 while (1)
2276 FLAGS f;
2277 switch (*p)
2279 case 'U':
2280 case 'u':
2281 f = FLAGS.unsigned;
2282 goto L1;
2283 case 'l':
2284 f = FLAGS.long_;
2285 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2286 goto L1;
2287 case 'L':
2288 f = FLAGS.long_;
2290 p++;
2291 if ((flags & f) && !err)
2293 error("repeated integer suffix `%c`", p[-1]);
2294 err = true;
2296 flags = cast(FLAGS)(flags | f);
2297 continue;
2298 default:
2299 break;
2301 break;
2303 if (base == 8 && n >= 8)
2305 if (err)
2306 // can't translate invalid octal value, just show a generic message
2307 error("octal literals larger than 7 are no longer supported");
2308 else
2309 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2310 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2312 TOK result;
2313 switch (flags)
2315 case FLAGS.none:
2316 /* Octal or Hexadecimal constant.
2317 * First that fits: int, uint, long, ulong
2319 if (n & 0x8000000000000000L)
2320 result = TOK.uns64Literal;
2321 else if (n & 0xFFFFFFFF00000000L)
2322 result = TOK.int64Literal;
2323 else if (n & 0x80000000)
2324 result = TOK.uns32Literal;
2325 else
2326 result = TOK.int32Literal;
2327 break;
2328 case FLAGS.decimal:
2329 /* First that fits: int, long, long long
2331 if (n & 0x8000000000000000L)
2333 result = TOK.uns64Literal;
2335 else if (n & 0xFFFFFFFF80000000L)
2336 result = TOK.int64Literal;
2337 else
2338 result = TOK.int32Literal;
2339 break;
2340 case FLAGS.unsigned:
2341 case FLAGS.decimal | FLAGS.unsigned:
2342 /* First that fits: uint, ulong
2344 if (n & 0xFFFFFFFF00000000L)
2345 result = TOK.uns64Literal;
2346 else
2347 result = TOK.uns32Literal;
2348 break;
2349 case FLAGS.decimal | FLAGS.long_:
2350 if (n & 0x8000000000000000L)
2352 if (!err)
2354 error("signed integer overflow");
2355 err = true;
2357 result = TOK.uns64Literal;
2359 else
2360 result = TOK.int64Literal;
2361 break;
2362 case FLAGS.long_:
2363 if (n & 0x8000000000000000L)
2364 result = TOK.uns64Literal;
2365 else
2366 result = TOK.int64Literal;
2367 break;
2368 case FLAGS.unsigned | FLAGS.long_:
2369 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2370 result = TOK.uns64Literal;
2371 break;
2372 default:
2373 debug
2375 printf("%x\n", flags);
2377 assert(0);
2379 return result;
2382 /**************************************
2383 * Lex C integer-suffix
2384 * Params:
2385 * base = number base
2386 * n = raw integer value
2387 * Returns:
2388 * token value
2390 private TOK cnumber(int base, ulong n)
2392 /* C11 6.4.4.1
2393 * Parse trailing suffixes:
2394 * u or U
2395 * l or L
2396 * ll or LL
2398 enum FLAGS : uint
2400 octalhex = 1, // octal or hexadecimal
2401 decimal = 2, // decimal
2402 unsigned = 4, // u or U suffix
2403 long_ = 8, // l or L suffix
2404 llong = 0x10, // ll or LL
2406 // Microsoft extensions
2407 i8 = 0x20,
2408 i16 = 0x40,
2409 i32 = 0x80,
2410 i64 = 0x100,
2412 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2413 bool err;
2414 Lsuffixes:
2415 while (1)
2417 FLAGS f;
2418 const cs = *p;
2419 switch (cs)
2421 case 'U':
2422 case 'u':
2423 f = FLAGS.unsigned;
2424 break;
2426 case 'l':
2427 case 'L':
2428 f = FLAGS.long_;
2429 if (cs == p[1])
2431 f = FLAGS.long_ | FLAGS.llong;
2432 ++p;
2434 break;
2436 case 'i':
2437 case 'I':
2438 if (p[1] == '8')
2440 f = FLAGS.i8;
2441 ++p;
2443 else if (p[1] == '1' && p[2] == '6')
2445 f = FLAGS.i16;
2446 p += 2;
2448 else if (p[1] == '3' && p[2] == '2')
2450 f = FLAGS.i32;
2451 p += 2;
2453 else if (p[1] == '6' && p[2] == '4')
2455 f = FLAGS.i64;
2456 p += 2;
2458 else
2459 break Lsuffixes;
2460 if (p[1] >= '0' && p[1] <= '9' && !err)
2462 error("invalid integer suffix");
2463 err = true;
2465 break;
2467 default:
2468 break Lsuffixes;
2470 ++p;
2471 if ((flags & f) && !err)
2473 error("duplicate integer suffixes");
2474 err = true;
2476 flags = cast(FLAGS)(flags | f);
2479 TOK result = TOK.int32Literal; // default
2480 switch (flags)
2482 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2483 * this code deviates from C by picking D int, uint, long, or ulong instead
2486 case FLAGS.octalhex:
2487 /* Octal or Hexadecimal constant.
2488 * First that fits: int, unsigned, long, unsigned long,
2489 * long long, unsigned long long
2491 if (n & 0x8000000000000000L)
2492 result = TOK.uns64Literal; // unsigned long
2493 else if (n & 0xFFFFFFFF00000000L)
2494 result = TOK.int64Literal; // long
2495 else if (n & 0x80000000)
2496 result = TOK.uns32Literal;
2497 else
2498 result = TOK.int32Literal;
2499 break;
2501 case FLAGS.decimal:
2502 /* First that fits: int, long, long long
2504 if (n & 0x8000000000000000L)
2505 result = TOK.uns64Literal; // unsigned long
2506 else if (n & 0xFFFFFFFF80000000L)
2507 result = TOK.int64Literal; // long
2508 else
2509 result = TOK.int32Literal;
2510 break;
2512 case FLAGS.octalhex | FLAGS.unsigned:
2513 case FLAGS.decimal | FLAGS.unsigned:
2514 /* First that fits: unsigned, unsigned long, unsigned long long
2516 if (n & 0xFFFFFFFF00000000L)
2517 result = TOK.uns64Literal; // unsigned long
2518 else
2519 result = TOK.uns32Literal;
2520 break;
2522 case FLAGS.decimal | FLAGS.long_:
2523 /* First that fits: long, long long
2525 if (longsize == 4 || long_longsize == 4)
2527 if (n & 0xFFFFFFFF_80000000L)
2528 result = TOK.int64Literal;
2529 else
2530 result = TOK.int32Literal; // long
2532 else
2534 result = TOK.int64Literal; // long
2536 break;
2538 case FLAGS.octalhex | FLAGS.long_:
2539 /* First that fits: long, unsigned long, long long,
2540 * unsigned long long
2542 if (longsize == 4 || long_longsize == 4)
2544 if (n & 0x8000000000000000L)
2545 result = TOK.uns64Literal;
2546 else if (n & 0xFFFFFFFF00000000L)
2547 result = TOK.int64Literal;
2548 else if (n & 0x80000000)
2549 result = TOK.uns32Literal; // unsigned long
2550 else
2551 result = TOK.int32Literal; // long
2553 else
2555 if (n & 0x80000000_00000000L)
2556 result = TOK.uns64Literal; // unsigned long
2557 else
2558 result = TOK.int64Literal; // long
2560 break;
2562 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2563 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2564 /* First that fits: unsigned long, unsigned long long
2566 if (longsize == 4 || long_longsize == 4)
2568 if (n & 0xFFFFFFFF00000000L)
2569 result = TOK.uns64Literal;
2570 else
2571 result = TOK.uns32Literal; // unsigned long
2573 else
2575 result = TOK.uns64Literal; // unsigned long
2577 break;
2579 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2580 /* First that fits: long long, unsigned long long
2582 if (n & 0x8000000000000000L)
2583 result = TOK.uns64Literal;
2584 else
2585 result = TOK.int64Literal;
2586 break;
2588 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2589 /* long long
2591 result = TOK.int64Literal;
2592 break;
2594 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2595 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2596 result = TOK.uns64Literal;
2597 break;
2599 case FLAGS.octalhex | FLAGS.i8:
2600 case FLAGS.octalhex | FLAGS.i16:
2601 case FLAGS.octalhex | FLAGS.i32:
2602 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i8:
2603 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i16:
2604 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i32:
2605 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i8:
2606 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i16:
2607 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i32:
2608 result = TOK.uns32Literal;
2609 break;
2611 case FLAGS.decimal | FLAGS.i8:
2612 case FLAGS.decimal | FLAGS.i16:
2613 case FLAGS.decimal | FLAGS.i32:
2614 result = TOK.int32Literal;
2615 break;
2617 case FLAGS.octalhex | FLAGS.i64:
2618 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i64:
2619 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i64:
2620 result = TOK.uns64Literal;
2621 break;
2623 case FLAGS.decimal | FLAGS.i64:
2624 result = TOK.int64Literal;
2625 break;
2627 default:
2628 debug printf("%x\n",flags);
2629 assert(0);
2631 return result;
2634 /**************************************
2635 * Read in characters, converting them to real.
2636 * Bugs:
2637 * Exponent overflow not detected.
2638 * Too much requested precision is not detected.
2640 private TOK inreal(Token* t)
2642 //printf("Lexer::inreal()\n");
2643 debug
2645 assert(*p == '.' || isdigit(*p));
2647 bool isWellformedString = true;
2648 stringbuffer.setsize(0);
2649 auto pstart = p;
2650 bool hex = false;
2651 dchar c = *p++;
2652 // Leading '0x'
2653 if (c == '0')
2655 c = *p++;
2656 if (c == 'x' || c == 'X')
2658 hex = true;
2659 c = *p++;
2662 // Digits to left of '.'
2663 while (1)
2665 if (c == '.')
2667 c = *p++;
2668 break;
2670 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2672 c = *p++;
2673 continue;
2675 break;
2677 // Digits to right of '.'
2678 while (1)
2680 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2682 c = *p++;
2683 continue;
2685 break;
2687 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2689 c = *p++;
2690 if (c == '-' || c == '+')
2692 c = *p++;
2694 bool anyexp = false;
2695 while (1)
2697 if (isdigit(c))
2699 anyexp = true;
2700 c = *p++;
2701 continue;
2703 if (c == '_')
2705 if (Ccompile)
2706 error("embedded `_` in numeric literals not allowed");
2707 c = *p++;
2708 continue;
2710 if (!anyexp)
2712 error("missing exponent");
2713 isWellformedString = false;
2715 break;
2718 else if (hex)
2720 error("exponent required for hex float");
2721 isWellformedString = false;
2723 --p;
2724 while (pstart < p)
2726 if (*pstart != '_')
2727 stringbuffer.writeByte(*pstart);
2728 ++pstart;
2730 stringbuffer.writeByte(0);
2731 auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2732 TOK result;
2733 bool isOutOfRange = false;
2734 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero);
2736 bool imaginary = false;
2737 if (*p == 'i' && Ccompile)
2739 ++p;
2740 imaginary = true;
2743 switch (*p)
2745 case 'F':
2746 case 'f':
2747 if (isWellformedString && !isOutOfRange)
2748 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2749 result = TOK.float32Literal;
2750 p++;
2751 break;
2752 default:
2753 if (isWellformedString && !isOutOfRange)
2754 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2755 result = TOK.float64Literal;
2756 break;
2757 case 'l':
2758 if (!Ccompile)
2759 error("use 'L' suffix instead of 'l'");
2760 goto case 'L';
2761 case 'L':
2762 ++p;
2763 if (Ccompile && long_doublesize == 8)
2764 goto default;
2765 result = TOK.float80Literal;
2766 break;
2769 if ((*p == 'i' || *p == 'I') && !Ccompile)
2771 if (*p == 'I')
2772 error("use 'i' suffix instead of 'I'");
2773 p++;
2774 imaginary = true;
2777 if (imaginary)
2779 switch (result)
2781 case TOK.float32Literal:
2782 result = TOK.imaginary32Literal;
2783 break;
2784 case TOK.float64Literal:
2785 result = TOK.imaginary64Literal;
2786 break;
2787 case TOK.float80Literal:
2788 result = TOK.imaginary80Literal;
2789 break;
2790 default:
2791 break;
2794 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2795 if (isOutOfRange && !isLong && (!Ccompile || hex))
2797 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2799 const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : "";
2800 const char* type = [TOK.float32Literal: "`float`".ptr,
2801 TOK.float64Literal: "`double`".ptr,
2802 TOK.float80Literal: "`real` for the current target".ptr][result];
2803 error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type);
2804 const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : "";
2805 eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra);
2807 debug
2809 switch (result)
2811 case TOK.float32Literal:
2812 case TOK.float64Literal:
2813 case TOK.float80Literal:
2814 case TOK.imaginary32Literal:
2815 case TOK.imaginary64Literal:
2816 case TOK.imaginary80Literal:
2817 break;
2818 default:
2819 assert(0);
2822 return result;
2825 final Loc loc() @nogc
2827 scanloc.charnum = cast(ushort)(1 + p - line);
2828 version (LocOffset)
2829 scanloc.fileOffset = cast(uint)(p - base);
2830 return scanloc;
2833 void error(T...)(const(char)* format, T args)
2835 eSink.error(token.loc, format, args);
2838 void error(T...)(const ref Loc loc, const(char)* format, T args)
2840 eSink.error(loc, format, args);
2843 void deprecation(T...)(const ref Loc loc, const(char)* format, T args)
2845 eSink.deprecation(loc, format, args);
2848 void deprecation(T...)(const(char)* format, T args)
2850 eSink.deprecation(token.loc, format, args);
2853 void deprecationSupplemental(T...)(const(char)* format, T args)
2855 eSink.deprecationSupplemental(token.loc, format, args);
2858 /***************************************
2859 * Parse special token sequence:
2860 * Returns:
2861 * true if the special token sequence was handled
2862 * References:
2863 * https://dlang.org/spec/lex.html#special-token-sequence
2865 bool parseSpecialTokenSequence()
2867 Token n;
2868 scan(&n);
2869 if (n.value == TOK.identifier)
2871 if (n.ident == Id.line)
2873 poundLine(n, false);
2874 return true;
2876 else
2878 const locx = loc();
2879 // @@@DEPRECATED_2.103@@@
2880 // Turn into an error in 2.113
2881 if (inTokenStringConstant)
2882 deprecation(locx, "token string requires valid D tokens, not `#%s`", n.ident.toChars());
2883 else
2884 error(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2887 else if (n.value == TOK.if_)
2889 const locx = loc();
2890 if (inTokenStringConstant)
2891 error(locx, "token string requires valid D tokens, not `#if`");
2892 else
2893 error(locx, "C preprocessor directive `#if` is not supported, use `version` or `static if`");
2895 return false;
2898 /*********************************************
2899 * Parse line/file preprocessor directive:
2900 * #line linnum [filespec]
2901 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2902 * Accept linemarker format:
2903 * # linnum [filespec] {flags}
2904 * There can be zero or more flags, which are one of the digits 1..4, and
2905 * must be in ascending order. The flags are ignored.
2906 * Params:
2907 * tok = token we're on, which is linnum of linemarker
2908 * linemarker = true if line marker format and lexer is on linnum
2909 * References:
2910 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2912 final void poundLine(ref Token tok, bool linemarker)
2914 auto linnum = this.scanloc.linnum;
2915 const(char)* filespec = null;
2916 bool flags;
2918 if (!linemarker)
2919 scan(&tok);
2920 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2922 const lin = cast(int)(tok.unsvalue);
2923 if (lin != tok.unsvalue)
2925 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2926 skipToNextLine();
2927 return;
2929 else
2930 linnum = lin;
2932 else if (tok.value == TOK.line) // #line __LINE__
2935 else
2937 error(tok.loc, "positive integer argument expected following `#line`");
2938 if (tok.value != TOK.endOfLine)
2939 skipToNextLine();
2940 return;
2942 while (1)
2944 scan(&tok);
2945 switch (tok.value)
2947 case TOK.endOfFile:
2948 case TOK.endOfLine:
2949 if (!inTokenStringConstant)
2951 this.scanloc.linnum = linnum;
2952 if (filespec)
2953 this.scanloc.filename = filespec;
2955 return;
2956 case TOK.file:
2957 if (filespec || flags)
2958 goto Lerr;
2959 filespec = mem.xstrdup(scanloc.filename);
2960 continue;
2961 case TOK.string_:
2962 if (filespec || flags)
2963 goto Lerr;
2964 if (tok.ptr[0] != '"' || tok.postfix != 0)
2965 goto Lerr;
2966 filespec = tok.ustring;
2967 continue;
2968 case TOK.int32Literal:
2969 if (!filespec)
2970 goto Lerr;
2971 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2973 flags = true; // linemarker flags seen
2974 continue;
2976 goto Lerr;
2977 default:
2978 goto Lerr;
2981 Lerr:
2982 if (filespec is null)
2983 error(tok.loc, "invalid filename for `#line` directive");
2984 else if (linemarker)
2985 error(tok.loc, "invalid flag for line marker directive");
2986 else if (!Ccompile)
2987 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2988 if (tok.value != TOK.endOfLine)
2989 skipToNextLine();
2992 /***************************************
2993 * Scan forward to start of next line.
2994 * Params:
2995 * defines = send characters to `defines`
2997 final void skipToNextLine(OutBuffer* defines = null)
2999 while (1)
3001 switch (*p)
3003 case 0:
3004 case 0x1A:
3005 return; // do not advance p
3007 case '\n':
3008 ++p;
3009 break;
3011 case '\r':
3012 ++p;
3013 if (p[0] == '\n')
3014 ++p;
3015 break;
3017 default:
3018 if (defines)
3019 defines.writeByte(*p); // don't care about Unicode line endings for C
3020 else if (*p & 0x80)
3022 const u = decodeUTF();
3023 if (u == PS || u == LS)
3025 ++p;
3026 break;
3029 ++p;
3030 continue;
3032 break;
3034 endOfLine();
3035 tokenizeNewlines = false;
3038 /********************************************
3039 * Decode UTF character.
3040 * Issue error messages for invalid sequences.
3041 * Return decoded character, advance p to last character in UTF sequence.
3043 private uint decodeUTF()
3045 string msg;
3046 auto result = decodeUTFpure(msg);
3048 if (msg)
3049 error(token.loc, "%.*s", cast(int)msg.length, msg.ptr);
3050 return result;
3053 /********************************************
3054 * Same as above, but the potential error message is stored to the
3055 * msg parameter instead of being issued.
3057 private pure uint decodeUTFpure(out string msg)
3059 const s = p;
3060 assert(*s & 0x80);
3061 // Check length of remaining string up to 4 UTF-8 characters
3062 size_t len;
3063 for (len = 1; len < 4 && s[len]; len++)
3066 size_t idx = 0;
3067 dchar u;
3068 msg = utf_decodeChar(s[0 .. len], idx, u);
3069 p += idx - 1;
3070 if (!msg && isBidiControl(u))
3071 msg = "Bidirectional control characters are disallowed for security reasons.";
3072 return u;
3075 /***************************************************
3076 * Parse doc comment embedded between t.ptr and p.
3077 * Remove trailing blanks and tabs from lines.
3078 * Replace all newlines with \n.
3079 * Remove leading comment character from each line.
3080 * Decide if it's a lineComment or a blockComment.
3081 * Append to previous one for this token.
3083 * If newParagraph is true, an extra newline will be
3084 * added between adjoining doc comments.
3086 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
3088 /* ct tells us which kind of comment it is: '/', '*', or '+'
3090 const ct = t.ptr[2];
3091 /* Start of comment text skips over / * *, / + +, or / / /
3093 const(char)* q = t.ptr + 3; // start of comment text
3094 const(char)* qend = p;
3095 if (ct == '*' || ct == '+')
3096 qend -= 2;
3097 /* Scan over initial row of ****'s or ++++'s or ////'s
3099 for (; q < qend; q++)
3101 if (*q != ct)
3102 break;
3104 /* Remove leading spaces until start of the comment
3106 int linestart = 0;
3107 if (ct == '/')
3109 while (q < qend && (*q == ' ' || *q == '\t'))
3110 ++q;
3112 else if (q < qend)
3114 if (*q == '\r')
3116 ++q;
3117 if (q < qend && *q == '\n')
3118 ++q;
3119 linestart = 1;
3121 else if (*q == '\n')
3123 ++q;
3124 linestart = 1;
3127 /* Remove trailing row of ****'s or ++++'s
3129 if (ct != '/')
3131 for (; q < qend; qend--)
3133 if (qend[-1] != ct)
3134 break;
3137 /* Comment is now [q .. qend].
3138 * Canonicalize it into buf[].
3140 OutBuffer buf;
3142 void trimTrailingWhitespace()
3144 const s = buf[];
3145 auto len = s.length;
3146 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
3147 --len;
3148 buf.setsize(len);
3151 for (; q < qend; q++)
3153 char c = *q;
3154 switch (c)
3156 case '*':
3157 case '+':
3158 if (linestart && c == ct)
3160 linestart = 0;
3161 /* Trim preceding whitespace up to preceding \n
3163 trimTrailingWhitespace();
3164 continue;
3166 break;
3167 case ' ':
3168 case '\t':
3169 break;
3170 case '\r':
3171 if (q[1] == '\n')
3172 continue; // skip the \r
3173 goto Lnewline;
3174 default:
3175 if (c == 226)
3177 // If LS or PS
3178 if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
3180 q += 2;
3181 goto Lnewline;
3184 linestart = 0;
3185 break;
3186 Lnewline:
3187 c = '\n'; // replace all newlines with \n
3188 goto case;
3189 case '\n':
3190 linestart = 1;
3191 /* Trim trailing whitespace
3193 trimTrailingWhitespace();
3194 break;
3196 buf.writeByte(c);
3198 /* Trim trailing whitespace (if the last line does not have newline)
3200 trimTrailingWhitespace();
3202 // Always end with a newline
3203 const s = buf[];
3204 if (s.length == 0 || s[$ - 1] != '\n')
3205 buf.writeByte('\n');
3207 // It's a line comment if the start of the doc comment comes
3208 // after other non-whitespace on the same line.
3209 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
3210 // Combine with previous doc comment, if any
3211 if (*dc)
3213 auto p = combineComments(*dc, buf[], newParagraph);
3214 *dc = p ? p[0 .. strlen(p)] : null;
3216 else
3217 *dc = buf.extractSlice(true);
3220 /********************************************
3221 * Combine two document comments into one,
3222 * separated by an extra newline if newParagraph is true.
3224 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
3226 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
3227 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
3228 if (!c1)
3229 return c2.ptr;
3230 if (!c2)
3231 return c1.ptr;
3233 int insertNewLine = 0;
3234 if (c1.length && c1[$ - 1] != '\n')
3235 insertNewLine = 1;
3236 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
3237 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
3238 p[0 .. c1.length] = c1[];
3239 if (insertNewLine)
3240 p[c1.length] = '\n';
3241 if (newParagraph)
3242 p[c1.length + insertNewLine] = '\n';
3243 p[retSize - c2.length .. retSize] = c2[];
3244 p[retSize] = 0;
3245 return p;
3248 /**************************
3249 * `p` should be at start of next line
3251 private void endOfLine() @nogc @safe
3253 scanloc.linnum = scanloc.linnum + 1;
3254 line = p;
3257 /****************************
3258 * Print the tokens from the current `token` to the end,
3259 * while not advancing the parser forward.
3260 * Useful for debugging.
3262 void printRestOfTokens()
3264 auto tk = &token;
3265 while (1)
3267 printf("%s ", (*tk).toChars());
3268 if (tk.value == TOK.endOfFile || tk.value == TOK.endOfLine)
3269 break;
3270 tk = peek(tk);
3272 printf("\n");
3277 /******************************* Private *****************************************/
3279 private:
3281 private enum LS = 0x2028; // UTF line separator
3282 private enum PS = 0x2029; // UTF paragraph separator
3284 /********************************************
3285 * Do our own char maps
3287 private static immutable cmtable = ()
3289 ubyte[256] table;
3290 foreach (const c; 0 .. table.length)
3292 if ('0' <= c && c <= '7')
3293 table[c] |= CMoctal;
3294 if (c_isxdigit(c))
3295 table[c] |= CMhex;
3296 if (c_isalnum(c) || c == '_')
3297 table[c] |= CMidchar;
3299 switch (c)
3301 case 'x': case 'X':
3302 case 'b': case 'B':
3303 table[c] |= CMzerosecond;
3304 break;
3306 case '0': .. case '9':
3307 case 'e': case 'E':
3308 case 'f': case 'F':
3309 case 'l': case 'L':
3310 case 'p': case 'P':
3311 case 'u': case 'U':
3312 case 'i':
3313 case '.':
3314 case '_':
3315 table[c] |= CMzerosecond | CMdigitsecond;
3316 break;
3318 default:
3319 break;
3322 switch (c)
3324 case '\\':
3325 case '\n':
3326 case '\r':
3327 case 0:
3328 case 0x1A:
3329 case '\'':
3330 break;
3331 default:
3332 if (!(c & 0x80))
3333 table[c] |= CMsinglechar;
3334 break;
3337 return table;
3338 }();
3340 private
3342 enum CMoctal = 0x1;
3343 enum CMhex = 0x2;
3344 enum CMidchar = 0x4;
3345 enum CMzerosecond = 0x8;
3346 enum CMdigitsecond = 0x10;
3347 enum CMsinglechar = 0x20;
3350 private bool isoctal(const char c) pure @nogc @safe
3352 return (cmtable[c] & CMoctal) != 0;
3355 private bool ishex(const char c) pure @nogc @safe
3357 return (cmtable[c] & CMhex) != 0;
3360 private bool isidchar(const char c) pure @nogc @safe
3362 return (cmtable[c] & CMidchar) != 0;
3365 private bool isZeroSecond(const char c) pure @nogc @safe
3367 return (cmtable[c] & CMzerosecond) != 0;
3370 private bool isDigitSecond(const char c) pure @nogc @safe
3372 return (cmtable[c] & CMdigitsecond) != 0;
3375 private bool issinglechar(const char c) pure @nogc @safe
3377 return (cmtable[c] & CMsinglechar) != 0;
3380 private bool c_isxdigit(const int c) pure @nogc @safe
3382 return (( c >= '0' && c <= '9') ||
3383 ( c >= 'a' && c <= 'f') ||
3384 ( c >= 'A' && c <= 'F'));
3387 private bool c_isalnum(const int c) pure @nogc @safe
3389 return (( c >= '0' && c <= '9') ||
3390 ( c >= 'a' && c <= 'z') ||
3391 ( c >= 'A' && c <= 'Z'));
3394 /******************************* Unittest *****************************************/
3396 unittest
3398 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3400 ErrorSink errorSink = new ErrorSinkStderr;
3402 void test(T)(string sequence, T expected, bool Ccompile = false)
3404 auto p = cast(const(char)*)sequence.ptr;
3405 dchar c2;
3406 Lexer lexer = new Lexer(errorSink);
3407 assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2));
3408 assert(p == sequence.ptr + sequence.length);
3411 test(`'`, '\'');
3412 test(`"`, '"');
3413 test(`?`, '?');
3414 test(`\`, '\\');
3415 test(`0`, '\0');
3416 test(`a`, '\a');
3417 test(`b`, '\b');
3418 test(`f`, '\f');
3419 test(`n`, '\n');
3420 test(`r`, '\r');
3421 test(`t`, '\t');
3422 test(`v`, '\v');
3424 test(`x00`, 0x00);
3425 test(`xff`, 0xff);
3426 test(`xFF`, 0xff);
3427 test(`xa7`, 0xa7);
3428 test(`x3c`, 0x3c);
3429 test(`xe2`, 0xe2);
3431 test(`1`, '\1');
3432 test(`42`, '\42');
3433 test(`357`, '\357');
3435 test(`u1234`, '\u1234');
3436 test(`uf0e4`, '\uf0e4');
3438 test(`U0001f603`, '\U0001f603');
3440 test(`&quot;`, '"');
3441 test(`&lt;`, '<');
3442 test(`&gt;`, '>');
3445 unittest
3447 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3449 static class ErrorSinkTest : ErrorSinkNull
3451 nothrow:
3452 extern (C++):
3453 override:
3455 import core.stdc.stdio;
3456 import core.stdc.stdarg;
3458 string expected;
3459 bool gotError;
3461 void error(const ref Loc loc, const(char)* format, ...)
3463 gotError = true;
3464 char[100] buffer = void;
3465 va_list ap;
3466 va_start(ap, format);
3467 auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)];
3468 va_end(ap);
3469 assert(expected == actual);
3473 ErrorSinkTest errorSink = new ErrorSinkTest;
3475 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3477 errorSink.expected = expectedError;
3478 errorSink.gotError = false;
3479 auto p = cast(const(char)*)sequence.ptr;
3480 Lexer lexer = new Lexer(errorSink);
3481 dchar c2;
3482 auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2);
3483 assert(errorSink.gotError);
3484 assert(expectedReturnValue == actualReturnValue);
3486 auto actualScanLength = p - sequence.ptr;
3487 assert(expectedScanLength == actualScanLength);
3490 test("c", `undefined escape sequence \c`, 'c', 1);
3491 test("!", `undefined escape sequence \!`, '!', 1);
3492 test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3494 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3496 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3497 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3498 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3500 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3501 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3502 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3503 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3504 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3505 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3506 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3508 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3509 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3510 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3512 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3513 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3514 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3516 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3517 test("&quot", `unterminated named entity &quot;`, '?', 5);
3518 test("&quot", `unterminated named entity &quot;`, '?', 5);
3520 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3523 unittest
3525 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3526 /* Not much here, just trying things out.
3528 string text = "int"; // We rely on the implicit null-terminator
3529 ErrorSink errorSink = new ErrorSinkStderr;
3530 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink, null);
3531 TOK tok;
3532 tok = lex1.nextToken();
3533 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3534 assert(tok == TOK.int32);
3535 tok = lex1.nextToken();
3536 assert(tok == TOK.endOfFile);
3537 tok = lex1.nextToken();
3538 assert(tok == TOK.endOfFile);
3539 tok = lex1.nextToken();
3540 assert(tok == TOK.endOfFile);
3543 unittest
3545 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3547 // We don't want to see Lexer error output during these tests.
3548 ErrorSink errorSink = new ErrorSinkNull;
3550 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3551 static immutable char[][] testcases =
3552 [ // Testcase must end with 0 or 0x1A.
3553 [0], // not malformed, but pathological
3554 ['\'', 0],
3555 ['\'', 0x1A],
3556 ['{', '{', 'q', '{', 0],
3557 [0xFF, 0],
3558 [0xFF, 0x80, 0],
3559 [0xFF, 0xFF, 0],
3560 [0xFF, 0xFF, 0],
3561 ['x', '"', 0x1A],
3564 foreach (testcase; testcases)
3566 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink, null);
3567 TOK tok = lex2.nextToken();
3568 size_t iterations = 1;
3569 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3571 tok = lex2.nextToken();
3573 assert(tok == TOK.endOfFile);
3574 tok = lex2.nextToken();
3575 assert(tok == TOK.endOfFile);