gcc/d/dmd/lexer.d

   1 /**
   2  * Implements the lexical analyzer, which converts source code into lexical tokens.
   3  *
   4  * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
   5  *
   6  * Copyright:   Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
   7  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
   8  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
   9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
  10  * Documentation:  https://dlang.org/phobos/dmd_lexer.html
  11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
  12  */
  13
  14 module dmd.lexer;
  15
  16 import core.stdc.ctype;
  17 import core.stdc.stdio;
  18 import core.stdc.string;
  19
  20 import dmd.entity;
  21 import dmd.errorsink;
  22 import dmd.id;
  23 import dmd.identifier;
  24 import dmd.location;
  25 import dmd.root.array;
  26 import dmd.root.ctfloat;
  27 import dmd.common.outbuffer;
  28 import dmd.root.port;
  29 import dmd.root.rmem;
  30 import dmd.root.utf;
  31 import dmd.tokens;
  32
  33 nothrow:
  34
  35 version (DMDLIB)
  36 {
  37     version = LocOffset;
  38 }
  39
  40 /***********************************************************
  41  * Values to use for various magic identifiers
  42  */
  43 struct CompileEnv
  44 {
  45     uint versionNumber;      /// __VERSION__
  46     const(char)[] date;      /// __DATE__
  47     const(char)[] time;      /// __TIME__
  48     const(char)[] vendor;    /// __VENDOR__
  49     const(char)[] timestamp; /// __TIMESTAMP__
  50
  51     bool previewIn;          /// `in` means `[ref] scope const`, accepts rvalues
  52     bool ddocOutput;         /// collect embedded documentation comments
  53     bool masm;               /// use MASM inline asm syntax
  54 }
  55
  56 /***********************************************************
  57  */
  58 class Lexer
  59 {
  60     private __gshared OutBuffer stringbuffer;
  61
  62     Loc scanloc;            // for error messages
  63     Loc prevloc;            // location of token before current
  64
  65     const(char)* p;         // current character
  66
  67     Token token;
  68
  69     // For ImportC
  70     bool Ccompile;              /// true if compiling ImportC
  71
  72     // The following are valid only if (Ccompile == true)
  73     ubyte boolsize;             /// size of a C _Bool, default 1
  74     ubyte shortsize;            /// size of a C short, default 2
  75     ubyte intsize;              /// size of a C int, default 4
  76     ubyte longsize;             /// size of C long, 4 or 8
  77     ubyte long_longsize;        /// size of a C long long, default 8
  78     ubyte long_doublesize;      /// size of C long double, 8 or D real.sizeof
  79     ubyte wchar_tsize;          /// size of C wchar_t, 2 or 4
  80
  81     ErrorSink eSink;            /// send error messages through this interface
  82     CompileEnv compileEnv;      /// environment
  83
  84     private
  85     {
  86         const(char)* base;      // pointer to start of buffer
  87         const(char)* end;       // pointer to last element of buffer
  88         const(char)* line;      // start of current line
  89
  90         bool doDocComment;      // collect doc comment information
  91         bool anyToken;          // seen at least one token
  92         bool commentToken;      // comments are TOK.comment's
  93         bool tokenizeNewlines;  // newlines are turned into TOK.endOfLine's
  94
  95         bool whitespaceToken;   // tokenize whitespaces (only for DMDLIB)
  96
  97         int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
  98         int lastDocLine;        // last line of previous doc comment
  99
 100         Token* tokenFreelist;
 101     }
 102
 103   nothrow:
 104
 105     /*********************
 106      * Creates a Lexer for the source code base[begoffset..endoffset+1].
 107      * The last character, base[endoffset], must be null (0) or EOF (0x1A).
 108      *
 109      * Params:
 110      *  filename = used for error messages
 111      *  base = source code, must be terminated by a null (0) or EOF (0x1A) character
 112      *  begoffset = starting offset into base[]
 113      *  endoffset = the last offset to read into base[]
 114      *  doDocComment = handle documentation comments
 115      *  commentToken = comments become TOK.comment's
 116      *  errorSink = where error messages go, must not be null
 117      *  compileEnv = version, vendor, date, time, etc.
 118      */
 119     this(const(char)* filename, const(char)* base, size_t begoffset,
 120         size_t endoffset, bool doDocComment, bool commentToken,
 121         ErrorSink errorSink,
 122         const CompileEnv* compileEnv) scope
 123     {
 124         scanloc = Loc(filename, 1, 1);
 125         // debug printf("Lexer::Lexer(%p)\n", base);
 126         // debug printf("lexer.filename = %s\n", filename);
 127         token = Token.init;
 128         this.base = base;
 129         this.end = base + endoffset;
 130         p = base + begoffset;
 131         line = p;
 132         this.doDocComment = doDocComment;
 133         this.commentToken = commentToken;
 134         this.tokenizeNewlines = false;
 135         this.inTokenStringConstant = 0;
 136         this.lastDocLine = 0;
 137         this.eSink = errorSink;
 138         assert(errorSink);
 139         if (compileEnv)
 140             this.compileEnv = *compileEnv;
 141         else
 142         {
 143             this.compileEnv.versionNumber = 1;
 144             this.compileEnv.vendor = "DLF";
 145         }
 146         //initKeywords();
 147         /* If first line starts with '#!', ignore the line
 148          */
 149         if (p && p[0] == '#' && p[1] == '!')
 150         {
 151             p += 2;
 152             for (;;p++)
 153             {
 154                 char c = *p;
 155                 switch (c)
 156                 {
 157                 case '\n':
 158                     p++;
 159                     goto case;
 160                 case 0:
 161                 case 0x1A:
 162                     break;
 163
 164                 default:
 165                     // Note: We do allow malformed UTF-8 on shebang line.
 166                     // It could have a meaning if the native system
 167                     // encoding is not Unicode. See test compilable/test13512.d
 168                     // for example encoded in KOI-8.
 169                     // We also allow bidirectional control characters.
 170                     // We do not execute the shebang line, so it can't be used
 171                     // to conceal code. It is up to the shell to sanitize it.
 172                     continue;
 173                 }
 174                 break;
 175             }
 176             endOfLine();
 177         }
 178     }
 179
 180     /***********************
 181      * Alternative entry point for DMDLIB, adds `whitespaceToken`
 182      */
 183     this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
 184         bool doDocComment, bool commentToken, bool whitespaceToken,
 185         ErrorSink errorSink, const CompileEnv* compileEnv = null
 186         )
 187     {
 188         this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink, compileEnv);
 189         this.whitespaceToken = whitespaceToken;
 190     }
 191
 192     /******************
 193      * Used for unittests for a mock Lexer
 194      */
 195     this(ErrorSink errorSink) scope @safe { assert(errorSink); this.eSink = errorSink; }
 196
 197     /**************************************
 198      * Reset lexer to lex #define's
 199      */
 200     final void resetDefineLines(const(char)[] slice)
 201     {
 202         base = slice.ptr;
 203         end = base + slice.length;
 204         assert(*end == 0);
 205         p = base;
 206         line = p;
 207         tokenizeNewlines = true;
 208         inTokenStringConstant = 0;
 209         lastDocLine = 0;
 210         scanloc = Loc("#defines", 1, 1);
 211     }
 212
 213     /**********************************
 214      * Set up for next #define line.
 215      * p should be at start of next line.
 216      */
 217     final void nextDefineLine()
 218     {
 219         tokenizeNewlines = true;
 220     }
 221
 222     /***************
 223      * Range interface
 224      */
 225
 226     final bool empty() const pure @property @nogc @safe
 227     {
 228         return front() == TOK.endOfFile;
 229     }
 230
 231     final TOK front() const pure @property @nogc @safe
 232     {
 233         return token.value;
 234     }
 235
 236     final void popFront()
 237     {
 238         nextToken();
 239     }
 240
 241     /// Returns: a newly allocated `Token`.
 242     Token* allocateToken() pure nothrow @safe
 243     {
 244         if (tokenFreelist)
 245         {
 246             Token* t = tokenFreelist;
 247             tokenFreelist = t.next;
 248             t.next = null;
 249             return t;
 250         }
 251         return new Token();
 252     }
 253
 254     /// Frees the given token by returning it to the freelist.
 255     private void releaseToken(Token* token) pure nothrow @nogc @safe
 256     {
 257         if (mem.isGCEnabled)
 258             *token = Token.init;
 259         token.next = tokenFreelist;
 260         tokenFreelist = token;
 261     }
 262
 263     final TOK nextToken()
 264     {
 265         prevloc = token.loc;
 266         if (token.next)
 267         {
 268             Token* t = token.next;
 269             memcpy(&token, t, Token.sizeof);
 270             releaseToken(t);
 271         }
 272         else
 273         {
 274             scan(&token);
 275         }
 276         //printf(token.toChars());
 277         return token.value;
 278     }
 279
 280     /***********************
 281      * Look ahead at next token's value.
 282      */
 283     final TOK peekNext()
 284     {
 285         return peek(&token).value;
 286     }
 287
 288     /***********************
 289      * Look 2 tokens ahead at value.
 290      */
 291     final TOK peekNext2()
 292     {
 293         Token* t = peek(&token);
 294         return peek(t).value;
 295     }
 296
 297     /****************************
 298      * Turn next token in buffer into a token.
 299      * Params:
 300      *  t = the token to set the resulting Token to
 301      */
 302     final void scan(Token* t)
 303     {
 304         const lastLine = scanloc.linnum;
 305         Loc startLoc;
 306         t.blockComment = null;
 307         t.lineComment = null;
 308
 309         while (1)
 310         {
 311             t.ptr = p;
 312             //printf("p = %p, *p = '%c'\n",p,*p);
 313             t.loc = loc();
 314             switch (*p)
 315             {
 316             case 0:
 317             case 0x1A:
 318                 t.value = TOK.endOfFile; // end of file
 319                 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
 320                 return;
 321             case ' ':
 322                 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
 323                 while ((cast(size_t)p) % uint.sizeof)
 324                 {
 325                     if (*p != ' ')
 326                         goto LendSkipFourSpaces;
 327                     p++;
 328                 }
 329                 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
 330                     p += 4;
 331                 // Skip over any remaining space on the line.
 332                 while (*p == ' ')
 333                     p++;
 334             LendSkipFourSpaces:
 335                 version (DMDLIB)
 336                 {
 337                     if (whitespaceToken)
 338                     {
 339                         t.value = TOK.whitespace;
 340                         return;
 341                     }
 342                 }
 343                 continue; // skip white space
 344             case '\t':
 345             case '\v':
 346             case '\f':
 347                 p++;
 348                 version (DMDLIB)
 349                 {
 350                     if (whitespaceToken)
 351                     {
 352                         t.value = TOK.whitespace;
 353                         return;
 354                     }
 355                 }
 356                 continue; // skip white space
 357             case '\r':
 358                 p++;
 359                 if (*p != '\n') // if CR stands by itself
 360                 {
 361                     endOfLine();
 362                     if (tokenizeNewlines)
 363                     {
 364                         t.value = TOK.endOfLine;
 365                         tokenizeNewlines = false;
 366                         return;
 367                     }
 368                 }
 369                 version (DMDLIB)
 370                 {
 371                     if (whitespaceToken)
 372                     {
 373                         t.value = TOK.whitespace;
 374                         return;
 375                     }
 376                 }
 377                 continue; // skip white space
 378             case '\n':
 379                 p++;
 380                 endOfLine();
 381                 if (tokenizeNewlines)
 382                 {
 383                     t.value = TOK.endOfLine;
 384                     tokenizeNewlines = false;
 385                     return;
 386                 }
 387                 version (DMDLIB)
 388                 {
 389                     if (whitespaceToken)
 390                     {
 391                         t.value = TOK.whitespace;
 392                         return;
 393                     }
 394                 }
 395                 continue; // skip white space
 396
 397             case '\\':
 398                 if (Ccompile && (p[1] == '\r' || p[1] == '\n'))
 399                 {
 400                     ++p; // ignore \ followed by new line, like VC does
 401                     continue;
 402                 }
 403                 goto default;
 404
 405             case '0':
 406                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
 407                 {
 408                     ++p;
 409                     t.unsvalue = 0;
 410                     t.value = TOK.int32Literal;
 411                     return;
 412                 }
 413                 goto Lnumber;
 414
 415             case '1': .. case '9':
 416                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
 417                 {
 418                     t.unsvalue = *p - '0';
 419                     ++p;
 420                     t.value = TOK.int32Literal;
 421                     return;
 422                 }
 423             Lnumber:
 424                 t.value = number(t);
 425                 return;
 426
 427             case '\'':
 428                 if (issinglechar(p[1]) && p[2] == '\'')
 429                 {
 430                     t.unsvalue = p[1];        // simple one character literal
 431                     t.value = TOK.charLiteral;
 432                     p += 3;
 433                 }
 434                 else if (Ccompile)
 435                 {
 436                     clexerCharConstant(*t, 0);
 437                 }
 438                 else
 439                 {
 440                     t.value = charConstant(t);
 441                 }
 442                 return;
 443
 444             case 'u':
 445             case 'U':
 446             case 'L':
 447                 if (!Ccompile)
 448                     goto case_ident;
 449                 if (p[1] == '\'')       // C wide character constant
 450                 {
 451                     char c = *p;
 452                     if (c == 'L')       // convert L to u or U
 453                         c = (wchar_tsize == 4) ? 'u' : 'U';
 454                     ++p;
 455                     clexerCharConstant(*t, c);
 456                     return;
 457                 }
 458                 else if (p[1] == '\"')  // C wide string literal
 459                 {
 460                     const c = *p;
 461                     ++p;
 462                     escapeStringConstant(t);
 463                     t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
 464                                 c == 'u' ? 'w' :
 465                                 'd';
 466                     return;
 467                 }
 468                 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
 469                 {
 470                     p += 2;
 471                     escapeStringConstant(t);
 472                     return;
 473                 }
 474                 goto case_ident;
 475
 476             case 'r':
 477                 if (Ccompile || p[1] != '"')
 478                     goto case_ident;
 479                 p++;
 480                 goto case '`';
 481             case '`':
 482                 if (Ccompile)
 483                     goto default;
 484                 wysiwygStringConstant(t);
 485                 return;
 486             case 'x':
 487                 if (p[1] != '"')
 488                     goto case_ident;
 489                 p++;
 490                 t.value = hexStringConstant(t);
 491                 return;
 492             case 'q':
 493                 if (Ccompile)
 494                     goto case_ident;
 495                 if (p[1] == '"')
 496                 {
 497                     p++;
 498                     delimitedStringConstant(t);
 499                     return;
 500                 }
 501                 else if (p[1] == '{')
 502                 {
 503                     p++;
 504                     tokenStringConstant(t);
 505                     return;
 506                 }
 507                 else
 508                     goto case_ident;
 509             case '"':
 510                 escapeStringConstant(t);
 511                 return;
 512             case 'a':
 513             case 'b':
 514             case 'c':
 515             case 'd':
 516             case 'e':
 517             case 'f':
 518             case 'g':
 519             case 'h':
 520             case 'i':
 521             case 'j':
 522             case 'k':
 523             case 'l':
 524             case 'm':
 525             case 'n':
 526             case 'o':
 527             case 'p':
 528                 /*case 'q': case 'r':*/
 529             case 's':
 530             case 't':
 531             //case 'u':
 532             case 'v':
 533             case 'w':
 534                 /*case 'x':*/
 535             case 'y':
 536             case 'z':
 537             case 'A':
 538             case 'B':
 539             case 'C':
 540             case 'D':
 541             case 'E':
 542             case 'F':
 543             case 'G':
 544             case 'H':
 545             case 'I':
 546             case 'J':
 547             case 'K':
 548             //case 'L':
 549             case 'M':
 550             case 'N':
 551             case 'O':
 552             case 'P':
 553             case 'Q':
 554             case 'R':
 555             case 'S':
 556             case 'T':
 557             //case 'U':
 558             case 'V':
 559             case 'W':
 560             case 'X':
 561             case 'Y':
 562             case 'Z':
 563             case '_':
 564             case_ident:
 565                 {
 566                     while (1)
 567                     {
 568                         const c = *++p;
 569                         if (isidchar(c))
 570                             continue;
 571                         else if (c & 0x80)
 572                         {
 573                             const s = p;
 574                             const u = decodeUTF();
 575                             if (isUniAlpha(u))
 576                                 continue;
 577                             error(t.loc, "char 0x%04x not allowed in identifier", u);
 578                             p = s;
 579                         }
 580                         break;
 581                     }
 582                     Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
 583                     t.ident = id;
 584                     t.value = cast(TOK)id.getValue();
 585
 586                     anyToken = 1;
 587
 588                     /* Different keywords for C and D
 589                      */
 590                     if (Ccompile)
 591                     {
 592                         if (t.value != TOK.identifier)
 593                         {
 594                             t.value = Ckeywords[t.value];  // filter out D keywords
 595                         }
 596                     }
 597                     else if (t.value >= FirstCKeyword)
 598                         t.value = TOK.identifier;       // filter out C keywords
 599
 600                     else if (*t.ptr == '_') // if special identifier token
 601                     {
 602                         void toToken(const(char)[] s)
 603                         {
 604                             t.value = TOK.string_;
 605                             t.ustring = s.ptr;
 606                             t.len = cast(uint)s.length;
 607                             t.postfix = 0;
 608                         }
 609
 610                         if (id == Id.DATE)
 611                             toToken(compileEnv.date);
 612                         else if (id == Id.TIME)
 613                             toToken(compileEnv.time);
 614                         else if (id == Id.VENDOR)
 615                             toToken(compileEnv.vendor);
 616                         else if (id == Id.TIMESTAMP)
 617                             toToken(compileEnv.timestamp);
 618                         else if (id == Id.VERSIONX)
 619                         {
 620                             t.value = TOK.int64Literal;
 621                             t.unsvalue = compileEnv.versionNumber;
 622                         }
 623                         else if (id == Id.EOFX)
 624                         {
 625                             t.value = TOK.endOfFile;
 626                             // Advance scanner to end of file
 627                             while (!(*p == 0 || *p == 0x1A))
 628                                 p++;
 629                         }
 630                     }
 631                     //printf("t.value = %d\n",t.value);
 632                     return;
 633                 }
 634             case '/':
 635                 p++;
 636                 switch (*p)
 637                 {
 638                 case '=':
 639                     p++;
 640                     t.value = TOK.divAssign;
 641                     return;
 642                 case '*':
 643                     p++;
 644                     startLoc = loc();
 645                     while (1)
 646                     {
 647                         while (1)
 648                         {
 649                             const c = *p;
 650                             switch (c)
 651                             {
 652                             case '/':
 653                                 break;
 654                             case '\n':
 655                                 endOfLine();
 656                                 p++;
 657                                 continue;
 658                             case '\r':
 659                                 p++;
 660                                 if (*p != '\n')
 661                                     endOfLine();
 662                                 continue;
 663                             case 0:
 664                             case 0x1A:
 665                                 error(t.loc, "unterminated /* */ comment");
 666                                 p = end;
 667                                 t.loc = loc();
 668                                 t.value = TOK.endOfFile;
 669                                 return;
 670                             default:
 671                                 if (c & 0x80)
 672                                 {
 673                                     const u = decodeUTF();
 674                                     if (u == PS || u == LS)
 675                                         endOfLine();
 676                                 }
 677                                 p++;
 678                                 continue;
 679                             }
 680                             break;
 681                         }
 682                         p++;
 683                         if (p[-2] == '*' && p - 3 != t.ptr)
 684                             break;
 685                     }
 686                     if (commentToken)
 687                     {
 688                         t.loc = startLoc;
 689                         t.value = TOK.comment;
 690                         return;
 691                     }
 692                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
 693                     {
 694                         // if /** but not /**/
 695                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 696                         lastDocLine = scanloc.linnum;
 697                     }
 698                     continue;
 699                 case '/': // do // style comments
 700                     startLoc = loc();
 701                     while (1)
 702                     {
 703                         const c = *++p;
 704                         switch (c)
 705                         {
 706                         case '\n':
 707                             break;
 708                         case '\r':
 709                             if (p[1] == '\n')
 710                                 p++;
 711                             break;
 712                         case 0:
 713                         case 0x1A:
 714                             if (commentToken)
 715                             {
 716                                 p = end;
 717                                 t.loc = startLoc;
 718                                 t.value = TOK.comment;
 719                                 return;
 720                             }
 721                             if (doDocComment && t.ptr[2] == '/')
 722                             {
 723                                 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 724                                 lastDocLine = scanloc.linnum;
 725                             }
 726                             p = end;
 727                             t.loc = loc();
 728                             t.value = TOK.endOfFile;
 729                             return;
 730                         default:
 731                             if (c & 0x80)
 732                             {
 733                                 const u = decodeUTF();
 734                                 if (u == PS || u == LS)
 735                                     break;
 736                             }
 737                             continue;
 738                         }
 739                         break;
 740                     }
 741                     if (commentToken)
 742                     {
 743                         version (DMDLIB) {}
 744                         else
 745                         {
 746                             p++;
 747                             endOfLine();
 748                         }
 749                         t.loc = startLoc;
 750                         t.value = TOK.comment;
 751                         return;
 752                     }
 753                     if (doDocComment && t.ptr[2] == '/')
 754                     {
 755                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 756                         lastDocLine = scanloc.linnum;
 757                     }
 758                     p++;
 759                     endOfLine();
 760                     continue;
 761                 case '+':
 762                     if (!Ccompile)
 763                     {
 764                         int nest;
 765                         startLoc = loc();
 766                         p++;
 767                         nest = 1;
 768                         while (1)
 769                         {
 770                             char c = *p;
 771                             switch (c)
 772                             {
 773                             case '/':
 774                                 p++;
 775                                 if (*p == '+')
 776                                 {
 777                                     p++;
 778                                     nest++;
 779                                 }
 780                                 continue;
 781                             case '+':
 782                                 p++;
 783                                 if (*p == '/')
 784                                 {
 785                                     p++;
 786                                     if (--nest == 0)
 787                                         break;
 788                                 }
 789                                 continue;
 790                             case '\r':
 791                                 p++;
 792                                 if (*p != '\n')
 793                                     endOfLine();
 794                                 continue;
 795                             case '\n':
 796                                 endOfLine();
 797                                 p++;
 798                                 continue;
 799                             case 0:
 800                             case 0x1A:
 801                                 error(t.loc, "unterminated /+ +/ comment");
 802                                 p = end;
 803                                 t.loc = loc();
 804                                 t.value = TOK.endOfFile;
 805                                 return;
 806                             default:
 807                                 if (c & 0x80)
 808                                 {
 809                                     uint u = decodeUTF();
 810                                     if (u == PS || u == LS)
 811                                         endOfLine();
 812                                 }
 813                                 p++;
 814                                 continue;
 815                             }
 816                             break;
 817                         }
 818                         if (commentToken)
 819                         {
 820                             t.loc = startLoc;
 821                             t.value = TOK.comment;
 822                             return;
 823                         }
 824                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
 825                         {
 826                             // if /++ but not /++/
 827                             getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 828                             lastDocLine = scanloc.linnum;
 829                         }
 830                         continue;
 831                     }
 832                     break;
 833                 default:
 834                     break;
 835                 }
 836                 t.value = TOK.div;
 837                 return;
 838             case '.':
 839                 p++;
 840                 if (isdigit(*p))
 841                 {
 842                     /* Note that we don't allow ._1 and ._ as being
 843                      * valid floating point numbers.
 844                      */
 845                     p--;
 846                     t.value = inreal(t);
 847                 }
 848                 else if (p[0] == '.')
 849                 {
 850                     if (p[1] == '.')
 851                     {
 852                         p += 2;
 853                         t.value = TOK.dotDotDot;
 854                     }
 855                     else
 856                     {
 857                         p++;
 858                         t.value = TOK.slice;
 859                     }
 860                 }
 861                 else
 862                     t.value = TOK.dot;
 863                 return;
 864             case '&':
 865                 p++;
 866                 if (*p == '=')
 867                 {
 868                     p++;
 869                     t.value = TOK.andAssign;
 870                 }
 871                 else if (*p == '&')
 872                 {
 873                     p++;
 874                     t.value = TOK.andAnd;
 875                 }
 876                 else
 877                     t.value = TOK.and;
 878                 return;
 879             case '|':
 880                 p++;
 881                 if (*p == '=')
 882                 {
 883                     p++;
 884                     t.value = TOK.orAssign;
 885                 }
 886                 else if (*p == '|')
 887                 {
 888                     p++;
 889                     t.value = TOK.orOr;
 890                 }
 891                 else
 892                     t.value = TOK.or;
 893                 return;
 894             case '-':
 895                 p++;
 896                 if (*p == '=')
 897                 {
 898                     p++;
 899                     t.value = TOK.minAssign;
 900                 }
 901                 else if (*p == '-')
 902                 {
 903                     p++;
 904                     t.value = TOK.minusMinus;
 905                 }
 906                 else if (*p == '>')
 907                 {
 908                     ++p;
 909                     t.value = TOK.arrow;
 910                 }
 911                 else
 912                     t.value = TOK.min;
 913                 return;
 914             case '+':
 915                 p++;
 916                 if (*p == '=')
 917                 {
 918                     p++;
 919                     t.value = TOK.addAssign;
 920                 }
 921                 else if (*p == '+')
 922                 {
 923                     p++;
 924                     t.value = TOK.plusPlus;
 925                 }
 926                 else
 927                     t.value = TOK.add;
 928                 return;
 929             case '<':
 930                 p++;
 931                 if (*p == '=')
 932                 {
 933                     p++;
 934                     t.value = TOK.lessOrEqual; // <=
 935                 }
 936                 else if (*p == '<')
 937                 {
 938                     p++;
 939                     if (*p == '=')
 940                     {
 941                         p++;
 942                         t.value = TOK.leftShiftAssign; // <<=
 943                     }
 944                     else
 945                         t.value = TOK.leftShift; // <<
 946                 }
 947                 else if (*p == ':' && Ccompile)
 948                 {
 949                     ++p;
 950                     t.value = TOK.leftBracket;  // <:
 951                 }
 952                 else if (*p == '%' && Ccompile)
 953                 {
 954                     ++p;
 955                     t.value = TOK.leftCurly;    // <%
 956                 }
 957                 else
 958                     t.value = TOK.lessThan; // <
 959                 return;
 960             case '>':
 961                 p++;
 962                 if (*p == '=')
 963                 {
 964                     p++;
 965                     t.value = TOK.greaterOrEqual; // >=
 966                 }
 967                 else if (*p == '>')
 968                 {
 969                     p++;
 970                     if (*p == '=')
 971                     {
 972                         p++;
 973                         t.value = TOK.rightShiftAssign; // >>=
 974                     }
 975                     else if (*p == '>')
 976                     {
 977                         p++;
 978                         if (*p == '=')
 979                         {
 980                             p++;
 981                             t.value = TOK.unsignedRightShiftAssign; // >>>=
 982                         }
 983                         else
 984                             t.value = TOK.unsignedRightShift; // >>>
 985                     }
 986                     else
 987                         t.value = TOK.rightShift; // >>
 988                 }
 989                 else
 990                     t.value = TOK.greaterThan; // >
 991                 return;
 992             case '!':
 993                 p++;
 994                 if (*p == '=')
 995                 {
 996                     p++;
 997                     t.value = TOK.notEqual; // !=
 998                 }
 999                 else
1000                     t.value = TOK.not; // !
1001                 return;
1002             case '=':
1003                 p++;
1004                 if (*p == '=')
1005                 {
1006                     p++;
1007                     t.value = TOK.equal; // ==
1008                 }
1009                 else if (*p == '>')
1010                 {
1011                     p++;
1012                     t.value = TOK.goesTo; // =>
1013                 }
1014                 else
1015                     t.value = TOK.assign; // =
1016                 return;
1017             case '~':
1018                 p++;
1019                 if (*p == '=')
1020                 {
1021                     p++;
1022                     t.value = TOK.concatenateAssign; // ~=
1023                 }
1024                 else
1025                     t.value = TOK.tilde; // ~
1026                 return;
1027             case '^':
1028                 p++;
1029                 if (*p == '^')
1030                 {
1031                     p++;
1032                     if (*p == '=')
1033                     {
1034                         p++;
1035                         t.value = TOK.powAssign; // ^^=
1036                     }
1037                     else
1038                         t.value = TOK.pow; // ^^
1039                 }
1040                 else if (*p == '=')
1041                 {
1042                     p++;
1043                     t.value = TOK.xorAssign; // ^=
1044                 }
1045                 else
1046                     t.value = TOK.xor; // ^
1047                 return;
1048             case '(':
1049                 p++;
1050                 t.value = TOK.leftParenthesis;
1051                 return;
1052             case ')':
1053                 p++;
1054                 t.value = TOK.rightParenthesis;
1055                 return;
1056             case '[':
1057                 p++;
1058                 t.value = TOK.leftBracket;
1059                 return;
1060             case ']':
1061                 p++;
1062                 t.value = TOK.rightBracket;
1063                 return;
1064             case '{':
1065                 p++;
1066                 t.value = TOK.leftCurly;
1067                 return;
1068             case '}':
1069                 p++;
1070                 t.value = TOK.rightCurly;
1071                 return;
1072             case '?':
1073                 p++;
1074                 t.value = TOK.question;
1075                 return;
1076             case ',':
1077                 p++;
1078                 t.value = TOK.comma;
1079                 return;
1080             case ';':
1081                 p++;
1082                 t.value = TOK.semicolon;
1083                 return;
1084             case ':':
1085                 p++;
1086                 if (*p == ':')
1087                 {
1088                     ++p;
1089                     t.value = TOK.colonColon;
1090                 }
1091                 else if (*p == '>' && Ccompile)
1092                 {
1093                     ++p;
1094                     t.value = TOK.rightBracket;
1095                 }
1096                 else
1097                     t.value = TOK.colon;
1098                 return;
1099             case '$':
1100                 p++;
1101                 t.value = TOK.dollar;
1102                 return;
1103             case '@':
1104                 p++;
1105                 t.value = TOK.at;
1106                 return;
1107             case '*':
1108                 p++;
1109                 if (*p == '=')
1110                 {
1111                     p++;
1112                     t.value = TOK.mulAssign;
1113                 }
1114                 else
1115                     t.value = TOK.mul;
1116                 return;
1117             case '%':
1118                 p++;
1119                 if (*p == '=')
1120                 {
1121                     p++;
1122                     t.value = TOK.modAssign;
1123                 }
1124                 else if (*p == '>' && Ccompile)
1125                 {
1126                     ++p;
1127                     t.value = TOK.rightCurly;
1128                 }
1129                 else if (*p == ':' && Ccompile)
1130                 {
1131                     goto case '#';      // %: means #
1132                 }
1133                 else
1134                     t.value = TOK.mod;
1135                 return;
1136             case '#':
1137                 {
1138                     // https://issues.dlang.org/show_bug.cgi?id=22825
1139                     // Special token sequences are terminated by newlines,
1140                     // and should not be skipped over.
1141                     this.tokenizeNewlines = true;
1142                     p++;
1143                     if (parseSpecialTokenSequence())
1144                         continue;
1145                     t.value = TOK.pound;
1146                     return;
1147                 }
1148             default:
1149                 {
1150                     dchar c = *p;
1151                     if (c & 0x80)
1152                     {
1153                         c = decodeUTF();
1154                         // Check for start of unicode identifier
1155                         if (isUniAlpha(c))
1156                             goto case_ident;
1157                         if (c == PS || c == LS)
1158                         {
1159                             endOfLine();
1160                             p++;
1161                             if (tokenizeNewlines)
1162                             {
1163                                 t.value = TOK.endOfLine;
1164                                 tokenizeNewlines = false;
1165                                 return;
1166                             }
1167                             continue;
1168                         }
1169                     }
1170                     if (c < 0x80 && isprint(c))
1171                         error(t.loc, "character '%c' is not a valid token", c);
1172                     else
1173                         error(t.loc, "character 0x%02x is not a valid token", c);
1174                     p++;
1175                     continue;
1176                     // assert(0);
1177                 }
1178             }
1179         }
1180     }
1181
1182     final Token* peek(Token* ct)
1183     {
1184         Token* t;
1185         if (ct.next)
1186             t = ct.next;
1187         else
1188         {
1189             t = allocateToken();
1190             scan(t);
1191             ct.next = t;
1192         }
1193         return t;
1194     }
1195
1196     /*********************************
1197      * tk is on the opening (.
1198      * Look ahead and return token that is past the closing ).
1199      */
1200     final Token* peekPastParen(Token* tk)
1201     {
1202         //printf("peekPastParen()\n");
1203         int parens = 1;
1204         int curlynest = 0;
1205         while (1)
1206         {
1207             tk = peek(tk);
1208             //tk.print();
1209             switch (tk.value)
1210             {
1211             case TOK.leftParenthesis:
1212                 parens++;
1213                 continue;
1214             case TOK.rightParenthesis:
1215                 --parens;
1216                 if (parens)
1217                     continue;
1218                 tk = peek(tk);
1219                 break;
1220             case TOK.leftCurly:
1221                 curlynest++;
1222                 continue;
1223             case TOK.rightCurly:
1224                 if (--curlynest >= 0)
1225                     continue;
1226                 break;
1227             case TOK.semicolon:
1228                 if (curlynest)
1229                     continue;
1230                 break;
1231             case TOK.endOfFile:
1232                 break;
1233             default:
1234                 continue;
1235             }
1236             return tk;
1237         }
1238     }
1239
1240     /*******************************************
1241      * Parse escape sequence.
1242      */
1243     private uint escapeSequence(out dchar c2)
1244     {
1245         return Lexer.escapeSequence(token.loc, p, Ccompile, c2);
1246     }
1247
1248     /********
1249      * Parse the given string literal escape sequence into a single character.
1250      * D https://dlang.org/spec/lex.html#escape_sequences
1251      * C11 6.4.4.4
1252      * Params:
1253      *  loc = location to use for error messages
1254      *  sequence = pointer to string with escape sequence to parse. Updated to
1255      *             point past the end of the escape sequence
1256      *  Ccompile = true for compile C11 escape sequences
1257      *  c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init`
1258      * Returns:
1259      *  the escape sequence as a single character
1260      */
1261     private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2)
1262     {
1263         const(char)* p = sequence; // cache sequence reference on stack
1264         scope(exit) sequence = p;
1265
1266         uint c = *p;
1267         int ndigits;
1268         switch (c)
1269         {
1270         case '\'':
1271         case '"':
1272         case '?':
1273         case '\\':
1274         Lconsume:
1275             p++;
1276             break;
1277         case 'a':
1278             c = 7;
1279             goto Lconsume;
1280         case 'b':
1281             c = 8;
1282             goto Lconsume;
1283         case 'f':
1284             c = 12;
1285             goto Lconsume;
1286         case 'n':
1287             c = 10;
1288             goto Lconsume;
1289         case 'r':
1290             c = 13;
1291             goto Lconsume;
1292         case 't':
1293             c = 9;
1294             goto Lconsume;
1295         case 'v':
1296             c = 11;
1297             goto Lconsume;
1298         case 'u':
1299             ndigits = 4;
1300             goto Lhex;
1301         case 'U':
1302             ndigits = 8;
1303             goto Lhex;
1304         case 'x':
1305             ndigits = 2;
1306         Lhex:
1307             p++;
1308             c = *p;
1309             if (ishex(cast(char)c))
1310             {
1311                 uint v = 0;
1312                 int n = 0;
1313                 if (Ccompile && ndigits == 2)
1314                 {
1315                     /* C11 6.4.4.4-7 one to infinity hex digits
1316                      */
1317                     do
1318                     {
1319                         if (isdigit(cast(char)c))
1320                             c -= '0';
1321                         else if (islower(c))
1322                             c -= 'a' - 10;
1323                         else
1324                             c -= 'A' - 10;
1325                         v = v * 16 + c;
1326                         c = *++p;
1327                     } while (ishex(cast(char)c));
1328                 }
1329                 else
1330                 {
1331                     while (1)
1332                     {
1333                         if (isdigit(cast(char)c))
1334                             c -= '0';
1335                         else if (islower(c))
1336                             c -= 'a' - 10;
1337                         else
1338                             c -= 'A' - 10;
1339                         v = v * 16 + c;
1340                         c = *++p;
1341                         if (++n == ndigits)
1342                             break;
1343                         if (!ishex(cast(char)c))
1344                         {
1345                             error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1346                             break;
1347                         }
1348                     }
1349                     if (ndigits != 2 && !utf_isValidDchar(v))
1350                     {
1351                         error(loc, "invalid UTF character \\U%08x", v);
1352                         v = '?'; // recover with valid UTF character
1353                     }
1354                 }
1355                 c = v;
1356             }
1357             else
1358             {
1359                 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1360                 p++;
1361             }
1362             break;
1363         case '&':
1364             if (Ccompile)
1365                 goto default;
1366
1367             // named character entity
1368             for (const idstart = ++p; 1; p++)
1369             {
1370                 switch (*p)
1371                 {
1372                 case ';':
1373                     auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]);
1374                     c = entity[0];
1375                     if (entity == entity.init)
1376                     {
1377                         error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1378                         c = '?';
1379                     }
1380                     if (entity[1] != entity.init[1])
1381                         c2 = entity[1];
1382
1383                     p++;
1384                     break;
1385                 default:
1386                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1387                         continue;
1388                     error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1389                     c = '?';
1390                     break;
1391                 }
1392                 break;
1393             }
1394             break;
1395         case 0:
1396         case 0x1A:
1397             // end of file
1398             c = '\\';
1399             break;
1400         default:
1401             if (isoctal(cast(char)c))
1402             {
1403                 uint v = 0;
1404                 int n = 0;
1405                 do
1406                 {
1407                     v = v * 8 + (c - '0');
1408                     c = *++p;
1409                 }
1410                 while (++n < 3 && isoctal(cast(char)c));
1411                 c = v;
1412                 if (c > 0xFF)
1413                     error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1414             }
1415             else
1416             {
1417                 error(loc, "undefined escape sequence \\%c", c);
1418                 p++;
1419             }
1420             break;
1421         }
1422         return c;
1423     }
1424
1425     /**
1426     Lex a wysiwyg string. `p` must be pointing to the first character before the
1427     contents of the string literal. The character pointed to by `p` will be used as
1428     the terminating character (i.e. backtick or double-quote).
1429     Params:
1430         result = pointer to the token that accepts the result
1431     */
1432     private void wysiwygStringConstant(Token* result)
1433     {
1434         result.value = TOK.string_;
1435         Loc start = loc();
1436         auto terminator = p[0];
1437         p++;
1438         stringbuffer.setsize(0);
1439         while (1)
1440         {
1441             dchar c = p[0];
1442             p++;
1443             switch (c)
1444             {
1445             case '\n':
1446                 endOfLine();
1447                 break;
1448             case '\r':
1449                 if (p[0] == '\n')
1450                     continue; // ignore
1451                 c = '\n'; // treat EndOfLine as \n character
1452                 endOfLine();
1453                 break;
1454             case 0:
1455             case 0x1A:
1456                 error("unterminated string constant starting at %s", start.toChars());
1457                 result.setString();
1458                 // rewind `p` so it points to the EOF character
1459                 p--;
1460                 return;
1461             default:
1462                 if (c == terminator)
1463                 {
1464                     result.setString(stringbuffer);
1465                     stringPostfix(result);
1466                     return;
1467                 }
1468                 else if (c & 0x80)
1469                 {
1470                     p--;
1471                     const u = decodeUTF();
1472                     p++;
1473                     if (u == PS || u == LS)
1474                         endOfLine();
1475                     stringbuffer.writeUTF8(u);
1476                     continue;
1477                 }
1478                 break;
1479             }
1480             stringbuffer.writeByte(c);
1481         }
1482     }
1483
1484     /**************************************
1485      * Lex hex strings:
1486      *      x"0A ae 34FE BD"
1487      */
1488     final TOK hexStringConstant(Token* t)
1489     {
1490         Loc start = loc();
1491         uint n = 0;
1492         uint v = ~0; // dead assignment, needed to suppress warning
1493         p++;
1494         stringbuffer.setsize(0);
1495         while (1)
1496         {
1497             dchar c = *p++;
1498             switch (c)
1499             {
1500             case ' ':
1501             case '\t':
1502             case '\v':
1503             case '\f':
1504                 continue; // skip white space
1505             case '\r':
1506                 if (*p == '\n')
1507                     continue; // ignore '\r' if followed by '\n'
1508                 // Treat isolated '\r' as if it were a '\n'
1509                 goto case '\n';
1510             case '\n':
1511                 endOfLine();
1512                 continue;
1513             case 0:
1514             case 0x1A:
1515                 error("unterminated string constant starting at %s", start.toChars());
1516                 t.setString();
1517                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1518                 p--;
1519                 return TOK.hexadecimalString;
1520             case '"':
1521                 if (n & 1)
1522                 {
1523                     error("odd number (%d) of hex characters in hex string", n);
1524                     stringbuffer.writeByte(v);
1525                 }
1526                 t.setString(stringbuffer);
1527                 t.postfix = 'h';
1528                 stringPostfix(t);
1529                 return TOK.hexadecimalString;
1530             default:
1531                 if (c >= '0' && c <= '9')
1532                     c -= '0';
1533                 else if (c >= 'a' && c <= 'f')
1534                     c -= 'a' - 10;
1535                 else if (c >= 'A' && c <= 'F')
1536                     c -= 'A' - 10;
1537                 else if (c & 0x80)
1538                 {
1539                     p--;
1540                     const u = decodeUTF();
1541                     p++;
1542                     if (u == PS || u == LS)
1543                         endOfLine();
1544                     else
1545                         error("non-hex character \\u%04x in hex string", u);
1546                 }
1547                 else
1548                     error("non-hex character '%c' in hex string", c);
1549                 if (n & 1)
1550                 {
1551                     v = (v << 4) | c;
1552                     stringbuffer.writeByte(v);
1553                 }
1554                 else
1555                     v = c;
1556                 n++;
1557                 break;
1558             }
1559         }
1560         assert(0); // see bug 15731
1561     }
1562
1563     /**
1564     Lex a delimited string. Some examples of delimited strings are:
1565     ---
1566     q"(foo(xxx))"      // "foo(xxx)"
1567     q"[foo$(LPAREN)]"  // "foo$(LPAREN)"
1568     q"/foo]/"          // "foo]"
1569     q"HERE
1570     foo
1571     HERE"              // "foo\n"
1572     ---
1573     It is assumed that `p` points to the opening double-quote '"'.
1574     Params:
1575         result = pointer to the token that accepts the result
1576     */
1577     private void delimitedStringConstant(Token* result)
1578     {
1579         result.value = TOK.string_;
1580         Loc start = loc();
1581         dchar delimleft = 0;
1582         dchar delimright = 0;
1583         uint nest = 1;
1584         uint nestcount = ~0; // dead assignment, needed to suppress warning
1585         Identifier hereid = null;
1586         uint blankrol = 0;
1587         uint startline = 0;
1588         p++;
1589         stringbuffer.setsize(0);
1590         while (1)
1591         {
1592             const s = p;
1593             dchar c = *p++;
1594             //printf("c = '%c'\n", c);
1595             switch (c)
1596             {
1597             case '\n':
1598             Lnextline:
1599                 endOfLine();
1600                 startline = 1;
1601                 if (blankrol)
1602                 {
1603                     blankrol = 0;
1604                     continue;
1605                 }
1606                 if (hereid)
1607                 {
1608                     stringbuffer.writeUTF8(c);
1609                     continue;
1610                 }
1611                 break;
1612             case '\r':
1613                 if (*p == '\n')
1614                     continue; // ignore
1615                 c = '\n'; // treat EndOfLine as \n character
1616                 goto Lnextline;
1617             case 0:
1618             case 0x1A:
1619                 error("unterminated delimited string constant starting at %s", start.toChars());
1620                 result.setString();
1621                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1622                 p--;
1623                 return;
1624             default:
1625                 if (c & 0x80)
1626                 {
1627                     p--;
1628                     c = decodeUTF();
1629                     p++;
1630                     if (c == PS || c == LS)
1631                         goto Lnextline;
1632                 }
1633                 break;
1634             }
1635             if (delimleft == 0)
1636             {
1637                 delimleft = c;
1638                 nest = 1;
1639                 nestcount = 1;
1640                 if (c == '(')
1641                     delimright = ')';
1642                 else if (c == '{')
1643                     delimright = '}';
1644                 else if (c == '[')
1645                     delimright = ']';
1646                 else if (c == '<')
1647                     delimright = '>';
1648                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1649                 {
1650                     // Start of identifier; must be a heredoc
1651                     Token tok;
1652                     p = s;
1653                     scan(&tok); // read in heredoc identifier
1654                     if (tok.value != TOK.identifier)
1655                     {
1656                         error("identifier expected for heredoc, not %s", tok.toChars());
1657                         delimright = c;
1658                     }
1659                     else
1660                     {
1661                         hereid = tok.ident;
1662                         //printf("hereid = '%s'\n", hereid.toChars());
1663                         blankrol = 1;
1664                     }
1665                     nest = 0;
1666                 }
1667                 else
1668                 {
1669                     delimright = c;
1670                     nest = 0;
1671                     if (isspace(c))
1672                         error("delimiter cannot be whitespace");
1673                 }
1674             }
1675             else
1676             {
1677                 if (blankrol)
1678                 {
1679                     error("heredoc rest of line should be blank");
1680                     blankrol = 0;
1681                     continue;
1682                 }
1683                 if (nest == 1)
1684                 {
1685                     if (c == delimleft)
1686                         nestcount++;
1687                     else if (c == delimright)
1688                     {
1689                         nestcount--;
1690                         if (nestcount == 0)
1691                             goto Ldone;
1692                     }
1693                 }
1694                 else if (c == delimright)
1695                     goto Ldone;
1696                 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1697                 {
1698                     Token tok;
1699                     auto psave = p;
1700                     p = s;
1701                     scan(&tok); // read in possible heredoc identifier
1702                     //printf("endid = '%s'\n", tok.ident.toChars());
1703                     if (tok.value == TOK.identifier && tok.ident is hereid)
1704                     {
1705                         /* should check that rest of line is blank
1706                          */
1707                         goto Ldone;
1708                     }
1709                     p = psave;
1710                 }
1711                 stringbuffer.writeUTF8(c);
1712                 startline = 0;
1713             }
1714         }
1715     Ldone:
1716         if (*p == '"')
1717             p++;
1718         else if (hereid)
1719             error("delimited string must end in `%s\"`", hereid.toChars());
1720         else if (isspace(delimright))
1721             error("delimited string must end in `\"`");
1722         else
1723             error(token.loc, "delimited string must end in `%c\"`", delimright);
1724         result.setString(stringbuffer);
1725         stringPostfix(result);
1726     }
1727
1728     /**
1729     Lex a token string. Some examples of token strings are:
1730     ---
1731     q{ foo(xxx) }    // " foo(xxx) "
1732     q{foo$(LPAREN)}  // "foo$(LPAREN)"
1733     q{{foo}"}"}      // "{foo}"}""
1734     ---
1735     It is assumed that `p` points to the opening curly-brace.
1736     Params:
1737         result = pointer to the token that accepts the result
1738     */
1739     private void tokenStringConstant(Token* result)
1740     {
1741         result.value = TOK.string_;
1742
1743         uint nest = 1;
1744         const start = loc();
1745         const pstart = ++p;
1746         inTokenStringConstant++;
1747         scope(exit) inTokenStringConstant--;
1748         while (1)
1749         {
1750             Token tok;
1751             scan(&tok);
1752             switch (tok.value)
1753             {
1754             case TOK.leftCurly:
1755                 nest++;
1756                 continue;
1757             case TOK.rightCurly:
1758                 if (--nest == 0)
1759                 {
1760                     result.setString(pstart, p - 1 - pstart);
1761                     stringPostfix(result);
1762                     return;
1763                 }
1764                 continue;
1765             case TOK.endOfFile:
1766                 error("unterminated token string constant starting at %s", start.toChars());
1767                 result.setString();
1768                 return;
1769             default:
1770                 continue;
1771             }
1772         }
1773     }
1774
1775     /**
1776     Scan a quoted string while building the processed string value by
1777     handling escape sequences. The result is returned in the given `t` token.
1778     This function assumes that `p` currently points to the opening quote
1779     of the string.
1780     Params:
1781         t = the token to set the resulting string to
1782     * References:
1783     *   D https://dlang.org/spec/lex.html#double_quoted_strings
1784     *   ImportC C11 6.4.5
1785     */
1786     private void escapeStringConstant(Token* t)
1787     {
1788         t.value = TOK.string_;
1789
1790         const start = loc();
1791         const tc = *p++;        // opening quote
1792         stringbuffer.setsize(0);
1793         while (1)
1794         {
1795             dchar c = *p++;
1796             dchar c2;
1797             switch (c)
1798             {
1799             case '\\':
1800                 switch (*p)
1801                 {
1802                 case '&':
1803                     if (Ccompile)
1804                         goto default;
1805
1806                     c = escapeSequence(c2);
1807                     stringbuffer.writeUTF8(c);
1808                     if (c2 != dchar.init)
1809                         stringbuffer.writeUTF8(c2);
1810                     continue;
1811                 case 'u':
1812                 case 'U':
1813                     c = escapeSequence(c2);
1814                     stringbuffer.writeUTF8(c);
1815                     continue;
1816                 default:
1817                     c = escapeSequence(c2);
1818                     break;
1819                 }
1820                 break;
1821             case '\n':
1822                 endOfLine();
1823                 if (Ccompile)
1824                     goto Lunterminated;
1825                 break;
1826             case '\r':
1827                 if (*p == '\n')
1828                     continue; // ignore
1829                 c = '\n'; // treat EndOfLine as \n character
1830                 endOfLine();
1831                 if (Ccompile)
1832                     goto Lunterminated;
1833                 break;
1834             case '\'':
1835             case '"':
1836                 if (c != tc)
1837                     goto default;
1838                 t.setString(stringbuffer);
1839                 if (!Ccompile)
1840                     stringPostfix(t);
1841                 return;
1842             case 0:
1843             case 0x1A:
1844                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1845                 p--;
1846             Lunterminated:
1847                 error("unterminated string constant starting at %s", start.toChars());
1848                 t.setString();
1849                 return;
1850             default:
1851                 if (c & 0x80)
1852                 {
1853                     p--;
1854                     c = decodeUTF();
1855                     if (c == LS || c == PS)
1856                     {
1857                         c = '\n';
1858                         endOfLine();
1859                         if (Ccompile)
1860                             goto Lunterminated;
1861                     }
1862                     p++;
1863                     stringbuffer.writeUTF8(c);
1864                     continue;
1865                 }
1866                 break;
1867             }
1868             stringbuffer.writeByte(c);
1869         }
1870     }
1871
1872     /**************************************
1873      * Reference:
1874      *    https://dlang.org/spec/lex.html#characterliteral
1875      */
1876     private TOK charConstant(Token* t)
1877     {
1878         TOK tk = TOK.charLiteral;
1879         //printf("Lexer::charConstant\n");
1880         p++;
1881         dchar c = *p++;
1882         dchar c2;
1883         switch (c)
1884         {
1885         case '\\':
1886             switch (*p)
1887             {
1888             case 'u':
1889                 tk = TOK.wcharLiteral;
1890                 goto default;
1891             case 'U':
1892             case '&':
1893                 tk = TOK.dcharLiteral;
1894                 goto default;
1895             default:
1896                 t.unsvalue = escapeSequence(c2);
1897                 if (c2 != c2.init)
1898                 {
1899                     error("html entity requires 2 code units, use a string instead of a character");
1900                     t.unsvalue = '?';
1901                 }
1902                 break;
1903             }
1904             break;
1905         case '\n':
1906         L1:
1907             endOfLine();
1908             goto case;
1909         case '\r':
1910             goto case '\'';
1911         case 0:
1912         case 0x1A:
1913             // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1914             p--;
1915             goto case;
1916         case '\'':
1917             error("unterminated character constant");
1918             t.unsvalue = '?';
1919             return tk;
1920         default:
1921             if (c & 0x80)
1922             {
1923                 p--;
1924                 c = decodeUTF();
1925                 p++;
1926                 if (c == LS || c == PS)
1927                     goto L1;
1928                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1929                     tk = TOK.wcharLiteral;
1930                 else
1931                     tk = TOK.dcharLiteral;
1932             }
1933             t.unsvalue = c;
1934             break;
1935         }
1936         if (*p != '\'')
1937         {
1938             while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1939                     *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1940             {
1941                 if (*p & 0x80)
1942                 {
1943                     const s = p;
1944                     c = decodeUTF();
1945                     if (c == LS || c == PS)
1946                     {
1947                         p = s;
1948                         break;
1949                     }
1950                 }
1951                 p++;
1952             }
1953
1954             if (*p == '\'')
1955             {
1956                 error("character constant has multiple characters");
1957                 p++;
1958             }
1959             else
1960                 error("unterminated character constant");
1961             t.unsvalue = '?';
1962             return tk;
1963         }
1964         p++;
1965         return tk;
1966     }
1967
1968     /***************************************
1969      * Lex C character constant.
1970      * Parser is on the opening quote.
1971      * Params:
1972      *  t = token to fill in
1973      *  prefix = one of `u`, `U` or 0.
1974      * Reference:
1975      *  C11 6.4.4.4
1976      */
1977     private void clexerCharConstant(ref Token t, char prefix)
1978     {
1979         escapeStringConstant(&t);
1980         const(char)[] str = t.ustring[0 .. t.len];
1981         const n = str.length;
1982         const loc = t.loc;
1983         if (n == 0)
1984         {
1985             error(loc, "empty character constant");
1986             t.value = TOK.semicolon;
1987             return;
1988         }
1989
1990         uint u;
1991         switch (prefix)
1992         {
1993             case 0:
1994                 if (n == 1) // fast case
1995                 {
1996                     u = str[0];
1997                 }
1998                 else if (n > 4)
1999                     error(loc, "max number of chars in character literal is 4, had %d",
2000                         cast(int)n);
2001                 else
2002                 {
2003                     foreach (i, c; str)
2004                         (cast(char*)&u)[n - 1 - i] = c;
2005                 }
2006                 break;
2007
2008             case 'u':
2009                 dchar d1;
2010                 size_t idx;
2011                 while (idx < n)
2012                 {
2013                     string msg = utf_decodeChar(str, idx, d1);
2014                     if (msg)
2015                         error(loc, "%.*s", cast(int)msg.length, msg.ptr);
2016                 }
2017                 if (d1 >= 0x1_0000)
2018                     error(loc, "x%x does not fit in 16 bits", d1);
2019                 t.unsvalue = d1;
2020                 t.value = TOK.wcharLiteral; // C11 6.4.4.4-9
2021                 return;
2022
2023             case 'U':
2024                 dchar d;
2025                 size_t idx;
2026                 auto msg = utf_decodeChar(str, idx, d);
2027                 if (msg)
2028                     error(loc, "%.*s", cast(int)msg.length, msg.ptr);
2029                 else if (idx < n)
2030                     error(loc, "max number of chars in 32 bit character literal is 1, had %d",
2031                         cast(int)((n + 3) >> 2));
2032                 t.unsvalue = d;
2033                 t.value = TOK.dcharLiteral; // C11 6.4.4.4-9
2034                 return;
2035
2036             default:
2037                 assert(0);
2038         }
2039         t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
2040         t.unsvalue = u;
2041     }
2042
2043     /***************************************
2044      * Get postfix of string literal.
2045      */
2046     private void stringPostfix(Token* t) pure @nogc
2047     {
2048         switch (*p)
2049         {
2050         case 'c':
2051         case 'w':
2052         case 'd':
2053             t.postfix = *p;
2054             p++;
2055             break;
2056         default:
2057             t.postfix = 0;
2058             break;
2059         }
2060     }
2061
2062     /**************************************
2063      * Read in a number.
2064      * If it's an integer, store it in tok.TKutok.Vlong.
2065      *      integers can be decimal, octal or hex
2066      *      Handle the suffixes U, UL, LU, L, etc.
2067      * If it's double, store it in tok.TKutok.Vdouble.
2068      * Returns:
2069      *      TKnum
2070      *      TKdouble,...
2071      */
2072     private TOK number(Token* t)
2073     {
2074         int base = 10;
2075         const start = p;
2076         ulong n = 0; // unsigned >=64 bit integer type
2077         int d;
2078         bool err = false;
2079         bool overflow = false;
2080         bool anyBinaryDigitsNoSingleUS = false;
2081         bool anyHexDigitsNoSingleUS = false;
2082         char errorDigit = 0;
2083         dchar c = *p;
2084         if (c == '0')
2085         {
2086             ++p;
2087             c = *p;
2088             switch (c)
2089             {
2090             case '0':
2091             case '1':
2092             case '2':
2093             case '3':
2094             case '4':
2095             case '5':
2096             case '6':
2097             case '7':
2098                 base = 8;
2099                 break;
2100
2101             case '8':
2102             case '9':
2103                 errorDigit = cast(char) c;
2104                 base = 8;
2105                 break;
2106             case 'x':
2107             case 'X':
2108                 ++p;
2109                 base = 16;
2110                 break;
2111             case 'b':
2112             case 'B':
2113                 ++p;
2114                 base = 2;
2115                 break;
2116             case '.':
2117                 if (p[1] == '.')
2118                     goto Ldone; // if ".."
2119                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
2120                 {
2121                     if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2122                         goto Lreal;  // if `0.f` or `0.L`
2123                     goto Ldone; // if ".identifier" or ".unicode"
2124                 }
2125                 goto Lreal; // '.' is part of current token
2126             case 'i':
2127             case 'f':
2128             case 'F':
2129                 goto Lreal;
2130             case '_':
2131                 if (Ccompile)
2132                     error("embedded `_` not allowed");
2133                 ++p;
2134                 base = 8;
2135                 break;
2136             case 'L':
2137                 if (p[1] == 'i')
2138                     goto Lreal;
2139                 break;
2140             default:
2141                 break;
2142             }
2143         }
2144         while (1)
2145         {
2146             c = *p;
2147             switch (c)
2148             {
2149             case '0':
2150             case '1':
2151             case '2':
2152             case '3':
2153             case '4':
2154             case '5':
2155             case '6':
2156             case '7':
2157             case '8':
2158             case '9':
2159                 ++p;
2160                 d = c - '0';
2161                 break;
2162             case 'a':
2163             case 'b':
2164             case 'c':
2165             case 'd':
2166             case 'e':
2167             case 'f':
2168             case 'A':
2169             case 'B':
2170             case 'C':
2171             case 'D':
2172             case 'E':
2173             case 'F':
2174                 ++p;
2175                 if (base != 16)
2176                 {
2177                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2178                         goto Lreal;
2179                 }
2180                 if (c >= 'a')
2181                     d = c + 10 - 'a';
2182                 else
2183                     d = c + 10 - 'A';
2184                 break;
2185             case 'L':
2186                 if (p[1] == 'i')
2187                     goto Lreal;
2188                 goto Ldone;
2189             case '.':
2190                 if (p[1] == '.')
2191                     goto Ldone; // if ".."
2192                 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2193                 {
2194                     if (Ccompile && base == 10 &&
2195                         (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2196                         goto Lreal;  // if `1.e6` or `1.f` or `1.L`
2197                     goto Ldone; // if ".identifier" or ".unicode"
2198                 }
2199                 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2200                     goto Ldone; // if ".identifier" or ".unicode"
2201                 if (base == 2)
2202                     goto Ldone; // if ".identifier" or ".unicode"
2203                 goto Lreal; // otherwise as part of a floating point literal
2204
2205             case 'i':
2206                 if (Ccompile)
2207                     goto Ldone;
2208                 goto Lreal;
2209
2210             case 'p':
2211             case 'P':
2212             Lreal:
2213                 p = start;
2214                 return inreal(t);
2215             case '_':
2216                 if (Ccompile)
2217                     goto default;
2218                 ++p;
2219                 continue;
2220             default:
2221                 goto Ldone;
2222             }
2223             // got a digit here, set any necessary flags, check for errors
2224             anyHexDigitsNoSingleUS = true;
2225             anyBinaryDigitsNoSingleUS = true;
2226             if (!errorDigit && d >= base)
2227             {
2228                 errorDigit = cast(char) c;
2229             }
2230             // Avoid expensive overflow check if we aren't at risk of overflow
2231             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2232                 n = n * base + d;
2233             else
2234             {
2235                 import core.checkedint : mulu, addu;
2236
2237                 n = mulu(n, base, overflow);
2238                 n = addu(n, d, overflow);
2239             }
2240         }
2241     Ldone:
2242         if (errorDigit)
2243         {
2244             error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2245                                                  base == 8 ? "octal".ptr :
2246                                                  "decimal".ptr, errorDigit);
2247             err = true;
2248         }
2249         if (overflow && !err)
2250         {
2251             error("integer overflow");
2252             err = true;
2253         }
2254         if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2255             (base == 16 && !anyHexDigitsNoSingleUS))
2256             error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2257
2258         t.unsvalue = n;
2259
2260         if (Ccompile)
2261             return cnumber(base, n);
2262
2263         enum FLAGS : int
2264         {
2265             none = 0,
2266             decimal = 1, // decimal
2267             unsigned = 2, // u or U suffix
2268             long_ = 4, // L suffix
2269         }
2270
2271         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2272         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2273         const psuffix = p;
2274         while (1)
2275         {
2276             FLAGS f;
2277             switch (*p)
2278             {
2279             case 'U':
2280             case 'u':
2281                 f = FLAGS.unsigned;
2282                 goto L1;
2283             case 'l':
2284                 f = FLAGS.long_;
2285                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2286                 goto L1;
2287             case 'L':
2288                 f = FLAGS.long_;
2289             L1:
2290                 p++;
2291                 if ((flags & f) && !err)
2292                 {
2293                     error("repeated integer suffix `%c`", p[-1]);
2294                     err = true;
2295                 }
2296                 flags = cast(FLAGS)(flags | f);
2297                 continue;
2298             default:
2299                 break;
2300             }
2301             break;
2302         }
2303         if (base == 8 && n >= 8)
2304         {
2305             if (err)
2306                 // can't translate invalid octal value, just show a generic message
2307                 error("octal literals larger than 7 are no longer supported");
2308             else
2309                 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2310                     n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2311         }
2312         TOK result;
2313         switch (flags)
2314         {
2315         case FLAGS.none:
2316             /* Octal or Hexadecimal constant.
2317              * First that fits: int, uint, long, ulong
2318              */
2319             if (n & 0x8000000000000000L)
2320                 result = TOK.uns64Literal;
2321             else if (n & 0xFFFFFFFF00000000L)
2322                 result = TOK.int64Literal;
2323             else if (n & 0x80000000)
2324                 result = TOK.uns32Literal;
2325             else
2326                 result = TOK.int32Literal;
2327             break;
2328         case FLAGS.decimal:
2329             /* First that fits: int, long, long long
2330              */
2331             if (n & 0x8000000000000000L)
2332             {
2333                 result = TOK.uns64Literal;
2334             }
2335             else if (n & 0xFFFFFFFF80000000L)
2336                 result = TOK.int64Literal;
2337             else
2338                 result = TOK.int32Literal;
2339             break;
2340         case FLAGS.unsigned:
2341         case FLAGS.decimal | FLAGS.unsigned:
2342             /* First that fits: uint, ulong
2343              */
2344             if (n & 0xFFFFFFFF00000000L)
2345                 result = TOK.uns64Literal;
2346             else
2347                 result = TOK.uns32Literal;
2348             break;
2349         case FLAGS.decimal | FLAGS.long_:
2350             if (n & 0x8000000000000000L)
2351             {
2352                 if (!err)
2353                 {
2354                     error("signed integer overflow");
2355                     err = true;
2356                 }
2357                 result = TOK.uns64Literal;
2358             }
2359             else
2360                 result = TOK.int64Literal;
2361             break;
2362         case FLAGS.long_:
2363             if (n & 0x8000000000000000L)
2364                 result = TOK.uns64Literal;
2365             else
2366                 result = TOK.int64Literal;
2367             break;
2368         case FLAGS.unsigned | FLAGS.long_:
2369         case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2370             result = TOK.uns64Literal;
2371             break;
2372         default:
2373             debug
2374             {
2375                 printf("%x\n", flags);
2376             }
2377             assert(0);
2378         }
2379         return result;
2380     }
2381
2382     /**************************************
2383      * Lex C integer-suffix
2384      * Params:
2385      *  base = number base
2386      *  n = raw integer value
2387      * Returns:
2388      *  token value
2389      */
2390     private TOK cnumber(int base, ulong n)
2391     {
2392         /* C11 6.4.4.1
2393          * Parse trailing suffixes:
2394          *   u or U
2395          *   l or L
2396          *   ll or LL
2397          */
2398         enum FLAGS : uint
2399         {
2400             octalhex = 1, // octal or hexadecimal
2401             decimal  = 2, // decimal
2402             unsigned = 4, // u or U suffix
2403             long_    = 8, // l or L suffix
2404             llong    = 0x10, // ll or LL
2405
2406             // Microsoft extensions
2407             i8       = 0x20,
2408             i16      = 0x40,
2409             i32      = 0x80,
2410             i64      = 0x100,
2411         }
2412         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2413         bool err;
2414     Lsuffixes:
2415         while (1)
2416         {
2417             FLAGS f;
2418             const cs = *p;
2419             switch (cs)
2420             {
2421                 case 'U':
2422                 case 'u':
2423                     f = FLAGS.unsigned;
2424                     break;
2425
2426                 case 'l':
2427                 case 'L':
2428                     f = FLAGS.long_;
2429                     if (cs == p[1])
2430                     {
2431                         f = FLAGS.long_ | FLAGS.llong;
2432                         ++p;
2433                     }
2434                     break;
2435
2436                 case 'i':
2437                 case 'I':
2438                     if (p[1] == '8')
2439                     {
2440                         f = FLAGS.i8;
2441                         ++p;
2442                     }
2443                     else if (p[1] == '1' && p[2] == '6')
2444                     {
2445                         f = FLAGS.i16;
2446                         p += 2;
2447                     }
2448                     else if (p[1] == '3' && p[2] == '2')
2449                     {
2450                         f = FLAGS.i32;
2451                         p += 2;
2452                     }
2453                     else if (p[1] == '6' && p[2] == '4')
2454                     {
2455                         f = FLAGS.i64;
2456                         p += 2;
2457                     }
2458                     else
2459                         break Lsuffixes;
2460                     if (p[1] >= '0' && p[1] <= '9' && !err)
2461                     {
2462                         error("invalid integer suffix");
2463                         err = true;
2464                     }
2465                     break;
2466
2467                 default:
2468                     break Lsuffixes;
2469             }
2470             ++p;
2471             if ((flags & f) && !err)
2472             {
2473                 error("duplicate integer suffixes");
2474                 err = true;
2475             }
2476             flags = cast(FLAGS)(flags | f);
2477         }
2478
2479         TOK result = TOK.int32Literal;     // default
2480         switch (flags)
2481         {
2482             /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2483              * this code deviates from C by picking D int, uint, long, or ulong instead
2484              */
2485
2486             case FLAGS.octalhex:
2487                 /* Octal or Hexadecimal constant.
2488                  * First that fits: int, unsigned, long, unsigned long,
2489                  * long long, unsigned long long
2490                  */
2491                 if (n & 0x8000000000000000L)
2492                     result = TOK.uns64Literal;      // unsigned long
2493                 else if (n & 0xFFFFFFFF00000000L)
2494                     result = TOK.int64Literal;      // long
2495                 else if (n & 0x80000000)
2496                     result = TOK.uns32Literal;
2497                 else
2498                     result = TOK.int32Literal;
2499                 break;
2500
2501             case FLAGS.decimal:
2502                 /* First that fits: int, long, long long
2503                  */
2504                 if (n & 0x8000000000000000L)
2505                     result = TOK.uns64Literal;      // unsigned long
2506                 else if (n & 0xFFFFFFFF80000000L)
2507                     result = TOK.int64Literal;      // long
2508                 else
2509                     result = TOK.int32Literal;
2510                 break;
2511
2512             case FLAGS.octalhex | FLAGS.unsigned:
2513             case FLAGS.decimal | FLAGS.unsigned:
2514                 /* First that fits: unsigned, unsigned long, unsigned long long
2515                  */
2516                 if (n & 0xFFFFFFFF00000000L)
2517                     result = TOK.uns64Literal;      // unsigned long
2518                 else
2519                     result = TOK.uns32Literal;
2520                 break;
2521
2522             case FLAGS.decimal | FLAGS.long_:
2523                 /* First that fits: long, long long
2524                  */
2525                 if (longsize == 4 || long_longsize == 4)
2526                 {
2527                     if (n & 0xFFFFFFFF_80000000L)
2528                         result = TOK.int64Literal;
2529                     else
2530                         result = TOK.int32Literal;  // long
2531                 }
2532                 else
2533                 {
2534                     result = TOK.int64Literal;      // long
2535                 }
2536                 break;
2537
2538             case FLAGS.octalhex | FLAGS.long_:
2539                 /* First that fits: long, unsigned long, long long,
2540                  * unsigned long long
2541                  */
2542                 if (longsize == 4 || long_longsize == 4)
2543                 {
2544                     if (n & 0x8000000000000000L)
2545                         result = TOK.uns64Literal;
2546                     else if (n & 0xFFFFFFFF00000000L)
2547                         result = TOK.int64Literal;
2548                     else if (n & 0x80000000)
2549                         result = TOK.uns32Literal;      // unsigned long
2550                     else
2551                         result = TOK.int32Literal;      // long
2552                 }
2553                 else
2554                 {
2555                     if (n & 0x80000000_00000000L)
2556                         result = TOK.uns64Literal;      // unsigned long
2557                     else
2558                         result = TOK.int64Literal;      // long
2559                 }
2560                 break;
2561
2562             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2563             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.long_:
2564                 /* First that fits: unsigned long, unsigned long long
2565                  */
2566                 if (longsize == 4 || long_longsize == 4)
2567                 {
2568                     if (n & 0xFFFFFFFF00000000L)
2569                         result = TOK.uns64Literal;
2570                     else
2571                         result = TOK.uns32Literal;      // unsigned long
2572                 }
2573                 else
2574                 {
2575                     result = TOK.uns64Literal;  // unsigned long
2576                 }
2577                 break;
2578
2579             case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2580                 /* First that fits: long long, unsigned long long
2581                  */
2582                 if (n & 0x8000000000000000L)
2583                     result = TOK.uns64Literal;
2584                 else
2585                     result = TOK.int64Literal;
2586                 break;
2587
2588             case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2589                 /* long long
2590                  */
2591                 result = TOK.int64Literal;
2592                 break;
2593
2594             case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2595             case FLAGS.decimal  | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2596                 result = TOK.uns64Literal;
2597                 break;
2598
2599             case FLAGS.octalhex | FLAGS.i8:
2600             case FLAGS.octalhex | FLAGS.i16:
2601             case FLAGS.octalhex | FLAGS.i32:
2602             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i8:
2603             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i16:
2604             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i32:
2605             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i8:
2606             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i16:
2607             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i32:
2608                 result = TOK.uns32Literal;
2609                 break;
2610
2611             case FLAGS.decimal | FLAGS.i8:
2612             case FLAGS.decimal | FLAGS.i16:
2613             case FLAGS.decimal | FLAGS.i32:
2614                 result = TOK.int32Literal;
2615                 break;
2616
2617             case FLAGS.octalhex | FLAGS.i64:
2618             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i64:
2619             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i64:
2620                 result = TOK.uns64Literal;
2621                 break;
2622
2623             case FLAGS.decimal | FLAGS.i64:
2624                 result = TOK.int64Literal;
2625                 break;
2626
2627             default:
2628                 debug printf("%x\n",flags);
2629                 assert(0);
2630         }
2631         return result;
2632     }
2633
2634     /**************************************
2635      * Read in characters, converting them to real.
2636      * Bugs:
2637      *      Exponent overflow not detected.
2638      *      Too much requested precision is not detected.
2639      */
2640     private TOK inreal(Token* t)
2641     {
2642         //printf("Lexer::inreal()\n");
2643         debug
2644         {
2645             assert(*p == '.' || isdigit(*p));
2646         }
2647         bool isWellformedString = true;
2648         stringbuffer.setsize(0);
2649         auto pstart = p;
2650         bool hex = false;
2651         dchar c = *p++;
2652         // Leading '0x'
2653         if (c == '0')
2654         {
2655             c = *p++;
2656             if (c == 'x' || c == 'X')
2657             {
2658                 hex = true;
2659                 c = *p++;
2660             }
2661         }
2662         // Digits to left of '.'
2663         while (1)
2664         {
2665             if (c == '.')
2666             {
2667                 c = *p++;
2668                 break;
2669             }
2670             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2671             {
2672                 c = *p++;
2673                 continue;
2674             }
2675             break;
2676         }
2677         // Digits to right of '.'
2678         while (1)
2679         {
2680             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2681             {
2682                 c = *p++;
2683                 continue;
2684             }
2685             break;
2686         }
2687         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2688         {
2689             c = *p++;
2690             if (c == '-' || c == '+')
2691             {
2692                 c = *p++;
2693             }
2694             bool anyexp = false;
2695             while (1)
2696             {
2697                 if (isdigit(c))
2698                 {
2699                     anyexp = true;
2700                     c = *p++;
2701                     continue;
2702                 }
2703                 if (c == '_')
2704                 {
2705                     if (Ccompile)
2706                         error("embedded `_` in numeric literals not allowed");
2707                     c = *p++;
2708                     continue;
2709                 }
2710                 if (!anyexp)
2711                 {
2712                     error("missing exponent");
2713                     isWellformedString = false;
2714                 }
2715                 break;
2716             }
2717         }
2718         else if (hex)
2719         {
2720             error("exponent required for hex float");
2721             isWellformedString = false;
2722         }
2723         --p;
2724         while (pstart < p)
2725         {
2726             if (*pstart != '_')
2727                 stringbuffer.writeByte(*pstart);
2728             ++pstart;
2729         }
2730         stringbuffer.writeByte(0);
2731         auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2732         TOK result;
2733         bool isOutOfRange = false;
2734         t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero);
2735
2736         bool imaginary = false;
2737         if (*p == 'i' && Ccompile)
2738         {
2739             ++p;
2740             imaginary = true;
2741         }
2742
2743         switch (*p)
2744         {
2745         case 'F':
2746         case 'f':
2747             if (isWellformedString && !isOutOfRange)
2748                 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2749             result = TOK.float32Literal;
2750             p++;
2751             break;
2752         default:
2753             if (isWellformedString && !isOutOfRange)
2754                 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2755             result = TOK.float64Literal;
2756             break;
2757         case 'l':
2758             if (!Ccompile)
2759                 error("use 'L' suffix instead of 'l'");
2760             goto case 'L';
2761         case 'L':
2762             ++p;
2763             if (Ccompile && long_doublesize == 8)
2764                 goto default;
2765             result = TOK.float80Literal;
2766             break;
2767         }
2768
2769         if ((*p == 'i' || *p == 'I') && !Ccompile)
2770         {
2771             if (*p == 'I')
2772                 error("use 'i' suffix instead of 'I'");
2773             p++;
2774             imaginary = true;
2775         }
2776
2777         if (imaginary)
2778         {
2779             switch (result)
2780             {
2781             case TOK.float32Literal:
2782                 result = TOK.imaginary32Literal;
2783                 break;
2784             case TOK.float64Literal:
2785                 result = TOK.imaginary64Literal;
2786                 break;
2787             case TOK.float80Literal:
2788                 result = TOK.imaginary80Literal;
2789                 break;
2790             default:
2791                 break;
2792             }
2793         }
2794         const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2795         if (isOutOfRange && !isLong && (!Ccompile || hex))
2796         {
2797             /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2798              */
2799             const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : "";
2800             const char* type = [TOK.float32Literal: "`float`".ptr,
2801                                 TOK.float64Literal: "`double`".ptr,
2802                                 TOK.float80Literal: "`real` for the current target".ptr][result];
2803             error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type);
2804             const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : "";
2805             eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra);
2806         }
2807         debug
2808         {
2809             switch (result)
2810             {
2811             case TOK.float32Literal:
2812             case TOK.float64Literal:
2813             case TOK.float80Literal:
2814             case TOK.imaginary32Literal:
2815             case TOK.imaginary64Literal:
2816             case TOK.imaginary80Literal:
2817                 break;
2818             default:
2819                 assert(0);
2820             }
2821         }
2822         return result;
2823     }
2824
2825     final Loc loc() @nogc
2826     {
2827         scanloc.charnum = cast(ushort)(1 + p - line);
2828         version (LocOffset)
2829             scanloc.fileOffset = cast(uint)(p - base);
2830         return scanloc;
2831     }
2832
2833     void error(T...)(const(char)* format, T args)
2834     {
2835         eSink.error(token.loc, format, args);
2836     }
2837
2838     void error(T...)(const ref Loc loc, const(char)* format, T args)
2839     {
2840         eSink.error(loc, format, args);
2841     }
2842
2843     void deprecation(T...)(const ref Loc loc, const(char)* format, T args)
2844     {
2845         eSink.deprecation(loc, format, args);
2846     }
2847
2848     void deprecation(T...)(const(char)* format, T args)
2849     {
2850         eSink.deprecation(token.loc, format, args);
2851     }
2852
2853     void deprecationSupplemental(T...)(const(char)* format, T args)
2854     {
2855         eSink.deprecationSupplemental(token.loc, format, args);
2856     }
2857
2858     /***************************************
2859      * Parse special token sequence:
2860      * Returns:
2861      *  true if the special token sequence was handled
2862      * References:
2863      *  https://dlang.org/spec/lex.html#special-token-sequence
2864      */
2865     bool parseSpecialTokenSequence()
2866     {
2867         Token n;
2868         scan(&n);
2869         if (n.value == TOK.identifier)
2870         {
2871             if (n.ident == Id.line)
2872             {
2873                 poundLine(n, false);
2874                 return true;
2875             }
2876             else
2877             {
2878                 const locx = loc();
2879                 // @@@DEPRECATED_2.103@@@
2880                 // Turn into an error in 2.113
2881                 if (inTokenStringConstant)
2882                     deprecation(locx, "token string requires valid D tokens, not `#%s`", n.ident.toChars());
2883                 else
2884                     error(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2885             }
2886         }
2887         else if (n.value == TOK.if_)
2888         {
2889             const locx = loc();
2890             if (inTokenStringConstant)
2891                 error(locx, "token string requires valid D tokens, not `#if`");
2892             else
2893                 error(locx, "C preprocessor directive `#if` is not supported, use `version` or `static if`");
2894         }
2895         return false;
2896     }
2897
2898     /*********************************************
2899      * Parse line/file preprocessor directive:
2900      *    #line linnum [filespec]
2901      * Allow __LINE__ for linnum, and __FILE__ for filespec.
2902      * Accept linemarker format:
2903      *    # linnum [filespec] {flags}
2904      * There can be zero or more flags, which are one of the digits 1..4, and
2905      * must be in ascending order. The flags are ignored.
2906      * Params:
2907      *  tok = token we're on, which is linnum of linemarker
2908      *  linemarker = true if line marker format and lexer is on linnum
2909      * References:
2910      *  linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2911      */
2912     final void poundLine(ref Token tok, bool linemarker)
2913     {
2914         auto linnum = this.scanloc.linnum;
2915         const(char)* filespec = null;
2916         bool flags;
2917
2918         if (!linemarker)
2919             scan(&tok);
2920         if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2921         {
2922             const lin = cast(int)(tok.unsvalue);
2923             if (lin != tok.unsvalue)
2924             {
2925                 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2926                 skipToNextLine();
2927                 return;
2928             }
2929             else
2930                 linnum = lin;
2931         }
2932         else if (tok.value == TOK.line)  // #line __LINE__
2933         {
2934         }
2935         else
2936         {
2937             error(tok.loc, "positive integer argument expected following `#line`");
2938             if (tok.value != TOK.endOfLine)
2939                 skipToNextLine();
2940             return;
2941         }
2942         while (1)
2943         {
2944             scan(&tok);
2945             switch (tok.value)
2946             {
2947             case TOK.endOfFile:
2948             case TOK.endOfLine:
2949                 if (!inTokenStringConstant)
2950                 {
2951                     this.scanloc.linnum = linnum;
2952                     if (filespec)
2953                         this.scanloc.filename = filespec;
2954                 }
2955                 return;
2956             case TOK.file:
2957                 if (filespec || flags)
2958                     goto Lerr;
2959                 filespec = mem.xstrdup(scanloc.filename);
2960                 continue;
2961             case TOK.string_:
2962                 if (filespec || flags)
2963                     goto Lerr;
2964                 if (tok.ptr[0] != '"' || tok.postfix != 0)
2965                     goto Lerr;
2966                 filespec = tok.ustring;
2967                 continue;
2968             case TOK.int32Literal:
2969                 if (!filespec)
2970                     goto Lerr;
2971                 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2972                 {
2973                     flags = true;   // linemarker flags seen
2974                     continue;
2975                 }
2976                 goto Lerr;
2977             default:
2978                 goto Lerr;
2979             }
2980         }
2981     Lerr:
2982         if (filespec is null)
2983             error(tok.loc, "invalid filename for `#line` directive");
2984         else if (linemarker)
2985             error(tok.loc, "invalid flag for line marker directive");
2986         else if (!Ccompile)
2987             error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2988         if (tok.value != TOK.endOfLine)
2989             skipToNextLine();
2990     }
2991
2992     /***************************************
2993      * Scan forward to start of next line.
2994      * Params:
2995      *    defines = send characters to `defines`
2996      */
2997     final void skipToNextLine(OutBuffer* defines = null)
2998     {
2999         while (1)
3000         {
3001             switch (*p)
3002             {
3003             case 0:
3004             case 0x1A:
3005                 return; // do not advance p
3006
3007             case '\n':
3008                 ++p;
3009                 break;
3010
3011             case '\r':
3012                 ++p;
3013                 if (p[0] == '\n')
3014                    ++p;
3015                 break;
3016
3017             default:
3018                 if (defines)
3019                     defines.writeByte(*p); // don't care about Unicode line endings for C
3020                 else if (*p & 0x80)
3021                 {
3022                     const u = decodeUTF();
3023                     if (u == PS || u == LS)
3024                     {
3025                         ++p;
3026                         break;
3027                     }
3028                 }
3029                 ++p;
3030                 continue;
3031             }
3032             break;
3033         }
3034         endOfLine();
3035         tokenizeNewlines = false;
3036     }
3037
3038     /********************************************
3039      * Decode UTF character.
3040      * Issue error messages for invalid sequences.
3041      * Return decoded character, advance p to last character in UTF sequence.
3042      */
3043     private uint decodeUTF()
3044     {
3045         string msg;
3046         auto result = decodeUTFpure(msg);
3047
3048         if (msg)
3049             error(token.loc, "%.*s", cast(int)msg.length, msg.ptr);
3050         return result;
3051     }
3052
3053     /********************************************
3054      * Same as above, but the potential error message is stored to the
3055      * msg parameter instead of being issued.
3056      */
3057     private pure uint decodeUTFpure(out string msg)
3058     {
3059         const s = p;
3060         assert(*s & 0x80);
3061         // Check length of remaining string up to 4 UTF-8 characters
3062         size_t len;
3063         for (len = 1; len < 4 && s[len]; len++)
3064         {
3065         }
3066         size_t idx = 0;
3067         dchar u;
3068         msg = utf_decodeChar(s[0 .. len], idx, u);
3069         p += idx - 1;
3070         if (!msg && isBidiControl(u))
3071             msg = "Bidirectional control characters are disallowed for security reasons.";
3072         return u;
3073     }
3074
3075     /***************************************************
3076      * Parse doc comment embedded between t.ptr and p.
3077      * Remove trailing blanks and tabs from lines.
3078      * Replace all newlines with \n.
3079      * Remove leading comment character from each line.
3080      * Decide if it's a lineComment or a blockComment.
3081      * Append to previous one for this token.
3082      *
3083      * If newParagraph is true, an extra newline will be
3084      * added between adjoining doc comments.
3085      */
3086     private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
3087     {
3088         /* ct tells us which kind of comment it is: '/', '*', or '+'
3089          */
3090         const ct = t.ptr[2];
3091         /* Start of comment text skips over / * *, / + +, or / / /
3092          */
3093         const(char)* q = t.ptr + 3; // start of comment text
3094         const(char)* qend = p;
3095         if (ct == '*' || ct == '+')
3096             qend -= 2;
3097         /* Scan over initial row of ****'s or ++++'s or ////'s
3098          */
3099         for (; q < qend; q++)
3100         {
3101             if (*q != ct)
3102                 break;
3103         }
3104         /* Remove leading spaces until start of the comment
3105          */
3106         int linestart = 0;
3107         if (ct == '/')
3108         {
3109             while (q < qend && (*q == ' ' || *q == '\t'))
3110                 ++q;
3111         }
3112         else if (q < qend)
3113         {
3114             if (*q == '\r')
3115             {
3116                 ++q;
3117                 if (q < qend && *q == '\n')
3118                     ++q;
3119                 linestart = 1;
3120             }
3121             else if (*q == '\n')
3122             {
3123                 ++q;
3124                 linestart = 1;
3125             }
3126         }
3127         /* Remove trailing row of ****'s or ++++'s
3128          */
3129         if (ct != '/')
3130         {
3131             for (; q < qend; qend--)
3132             {
3133                 if (qend[-1] != ct)
3134                     break;
3135             }
3136         }
3137         /* Comment is now [q .. qend].
3138          * Canonicalize it into buf[].
3139          */
3140         OutBuffer buf;
3141
3142         void trimTrailingWhitespace()
3143         {
3144             const s = buf[];
3145             auto len = s.length;
3146             while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
3147                 --len;
3148             buf.setsize(len);
3149         }
3150
3151         for (; q < qend; q++)
3152         {
3153             char c = *q;
3154             switch (c)
3155             {
3156             case '*':
3157             case '+':
3158                 if (linestart && c == ct)
3159                 {
3160                     linestart = 0;
3161                     /* Trim preceding whitespace up to preceding \n
3162                      */
3163                     trimTrailingWhitespace();
3164                     continue;
3165                 }
3166                 break;
3167             case ' ':
3168             case '\t':
3169                 break;
3170             case '\r':
3171                 if (q[1] == '\n')
3172                     continue; // skip the \r
3173                 goto Lnewline;
3174             default:
3175                 if (c == 226)
3176                 {
3177                     // If LS or PS
3178                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
3179                     {
3180                         q += 2;
3181                         goto Lnewline;
3182                     }
3183                 }
3184                 linestart = 0;
3185                 break;
3186             Lnewline:
3187                 c = '\n'; // replace all newlines with \n
3188                 goto case;
3189             case '\n':
3190                 linestart = 1;
3191                 /* Trim trailing whitespace
3192                  */
3193                 trimTrailingWhitespace();
3194                 break;
3195             }
3196             buf.writeByte(c);
3197         }
3198         /* Trim trailing whitespace (if the last line does not have newline)
3199          */
3200         trimTrailingWhitespace();
3201
3202         // Always end with a newline
3203         const s = buf[];
3204         if (s.length == 0 || s[$ - 1] != '\n')
3205             buf.writeByte('\n');
3206
3207         // It's a line comment if the start of the doc comment comes
3208         // after other non-whitespace on the same line.
3209         auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
3210         // Combine with previous doc comment, if any
3211         if (*dc)
3212         {
3213             auto p = combineComments(*dc, buf[], newParagraph);
3214             *dc = p ? p[0 .. strlen(p)] : null;
3215         }
3216         else
3217             *dc = buf.extractSlice(true);
3218     }
3219
3220     /********************************************
3221      * Combine two document comments into one,
3222      * separated by an extra newline if newParagraph is true.
3223      */
3224     static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
3225     {
3226         //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
3227         const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
3228         if (!c1)
3229             return c2.ptr;
3230         if (!c2)
3231             return c1.ptr;
3232
3233         int insertNewLine = 0;
3234         if (c1.length && c1[$ - 1] != '\n')
3235             insertNewLine = 1;
3236         const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
3237         auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
3238         p[0 .. c1.length] = c1[];
3239         if (insertNewLine)
3240             p[c1.length] = '\n';
3241         if (newParagraph)
3242             p[c1.length + insertNewLine] = '\n';
3243         p[retSize - c2.length .. retSize] = c2[];
3244         p[retSize] = 0;
3245         return p;
3246     }
3247
3248     /**************************
3249      * `p` should be at start of next line
3250      */
3251     private void endOfLine() @nogc @safe
3252     {
3253         scanloc.linnum = scanloc.linnum + 1;
3254         line = p;
3255     }
3256
3257     /****************************
3258      * Print the tokens from the current `token` to the end,
3259      * while not advancing the parser forward.
3260      * Useful for debugging.
3261      */
3262     void printRestOfTokens()
3263     {
3264         auto tk = &token;
3265         while (1)
3266         {
3267             printf("%s ", (*tk).toChars());
3268             if (tk.value == TOK.endOfFile || tk.value == TOK.endOfLine)
3269                 break;
3270             tk = peek(tk);
3271         }
3272         printf("\n");
3273     }
3274 }
3275
3276
3277 /******************************* Private *****************************************/
3278
3279 private:
3280
3281 private enum LS = 0x2028;       // UTF line separator
3282 private enum PS = 0x2029;       // UTF paragraph separator
3283
3284 /********************************************
3285  * Do our own char maps
3286  */
3287 private static immutable cmtable = ()
3288 {
3289     ubyte[256] table;
3290     foreach (const c; 0 .. table.length)
3291     {
3292         if ('0' <= c && c <= '7')
3293             table[c] |= CMoctal;
3294         if (c_isxdigit(c))
3295             table[c] |= CMhex;
3296         if (c_isalnum(c) || c == '_')
3297             table[c] |= CMidchar;
3298
3299         switch (c)
3300         {
3301             case 'x': case 'X':
3302             case 'b': case 'B':
3303                 table[c] |= CMzerosecond;
3304                 break;
3305
3306             case '0': .. case '9':
3307             case 'e': case 'E':
3308             case 'f': case 'F':
3309             case 'l': case 'L':
3310             case 'p': case 'P':
3311             case 'u': case 'U':
3312             case 'i':
3313             case '.':
3314             case '_':
3315                 table[c] |= CMzerosecond | CMdigitsecond;
3316                 break;
3317
3318             default:
3319                 break;
3320         }
3321
3322         switch (c)
3323         {
3324             case '\\':
3325             case '\n':
3326             case '\r':
3327             case 0:
3328             case 0x1A:
3329             case '\'':
3330                 break;
3331             default:
3332                 if (!(c & 0x80))
3333                     table[c] |= CMsinglechar;
3334                 break;
3335         }
3336     }
3337     return table;
3338 }();
3339
3340 private
3341 {
3342     enum CMoctal  = 0x1;
3343     enum CMhex    = 0x2;
3344     enum CMidchar = 0x4;
3345     enum CMzerosecond = 0x8;
3346     enum CMdigitsecond = 0x10;
3347     enum CMsinglechar = 0x20;
3348 }
3349
3350 private bool isoctal(const char c) pure @nogc @safe
3351 {
3352     return (cmtable[c] & CMoctal) != 0;
3353 }
3354
3355 private bool ishex(const char c) pure @nogc @safe
3356 {
3357     return (cmtable[c] & CMhex) != 0;
3358 }
3359
3360 private bool isidchar(const char c) pure @nogc @safe
3361 {
3362     return (cmtable[c] & CMidchar) != 0;
3363 }
3364
3365 private bool isZeroSecond(const char c) pure @nogc @safe
3366 {
3367     return (cmtable[c] & CMzerosecond) != 0;
3368 }
3369
3370 private bool isDigitSecond(const char c) pure @nogc @safe
3371 {
3372     return (cmtable[c] & CMdigitsecond) != 0;
3373 }
3374
3375 private bool issinglechar(const char c) pure @nogc @safe
3376 {
3377     return (cmtable[c] & CMsinglechar) != 0;
3378 }
3379
3380 private bool c_isxdigit(const int c) pure @nogc @safe
3381 {
3382     return (( c >= '0' && c <= '9') ||
3383             ( c >= 'a' && c <= 'f') ||
3384             ( c >= 'A' && c <= 'F'));
3385 }
3386
3387 private bool c_isalnum(const int c) pure @nogc @safe
3388 {
3389     return (( c >= '0' && c <= '9') ||
3390             ( c >= 'a' && c <= 'z') ||
3391             ( c >= 'A' && c <= 'Z'));
3392 }
3393
3394 /******************************* Unittest *****************************************/
3395
3396 unittest
3397 {
3398     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3399
3400     ErrorSink errorSink = new ErrorSinkStderr;
3401
3402     void test(T)(string sequence, T expected, bool Ccompile = false)
3403     {
3404         auto p = cast(const(char)*)sequence.ptr;
3405         dchar c2;
3406         Lexer lexer = new Lexer(errorSink);
3407         assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2));
3408         assert(p == sequence.ptr + sequence.length);
3409     }
3410
3411     test(`'`, '\'');
3412     test(`"`, '"');
3413     test(`?`, '?');
3414     test(`\`, '\\');
3415     test(`0`, '\0');
3416     test(`a`, '\a');
3417     test(`b`, '\b');
3418     test(`f`, '\f');
3419     test(`n`, '\n');
3420     test(`r`, '\r');
3421     test(`t`, '\t');
3422     test(`v`, '\v');
3423
3424     test(`x00`, 0x00);
3425     test(`xff`, 0xff);
3426     test(`xFF`, 0xff);
3427     test(`xa7`, 0xa7);
3428     test(`x3c`, 0x3c);
3429     test(`xe2`, 0xe2);
3430
3431     test(`1`, '\1');
3432     test(`42`, '\42');
3433     test(`357`, '\357');
3434
3435     test(`u1234`, '\u1234');
3436     test(`uf0e4`, '\uf0e4');
3437
3438     test(`U0001f603`, '\U0001f603');
3439
3440     test(`&quot;`, '"');
3441     test(`&lt;`, '<');
3442     test(`&gt;`, '>');
3443 }
3444
3445 unittest
3446 {
3447     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3448
3449     static class ErrorSinkTest : ErrorSinkNull
3450     {
3451       nothrow:
3452       extern (C++):
3453       override:
3454
3455         import core.stdc.stdio;
3456         import core.stdc.stdarg;
3457
3458         string expected;
3459         bool gotError;
3460
3461         void error(const ref Loc loc, const(char)* format, ...)
3462         {
3463             gotError = true;
3464             char[100] buffer = void;
3465             va_list ap;
3466             va_start(ap, format);
3467             auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)];
3468             va_end(ap);
3469             assert(expected == actual);
3470         }
3471     }
3472
3473     ErrorSinkTest errorSink = new ErrorSinkTest;
3474
3475     void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3476     {
3477         errorSink.expected = expectedError;
3478         errorSink.gotError = false;
3479         auto p = cast(const(char)*)sequence.ptr;
3480         Lexer lexer = new Lexer(errorSink);
3481         dchar c2;
3482         auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2);
3483         assert(errorSink.gotError);
3484         assert(expectedReturnValue == actualReturnValue);
3485
3486         auto actualScanLength = p - sequence.ptr;
3487         assert(expectedScanLength == actualScanLength);
3488     }
3489
3490     test("c", `undefined escape sequence \c`, 'c', 1);
3491     test("!", `undefined escape sequence \!`, '!', 1);
3492     test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3493
3494     test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3495
3496     test("u1"  , `escape hex sequence has 1 hex digits instead of 4`,   0x1, 2);
3497     test("u12" , `escape hex sequence has 2 hex digits instead of 4`,  0x12, 3);
3498     test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3499
3500     test("U0"      , `escape hex sequence has 1 hex digits instead of 8`,       0x0, 2);
3501     test("U00"     , `escape hex sequence has 2 hex digits instead of 8`,      0x00, 3);
3502     test("U000"    , `escape hex sequence has 3 hex digits instead of 8`,     0x000, 4);
3503     test("U0000"   , `escape hex sequence has 4 hex digits instead of 8`,    0x0000, 5);
3504     test("U0001f"  , `escape hex sequence has 5 hex digits instead of 8`,   0x0001f, 6);
3505     test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`,  0x0001f6, 7);
3506     test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3507
3508     test("ud800"    , `invalid UTF character \U0000d800`, '?', 5);
3509     test("udfff"    , `invalid UTF character \U0000dfff`, '?', 5);
3510     test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3511
3512     test("xg0"      , `undefined escape hex sequence \xg`, 'g', 2);
3513     test("ug000"    , `undefined escape hex sequence \ug`, 'g', 2);
3514     test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3515
3516     test("&BAD;", `unnamed character entity &BAD;`  , '?', 5);
3517     test("&quot", `unterminated named entity &quot;`, '?', 5);
3518     test("&quot", `unterminated named entity &quot;`, '?', 5);
3519
3520     test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3521 }
3522
3523 unittest
3524 {
3525     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3526     /* Not much here, just trying things out.
3527      */
3528     string text = "int"; // We rely on the implicit null-terminator
3529     ErrorSink errorSink = new ErrorSinkStderr;
3530     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink, null);
3531     TOK tok;
3532     tok = lex1.nextToken();
3533     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3534     assert(tok == TOK.int32);
3535     tok = lex1.nextToken();
3536     assert(tok == TOK.endOfFile);
3537     tok = lex1.nextToken();
3538     assert(tok == TOK.endOfFile);
3539     tok = lex1.nextToken();
3540     assert(tok == TOK.endOfFile);
3541 }
3542
3543 unittest
3544 {
3545     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3546
3547     // We don't want to see Lexer error output during these tests.
3548     ErrorSink errorSink = new ErrorSinkNull;
3549
3550     // Test malformed input: even malformed input should end in a TOK.endOfFile.
3551     static immutable char[][] testcases =
3552     [   // Testcase must end with 0 or 0x1A.
3553         [0], // not malformed, but pathological
3554         ['\'', 0],
3555         ['\'', 0x1A],
3556         ['{', '{', 'q', '{', 0],
3557         [0xFF, 0],
3558         [0xFF, 0x80, 0],
3559         [0xFF, 0xFF, 0],
3560         [0xFF, 0xFF, 0],
3561         ['x', '"', 0x1A],
3562     ];
3563
3564     foreach (testcase; testcases)
3565     {
3566         scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink, null);
3567         TOK tok = lex2.nextToken();
3568         size_t iterations = 1;
3569         while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3570         {
3571             tok = lex2.nextToken();
3572         }
3573         assert(tok == TOK.endOfFile);
3574         tok = lex2.nextToken();
3575         assert(tok == TOK.endOfFile);
3576     }
3577 }