dmd2/lexer.c

   1
   2 // Compiler implementation of the D programming language
   3 // Copyright (c) 1999-2008 by Digital Mars
   4 // All Rights Reserved
   5 // written by Walter Bright
   6 // http://www.digitalmars.com
   7 // License for redistribution is by either the Artistic License
   8 // in artistic.txt, or the GNU General Public License in gnu.txt.
   9 // See the included readme.txt for details.
  10
  11 /* NOTE: This file has been patched from the original DMD distribution to
  12    work with the GDC compiler.
  13
  14    Modified by David Friedman, December 2006
  15 */
  16
  17 /* Lexical Analyzer */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21 #include <ctype.h>
  22 #include <stdarg.h>
  23 #include <errno.h>
  24 //#include <wchar.h>
  25 #include <stdlib.h>
  26 #include <assert.h>
  27 #include <sys/time.h>
  28
  29 #ifdef IN_GCC
  30
  31 #include <time.h>
  32 #include "mem.h"
  33
  34 #else
  35
  36 #if __GNUC__
  37 #include <time.h>
  38 #endif
  39
  40 #if _WIN32
  41 #include "..\root\mem.h"
  42 #else
  43 #include "../root/mem.h"
  44 #endif
  45 #endif
  46
  47 #include "stringtable.h"
  48
  49 #include "lexer.h"
  50 #include "utf.h"
  51 #include "identifier.h"
  52 #include "id.h"
  53 #include "module.h"
  54
  55 #if _WIN32 && __DMC__
  56 // from \dm\src\include\setlocal.h
  57 extern "C" char * __cdecl __locale_decpoint;
  58 #endif
  59
  60 extern int HtmlNamedEntity(unsigned char *p, int length);
  61
  62 #define LS 0x2028       // UTF line separator
  63 #define PS 0x2029       // UTF paragraph separator
  64
  65 /********************************************
  66  * Do our own char maps
  67  */
  68
  69 static unsigned char cmtable[256];
  70
  71 const int CMoctal =     0x1;
  72 const int CMhex =       0x2;
  73 const int CMidchar =    0x4;
  74
  75 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
  76 inline unsigned char ishex   (unsigned char c) { return cmtable[c] & CMhex; }
  77 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
  78
  79 static void cmtable_init()
  80 {
  81     for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
  82     {
  83         if ('0' <= c && c <= '7')
  84             cmtable[c] |= CMoctal;
  85         if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
  86             cmtable[c] |= CMhex;
  87         if (isalnum(c) || c == '_')
  88             cmtable[c] |= CMidchar;
  89     }
  90 }
  91
  92
  93 /************************* Token **********************************************/
  94
  95 char *Token::tochars[TOKMAX];
  96
  97 void *Token::operator new(size_t size)
  98 {   Token *t;
  99
 100     if (Lexer::freelist)
 101     {
 102         t = Lexer::freelist;
 103         Lexer::freelist = t->next;
 104         return t;
 105     }
 106
 107     return ::operator new(size);
 108 }
 109
 110 #ifdef DEBUG
 111 void Token::print()
 112 {
 113     fprintf(stdmsg, "%s\n", toChars());
 114 }
 115 #endif
 116
 117 char *Token::toChars()
 118 {   char *p;
 119     static char buffer[3 + 3 * sizeof(value) + 1];
 120
 121     p = buffer;
 122     switch (value)
 123     {
 124         case TOKint32v:
 125 #if IN_GCC
 126             sprintf(buffer,"%d",(d_int32)int64value);
 127 #else
 128             sprintf(buffer,"%d",int32value);
 129 #endif
 130             break;
 131
 132         case TOKuns32v:
 133         case TOKcharv:
 134         case TOKwcharv:
 135         case TOKdcharv:
 136 #if IN_GCC
 137             sprintf(buffer,"%uU",(d_uns32)uns64value);
 138 #else
 139             sprintf(buffer,"%uU",uns32value);
 140 #endif
 141             break;
 142
 143         case TOKint64v:
 144             sprintf(buffer,"%"PRIdMAX"L",int64value);
 145             break;
 146
 147         case TOKuns64v:
 148             sprintf(buffer,"%"PRIuMAX"UL",uns64value);
 149             break;
 150
 151 #if IN_GCC
 152         case TOKfloat32v:
 153         case TOKfloat64v:
 154         case TOKfloat80v:
 155             float80value.format(buffer, sizeof(buffer));
 156             break;
 157         case TOKimaginary32v:
 158         case TOKimaginary64v:
 159         case TOKimaginary80v:
 160             float80value.format(buffer, sizeof(buffer));
 161             // %% buffer
 162             strcat(buffer, "i");
 163             break;
 164 #else
 165         case TOKfloat32v:
 166             sprintf(buffer,"%Lgf", float80value);
 167             break;
 168
 169         case TOKfloat64v:
 170             sprintf(buffer,"%Lg", float80value);
 171             break;
 172
 173         case TOKfloat80v:
 174             sprintf(buffer,"%LgL", float80value);
 175             break;
 176
 177         case TOKimaginary32v:
 178             sprintf(buffer,"%Lgfi", float80value);
 179             break;
 180
 181         case TOKimaginary64v:
 182             sprintf(buffer,"%Lgi", float80value);
 183             break;
 184
 185         case TOKimaginary80v:
 186             sprintf(buffer,"%LgLi", float80value);
 187             break;
 188 #endif
 189
 190
 191         case TOKstring:
 192 #if CSTRINGS
 193             p = string;
 194 #else
 195         {   OutBuffer buf;
 196
 197             buf.writeByte('"');
 198             for (size_t i = 0; i < len; )
 199             {   unsigned c;
 200
 201                 utf_decodeChar((unsigned char *)ustring, len, &i, &c);
 202                 switch (c)
 203                 {
 204                     case 0:
 205                         break;
 206
 207                     case '"':
 208                     case '\\':
 209                         buf.writeByte('\\');
 210                     default:
 211                         if (isprint(c))
 212                             buf.writeByte(c);
 213                         else if (c <= 0x7F)
 214                             buf.printf("\\x%02x", c);
 215                         else if (c <= 0xFFFF)
 216                             buf.printf("\\u%04x", c);
 217                         else
 218                             buf.printf("\\U%08x", c);
 219                         continue;
 220                 }
 221                 break;
 222             }
 223             buf.writeByte('"');
 224             if (postfix)
 225                 buf.writeByte('"');
 226             buf.writeByte(0);
 227             p = (char *)buf.extractData();
 228         }
 229 #endif
 230             break;
 231
 232         case TOKidentifier:
 233         case TOKenum:
 234         case TOKstruct:
 235         case TOKimport:
 236         CASE_BASIC_TYPES:
 237             p = ident->toChars();
 238             break;
 239
 240         default:
 241             p = toChars(value);
 242             break;
 243     }
 244     return p;
 245 }
 246
 247 char *Token::toChars(enum TOK value)
 248 {   char *p;
 249     static char buffer[3 + 3 * sizeof(value) + 1];
 250
 251     p = tochars[value];
 252     if (!p)
 253     {   sprintf(buffer,"TOK%d",value);
 254         p = buffer;
 255     }
 256     return p;
 257 }
 258
 259 /*************************** Lexer ********************************************/
 260
 261 Token *Lexer::freelist = NULL;
 262 StringTable Lexer::stringtable;
 263 OutBuffer Lexer::stringbuffer;
 264
 265 Lexer::Lexer(Module *mod,
 266         unsigned char *base, unsigned begoffset, unsigned endoffset,
 267         int doDocComment, int commentToken, bool dltSyntax)
 268     : loc(mod, 1), dltSyntax(dltSyntax)
 269 {
 270     //printf("Lexer::Lexer(%p,%d)\n",base,length);
 271     //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
 272     memset(&token,0,sizeof(token));
 273     this->base = base;
 274     this->end  = base + endoffset;
 275     p = base + begoffset;
 276     this->mod = mod;
 277     this->doDocComment = doDocComment;
 278     this->anyToken = 0;
 279     this->commentToken = commentToken;
 280     this->nesting = 0;
 281     this->indent = 0;
 282     this->atStartOfLine = 1;
 283     this->incLineno = 0;
 284     //initKeywords();
 285
 286     /* If first line starts with '#!', ignore the line
 287      */
 288
 289     if (p[0] == '#' && p[1] =='!')
 290     {
 291         p += 2;
 292         while (1)
 293         {   unsigned char c = *p;
 294             switch (c)
 295             {
 296                 case '\n':
 297                     p++;
 298                     break;
 299
 300                 case '\r':
 301                     p++;
 302                     if (*p == '\n')
 303                         p++;
 304                     break;
 305
 306                 case 0:
 307                 case 0x1A:
 308                     break;
 309
 310                 default:
 311                     if (c & 0x80)
 312                     {   unsigned u = decodeUTF();
 313                         if (u == PS || u == LS)
 314                             break;
 315                     }
 316                     p++;
 317                     continue;
 318             }
 319             break;
 320         }
 321         loc.linnum = 2;
 322     }
 323 }
 324
 325
 326 void Lexer::error(const char *format, ...)
 327 {
 328     if (mod && !global.gag)
 329     {
 330         char *p = loc.toChars();
 331         if (*p)
 332             fprintf(stdmsg, "%s: ", p);
 333         mem.free(p);
 334
 335         va_list ap;
 336         va_start(ap, format);
 337         vfprintf(stdmsg, format, ap);
 338         va_end(ap);
 339
 340         fprintf(stdmsg, "\n");
 341         fflush(stdmsg);
 342
 343         if (global.errors >= 20)        // moderate blizzard of cascading messages
 344             fatal();
 345     }
 346     global.errors++;
 347 }
 348
 349 void Lexer::error(Loc loc, const char *format, ...)
 350 {
 351     if (mod && !global.gag)
 352     {
 353         char *p = loc.toChars();
 354         if (*p)
 355             fprintf(stdmsg, "%s: ", p);
 356         mem.free(p);
 357
 358         va_list ap;
 359         va_start(ap, format);
 360         vfprintf(stdmsg, format, ap);
 361         va_end(ap);
 362
 363         fprintf(stdmsg, "\n");
 364         fflush(stdmsg);
 365
 366         if (global.errors >= 20)        // moderate blizzard of cascading messages
 367             fatal();
 368     }
 369     global.errors++;
 370 }
 371
 372 TOK Lexer::nextToken()
 373 {   Token *t;
 374
 375     if (token.next)
 376     {
 377         t = token.next;
 378         memcpy(&token,t,sizeof(Token));
 379         t->next = freelist;
 380         freelist = t;
 381     }
 382     else
 383     {
 384         scan(&token);
 385     }
 386     //token.print();
 387     return token.value;
 388 }
 389
 390 Token *Lexer::peek(Token *ct)
 391 {   Token *t;
 392
 393     if (ct->next)
 394         t = ct->next;
 395     else
 396     {
 397         t = new Token();
 398         scan(t);
 399         t->next = NULL;
 400         ct->next = t;
 401     }
 402     return t;
 403 }
 404
 405 /*********************************
 406  * tk is on the opening (.
 407  * Look ahead and return token that is past the closing ).
 408  */
 409
 410 Token *Lexer::peekPastParen(Token *tk)
 411 {
 412     //printf("peekPastParen()\n");
 413     int parens = 1;
 414     int curlynest = 0;
 415     while (1)
 416     {
 417         tk = peek(tk);
 418         //tk->print();
 419         switch (tk->value)
 420         {
 421             case TOKlparen:
 422                 parens++;
 423                 continue;
 424
 425             case TOKrparen:
 426                 --parens;
 427                 if (parens)
 428                     continue;
 429                 tk = peek(tk);
 430                 break;
 431
 432             case TOKlcurly:
 433                 curlynest++;
 434                 continue;
 435
 436             case TOKrcurly:
 437                 if (--curlynest >= 0)
 438                     continue;
 439                 break;
 440
 441             case TOKsemicolon:
 442                 if (curlynest)
 443                     continue;
 444                 break;
 445
 446             case TOKeof:
 447                 break;
 448
 449             default:
 450                 continue;
 451         }
 452         return tk;
 453     }
 454 }
 455
 456 /**********************************
 457  * Determine if string is a valid Identifier.
 458  * Placed here because of commonality with Lexer functionality.
 459  * Returns:
 460  *      0       invalid
 461  */
 462
 463 int Lexer::isValidIdentifier(char *p)
 464 {
 465     size_t len;
 466     size_t idx;
 467
 468     if (!p || !*p)
 469         goto Linvalid;
 470
 471     if (*p >= '0' && *p <= '9')         // beware of isdigit() on signed chars
 472         goto Linvalid;
 473
 474     len = strlen(p);
 475     idx = 0;
 476     while (p[idx])
 477     {   dchar_t dc;
 478
 479         char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
 480         if (q)
 481             goto Linvalid;
 482
 483         if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
 484             goto Linvalid;
 485     }
 486     return 1;
 487
 488 Linvalid:
 489     return 0;
 490 }
 491
 492 /****************************
 493  * Turn next token in buffer into a token.
 494  */
 495
 496 void Lexer::scan(Token *t)
 497 {
 498     unsigned lastLine = loc.linnum;
 499     unsigned linnum;
 500
 501     // Delayed line-number updating
 502     if (incLineno)
 503     {
 504         assert(incLineno == 1);
 505         incLineno = 0;
 506         loc.linnum++;
 507     }
 508
 509     t->blockComment = NULL;
 510     t->lineComment = NULL;
 511     while (1)
 512     {
 513         t->ptr = p;
 514
 515         if (dltSyntax && atStartOfLine) {
 516                 // Check indent
 517                 int i;
 518                 for (i = 0; p[i] == '\t'; i++) {
 519                 }
 520                 if (p[i] == ' ') {
 521                     error("Whitespace error: use tabs to indent!");
 522                 }
 523                 if (p[i] == '#') {
 524                     p += i;
 525                     atStartOfLine = 0;
 526                 } else if (p[i] != '\n' && p[i] != '\r') {
 527                     if (p[i] == '\0')
 528                         i = 0;                  // End-of-file always has no indent
 529                     if (i > indent) {
 530                         error("unexpected indentation (expected %d tabs, not %d)",
 531                                 indent, i);
 532                     } else if (i < indent) {
 533                         indent -= 1;
 534                         t->value = TOKrcurly;
 535                         return;
 536                     }
 537                     atStartOfLine = 0;
 538                 } /* else ignore blank line */
 539         }
 540
 541         //printf("p = %p, *p = '%c'\n",p,*p);
 542         switch (*p)
 543         {
 544             case 0:
 545             case 0x1A:
 546                 t->value = TOKeof;                      // end of file
 547                 return;
 548
 549             case ' ':
 550             case '\t':
 551             case '\v':
 552             case '\f':
 553                 p++;
 554                 continue;                       // skip white space
 555
 556             case '\r':
 557                 if (p[1] == '\n') {             // if CRLF
 558                     p++;
 559                     continue;
 560                 }
 561                 // fall-through
 562             case '\n':
 563                 p++;
 564                 if (dltSyntax)
 565                 {
 566                     // Delay incrementing the line number until after sending
 567                     // the TOKendline, for better error messages
 568                     assert(!incLineno);
 569                     incLineno++;
 570
 571                     if (!nesting)
 572                     {
 573                         atStartOfLine = 1;
 574                         t->value = TOKendline;
 575                         return;
 576                     }
 577                 }
 578                 else
 579                     loc.linnum++;
 580                 continue;                       // Ignore newlines inside brackets
 581             case '0':   case '1':   case '2':   case '3':   case '4':
 582             case '5':   case '6':   case '7':   case '8':   case '9':
 583                 t->value = number(t);
 584                 return;
 585
 586 #if CSTRINGS
 587             case '\'':
 588                 t->value = charConstant(t, 0);
 589                 return;
 590
 591             case '"':
 592                 t->value = stringConstant(t,0);
 593                 return;
 594
 595             case 'l':
 596             case 'L':
 597                 if (p[1] == '\'')
 598                 {
 599                     p++;
 600                     t->value = charConstant(t, 1);
 601                     return;
 602                 }
 603                 else if (p[1] == '"')
 604                 {
 605                     p++;
 606                     t->value = stringConstant(t, 1);
 607                     return;
 608                 }
 609 #else
 610             case '\'':
 611                 t->value = charConstant(t,0);
 612                 return;
 613
 614             case 'r':
 615                 if (p[1] != '"')
 616                     goto case_ident;
 617                 p++;
 618             case '`':
 619                 t->value = wysiwygStringConstant(t, *p);
 620                 return;
 621
 622             case 'x':
 623                 if (p[1] != '"')
 624                     goto case_ident;
 625                 p++;
 626                 t->value = hexStringConstant(t);
 627                 return;
 628
 629 #if V2
 630             case 'q':
 631                 if (p[1] == '"')
 632                 {
 633                     p++;
 634                     t->value = delimitedStringConstant(t);
 635                     return;
 636                 }
 637                 else if (p[1] == '{')
 638                 {
 639                     p++;
 640                     t->value = tokenStringConstant(t);
 641                     return;
 642                 }
 643                 else
 644                     goto case_ident;
 645 #endif
 646
 647             case '"':
 648                 t->value = escapeStringConstant(t,0);
 649                 return;
 650
 651             case '\\':                  // escaped string literal
 652             {   unsigned c;
 653
 654                 stringbuffer.reset();
 655                 do
 656                 {
 657                     p++;
 658                     switch (*p)
 659                     {
 660                         case 'u':
 661                         case 'U':
 662                         case '&':
 663                             c = escapeSequence();
 664                             stringbuffer.writeUTF8(c);
 665                             break;
 666
 667                         default:
 668                             c = escapeSequence();
 669                             stringbuffer.writeByte(c);
 670                             break;
 671                     }
 672                 } while (*p == '\\');
 673                 t->len = stringbuffer.offset;
 674                 stringbuffer.writeByte(0);
 675                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
 676                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
 677                 t->postfix = 0;
 678                 t->value = TOKstring;
 679                 return;
 680             }
 681
 682             case 'l':
 683             case 'L':
 684 #endif
 685             case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 686             case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 687             case 'k':               case 'm':   case 'n':   case 'o':
 688 #if V2
 689             case 'p':   /*case 'q': case 'r':*/ case 's':   case 't':
 690 #else
 691             case 'p':   case 'q': /*case 'r':*/ case 's':   case 't':
 692 #endif
 693             case 'u':   case 'v':   case 'w': /*case 'x':*/ case 'y':
 694             case 'z':
 695             case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 696             case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 697             case 'K':               case 'M':   case 'N':   case 'O':
 698             case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 699             case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 700             case 'Z':
 701             case '_':
 702             case_ident:
 703             {   unsigned char c;
 704                 StringValue *sv;
 705                 Identifier *id;
 706
 707                 do
 708                 {
 709                     c = *++p;
 710                 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
 711                 sv = stringtable.update((char *)t->ptr, p - t->ptr);
 712                 id = (Identifier *) sv->ptrvalue;
 713                 if (!id)
 714                 {   id = new Identifier(sv->lstring.string,TOKidentifier);
 715                     sv->ptrvalue = id;
 716                 }
 717                 t->ident = id;
 718                 t->value = (enum TOK) id->value;
 719                 anyToken = 1;
 720                 if (*t->ptr == '_')     // if special identifier token
 721                 {
 722                     static char date[11+1];
 723                     static char time[8+1];
 724                     static char timestamp[24+1];
 725
 726                     if (!date[0])       // lazy evaluation
 727                     {   time_t t;
 728                         char *p;
 729
 730                         ::time(&t);
 731                         p = ctime(&t);
 732                         assert(p);
 733                         sprintf(date, "%.6s %.4s", p + 4, p + 20);
 734                         sprintf(time, "%.8s", p + 11);
 735                         sprintf(timestamp, "%.24s", p);
 736                     }
 737
 738 #if !V2
 739                     if (mod && id == Id::FILE)
 740                     {
 741                         t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
 742                         goto Lstring;
 743                     }
 744                     else if (mod && id == Id::LINE)
 745                     {
 746                         t->value = TOKint64v;
 747                         t->uns64value = loc.linnum;
 748                     }
 749                     else
 750 #endif
 751                     if (id == Id::DATE)
 752                     {
 753                         t->ustring = (unsigned char *)date;
 754                         goto Lstring;
 755                     }
 756                     else if (id == Id::TIME)
 757                     {
 758                         t->ustring = (unsigned char *)time;
 759                         goto Lstring;
 760                     }
 761                     else if (id == Id::VENDOR)
 762                     {
 763 #ifdef IN_GCC
 764                         t->ustring = (unsigned char *)"GDC";
 765 #else
 766                         t->ustring = (unsigned char *)"Digital Mars D";
 767 #endif
 768                         goto Lstring;
 769                     }
 770                     else if (id == Id::TIMESTAMP)
 771                     {
 772                         t->ustring = (unsigned char *)timestamp;
 773                      Lstring:
 774                         t->value = TOKstring;
 775                      Llen:
 776                         t->postfix = 0;
 777                         t->len = strlen((char *)t->ustring);
 778                     }
 779                     else if (id == Id::VERSIONX)
 780                     {   unsigned major = 0;
 781                         unsigned minor = 0;
 782
 783                         for (char *p = global.version + 1; 1; p++)
 784                         {
 785                             char c = *p;
 786                             if (isdigit(c))
 787                                 minor = minor * 10 + c - '0';
 788                             else if (c == '.')
 789                             {   major = minor;
 790                                 minor = 0;
 791                             }
 792                             else
 793                                 break;
 794                         }
 795                         t->value = TOKint64v;
 796                         t->uns64value = major * 1000 + minor;
 797                     }
 798 #if V2
 799                     else if (id == Id::EOFX)
 800                     {
 801                         t->value = TOKeof;
 802                         // Advance scanner to end of file
 803                         while (!(*p == 0 || *p == 0x1A))
 804                             p++;
 805                     }
 806 #endif
 807                 }
 808                 //printf("t->value = %d\n",t->value);
 809                 return;
 810             }
 811
 812             case '/':
 813                 p++;
 814                 switch (*p)
 815                 {
 816                     case '=':
 817                         p++;
 818                         t->value = TOKdivass;
 819                         return;
 820
 821                     case '*':
 822                         p++;
 823                         linnum = loc.linnum;
 824                         while (1)
 825                         {
 826                             while (1)
 827                             {   unsigned char c = *p;
 828                                 switch (c)
 829                                 {
 830                                     case '/':
 831                                         break;
 832
 833                                     case '\n':
 834                                         loc.linnum++;
 835                                         p++;
 836                                         continue;
 837
 838                                     case '\r':
 839                                         p++;
 840                                         if (*p != '\n')
 841                                             loc.linnum++;
 842                                         continue;
 843
 844                                     case 0:
 845                                     case 0x1A:
 846                                         error("unterminated /* */ comment");
 847                                         p = end;
 848                                         t->value = TOKeof;
 849                                         return;
 850
 851                                     default:
 852                                         if (c & 0x80)
 853                                         {   unsigned u = decodeUTF();
 854                                             if (u == PS || u == LS)
 855                                                 loc.linnum++;
 856                                         }
 857                                         p++;
 858                                         continue;
 859                                 }
 860                                 break;
 861                             }
 862                             p++;
 863                             if (p[-2] == '*' && p - 3 != t->ptr)
 864                                 break;
 865                         }
 866                         if (commentToken)
 867                         {
 868                             t->value = TOKcomment;
 869                             return;
 870                         }
 871                         else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
 872                         {   // if /** but not /**/
 873                             getDocComment(t, lastLine == linnum);
 874                         }
 875                         continue;
 876
 877                     case '/':           // do // style comments
 878                         linnum = loc.linnum;
 879                         while (1)
 880                         {   unsigned char c = *++p;
 881                             switch (c)
 882                             {
 883                                 case '\n':
 884                                     break;
 885
 886                                 case '\r':
 887                                     if (p[1] == '\n')
 888                                         p++;
 889                                     break;
 890
 891                                 case 0:
 892                                 case 0x1A:
 893                                     if (commentToken)
 894                                     {
 895                                         p = end;
 896                                         t->value = TOKcomment;
 897                                         return;
 898                                     }
 899                                     if (doDocComment && t->ptr[2] == '/')
 900                                         getDocComment(t, lastLine == linnum);
 901                                     p = end;
 902                                     t->value = TOKeof;
 903                                     return;
 904
 905                                 default:
 906                                     if (c & 0x80)
 907                                     {   unsigned u = decodeUTF();
 908                                         if (u == PS || u == LS)
 909                                             break;
 910                                     }
 911                                     continue;
 912                             }
 913                             break;
 914                         }
 915
 916                         if (commentToken)
 917                         {
 918                             p++;
 919                             loc.linnum++;
 920                             t->value = TOKcomment;
 921                             return;
 922                         }
 923                         if (doDocComment && t->ptr[2] == '/')
 924                             getDocComment(t, lastLine == linnum);
 925
 926                         p++;
 927                         loc.linnum++;
 928                         continue;
 929
 930                     case '+':
 931                     {   int nest;
 932
 933                         linnum = loc.linnum;
 934                         p++;
 935                         nest = 1;
 936                         while (1)
 937                         {   unsigned char c = *p;
 938                             switch (c)
 939                             {
 940                                 case '/':
 941                                     p++;
 942                                     if (*p == '+')
 943                                     {
 944                                         p++;
 945                                         nest++;
 946                                     }
 947                                     continue;
 948
 949                                 case '+':
 950                                     p++;
 951                                     if (*p == '/')
 952                                     {
 953                                         p++;
 954                                         if (--nest == 0)
 955                                             break;
 956                                     }
 957                                     continue;
 958
 959                                 case '\r':
 960                                     p++;
 961                                     if (*p != '\n')
 962                                         loc.linnum++;
 963                                     continue;
 964
 965                                 case '\n':
 966                                     loc.linnum++;
 967                                     p++;
 968                                     continue;
 969
 970                                 case 0:
 971                                 case 0x1A:
 972                                     error("unterminated /+ +/ comment");
 973                                     p = end;
 974                                     t->value = TOKeof;
 975                                     return;
 976
 977                                 default:
 978                                     if (c & 0x80)
 979                                     {   unsigned u = decodeUTF();
 980                                         if (u == PS || u == LS)
 981                                             loc.linnum++;
 982                                     }
 983                                     p++;
 984                                     continue;
 985                             }
 986                             break;
 987                         }
 988                         if (commentToken)
 989                         {
 990                             t->value = TOKcomment;
 991                             return;
 992                         }
 993                         if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
 994                         {   // if /++ but not /++/
 995                             getDocComment(t, lastLine == linnum);
 996                         }
 997                         continue;
 998                     }
 999                 }
1000                 t->value = TOKdiv;
1001                 return;
1002
1003             case '.':
1004                 p++;
1005                 if (isdigit(*p))
1006                 {   /* Note that we don't allow ._1 and ._ as being
1007                      * valid floating point numbers.
1008                      */
1009                     p--;
1010                     t->value = inreal(t);
1011                 }
1012                 else if (p[0] == '.')
1013                 {
1014                     if (p[1] == '.')
1015                     {   p += 2;
1016                         t->value = TOKdotdotdot;
1017                     }
1018                     else
1019                     {   p++;
1020                         t->value = TOKslice;
1021                     }
1022                 }
1023                 else
1024                     t->value = TOKdot;
1025                 return;
1026
1027             case '&':
1028                 p++;
1029                 if (*p == '=')
1030                 {   p++;
1031                     t->value = TOKandass;
1032                 }
1033                 else if (*p == '&')
1034                 {   p++;
1035                     t->value = TOKandand;
1036                     if (dltSyntax)
1037                         error("Use 'and' instead of '&&'");
1038                 }
1039                 else
1040                     t->value = TOKand;
1041                 return;
1042
1043             case '|':
1044                 p++;
1045                 if (*p == '=')
1046                 {   p++;
1047                     t->value = TOKorass;
1048                 }
1049                 else if (*p == '|')
1050                 {   p++;
1051                     t->value = TOKoror;
1052                     if (dltSyntax)
1053                         error("Use 'or' instead of '||'");
1054                 }
1055                 else
1056                     t->value = TOKor;
1057                 return;
1058
1059             case '-':
1060                 p++;
1061                 if (*p == '=')
1062                 {   p++;
1063                     t->value = TOKminass;
1064                 }
1065 #if 0
1066                 else if (*p == '>')
1067                 {   p++;
1068                     t->value = TOKarrow;
1069                 }
1070 #endif
1071                 else if (*p == '-')
1072                 {   p++;
1073                     t->value = TOKminusminus;
1074                 }
1075                 else
1076                     t->value = TOKmin;
1077                 return;
1078
1079             case '+':
1080                 p++;
1081                 if (*p == '=')
1082                 {   p++;
1083                     t->value = TOKaddass;
1084                 }
1085                 else if (*p == '+')
1086                 {   p++;
1087                     t->value = TOKplusplus;
1088                 }
1089                 else
1090                     t->value = TOKadd;
1091                 return;
1092
1093             case '<':
1094                 p++;
1095                 if (*p == '=')
1096                 {   p++;
1097                     t->value = TOKle;                   // <=
1098                 }
1099                 else if (*p == '<')
1100                 {   p++;
1101                     if (*p == '=')
1102                     {   p++;
1103                         t->value = TOKshlass;           // <<=
1104                     }
1105                     else
1106                         t->value = TOKshl;              // <<
1107                 }
1108                 else if (*p == '>')
1109                 {   p++;
1110                     if (*p == '=')
1111                     {   p++;
1112                         t->value = TOKleg;              // <>=
1113                     }
1114                     else
1115                         t->value = TOKlg;               // <>
1116                 }
1117                 else
1118                     t->value = TOKlt;                   // <
1119                 return;
1120
1121             case '>':
1122                 p++;
1123                 if (*p == '=')
1124                 {   p++;
1125                     t->value = TOKge;                   // >=
1126                 }
1127                 else if (*p == '>')
1128                 {   p++;
1129                     if (*p == '=')
1130                     {   p++;
1131                         t->value = TOKshrass;           // >>=
1132                     }
1133                     else if (*p == '>')
1134                     {   p++;
1135                         if (*p == '=')
1136                         {   p++;
1137                             t->value = TOKushrass;      // >>>=
1138                         }
1139                         else
1140                             t->value = TOKushr;         // >>>
1141                     }
1142                     else
1143                         t->value = TOKshr;              // >>
1144                 }
1145                 else
1146                     t->value = TOKgt;                   // >
1147                 return;
1148
1149             case '!':
1150                 p++;
1151                 if (*p == '=')
1152                 {   p++;
1153                     if (*p == '=' && global.params.Dversion == 1)
1154                     {   p++;
1155                         t->value = TOKnotidentity;      // !==
1156                     }
1157                     else
1158                         t->value = TOKnotequal;         // !=
1159                 }
1160                 else if (*p == '<')
1161                 {   p++;
1162                     if (*p == '>')
1163                     {   p++;
1164                         if (*p == '=')
1165                         {   p++;
1166                             t->value = TOKunord; // !<>=
1167                         }
1168                         else
1169                             t->value = TOKue;   // !<>
1170                     }
1171                     else if (*p == '=')
1172                     {   p++;
1173                         t->value = TOKug;       // !<=
1174                     }
1175                     else
1176                         t->value = TOKuge;      // !<
1177                 }
1178                 else if (*p == '>')
1179                 {   p++;
1180                     if (*p == '=')
1181                     {   p++;
1182                         t->value = TOKul;       // !>=
1183                     }
1184                     else
1185                         t->value = TOKule;      // !>
1186                 }
1187                 else
1188                     t->value = TOKnot;          // !
1189                 return;
1190
1191             case '=':
1192                 p++;
1193                 if (*p == '=')
1194                 {   p++;
1195                     if (*p == '=' && global.params.Dversion == 1)
1196                     {   p++;
1197                         t->value = TOKidentity;         // ===
1198                     }
1199                     else
1200                         t->value = TOKequal;            // ==
1201                 }
1202                 else
1203                     t->value = TOKassign;               // =
1204                 return;
1205
1206             case '~':
1207                 p++;
1208                 if (*p == '=')
1209                 {   p++;
1210                     t->value = TOKcatass;               // ~=
1211                 }
1212                 else
1213                     t->value = TOKtilde;                // ~
1214                 return;
1215
1216 #define NESTED(cin,tokin,cout,tokout) \
1217             case cin: nesting++; p++; t->value = tokin; return;\
1218             case cout: if (nesting == 0) {error("Unexpected '%c'", cout);} else {nesting--;} p++; t->value = tokout; return;
1219
1220             NESTED('(', TOKlparen, ')', TOKrparen)
1221             NESTED('[', TOKlbracket, ']', TOKrbracket)
1222             NESTED('{', TOKlcurly, '}', TOKrcurly)
1223 #undef NESTED
1224
1225 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1226             SINGLE('?', TOKquestion)
1227             SINGLE(',', TOKcomma)
1228             SINGLE(';', TOKsemicolon)
1229             SINGLE('$', TOKdollar)
1230             SINGLE('@', TOKat)
1231
1232 #undef SINGLE
1233
1234             case ':':
1235                 p++;
1236                 if (!nesting)
1237                         indent += 1;
1238                 t->value = TOKcolon;
1239                 return;
1240
1241 #define DOUBLE(c1,tok1,c2,tok2)         \
1242             case c1:                    \
1243                 p++;                    \
1244                 if (*p == c2)           \
1245                 {   p++;                \
1246                     t->value = tok2;    \
1247                 }                       \
1248                 else                    \
1249                     t->value = tok1;    \
1250                 return;
1251
1252             DOUBLE('*', TOKmul, '=', TOKmulass)
1253             DOUBLE('%', TOKmod, '=', TOKmodass)
1254             DOUBLE('^', TOKxor, '=', TOKxorass)
1255
1256 #undef DOUBLE
1257
1258             case '#':           // do # style comments and pragmas
1259                 if (dltSyntax)
1260                 {
1261                     do { p++; } while (*p != '\n');
1262                 }
1263                 else
1264                 {
1265                     p++;
1266                     pragma();
1267                 }
1268                 continue;
1269
1270             default:
1271             {   unsigned char c = *p;
1272
1273                 if (c & 0x80)
1274                 {   unsigned u = decodeUTF();
1275
1276                     // Check for start of unicode identifier
1277                     if (isUniAlpha(u))
1278                         goto case_ident;
1279
1280                     if (u == PS || u == LS)
1281                     {
1282                         loc.linnum++;
1283                         p++;
1284                         continue;
1285                     }
1286                 }
1287                 if (isprint(c))
1288                     error("unsupported char '%c'", c);
1289                 else
1290                     error("unsupported char 0x%02x", c);
1291                 p++;
1292                 continue;
1293             }
1294         }
1295     }
1296 }
1297
1298 /*******************************************
1299  * Parse escape sequence.
1300  */
1301
1302 unsigned Lexer::escapeSequence()
1303 {   unsigned c;
1304     int n;
1305     int ndigits;
1306
1307     c = *p;
1308     switch (c)
1309     {
1310         case '\'':
1311         case '"':
1312         case '?':
1313         case '\\':
1314         Lconsume:
1315                 p++;
1316                 break;
1317
1318         case 'a':       c = 7;          goto Lconsume;
1319         case 'b':       c = 8;          goto Lconsume;
1320         case 'f':       c = 12;         goto Lconsume;
1321         case 'n':       c = 10;         goto Lconsume;
1322         case 'r':       c = 13;         goto Lconsume;
1323         case 't':       c = 9;          goto Lconsume;
1324         case 'v':       c = 11;         goto Lconsume;
1325
1326         case 'u':
1327                 ndigits = 4;
1328                 goto Lhex;
1329         case 'U':
1330                 ndigits = 8;
1331                 goto Lhex;
1332         case 'x':
1333                 ndigits = 2;
1334         Lhex:
1335                 p++;
1336                 c = *p;
1337                 if (ishex(c))
1338                 {   unsigned v;
1339
1340                     n = 0;
1341                     v = 0;
1342                     while (1)
1343                     {
1344                         if (isdigit(c))
1345                             c -= '0';
1346                         else if (islower(c))
1347                             c -= 'a' - 10;
1348                         else
1349                             c -= 'A' - 10;
1350                         v = v * 16 + c;
1351                         c = *++p;
1352                         if (++n == ndigits)
1353                             break;
1354                         if (!ishex(c))
1355                         {   error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1356                             break;
1357                         }
1358                     }
1359                     if (ndigits != 2 && !utf_isValidDchar(v))
1360                         error("invalid UTF character \\U%08x", v);
1361                     c = v;
1362                 }
1363                 else
1364                     error("undefined escape hex sequence \\%c\n",c);
1365                 break;
1366
1367         case '&':                       // named character entity
1368                 for (unsigned char *idstart = ++p; 1; p++)
1369                 {
1370                     switch (*p)
1371                     {
1372                         case ';':
1373                             c = HtmlNamedEntity(idstart, p - idstart);
1374                             if (c == ~0)
1375                             {   error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1376                                 c = ' ';
1377                             }
1378                             p++;
1379                             break;
1380
1381                         default:
1382                             if (isalpha(*p) ||
1383                                 (p != idstart + 1 && isdigit(*p)))
1384                                 continue;
1385                             error("unterminated named entity");
1386                             break;
1387                     }
1388                     break;
1389                 }
1390                 break;
1391
1392         case 0:
1393         case 0x1A:                      // end of file
1394                 c = '\\';
1395                 break;
1396
1397         default:
1398                 if (isoctal(c))
1399                 {   unsigned v;
1400
1401                     n = 0;
1402                     v = 0;
1403                     do
1404                     {
1405                         v = v * 8 + (c - '0');
1406                         c = *++p;
1407                     } while (++n < 3 && isoctal(c));
1408                     c = v;
1409                     if (c > 0xFF)
1410                         error("0%03o is larger than a byte", c);
1411                 }
1412                 else
1413                     error("undefined escape sequence \\%c\n",c);
1414                 break;
1415     }
1416     return c;
1417 }
1418
1419 /**************************************
1420  */
1421
1422 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1423 {   unsigned c;
1424     Loc start = loc;
1425
1426     p++;
1427     stringbuffer.reset();
1428     while (1)
1429     {
1430         c = *p++;
1431         switch (c)
1432         {
1433             case '\n':
1434                 loc.linnum++;
1435                 break;
1436
1437             case '\r':
1438                 if (*p == '\n')
1439                     continue;   // ignore
1440                 c = '\n';       // treat EndOfLine as \n character
1441                 loc.linnum++;
1442                 break;
1443
1444             case 0:
1445             case 0x1A:
1446                 error("unterminated string constant starting at %s", start.toChars());
1447                 t->ustring = (unsigned char *)"";
1448                 t->len = 0;
1449                 t->postfix = 0;
1450                 return TOKstring;
1451
1452             case '"':
1453             case '`':
1454                 if (c == tc)
1455                 {
1456                     t->len = stringbuffer.offset;
1457                     stringbuffer.writeByte(0);
1458                     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1459                     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1460                     stringPostfix(t);
1461                     return TOKstring;
1462                 }
1463                 break;
1464
1465             default:
1466                 if (c & 0x80)
1467                 {   p--;
1468                     unsigned u = decodeUTF();
1469                     p++;
1470                     if (u == PS || u == LS)
1471                         loc.linnum++;
1472                     stringbuffer.writeUTF8(u);
1473                     continue;
1474                 }
1475                 break;
1476         }
1477         stringbuffer.writeByte(c);
1478     }
1479 }
1480
1481 /**************************************
1482  * Lex hex strings:
1483  *      x"0A ae 34FE BD"
1484  */
1485
1486 TOK Lexer::hexStringConstant(Token *t)
1487 {   unsigned c;
1488     Loc start = loc;
1489     unsigned n = 0;
1490     unsigned v;
1491
1492     p++;
1493     stringbuffer.reset();
1494     while (1)
1495     {
1496         c = *p++;
1497         switch (c)
1498         {
1499             case ' ':
1500             case '\t':
1501             case '\v':
1502             case '\f':
1503                 continue;                       // skip white space
1504
1505             case '\r':
1506                 if (*p == '\n')
1507                     continue;                   // ignore
1508                 // Treat isolated '\r' as if it were a '\n'
1509             case '\n':
1510                 loc.linnum++;
1511                 continue;
1512
1513             case 0:
1514             case 0x1A:
1515                 error("unterminated string constant starting at %s", start.toChars());
1516                 t->ustring = (unsigned char *)"";
1517                 t->len = 0;
1518                 t->postfix = 0;
1519                 return TOKstring;
1520
1521             case '"':
1522                 if (n & 1)
1523                 {   error("odd number (%d) of hex characters in hex string", n);
1524                     stringbuffer.writeByte(v);
1525                 }
1526                 t->len = stringbuffer.offset;
1527                 stringbuffer.writeByte(0);
1528                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1529                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1530                 stringPostfix(t);
1531                 return TOKstring;
1532
1533             default:
1534                 if (c >= '0' && c <= '9')
1535                     c -= '0';
1536                 else if (c >= 'a' && c <= 'f')
1537                     c -= 'a' - 10;
1538                 else if (c >= 'A' && c <= 'F')
1539                     c -= 'A' - 10;
1540                 else if (c & 0x80)
1541                 {   p--;
1542                     unsigned u = decodeUTF();
1543                     p++;
1544                     if (u == PS || u == LS)
1545                         loc.linnum++;
1546                     else
1547                         error("non-hex character \\u%x", u);
1548                 }
1549                 else
1550                     error("non-hex character '%c'", c);
1551                 if (n & 1)
1552                 {   v = (v << 4) | c;
1553                     stringbuffer.writeByte(v);
1554                 }
1555                 else
1556                     v = c;
1557                 n++;
1558                 break;
1559         }
1560     }
1561 }
1562
1563
1564 #if V2
1565 /**************************************
1566  * Lex delimited strings:
1567  *      q"(foo(xxx))"   // "foo(xxx)"
1568  *      q"[foo(]"       // "foo("
1569  *      q"/foo]/"       // "foo]"
1570  *      q"HERE
1571  *      foo
1572  *      HERE"           // "foo\n"
1573  * Input:
1574  *      p is on the "
1575  */
1576
1577 TOK Lexer::delimitedStringConstant(Token *t)
1578 {   unsigned c;
1579     Loc start = loc;
1580     unsigned delimleft = 0;
1581     unsigned delimright = 0;
1582     unsigned nest = 1;
1583     unsigned nestcount;
1584     Identifier *hereid = NULL;
1585     unsigned blankrol = 0;
1586     unsigned startline = 0;
1587
1588     p++;
1589     stringbuffer.reset();
1590     while (1)
1591     {
1592         c = *p++;
1593         //printf("c = '%c'\n", c);
1594         switch (c)
1595         {
1596             case '\n':
1597             Lnextline:
1598                 loc.linnum++;
1599                 startline = 1;
1600                 if (blankrol)
1601                 {   blankrol = 0;
1602                     continue;
1603                 }
1604                 if (hereid)
1605                 {
1606                     stringbuffer.writeUTF8(c);
1607                     continue;
1608                 }
1609                 break;
1610
1611             case '\r':
1612                 if (*p == '\n')
1613                     continue;   // ignore
1614                 c = '\n';       // treat EndOfLine as \n character
1615                 goto Lnextline;
1616
1617             case 0:
1618             case 0x1A:
1619                 goto Lerror;
1620
1621             default:
1622                 if (c & 0x80)
1623                 {   p--;
1624                     c = decodeUTF();
1625                     p++;
1626                     if (c == PS || c == LS)
1627                         goto Lnextline;
1628                 }
1629                 break;
1630         }
1631         if (delimleft == 0)
1632         {   delimleft = c;
1633             nest = 1;
1634             nestcount = 1;
1635             if (c == '(')
1636                 delimright = ')';
1637             else if (c == '{')
1638                 delimright = '}';
1639             else if (c == '[')
1640                 delimright = ']';
1641             else if (c == '<')
1642                 delimright = '>';
1643             else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1644             {   // Start of identifier; must be a heredoc
1645                 Token t;
1646                 p--;
1647                 scan(&t);               // read in heredoc identifier
1648                 if (t.value != TOKidentifier)
1649                 {   error("identifier expected for heredoc, not %s", t.toChars());
1650                     delimright = c;
1651                 }
1652                 else
1653                 {   hereid = t.ident;
1654                     //printf("hereid = '%s'\n", hereid->toChars());
1655                     blankrol = 1;
1656                 }
1657                 nest = 0;
1658             }
1659             else
1660             {   delimright = c;
1661                 nest = 0;
1662             }
1663         }
1664         else
1665         {
1666             if (blankrol)
1667             {   error("heredoc rest of line should be blank");
1668                 blankrol = 0;
1669                 continue;
1670             }
1671             if (nest == 1)
1672             {
1673                 if (c == delimleft)
1674                     nestcount++;
1675                 else if (c == delimright)
1676                 {   nestcount--;
1677                     if (nestcount == 0)
1678                         goto Ldone;
1679                 }
1680             }
1681             else if (c == delimright)
1682                 goto Ldone;
1683             if (startline && isalpha(c))
1684             {   Token t;
1685                 unsigned char *psave = p;
1686                 p--;
1687                 scan(&t);               // read in possible heredoc identifier
1688                 //printf("endid = '%s'\n", t.ident->toChars());
1689                 if (t.value == TOKidentifier && t.ident->equals(hereid))
1690                 {   /* should check that rest of line is blank
1691                      */
1692                     goto Ldone;
1693                 }
1694                 p = psave;
1695             }
1696             stringbuffer.writeUTF8(c);
1697             startline = 0;
1698         }
1699     }
1700
1701 Ldone:
1702     if (*p == '"')
1703         p++;
1704     else
1705         error("delimited string must end in %c\"", delimright);
1706     t->len = stringbuffer.offset;
1707     stringbuffer.writeByte(0);
1708     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1709     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1710     stringPostfix(t);
1711     return TOKstring;
1712
1713 Lerror:
1714     error("unterminated string constant starting at %s", start.toChars());
1715     t->ustring = (unsigned char *)"";
1716     t->len = 0;
1717     t->postfix = 0;
1718     return TOKstring;
1719 }
1720
1721 /**************************************
1722  * Lex delimited strings:
1723  *      q{ foo(xxx) } // " foo(xxx) "
1724  *      q{foo(}       // "foo("
1725  *      q{{foo}"}"}   // "{foo}"}""
1726  * Input:
1727  *      p is on the q
1728  */
1729
1730 TOK Lexer::tokenStringConstant(Token *t)
1731 {
1732     unsigned nest = 1;
1733     Loc start = loc;
1734     unsigned char *pstart = ++p;
1735
1736     while (1)
1737     {   Token tok;
1738
1739         scan(&tok);
1740         switch (tok.value)
1741         {
1742             case TOKlcurly:
1743                 nest++;
1744                 continue;
1745
1746             case TOKrcurly:
1747                 if (--nest == 0)
1748                     goto Ldone;
1749                 continue;
1750
1751             case TOKeof:
1752                 goto Lerror;
1753
1754             default:
1755                 continue;
1756         }
1757     }
1758
1759 Ldone:
1760     t->len = p - 1 - pstart;
1761     t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1762     memcpy(t->ustring, pstart, t->len);
1763     t->ustring[t->len] = 0;
1764     stringPostfix(t);
1765     return TOKstring;
1766
1767 Lerror:
1768     error("unterminated token string constant starting at %s", start.toChars());
1769     t->ustring = (unsigned char *)"";
1770     t->len = 0;
1771     t->postfix = 0;
1772     return TOKstring;
1773 }
1774
1775 #endif
1776
1777
1778 /**************************************
1779  */
1780
1781 TOK Lexer::escapeStringConstant(Token *t, int wide)
1782 {   unsigned c;
1783     Loc start = loc;
1784
1785     p++;
1786     stringbuffer.reset();
1787     while (1)
1788     {
1789         c = *p++;
1790         switch (c)
1791         {
1792             case '\\':
1793                 switch (*p)
1794                 {
1795                     case 'u':
1796                     case 'U':
1797                     case '&':
1798                         c = escapeSequence();
1799                         stringbuffer.writeUTF8(c);
1800                         continue;
1801
1802                     default:
1803                         c = escapeSequence();
1804                         break;
1805                 }
1806                 break;
1807
1808             case '\n':
1809                 loc.linnum++;
1810                 break;
1811
1812             case '\r':
1813                 if (*p == '\n')
1814                     continue;   // ignore
1815                 c = '\n';       // treat EndOfLine as \n character
1816                 loc.linnum++;
1817                 break;
1818
1819             case '"':
1820                 t->len = stringbuffer.offset;
1821                 stringbuffer.writeByte(0);
1822                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1823                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1824                 stringPostfix(t);
1825                 return TOKstring;
1826
1827             case 0:
1828             case 0x1A:
1829                 p--;
1830                 error("unterminated string constant starting at %s", start.toChars());
1831                 t->ustring = (unsigned char *)"";
1832                 t->len = 0;
1833                 t->postfix = 0;
1834                 return TOKstring;
1835
1836             default:
1837                 if (c & 0x80)
1838                 {
1839                     p--;
1840                     c = decodeUTF();
1841                     if (c == LS || c == PS)
1842                     {   c = '\n';
1843                         loc.linnum++;
1844                     }
1845                     p++;
1846                     stringbuffer.writeUTF8(c);
1847                     continue;
1848                 }
1849                 break;
1850         }
1851         stringbuffer.writeByte(c);
1852     }
1853 }
1854
1855 /**************************************
1856  */
1857
1858 TOK Lexer::charConstant(Token *t, int wide)
1859 {
1860     unsigned c;
1861     TOK tk = TOKcharv;
1862
1863     //printf("Lexer::charConstant\n");
1864     p++;
1865     c = *p++;
1866     switch (c)
1867     {
1868         case '\\':
1869             switch (*p)
1870             {
1871                 case 'u':
1872                     t->uns64value = escapeSequence();
1873                     tk = TOKwcharv;
1874                     break;
1875
1876                 case 'U':
1877                 case '&':
1878                     t->uns64value = escapeSequence();
1879                     tk = TOKdcharv;
1880                     break;
1881
1882                 default:
1883                     t->uns64value = escapeSequence();
1884                     break;
1885             }
1886             break;
1887
1888         case '\n':
1889         L1:
1890             loc.linnum++;
1891         case '\r':
1892         case 0:
1893         case 0x1A:
1894         case '\'':
1895             error("unterminated character constant");
1896             return tk;
1897
1898         default:
1899             if (c & 0x80)
1900             {
1901                 p--;
1902                 c = decodeUTF();
1903                 p++;
1904                 if (c == LS || c == PS)
1905                     goto L1;
1906                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1907                     tk = TOKwcharv;
1908                 else
1909                     tk = TOKdcharv;
1910             }
1911             t->uns64value = c;
1912             break;
1913     }
1914
1915     if (*p != '\'')
1916     {   error("unterminated character constant");
1917         return tk;
1918     }
1919     p++;
1920     return tk;
1921 }
1922
1923 /***************************************
1924  * Get postfix of string literal.
1925  */
1926
1927 void Lexer::stringPostfix(Token *t)
1928 {
1929     switch (*p)
1930     {
1931         case 'c':
1932         case 'w':
1933         case 'd':
1934             t->postfix = *p;
1935             p++;
1936             break;
1937
1938         default:
1939             t->postfix = 0;
1940             break;
1941     }
1942 }
1943
1944 /***************************************
1945  * Read \u or \U unicode sequence
1946  * Input:
1947  *      u       'u' or 'U'
1948  */
1949
1950 #if 0
1951 unsigned Lexer::wchar(unsigned u)
1952 {
1953     unsigned value;
1954     unsigned n;
1955     unsigned char c;
1956     unsigned nchars;
1957
1958     nchars = (u == 'U') ? 8 : 4;
1959     value = 0;
1960     for (n = 0; 1; n++)
1961     {
1962         ++p;
1963         if (n == nchars)
1964             break;
1965         c = *p;
1966         if (!ishex(c))
1967         {   error("\\%c sequence must be followed by %d hex characters", u, nchars);
1968             break;
1969         }
1970         if (isdigit(c))
1971             c -= '0';
1972         else if (islower(c))
1973             c -= 'a' - 10;
1974         else
1975             c -= 'A' - 10;
1976         value <<= 4;
1977         value |= c;
1978     }
1979     return value;
1980 }
1981 #endif
1982
1983 /**************************************
1984  * Read in a number.
1985  * If it's an integer, store it in tok.TKutok.Vlong.
1986  *      integers can be decimal, octal or hex
1987  *      Handle the suffixes U, UL, LU, L, etc.
1988  * If it's double, store it in tok.TKutok.Vdouble.
1989  * Returns:
1990  *      TKnum
1991  *      TKdouble,...
1992  */
1993
1994 TOK Lexer::number(Token *t)
1995 {
1996     // We use a state machine to collect numbers
1997     enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
1998         STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
1999         STATE_hexh, STATE_error };
2000     enum STATE state;
2001
2002     enum FLAGS
2003     {   FLAGS_decimal  = 1,             // decimal
2004         FLAGS_unsigned = 2,             // u or U suffix
2005         FLAGS_long     = 4,             // l or L suffix
2006     };
2007     enum FLAGS flags = FLAGS_decimal;
2008
2009     int i;
2010     int base;
2011     unsigned c;
2012     unsigned char *start;
2013     TOK result;
2014
2015     //printf("Lexer::number()\n");
2016     state = STATE_initial;
2017     base = 0;
2018     stringbuffer.reset();
2019     start = p;
2020     while (1)
2021     {
2022         c = *p;
2023         switch (state)
2024         {
2025             case STATE_initial:         // opening state
2026                 if (c == '0')
2027                     state = STATE_0;
2028                 else
2029                     state = STATE_decimal;
2030                 break;
2031
2032             case STATE_0:
2033                 flags = (FLAGS) (flags & ~FLAGS_decimal);
2034                 switch (c)
2035                 {
2036 #if ZEROH
2037                     case 'H':                   // 0h
2038                     case 'h':
2039                         goto hexh;
2040 #endif
2041                     case 'X':
2042                     case 'x':
2043                         state = STATE_hex0;
2044                         break;
2045
2046                     case '.':
2047                         if (p[1] == '.')        // .. is a separate token
2048                             goto done;
2049                     case 'i':
2050                     case 'f':
2051                     case 'F':
2052                         goto real;
2053 #if ZEROH
2054                     case 'E':
2055                     case 'e':
2056                         goto case_hex;
2057 #endif
2058                     case 'B':
2059                     case 'b':
2060                         state = STATE_binary0;
2061                         break;
2062
2063                     case '0': case '1': case '2': case '3':
2064                     case '4': case '5': case '6': case '7':
2065                         state = STATE_octal;
2066                         break;
2067
2068 #if ZEROH
2069                     case '8': case '9': case 'A':
2070                     case 'C': case 'D': case 'F':
2071                     case 'a': case 'c': case 'd': case 'f':
2072                     case_hex:
2073                         state = STATE_hexh;
2074                         break;
2075 #endif
2076                     case '_':
2077                         state = STATE_octal;
2078                         p++;
2079                         continue;
2080
2081                     case 'L':
2082                         if (p[1] == 'i')
2083                             goto real;
2084                         goto done;
2085
2086                     default:
2087                         goto done;
2088                 }
2089                 break;
2090
2091             case STATE_decimal:         // reading decimal number
2092                 if (!isdigit(c))
2093                 {
2094 #if ZEROH
2095                     if (ishex(c)
2096                         || c == 'H' || c == 'h'
2097                        )
2098                         goto hexh;
2099 #endif
2100                     if (c == '_')               // ignore embedded _
2101                     {   p++;
2102                         continue;
2103                     }
2104                     if (c == '.' && p[1] != '.')
2105                         goto real;
2106                     else if (c == 'i' || c == 'f' || c == 'F' ||
2107                              c == 'e' || c == 'E')
2108                     {
2109             real:       // It's a real number. Back up and rescan as a real
2110                         p = start;
2111                         return inreal(t);
2112                     }
2113                     else if (c == 'L' && p[1] == 'i')
2114                         goto real;
2115                     goto done;
2116                 }
2117                 break;
2118
2119             case STATE_hex0:            // reading hex number
2120             case STATE_hex:
2121                 if (!ishex(c))
2122                 {
2123                     if (c == '_')               // ignore embedded _
2124                     {   p++;
2125                         continue;
2126                     }
2127                     if (c == '.' && p[1] != '.')
2128                         goto real;
2129                     if (c == 'P' || c == 'p' || c == 'i')
2130                         goto real;
2131                     if (state == STATE_hex0)
2132                         error("Hex digit expected, not '%c'", c);
2133                     goto done;
2134                 }
2135                 state = STATE_hex;
2136                 break;
2137
2138 #if ZEROH
2139             hexh:
2140                 state = STATE_hexh;
2141             case STATE_hexh:            // parse numbers like 0FFh
2142                 if (!ishex(c))
2143                 {
2144                     if (c == 'H' || c == 'h')
2145                     {
2146                         p++;
2147                         base = 16;
2148                         goto done;
2149                     }
2150                     else
2151                     {
2152                         // Check for something like 1E3 or 0E24
2153                         if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2154                             memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2155                             goto real;
2156                         error("Hex digit expected, not '%c'", c);
2157                         goto done;
2158                     }
2159                 }
2160                 break;
2161 #endif
2162
2163             case STATE_octal:           // reading octal number
2164             case STATE_octale:          // reading octal number with non-octal digits
2165                 if (!isoctal(c))
2166                 {
2167 #if ZEROH
2168                     if (ishex(c)
2169                         || c == 'H' || c == 'h'
2170                        )
2171                         goto hexh;
2172 #endif
2173                     if (c == '_')               // ignore embedded _
2174                     {   p++;
2175                         continue;
2176                     }
2177                     if (c == '.' && p[1] != '.')
2178                         goto real;
2179                     if (c == 'i')
2180                         goto real;
2181                     if (isdigit(c))
2182                     {
2183                         state = STATE_octale;
2184                     }
2185                     else
2186                         goto done;
2187                 }
2188                 break;
2189
2190             case STATE_binary0:         // starting binary number
2191             case STATE_binary:          // reading binary number
2192                 if (c != '0' && c != '1')
2193                 {
2194 #if ZEROH
2195                     if (ishex(c)
2196                         || c == 'H' || c == 'h'
2197                        )
2198                         goto hexh;
2199 #endif
2200                     if (c == '_')               // ignore embedded _
2201                     {   p++;
2202                         continue;
2203                     }
2204                     if (state == STATE_binary0)
2205                     {   error("binary digit expected");
2206                         state = STATE_error;
2207                         break;
2208                     }
2209                     else
2210                         goto done;
2211                 }
2212                 state = STATE_binary;
2213                 break;
2214
2215             case STATE_error:           // for error recovery
2216                 if (!isdigit(c))        // scan until non-digit
2217                     goto done;
2218                 break;
2219
2220             default:
2221                 assert(0);
2222         }
2223         stringbuffer.writeByte(c);
2224         p++;
2225     }
2226 done:
2227     stringbuffer.writeByte(0);          // terminate string
2228     if (state == STATE_octale)
2229         error("Octal digit expected");
2230
2231     uinteger_t n;                       // unsigned >=64 bit integer type
2232
2233     if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2234         n = stringbuffer.data[0] - '0';
2235     else
2236     {
2237         // Convert string to integer
2238 #if __DMC__
2239         errno = 0;
2240         n = strtoull((char *)stringbuffer.data,NULL,base);
2241         if (errno == ERANGE)
2242             error("integer overflow");
2243 #else
2244         // Not everybody implements strtoull()
2245         char *p = (char *)stringbuffer.data;
2246         int r = 10, d;
2247
2248         if (*p == '0')
2249         {
2250             if (p[1] == 'x' || p[1] == 'X')
2251                 p += 2, r = 16;
2252             else if (p[1] == 'b' || p[1] == 'B')
2253                 p += 2, r = 2;
2254             else if (isdigit(p[1]))
2255                 p += 1, r = 8;
2256         }
2257
2258         n = 0;
2259         while (1)
2260         {
2261             if (*p >= '0' && *p <= '9')
2262                 d = *p - '0';
2263             else if (*p >= 'a' && *p <= 'z')
2264                 d = *p - 'a' + 10;
2265             else if (*p >= 'A' && *p <= 'Z')
2266                 d = *p - 'A' + 10;
2267             else
2268                 break;
2269             if (d >= r)
2270                 break;
2271             if (n && n * r + d <= n)
2272             {
2273                 error ("integer overflow");
2274                 break;
2275             }
2276
2277             n = n * r + d;
2278             p++;
2279         }
2280 #endif
2281         if (sizeof(n) > 8 &&
2282             n > 0xFFFFFFFFFFFFFFFFULL)  // if n needs more than 64 bits
2283             error("integer overflow");
2284     }
2285
2286     // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2287     while (1)
2288     {   unsigned char f;
2289
2290         switch (*p)
2291         {   case 'U':
2292             case 'u':
2293                 f = FLAGS_unsigned;
2294                 goto L1;
2295
2296             case 'l':
2297                 if (1 || !global.params.useDeprecated)
2298                     error("'l' suffix is deprecated, use 'L' instead");
2299             case 'L':
2300                 f = FLAGS_long;
2301             L1:
2302                 p++;
2303                 if (flags & f)
2304                     error("unrecognized token");
2305                 flags = (FLAGS) (flags | f);
2306                 continue;
2307             default:
2308                 break;
2309         }
2310         break;
2311     }
2312
2313     switch (flags)
2314     {
2315         case 0:
2316             /* Octal or Hexadecimal constant.
2317              * First that fits: int, uint, long, ulong
2318              */
2319             if (n & 0x8000000000000000LL)
2320                     result = TOKuns64v;
2321             else if (n & 0xFFFFFFFF00000000LL)
2322                     result = TOKint64v;
2323             else if (n & 0x80000000)
2324                     result = TOKuns32v;
2325             else
2326                     result = TOKint32v;
2327             break;
2328
2329         case FLAGS_decimal:
2330             /* First that fits: int, long, long long
2331              */
2332             if (n & 0x8000000000000000LL)
2333             {       error("signed integer overflow");
2334                     result = TOKuns64v;
2335             }
2336             else if (n & 0xFFFFFFFF80000000LL)
2337                     result = TOKint64v;
2338             else
2339                     result = TOKint32v;
2340             break;
2341
2342         case FLAGS_unsigned:
2343         case FLAGS_decimal | FLAGS_unsigned:
2344             /* First that fits: uint, ulong
2345              */
2346             if (n & 0xFFFFFFFF00000000LL)
2347                     result = TOKuns64v;
2348             else
2349                     result = TOKuns32v;
2350             break;
2351
2352         case FLAGS_decimal | FLAGS_long:
2353             if (n & 0x8000000000000000LL)
2354             {       error("signed integer overflow");
2355                     result = TOKuns64v;
2356             }
2357             else
2358                     result = TOKint64v;
2359             break;
2360
2361         case FLAGS_long:
2362             if (n & 0x8000000000000000LL)
2363                     result = TOKuns64v;
2364             else
2365                     result = TOKint64v;
2366             break;
2367
2368         case FLAGS_unsigned | FLAGS_long:
2369         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2370             result = TOKuns64v;
2371             break;
2372
2373         default:
2374             #ifdef DEBUG
2375                 printf("%x\n",flags);
2376             #endif
2377             assert(0);
2378     }
2379     t->uns64value = n;
2380     return result;
2381 }
2382
2383 /**************************************
2384  * Read in characters, converting them to real.
2385  * Bugs:
2386  *      Exponent overflow not detected.
2387  *      Too much requested precision is not detected.
2388  */
2389
2390 TOK Lexer::inreal(Token *t)
2391 #ifdef __DMC__
2392 __in
2393 {
2394     assert(*p == '.' || isdigit(*p));
2395 }
2396 __out (result)
2397 {
2398     switch (result)
2399     {
2400         case TOKfloat32v:
2401         case TOKfloat64v:
2402         case TOKfloat80v:
2403         case TOKimaginary32v:
2404         case TOKimaginary64v:
2405         case TOKimaginary80v:
2406             break;
2407
2408         default:
2409             assert(0);
2410     }
2411 }
2412 __body
2413 #endif /* __DMC__ */
2414 {   int dblstate;
2415     unsigned c;
2416     char hex;                   // is this a hexadecimal-floating-constant?
2417     TOK result;
2418
2419     //printf("Lexer::inreal()\n");
2420     stringbuffer.reset();
2421     dblstate = 0;
2422     hex = 0;
2423 Lnext:
2424     while (1)
2425     {
2426         // Get next char from input
2427         c = *p++;
2428         //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2429         while (1)
2430         {
2431             switch (dblstate)
2432             {
2433                 case 0:                 // opening state
2434                     if (c == '0')
2435                         dblstate = 9;
2436                     else if (c == '.')
2437                         dblstate = 3;
2438                     else
2439                         dblstate = 1;
2440                     break;
2441
2442                 case 9:
2443                     dblstate = 1;
2444                     if (c == 'X' || c == 'x')
2445                     {   hex++;
2446                         break;
2447                     }
2448                 case 1:                 // digits to left of .
2449                 case 3:                 // digits to right of .
2450                 case 7:                 // continuing exponent digits
2451                     if (!isdigit(c) && !(hex && isxdigit(c)))
2452                     {
2453                         if (c == '_')
2454                             goto Lnext; // ignore embedded '_'
2455                         dblstate++;
2456                         continue;
2457                     }
2458                     break;
2459
2460                 case 2:                 // no more digits to left of .
2461                     if (c == '.')
2462                     {   dblstate++;
2463                         break;
2464                     }
2465                 case 4:                 // no more digits to right of .
2466                     if ((c == 'E' || c == 'e') ||
2467                         hex && (c == 'P' || c == 'p'))
2468                     {   dblstate = 5;
2469                         hex = 0;        // exponent is always decimal
2470                         break;
2471                     }
2472                     if (hex)
2473                         error("binary-exponent-part required");
2474                     goto done;
2475
2476                 case 5:                 // looking immediately to right of E
2477                     dblstate++;
2478                     if (c == '-' || c == '+')
2479                         break;
2480                 case 6:                 // 1st exponent digit expected
2481                     if (!isdigit(c))
2482                         error("exponent expected");
2483                     dblstate++;
2484                     break;
2485
2486                 case 8:                 // past end of exponent digits
2487                     goto done;
2488             }
2489             break;
2490         }
2491         stringbuffer.writeByte(c);
2492     }
2493 done:
2494     p--;
2495
2496     stringbuffer.writeByte(0);
2497
2498 #if _WIN32 && __DMC__
2499     char *save = __locale_decpoint;
2500     __locale_decpoint = ".";
2501 #endif
2502 #ifdef IN_GCC
2503     t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2504 #else
2505     t->float80value = strtold((char *)stringbuffer.data, NULL);
2506 #endif
2507     errno = 0;
2508     switch (*p)
2509     {
2510         case 'F':
2511         case 'f':
2512 #ifdef IN_GCC
2513             real_t::parse((char *)stringbuffer.data, real_t::Float);
2514 #else
2515             strtof((char *)stringbuffer.data, NULL);
2516 #endif
2517             result = TOKfloat32v;
2518             p++;
2519             break;
2520
2521         default:
2522 #ifdef IN_GCC
2523             real_t::parse((char *)stringbuffer.data, real_t::Double);
2524 #else
2525             strtod((char *)stringbuffer.data, NULL);
2526 #endif
2527             result = TOKfloat64v;
2528             break;
2529
2530         case 'l':
2531             if (!global.params.useDeprecated)
2532                 error("'l' suffix is deprecated, use 'L' instead");
2533         case 'L':
2534             result = TOKfloat80v;
2535             p++;
2536             break;
2537     }
2538     if (*p == 'i' || *p == 'I')
2539     {
2540         if (!global.params.useDeprecated && *p == 'I')
2541             error("'I' suffix is deprecated, use 'i' instead");
2542         p++;
2543         switch (result)
2544         {
2545             case TOKfloat32v:
2546                 result = TOKimaginary32v;
2547                 break;
2548             case TOKfloat64v:
2549                 result = TOKimaginary64v;
2550                 break;
2551             case TOKfloat80v:
2552                 result = TOKimaginary80v;
2553                 break;
2554         }
2555     }
2556 #if _WIN32 && __DMC__
2557     __locale_decpoint = save;
2558 #endif
2559     if (errno == ERANGE)
2560         error("number is not representable");
2561     return result;
2562 }
2563
2564 /*********************************************
2565  * Do pragma.
2566  * Currently, the only pragma supported is:
2567  *      #line linnum [filespec]
2568  */
2569
2570 void Lexer::pragma()
2571 {
2572     Token tok;
2573     int linnum;
2574     char *filespec = NULL;
2575     Loc loc = this->loc;
2576
2577     while (isblank(*p)) p++;
2578     if (*p == '\n')
2579         goto Lerr;
2580
2581     scan(&tok);
2582     if (tok.value != TOKidentifier || tok.ident != Id::line)
2583         goto Lerr;
2584
2585     scan(&tok);
2586     if (tok.value == TOKint32v || tok.value == TOKint64v)
2587         linnum = tok.uns64value - 1;
2588     else
2589         goto Lerr;
2590
2591     while (1)
2592     {
2593         switch (*p)
2594         {
2595             case 0:
2596             case 0x1A:
2597             case '\n':
2598             Lnewline:
2599                 this->loc.linnum = linnum;
2600                 if (filespec)
2601                     this->loc.filename = filespec;
2602                 return;
2603
2604             case '\r':
2605                 p++;
2606                 if (*p != '\n')
2607                 {   p--;
2608                     goto Lnewline;
2609                 }
2610                 continue;
2611
2612             case ' ':
2613             case '\t':
2614             case '\v':
2615             case '\f':
2616                 p++;
2617                 continue;                       // skip white space
2618
2619             case '_':
2620                 if (mod && memcmp(p, "__FILE__", 8) == 0)
2621                 {
2622                     p += 8;
2623                     filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2624                 }
2625                 continue;
2626
2627             case '"':
2628                 if (filespec)
2629                     goto Lerr;
2630                 stringbuffer.reset();
2631                 p++;
2632                 while (1)
2633                 {   unsigned c;
2634
2635                     c = *p;
2636                     switch (c)
2637                     {
2638                         case '\n':
2639                         case '\r':
2640                         case 0:
2641                         case 0x1A:
2642                             goto Lerr;
2643
2644                         case '"':
2645                             stringbuffer.writeByte(0);
2646                             filespec = mem.strdup((char *)stringbuffer.data);
2647                             p++;
2648                             break;
2649
2650                         default:
2651                             if (c & 0x80)
2652                             {   unsigned u = decodeUTF();
2653                                 if (u == PS || u == LS)
2654                                     goto Lerr;
2655                             }
2656                             stringbuffer.writeByte(c);
2657                             p++;
2658                             continue;
2659                     }
2660                     break;
2661                 }
2662                 continue;
2663
2664             default:
2665                 if (*p & 0x80)
2666                 {   unsigned u = decodeUTF();
2667                     if (u == PS || u == LS)
2668                         goto Lnewline;
2669                 }
2670                 goto Lerr;
2671         }
2672     }
2673
2674 Lerr:
2675     error(loc, "#line integer [\"filespec\"]\\n expected");
2676 }
2677
2678
2679 /********************************************
2680  * Decode UTF character.
2681  * Issue error messages for invalid sequences.
2682  * Return decoded character, advance p to last character in UTF sequence.
2683  */
2684
2685 unsigned Lexer::decodeUTF()
2686 {
2687     dchar_t u;
2688     unsigned char c;
2689     unsigned char *s = p;
2690     size_t len;
2691     size_t idx;
2692     char *msg;
2693
2694     c = *s;
2695     assert(c & 0x80);
2696
2697     // Check length of remaining string up to 6 UTF-8 characters
2698     for (len = 1; len < 6 && s[len]; len++)
2699         ;
2700
2701     idx = 0;
2702     msg = utf_decodeChar(s, len, &idx, &u);
2703     p += idx - 1;
2704     if (msg)
2705     {
2706         error("%s", msg);
2707     }
2708     return u;
2709 }
2710
2711
2712 /***************************************************
2713  * Parse doc comment embedded between t->ptr and p.
2714  * Remove trailing blanks and tabs from lines.
2715  * Replace all newlines with \n.
2716  * Remove leading comment character from each line.
2717  * Decide if it's a lineComment or a blockComment.
2718  * Append to previous one for this token.
2719  */
2720
2721 void Lexer::getDocComment(Token *t, unsigned lineComment)
2722 {
2723     OutBuffer buf;
2724     unsigned char ct = t->ptr[2];
2725     unsigned char *q = t->ptr + 3;      // start of comment text
2726     int linestart = 0;
2727
2728     unsigned char *qend = p;
2729     if (ct == '*' || ct == '+')
2730         qend -= 2;
2731
2732     /* Scan over initial row of ****'s or ++++'s or ////'s
2733      */
2734     for (; q < qend; q++)
2735     {
2736         if (*q != ct)
2737             break;
2738     }
2739
2740     /* Remove trailing row of ****'s or ++++'s
2741      */
2742     if (ct != '/')
2743     {
2744         for (; q < qend; qend--)
2745         {
2746             if (qend[-1] != ct)
2747                 break;
2748         }
2749     }
2750
2751     for (; q < qend; q++)
2752     {
2753         unsigned char c = *q;
2754
2755         switch (c)
2756         {
2757             case '*':
2758             case '+':
2759                 if (linestart && c == ct)
2760                 {   linestart = 0;
2761                     /* Trim preceding whitespace up to preceding \n
2762                      */
2763                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2764                         buf.offset--;
2765                     continue;
2766                 }
2767                 break;
2768
2769             case ' ':
2770             case '\t':
2771                 break;
2772
2773             case '\r':
2774                 if (q[1] == '\n')
2775                     continue;           // skip the \r
2776                 goto Lnewline;
2777
2778             default:
2779                 if (c == 226)
2780                 {
2781                     // If LS or PS
2782                     if (q[1] == 128 &&
2783                         (q[2] == 168 || q[2] == 169))
2784                     {
2785                         q += 2;
2786                         goto Lnewline;
2787                     }
2788                 }
2789                 linestart = 0;
2790                 break;
2791
2792             Lnewline:
2793                 c = '\n';               // replace all newlines with \n
2794             case '\n':
2795                 linestart = 1;
2796
2797                 /* Trim trailing whitespace
2798                  */
2799                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2800                     buf.offset--;
2801
2802                 break;
2803         }
2804         buf.writeByte(c);
2805     }
2806
2807     // Always end with a newline
2808     if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2809         buf.writeByte('\n');
2810
2811     buf.writeByte(0);
2812
2813     // It's a line comment if the start of the doc comment comes
2814     // after other non-whitespace on the same line.
2815     unsigned char** dc = (lineComment && anyToken)
2816                          ? &t->lineComment
2817                          : &t->blockComment;
2818
2819     // Combine with previous doc comment, if any
2820     if (*dc)
2821         *dc = combineComments(*dc, (unsigned char *)buf.data);
2822     else
2823         *dc = (unsigned char *)buf.extractData();
2824 }
2825
2826 /********************************************
2827  * Combine two document comments into one.
2828  */
2829
2830 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2831 {
2832     unsigned char *c = c2;
2833
2834     if (c1)
2835     {   c = c1;
2836         if (c2)
2837         {   size_t len1 = strlen((char *)c1);
2838             size_t len2 = strlen((char *)c2);
2839
2840             c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2841             memcpy(c, c1, len1);
2842             c[len1] = '\n';
2843             memcpy(c + len1 + 1, c2, len2);
2844             c[len1 + 1 + len2] = 0;
2845         }
2846     }
2847     return c;
2848 }
2849
2850 /********************************************
2851  * Create an identifier in the string table.
2852  */
2853
2854 Identifier *Lexer::idPool(const char *s)
2855 {
2856     size_t len = strlen(s);
2857     StringValue *sv = stringtable.update(s, len);
2858     Identifier *id = (Identifier *) sv->ptrvalue;
2859     if (!id)
2860     {
2861         id = new Identifier(sv->lstring.string, TOKidentifier);
2862         sv->ptrvalue = id;
2863     }
2864     return id;
2865 }
2866
2867 /*********************************************
2868  * Create a unique identifier using the prefix s.
2869  */
2870
2871 Identifier *Lexer::uniqueId(const char *s, int num)
2872 {   char buffer[32];
2873     size_t slen = strlen(s);
2874
2875     assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2876     sprintf(buffer, "%s%d", s, num);
2877     return idPool(buffer);
2878 }
2879
2880 Identifier *Lexer::uniqueId(const char *s)
2881 {
2882     static int num;
2883     return uniqueId(s, ++num);
2884 }
2885
2886 /****************************************
2887  */
2888
2889 struct Keyword
2890 {   char *name;
2891     enum TOK value;
2892 };
2893
2894 static Keyword keywords[] =
2895 {
2896 //    { "",             TOK     },
2897
2898     {   "this",         TOKthis         },
2899     {   "super",        TOKsuper        },
2900     {   "assert",       TOKassert       },
2901     {   "null",         TOKnull         },
2902     {   "true",         TOKtrue         },
2903     {   "false",        TOKfalse        },
2904     {   "cast",         TOKcast         },
2905     {   "new",          TOKnew          },
2906     {   "delete",       TOKdelete       },
2907     {   "throw",        TOKthrow        },
2908     {   "module",       TOKmodule       },
2909     {   "pragma",       TOKpragma       },
2910     {   "typeof",       TOKtypeof       },
2911     {   "typeid",       TOKtypeid       },
2912
2913     {   "template",     TOKtemplate     },
2914
2915     {   "void",         TOKvoid         },
2916     {   "byte",         TOKint8         },
2917     {   "ubyte",        TOKuns8         },
2918     {   "short",        TOKint16        },
2919     {   "ushort",       TOKuns16        },
2920     {   "int",          TOKint32        },
2921     {   "uint",         TOKuns32        },
2922     {   "long",         TOKint64        },
2923     {   "ulong",        TOKuns64        },
2924     {   "cent",         TOKcent,        },
2925     {   "ucent",        TOKucent,       },
2926     {   "float",        TOKfloat32      },
2927     {   "double",       TOKfloat64      },
2928     {   "real",         TOKfloat80      },
2929
2930     {   "bool",         TOKbool         },
2931     {   "char",         TOKchar         },
2932     {   "wchar",        TOKwchar        },
2933     {   "dchar",        TOKdchar        },
2934
2935     {   "ifloat",       TOKimaginary32  },
2936     {   "idouble",      TOKimaginary64  },
2937     {   "ireal",        TOKimaginary80  },
2938
2939     {   "cfloat",       TOKcomplex32    },
2940     {   "cdouble",      TOKcomplex64    },
2941     {   "creal",        TOKcomplex80    },
2942
2943     {   "delegate",     TOKdelegate     },
2944     {   "function",     TOKfunction     },
2945
2946     {   "is",           TOKis           },
2947     {   "if",           TOKif           },
2948     {   "else",         TOKelse         },
2949     {   "while",        TOKwhile        },
2950     {   "for",          TOKfor          },
2951     {   "do",           TOKdo           },
2952     {   "switch",       TOKswitch       },
2953     {   "case",         TOKcase         },
2954     {   "default",      TOKdefault      },
2955     {   "break",        TOKbreak        },
2956     {   "continue",     TOKcontinue     },
2957     {   "synchronized", TOKsynchronized },
2958     {   "return",       TOKreturn       },
2959     {   "goto",         TOKgoto         },
2960     {   "try",          TOKtry          },
2961     {   "catch",        TOKcatch        },
2962     {   "finally",      TOKfinally      },
2963     {   "with",         TOKwith         },
2964     {   "asm",          TOKasm          },
2965     {   "foreach",      TOKforeach      },
2966     {   "foreach_reverse",      TOKforeach_reverse      },
2967     {   "reversed",     TOKreversed     },
2968     {   "scope",        TOKscope        },
2969
2970     {   "struct",       TOKstruct       },
2971     {   "class",        TOKclass        },
2972     {   "interface",    TOKinterface    },
2973     {   "union",        TOKunion        },
2974     {   "enum",         TOKenum         },
2975     {   "import",       TOKimport       },
2976     {   "mixin",        TOKmixin        },
2977     {   "static",       TOKstatic       },
2978     {   "final",        TOKfinal        },
2979     {   "const",        TOKconst        },
2980     {   "typedef",      TOKtypedef      },
2981     {   "alias",        TOKalias        },
2982     {   "override",     TOKoverride     },
2983     {   "abstract",     TOKabstract     },
2984     {   "volatile",     TOKvolatile     },
2985     {   "debug",        TOKdebug        },
2986     {   "deprecated",   TOKdeprecated   },
2987     {   "in",           TOKin           },
2988     {   "out",          TOKout          },
2989     {   "inout",        TOKinout        },
2990     {   "lazy",         TOKlazy         },
2991     {   "auto",         TOKauto         },
2992
2993     {   "align",        TOKalign        },
2994     {   "extern",       TOKextern       },
2995     {   "private",      TOKprivate      },
2996     {   "package",      TOKpackage      },
2997     {   "protected",    TOKprotected    },
2998     {   "public",       TOKpublic       },
2999     {   "export",       TOKexport       },
3000
3001     {   "body",         TOKbody         },
3002     {   "invariant",    TOKinvariant    },
3003     {   "unittest",     TOKunittest     },
3004     {   "version",      TOKversion      },
3005     //{ "manifest",     TOKmanifest     },
3006
3007     // Added after 1.0
3008     {   "ref",          TOKref          },
3009     {   "macro",        TOKmacro        },
3010
3011
3012     // TAL
3013     {   "and",          TOKandand       },
3014     {   "or",           TOKoror         },
3015     {   "not",          TOKnot          },
3016     {   "extends",      TOKextends      },
3017     {   "implements",   TOKimplements   },
3018     {   "log_error",    TOKlog_error    },
3019     {   "log_warning",  TOKlog_warning  },
3020     {   "log_info",     TOKlog_info     },
3021     {   "log_trace",    TOKlog_trace    },
3022 #if V2
3023     {   "pure",         TOKpure         },
3024     {   "nothrow",      TOKnothrow      },
3025     {   "__thread",     TOKtls          },
3026     {   "__traits",     TOKtraits       },
3027     {   "__overloadset", TOKoverloadset },
3028     {   "__FILE__",     TOKfile         },
3029     {   "__LINE__",     TOKline         },
3030 #endif
3031 };
3032
3033 int Token::isKeyword()
3034 {
3035     for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
3036     {
3037         if (keywords[u].value == value)
3038             return 1;
3039     }
3040     return 0;
3041 }
3042
3043 void Lexer::initKeywords()
3044 {   StringValue *sv;
3045     unsigned u;
3046     enum TOK v;
3047     unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
3048
3049     if (global.params.Dversion == 1)
3050         nkeywords -= 2;
3051
3052     cmtable_init();
3053
3054     for (u = 0; u < nkeywords; u++)
3055     {   char *s;
3056
3057         //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
3058         s = keywords[u].name;
3059         v = keywords[u].value;
3060         sv = stringtable.insert(s, strlen(s));
3061         sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
3062
3063         //printf("tochars[%d] = '%s'\n",v, s);
3064         Token::tochars[v] = s;
3065     }
3066
3067     Token::tochars[TOKeof]              = "EOF";
3068     Token::tochars[TOKlcurly]           = "{";
3069     Token::tochars[TOKrcurly]           = "}";
3070     Token::tochars[TOKlparen]           = "(";
3071     Token::tochars[TOKrparen]           = ")";
3072     Token::tochars[TOKlbracket]         = "[";
3073     Token::tochars[TOKrbracket]         = "]";
3074     Token::tochars[TOKsemicolon]        = ";";
3075     Token::tochars[TOKcolon]            = ":";
3076     Token::tochars[TOKcomma]            = ",";
3077     Token::tochars[TOKdot]              = ".";
3078     Token::tochars[TOKxor]              = "^";
3079     Token::tochars[TOKxorass]           = "^=";
3080     Token::tochars[TOKassign]           = "=";
3081     Token::tochars[TOKconstruct]        = "=";
3082 #if V2
3083     Token::tochars[TOKblit]             = "=";
3084 #endif
3085     Token::tochars[TOKlt]               = "<";
3086     Token::tochars[TOKgt]               = ">";
3087     Token::tochars[TOKle]               = "<=";
3088     Token::tochars[TOKge]               = ">=";
3089     Token::tochars[TOKequal]            = "==";
3090     Token::tochars[TOKnotequal]         = "!=";
3091     Token::tochars[TOKnotidentity]      = "!is";
3092     Token::tochars[TOKtobool]           = "!!";
3093     Token::tochars[TOKat]               = "@";
3094
3095     Token::tochars[TOKunord]            = "!<>=";
3096     Token::tochars[TOKue]               = "!<>";
3097     Token::tochars[TOKlg]               = "<>";
3098     Token::tochars[TOKleg]              = "<>=";
3099     Token::tochars[TOKule]              = "!>";
3100     Token::tochars[TOKul]               = "!>=";
3101     Token::tochars[TOKuge]              = "!<";
3102     Token::tochars[TOKug]               = "!<=";
3103
3104     Token::tochars[TOKnot]              = "!";
3105     Token::tochars[TOKtobool]           = "!!";
3106     Token::tochars[TOKshl]              = "<<";
3107     Token::tochars[TOKshr]              = ">>";
3108     Token::tochars[TOKushr]             = ">>>";
3109     Token::tochars[TOKadd]              = "+";
3110     Token::tochars[TOKmin]              = "-";
3111     Token::tochars[TOKmul]              = "*";
3112     Token::tochars[TOKdiv]              = "/";
3113     Token::tochars[TOKmod]              = "%";
3114     Token::tochars[TOKslice]            = "..";
3115     Token::tochars[TOKdotdotdot]        = "...";
3116     Token::tochars[TOKand]              = "&";
3117     Token::tochars[TOKandand]           = "&&";
3118     Token::tochars[TOKor]               = "|";
3119     Token::tochars[TOKoror]             = "||";
3120     Token::tochars[TOKarray]            = "[]";
3121     Token::tochars[TOKindex]            = "[i]";
3122     Token::tochars[TOKaddress]          = "&";
3123     Token::tochars[TOKstar]             = "*";
3124     Token::tochars[TOKtilde]            = "~";
3125     Token::tochars[TOKdollar]           = "$";
3126     Token::tochars[TOKcast]             = "cast";
3127     Token::tochars[TOKplusplus]         = "++";
3128     Token::tochars[TOKminusminus]       = "--";
3129     Token::tochars[TOKtype]             = "type";
3130     Token::tochars[TOKquestion]         = "?";
3131     Token::tochars[TOKneg]              = "-";
3132     Token::tochars[TOKuadd]             = "+";
3133     Token::tochars[TOKvar]              = "var";
3134     Token::tochars[TOKaddass]           = "+=";
3135     Token::tochars[TOKminass]           = "-=";
3136     Token::tochars[TOKmulass]           = "*=";
3137     Token::tochars[TOKdivass]           = "/=";
3138     Token::tochars[TOKmodass]           = "%=";
3139     Token::tochars[TOKshlass]           = "<<=";
3140     Token::tochars[TOKshrass]           = ">>=";
3141     Token::tochars[TOKushrass]          = ">>>=";
3142     Token::tochars[TOKandass]           = "&=";
3143     Token::tochars[TOKorass]            = "|=";
3144     Token::tochars[TOKcatass]           = "~=";
3145     Token::tochars[TOKcat]              = "~";
3146     Token::tochars[TOKcall]             = "call";
3147     Token::tochars[TOKidentity]         = "is";
3148     Token::tochars[TOKnotidentity]      = "!is";
3149     Token::tochars[TOKendline]          = "\\n";
3150
3151     Token::tochars[TOKorass]            = "|=";
3152     Token::tochars[TOKidentifier]       = "identifier";
3153
3154      // For debugging
3155     Token::tochars[TOKdotexp]           = "dotexp";
3156     Token::tochars[TOKdotti]            = "dotti";
3157     Token::tochars[TOKdotvar]           = "dotvar";
3158     Token::tochars[TOKdottype]          = "dottype";
3159     Token::tochars[TOKsymoff]           = "symoff";
3160     Token::tochars[TOKtypedot]          = "typedot";
3161     Token::tochars[TOKarraylength]      = "arraylength";
3162     Token::tochars[TOKarrayliteral]     = "arrayliteral";
3163     Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3164     Token::tochars[TOKstructliteral]    = "structliteral";
3165     Token::tochars[TOKstring]           = "string";
3166     Token::tochars[TOKdsymbol]          = "symbol";
3167     Token::tochars[TOKtuple]            = "tuple";
3168     Token::tochars[TOKdeclaration]      = "declaration";
3169     Token::tochars[TOKdottd]            = "dottd";
3170     Token::tochars[TOKlogger]           = "logger";
3171     Token::tochars[TOKon_scope_exit]    = "scope(exit)";
3172 }