dmd/lexer.c

   1
   2 // Compiler implementation of the D programming language
   3 // Copyright (c) 1999-2008 by Digital Mars
   4 // All Rights Reserved
   5 // written by Walter Bright
   6 // http://www.digitalmars.com
   7 // License for redistribution is by either the Artistic License
   8 // in artistic.txt, or the GNU General Public License in gnu.txt.
   9 // See the included readme.txt for details.
  10
  11 /* NOTE: This file has been patched from the original DMD distribution to
  12    work with the GDC compiler.
  13
  14    Modified by David Friedman, December 2006
  15 */
  16
  17 /* Lexical Analyzer */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21 #include <ctype.h>
  22 #include <stdarg.h>
  23 #include <errno.h>
  24 //#include <wchar.h>
  25 #include <stdlib.h>
  26 #include <assert.h>
  27 #include <sys/time.h>
  28
  29 #ifdef IN_GCC
  30
  31 #include <time.h>
  32 #include "mem.h"
  33
  34 #else
  35
  36 #if __GNUC__
  37 #include <time.h>
  38 #endif
  39
  40 #if _WIN32
  41 #include "..\root\mem.h"
  42 #else
  43 #include "../root/mem.h"
  44 #endif
  45 #endif
  46
  47 #include "stringtable.h"
  48
  49 #include "lexer.h"
  50 #include "utf.h"
  51 #include "identifier.h"
  52 #include "id.h"
  53 #include "module.h"
  54
  55 #if _WIN32 && __DMC__
  56 // from \dm\src\include\setlocal.h
  57 extern "C" char * __cdecl __locale_decpoint;
  58 #endif
  59
  60 extern int HtmlNamedEntity(unsigned char *p, int length);
  61
  62 #define LS 0x2028       // UTF line separator
  63 #define PS 0x2029       // UTF paragraph separator
  64
  65 /********************************************
  66  * Do our own char maps
  67  */
  68
  69 static unsigned char cmtable[256];
  70
  71 const int CMoctal =     0x1;
  72 const int CMhex =       0x2;
  73 const int CMidchar =    0x4;
  74
  75 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
  76 inline unsigned char ishex   (unsigned char c) { return cmtable[c] & CMhex; }
  77 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
  78
  79 static void cmtable_init()
  80 {
  81     for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
  82     {
  83         if ('0' <= c && c <= '7')
  84             cmtable[c] |= CMoctal;
  85         if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
  86             cmtable[c] |= CMhex;
  87         if (isalnum(c) || c == '_')
  88             cmtable[c] |= CMidchar;
  89     }
  90 }
  91
  92
  93 /************************* Token **********************************************/
  94
  95 char *Token::tochars[TOKMAX];
  96
  97 void *Token::operator new(size_t size)
  98 {   Token *t;
  99
 100     if (Lexer::freelist)
 101     {
 102         t = Lexer::freelist;
 103         Lexer::freelist = t->next;
 104         return t;
 105     }
 106
 107     return ::operator new(size);
 108 }
 109
 110 #ifdef DEBUG
 111 void Token::print()
 112 {
 113     fprintf(stdmsg, "%s\n", toChars());
 114 }
 115 #endif
 116
 117 char *Token::toChars()
 118 {   char *p;
 119     static char buffer[3 + 3 * sizeof(value) + 1];
 120
 121     p = buffer;
 122     switch (value)
 123     {
 124         case TOKint32v:
 125 #if IN_GCC
 126             sprintf(buffer,"%d",(d_int32)int64value);
 127 #else
 128             sprintf(buffer,"%d",int32value);
 129 #endif
 130             break;
 131
 132         case TOKuns32v:
 133         case TOKcharv:
 134         case TOKwcharv:
 135         case TOKdcharv:
 136 #if IN_GCC
 137             sprintf(buffer,"%uU",(d_uns32)uns64value);
 138 #else
 139             sprintf(buffer,"%uU",uns32value);
 140 #endif
 141             break;
 142
 143         case TOKint64v:
 144             sprintf(buffer,"%"PRIdMAX"L",int64value);
 145             break;
 146
 147         case TOKuns64v:
 148             sprintf(buffer,"%"PRIuMAX"UL",uns64value);
 149             break;
 150
 151 #if IN_GCC
 152         case TOKfloat32v:
 153         case TOKfloat64v:
 154         case TOKfloat80v:
 155             float80value.format(buffer, sizeof(buffer));
 156             break;
 157         case TOKimaginary32v:
 158         case TOKimaginary64v:
 159         case TOKimaginary80v:
 160             float80value.format(buffer, sizeof(buffer));
 161             // %% buffer
 162             strcat(buffer, "i");
 163             break;
 164 #else
 165         case TOKfloat32v:
 166             sprintf(buffer,"%Lgf", float80value);
 167             break;
 168
 169         case TOKfloat64v:
 170             sprintf(buffer,"%Lg", float80value);
 171             break;
 172
 173         case TOKfloat80v:
 174             sprintf(buffer,"%LgL", float80value);
 175             break;
 176
 177         case TOKimaginary32v:
 178             sprintf(buffer,"%Lgfi", float80value);
 179             break;
 180
 181         case TOKimaginary64v:
 182             sprintf(buffer,"%Lgi", float80value);
 183             break;
 184
 185         case TOKimaginary80v:
 186             sprintf(buffer,"%LgLi", float80value);
 187             break;
 188 #endif
 189
 190
 191         case TOKstring:
 192 #if CSTRINGS
 193             p = string;
 194 #else
 195         {   OutBuffer buf;
 196
 197             buf.writeByte('"');
 198             for (size_t i = 0; i < len; )
 199             {   unsigned c;
 200
 201                 utf_decodeChar((unsigned char *)ustring, len, &i, &c);
 202                 switch (c)
 203                 {
 204                     case 0:
 205                         break;
 206
 207                     case '"':
 208                     case '\\':
 209                         buf.writeByte('\\');
 210                     default:
 211                         if (isprint(c))
 212                             buf.writeByte(c);
 213                         else if (c <= 0x7F)
 214                             buf.printf("\\x%02x", c);
 215                         else if (c <= 0xFFFF)
 216                             buf.printf("\\u%04x", c);
 217                         else
 218                             buf.printf("\\U%08x", c);
 219                         continue;
 220                 }
 221                 break;
 222             }
 223             buf.writeByte('"');
 224             if (postfix)
 225                 buf.writeByte('"');
 226             buf.writeByte(0);
 227             p = (char *)buf.extractData();
 228         }
 229 #endif
 230             break;
 231
 232         case TOKidentifier:
 233         case TOKenum:
 234         case TOKstruct:
 235         case TOKimport:
 236         CASE_BASIC_TYPES:
 237             p = ident->toChars();
 238             break;
 239
 240         default:
 241             p = toChars(value);
 242             break;
 243     }
 244     return p;
 245 }
 246
 247 char *Token::toChars(enum TOK value)
 248 {   char *p;
 249     static char buffer[3 + 3 * sizeof(value) + 1];
 250
 251     p = tochars[value];
 252     if (!p)
 253     {   sprintf(buffer,"TOK%d",value);
 254         p = buffer;
 255     }
 256     return p;
 257 }
 258
 259 /*************************** Lexer ********************************************/
 260
 261 Token *Lexer::freelist = NULL;
 262 StringTable Lexer::stringtable;
 263 OutBuffer Lexer::stringbuffer;
 264
 265 Lexer::Lexer(Module *mod,
 266         unsigned char *base, unsigned begoffset, unsigned endoffset,
 267         int doDocComment, int commentToken, bool dltSyntax)
 268     : loc(mod, 1), dltSyntax(dltSyntax)
 269 {
 270     //printf("Lexer::Lexer(%p,%d)\n",base,length);
 271     //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
 272     memset(&token,0,sizeof(token));
 273     this->base = base;
 274     this->end  = base + endoffset;
 275     p = base + begoffset;
 276     this->mod = mod;
 277     this->doDocComment = doDocComment;
 278     this->anyToken = 0;
 279     this->commentToken = commentToken;
 280     this->nesting = 0;
 281     this->indent = 0;
 282     this->atStartOfLine = 1;
 283     this->incLineno = 0;
 284     //initKeywords();
 285
 286     /* If first line starts with '#!', ignore the line
 287      */
 288
 289     if (p[0] == '#' && p[1] =='!')
 290     {
 291         p += 2;
 292         while (1)
 293         {   unsigned char c = *p;
 294             switch (c)
 295             {
 296                 case '\n':
 297                     p++;
 298                     break;
 299
 300                 case '\r':
 301                     p++;
 302                     if (*p == '\n')
 303                         p++;
 304                     break;
 305
 306                 case 0:
 307                 case 0x1A:
 308                     break;
 309
 310                 default:
 311                     if (c & 0x80)
 312                     {   unsigned u = decodeUTF();
 313                         if (u == PS || u == LS)
 314                             break;
 315                     }
 316                     p++;
 317                     continue;
 318             }
 319             break;
 320         }
 321         loc.linnum = 2;
 322     }
 323 }
 324
 325
 326 void Lexer::error(const char *format, ...)
 327 {
 328     if (mod && !global.gag)
 329     {
 330         char *p = loc.toChars();
 331         if (*p)
 332             fprintf(stdmsg, "%s: ", p);
 333         mem.free(p);
 334
 335         va_list ap;
 336         va_start(ap, format);
 337         vfprintf(stdmsg, format, ap);
 338         va_end(ap);
 339
 340         fprintf(stdmsg, "\n");
 341         fflush(stdmsg);
 342
 343         if (global.errors >= 20)        // moderate blizzard of cascading messages
 344             fatal();
 345     }
 346     global.errors++;
 347 }
 348
 349 void Lexer::error(Loc loc, const char *format, ...)
 350 {
 351     if (mod && !global.gag)
 352     {
 353         char *p = loc.toChars();
 354         if (*p)
 355             fprintf(stdmsg, "%s: ", p);
 356         mem.free(p);
 357
 358         va_list ap;
 359         va_start(ap, format);
 360         vfprintf(stdmsg, format, ap);
 361         va_end(ap);
 362
 363         fprintf(stdmsg, "\n");
 364         fflush(stdmsg);
 365
 366         if (global.errors >= 20)        // moderate blizzard of cascading messages
 367             fatal();
 368     }
 369     global.errors++;
 370 }
 371
 372 TOK Lexer::nextToken()
 373 {   Token *t;
 374
 375     if (token.next)
 376     {
 377         t = token.next;
 378         memcpy(&token,t,sizeof(Token));
 379         t->next = freelist;
 380         freelist = t;
 381     }
 382     else
 383     {
 384         scan(&token);
 385     }
 386     //token.print();
 387     return token.value;
 388 }
 389
 390 Token *Lexer::peek(Token *ct)
 391 {   Token *t;
 392
 393     if (ct->next)
 394         t = ct->next;
 395     else
 396     {
 397         t = new Token();
 398         scan(t);
 399         t->next = NULL;
 400         ct->next = t;
 401     }
 402     return t;
 403 }
 404
 405 /*********************************
 406  * tk is on the opening (.
 407  * Look ahead and return token that is past the closing ).
 408  */
 409
 410 Token *Lexer::peekPastParen(Token *tk)
 411 {
 412     //printf("peekPastParen()\n");
 413     int parens = 1;
 414     int curlynest = 0;
 415     while (1)
 416     {
 417         tk = peek(tk);
 418         //tk->print();
 419         switch (tk->value)
 420         {
 421             case TOKlparen:
 422                 parens++;
 423                 continue;
 424
 425             case TOKrparen:
 426                 --parens;
 427                 if (parens)
 428                     continue;
 429                 tk = peek(tk);
 430                 break;
 431
 432             case TOKlcurly:
 433                 curlynest++;
 434                 continue;
 435
 436             case TOKrcurly:
 437                 if (--curlynest >= 0)
 438                     continue;
 439                 break;
 440
 441             case TOKsemicolon:
 442                 if (curlynest)
 443                     continue;
 444                 break;
 445
 446             case TOKeof:
 447                 break;
 448
 449             default:
 450                 continue;
 451         }
 452         return tk;
 453     }
 454 }
 455
 456 /**********************************
 457  * Determine if string is a valid Identifier.
 458  * Placed here because of commonality with Lexer functionality.
 459  * Returns:
 460  *      0       invalid
 461  */
 462
 463 int Lexer::isValidIdentifier(char *p)
 464 {
 465     size_t len;
 466     size_t idx;
 467
 468     if (!p || !*p)
 469         goto Linvalid;
 470
 471     if (*p >= '0' && *p <= '9')         // beware of isdigit() on signed chars
 472         goto Linvalid;
 473
 474     len = strlen(p);
 475     idx = 0;
 476     while (p[idx])
 477     {   dchar_t dc;
 478
 479         char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
 480         if (q)
 481             goto Linvalid;
 482
 483         if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
 484             goto Linvalid;
 485     }
 486     return 1;
 487
 488 Linvalid:
 489     return 0;
 490 }
 491
 492 /****************************
 493  * Turn next token in buffer into a token.
 494  */
 495
 496 void Lexer::scan(Token *t)
 497 {
 498     unsigned lastLine = loc.linnum;
 499     unsigned linnum;
 500
 501     // Delayed line-number updating
 502     if (incLineno)
 503     {
 504         assert(incLineno == 1);
 505         incLineno = 0;
 506         loc.linnum++;
 507     }
 508
 509     t->blockComment = NULL;
 510     t->lineComment = NULL;
 511     while (1)
 512     {
 513         t->ptr = p;
 514
 515         if (dltSyntax && atStartOfLine) {
 516                 // Check indent
 517                 int i;
 518                 for (i = 0; p[i] == '\t'; i++) {
 519                 }
 520                 if (p[i] == ' ') {
 521                     error("Whitespace error: use tabs to indent!");
 522                 }
 523                 if (p[i] == '#') {
 524                     p += i;
 525                     atStartOfLine = 0;
 526                 } else if (p[i] != '\n' && p[i] != '\r') {
 527                     if (p[i] == '\0')
 528                         i = 0;                  // End-of-file always has no indent
 529                     if (i > indent) {
 530                         error("unexpected indentation (expected %d tabs, not %d)",
 531                                 indent, i);
 532                     } else if (i < indent) {
 533                         indent -= 1;
 534                         t->value = TOKrcurly;
 535                         return;
 536                     }
 537                     atStartOfLine = 0;
 538                 } /* else ignore blank line */
 539         }
 540
 541         //printf("p = %p, *p = '%c'\n",p,*p);
 542         switch (*p)
 543         {
 544             case 0:
 545             case 0x1A:
 546                 t->value = TOKeof;                      // end of file
 547                 return;
 548
 549             case ' ':
 550             case '\t':
 551             case '\v':
 552             case '\f':
 553                 p++;
 554                 continue;                       // skip white space
 555
 556             case '\r':
 557                 if (p[1] == '\n') {             // if CRLF
 558                     p++;
 559                     continue;
 560                 }
 561                 // fall-through
 562             case '\n':
 563                 p++;
 564                 if (dltSyntax)
 565                 {
 566                     // Delay incrementing the line number until after sending
 567                     // the TOKendline, for better error messages
 568                     assert(!incLineno);
 569                     incLineno++;
 570
 571                     if (!nesting)
 572                     {
 573                         atStartOfLine = 1;
 574                         t->value = TOKendline;
 575                         return;
 576                     }
 577                 }
 578                 else
 579                     loc.linnum++;
 580                 continue;                       // Ignore newlines inside brackets
 581             case '0':   case '1':   case '2':   case '3':   case '4':
 582             case '5':   case '6':   case '7':   case '8':   case '9':
 583                 t->value = number(t);
 584                 return;
 585
 586 #if CSTRINGS
 587             case '\'':
 588                 t->value = charConstant(t, 0);
 589                 return;
 590
 591             case '"':
 592                 t->value = stringConstant(t,0);
 593                 return;
 594
 595             case 'l':
 596             case 'L':
 597                 if (p[1] == '\'')
 598                 {
 599                     p++;
 600                     t->value = charConstant(t, 1);
 601                     return;
 602                 }
 603                 else if (p[1] == '"')
 604                 {
 605                     p++;
 606                     t->value = stringConstant(t, 1);
 607                     return;
 608                 }
 609 #else
 610             case '\'':
 611                 t->value = charConstant(t,0);
 612                 return;
 613
 614             case 'r':
 615                 if (p[1] != '"')
 616                     goto case_ident;
 617                 p++;
 618             case '`':
 619                 t->value = wysiwygStringConstant(t, *p);
 620                 return;
 621
 622             case 'x':
 623                 if (p[1] != '"')
 624                     goto case_ident;
 625                 p++;
 626                 t->value = hexStringConstant(t);
 627                 return;
 628
 629 #if V2
 630             case 'q':
 631                 if (p[1] == '"')
 632                 {
 633                     p++;
 634                     t->value = delimitedStringConstant(t);
 635                     return;
 636                 }
 637                 else if (p[1] == '{')
 638                 {
 639                     p++;
 640                     t->value = tokenStringConstant(t);
 641                     return;
 642                 }
 643                 else
 644                     goto case_ident;
 645 #endif
 646
 647             case '"':
 648                 t->value = escapeStringConstant(t,0);
 649                 return;
 650
 651             case '\\':                  // escaped string literal
 652             {   unsigned c;
 653
 654                 stringbuffer.reset();
 655                 do
 656                 {
 657                     p++;
 658                     switch (*p)
 659                     {
 660                         case 'u':
 661                         case 'U':
 662                         case '&':
 663                             c = escapeSequence();
 664                             stringbuffer.writeUTF8(c);
 665                             break;
 666
 667                         default:
 668                             c = escapeSequence();
 669                             stringbuffer.writeByte(c);
 670                             break;
 671                     }
 672                 } while (*p == '\\');
 673                 t->len = stringbuffer.offset;
 674                 stringbuffer.writeByte(0);
 675                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
 676                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
 677                 t->postfix = 0;
 678                 t->value = TOKstring;
 679                 return;
 680             }
 681
 682             case 'l':
 683             case 'L':
 684 #endif
 685             case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 686             case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 687             case 'k':               case 'm':   case 'n':   case 'o':
 688 #if V2
 689             case 'p':   /*case 'q': case 'r':*/ case 's':   case 't':
 690 #else
 691             case 'p':   case 'q': /*case 'r':*/ case 's':   case 't':
 692 #endif
 693             case 'u':   case 'v':   case 'w': /*case 'x':*/ case 'y':
 694             case 'z':
 695             case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 696             case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 697             case 'K':               case 'M':   case 'N':   case 'O':
 698             case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 699             case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 700             case 'Z':
 701             case '_':
 702             case_ident:
 703             {   unsigned char c;
 704                 StringValue *sv;
 705                 Identifier *id;
 706
 707                 do
 708                 {
 709                     c = *++p;
 710                 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
 711                 sv = stringtable.update((char *)t->ptr, p - t->ptr);
 712                 id = (Identifier *) sv->ptrvalue;
 713                 if (!id)
 714                 {   id = new Identifier(sv->lstring.string,TOKidentifier);
 715                     sv->ptrvalue = id;
 716                 }
 717                 t->ident = id;
 718                 t->value = (enum TOK) id->value;
 719                 anyToken = 1;
 720                 if (*t->ptr == '_')     // if special identifier token
 721                 {
 722                     static char date[11+1];
 723                     static char time[8+1];
 724                     static char timestamp[24+1];
 725
 726                     if (!date[0])       // lazy evaluation
 727                     {   time_t t;
 728                         char *p;
 729
 730                         ::time(&t);
 731                         p = ctime(&t);
 732                         assert(p);
 733                         sprintf(date, "%.6s %.4s", p + 4, p + 20);
 734                         sprintf(time, "%.8s", p + 11);
 735                         sprintf(timestamp, "%.24s", p);
 736                     }
 737
 738                     if (mod && id == Id::FILE)
 739                     {
 740                         t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
 741                         goto Lstring;
 742                     }
 743                     else if (mod && id == Id::LINE)
 744                     {
 745                         t->value = TOKint64v;
 746                         t->uns64value = loc.linnum;
 747                     }
 748                     else if (id == Id::DATE)
 749                     {
 750                         t->ustring = (unsigned char *)date;
 751                         goto Lstring;
 752                     }
 753                     else if (id == Id::TIME)
 754                     {
 755                         t->ustring = (unsigned char *)time;
 756                         goto Lstring;
 757                     }
 758                     else if (id == Id::VENDOR)
 759                     {
 760 #ifdef IN_GCC
 761                         t->ustring = (unsigned char *)"GDC";
 762 #else
 763                         t->ustring = (unsigned char *)"Digital Mars D";
 764 #endif
 765                         goto Lstring;
 766                     }
 767                     else if (id == Id::TIMESTAMP)
 768                     {
 769                         t->ustring = (unsigned char *)timestamp;
 770                      Lstring:
 771                         t->value = TOKstring;
 772                      Llen:
 773                         t->postfix = 0;
 774                         t->len = strlen((char *)t->ustring);
 775                     }
 776                     else if (id == Id::VERSIONX)
 777                     {   unsigned major = 0;
 778                         unsigned minor = 0;
 779
 780                         for (char *p = global.version + 1; 1; p++)
 781                         {
 782                             char c = *p;
 783                             if (isdigit(c))
 784                                 minor = minor * 10 + c - '0';
 785                             else if (c == '.')
 786                             {   major = minor;
 787                                 minor = 0;
 788                             }
 789                             else
 790                                 break;
 791                         }
 792                         t->value = TOKint64v;
 793                         t->uns64value = major * 1000 + minor;
 794                     }
 795 #if V2
 796                     else if (id == Id::EOFX)
 797                     {
 798                         t->value = TOKeof;
 799                         // Advance scanner to end of file
 800                         while (!(*p == 0 || *p == 0x1A))
 801                             p++;
 802                     }
 803 #endif
 804                 }
 805                 //printf("t->value = %d\n",t->value);
 806                 return;
 807             }
 808
 809             case '/':
 810                 p++;
 811                 switch (*p)
 812                 {
 813                     case '=':
 814                         p++;
 815                         t->value = TOKdivass;
 816                         return;
 817
 818                     case '*':
 819                         p++;
 820                         linnum = loc.linnum;
 821                         while (1)
 822                         {
 823                             while (1)
 824                             {   unsigned char c = *p;
 825                                 switch (c)
 826                                 {
 827                                     case '/':
 828                                         break;
 829
 830                                     case '\n':
 831                                         loc.linnum++;
 832                                         p++;
 833                                         continue;
 834
 835                                     case '\r':
 836                                         p++;
 837                                         if (*p != '\n')
 838                                             loc.linnum++;
 839                                         continue;
 840
 841                                     case 0:
 842                                     case 0x1A:
 843                                         error("unterminated /* */ comment");
 844                                         p = end;
 845                                         t->value = TOKeof;
 846                                         return;
 847
 848                                     default:
 849                                         if (c & 0x80)
 850                                         {   unsigned u = decodeUTF();
 851                                             if (u == PS || u == LS)
 852                                                 loc.linnum++;
 853                                         }
 854                                         p++;
 855                                         continue;
 856                                 }
 857                                 break;
 858                             }
 859                             p++;
 860                             if (p[-2] == '*' && p - 3 != t->ptr)
 861                                 break;
 862                         }
 863                         if (commentToken)
 864                         {
 865                             t->value = TOKcomment;
 866                             return;
 867                         }
 868                         else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
 869                         {   // if /** but not /**/
 870                             getDocComment(t, lastLine == linnum);
 871                         }
 872                         continue;
 873
 874                     case '/':           // do // style comments
 875                         linnum = loc.linnum;
 876                         while (1)
 877                         {   unsigned char c = *++p;
 878                             switch (c)
 879                             {
 880                                 case '\n':
 881                                     break;
 882
 883                                 case '\r':
 884                                     if (p[1] == '\n')
 885                                         p++;
 886                                     break;
 887
 888                                 case 0:
 889                                 case 0x1A:
 890                                     if (commentToken)
 891                                     {
 892                                         p = end;
 893                                         t->value = TOKcomment;
 894                                         return;
 895                                     }
 896                                     if (doDocComment && t->ptr[2] == '/')
 897                                         getDocComment(t, lastLine == linnum);
 898                                     p = end;
 899                                     t->value = TOKeof;
 900                                     return;
 901
 902                                 default:
 903                                     if (c & 0x80)
 904                                     {   unsigned u = decodeUTF();
 905                                         if (u == PS || u == LS)
 906                                             break;
 907                                     }
 908                                     continue;
 909                             }
 910                             break;
 911                         }
 912
 913                         if (commentToken)
 914                         {
 915                             p++;
 916                             loc.linnum++;
 917                             t->value = TOKcomment;
 918                             return;
 919                         }
 920                         if (doDocComment && t->ptr[2] == '/')
 921                             getDocComment(t, lastLine == linnum);
 922
 923                         p++;
 924                         loc.linnum++;
 925                         continue;
 926
 927                     case '+':
 928                     {   int nest;
 929
 930                         linnum = loc.linnum;
 931                         p++;
 932                         nest = 1;
 933                         while (1)
 934                         {   unsigned char c = *p;
 935                             switch (c)
 936                             {
 937                                 case '/':
 938                                     p++;
 939                                     if (*p == '+')
 940                                     {
 941                                         p++;
 942                                         nest++;
 943                                     }
 944                                     continue;
 945
 946                                 case '+':
 947                                     p++;
 948                                     if (*p == '/')
 949                                     {
 950                                         p++;
 951                                         if (--nest == 0)
 952                                             break;
 953                                     }
 954                                     continue;
 955
 956                                 case '\r':
 957                                     p++;
 958                                     if (*p != '\n')
 959                                         loc.linnum++;
 960                                     continue;
 961
 962                                 case '\n':
 963                                     loc.linnum++;
 964                                     p++;
 965                                     continue;
 966
 967                                 case 0:
 968                                 case 0x1A:
 969                                     error("unterminated /+ +/ comment");
 970                                     p = end;
 971                                     t->value = TOKeof;
 972                                     return;
 973
 974                                 default:
 975                                     if (c & 0x80)
 976                                     {   unsigned u = decodeUTF();
 977                                         if (u == PS || u == LS)
 978                                             loc.linnum++;
 979                                     }
 980                                     p++;
 981                                     continue;
 982                             }
 983                             break;
 984                         }
 985                         if (commentToken)
 986                         {
 987                             t->value = TOKcomment;
 988                             return;
 989                         }
 990                         if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
 991                         {   // if /++ but not /++/
 992                             getDocComment(t, lastLine == linnum);
 993                         }
 994                         continue;
 995                     }
 996                 }
 997                 t->value = TOKdiv;
 998                 return;
 999
1000             case '.':
1001                 p++;
1002                 if (isdigit(*p))
1003                 {   /* Note that we don't allow ._1 and ._ as being
1004                      * valid floating point numbers.
1005                      */
1006                     p--;
1007                     t->value = inreal(t);
1008                 }
1009                 else if (p[0] == '.')
1010                 {
1011                     if (p[1] == '.')
1012                     {   p += 2;
1013                         t->value = TOKdotdotdot;
1014                     }
1015                     else
1016                     {   p++;
1017                         t->value = TOKslice;
1018                     }
1019                 }
1020                 else
1021                     t->value = TOKdot;
1022                 return;
1023
1024             case '&':
1025                 p++;
1026                 if (*p == '=')
1027                 {   p++;
1028                     t->value = TOKandass;
1029                 }
1030                 else if (*p == '&')
1031                 {   p++;
1032                     t->value = TOKandand;
1033                     if (dltSyntax)
1034                         error("Use 'and' instead of '&&'");
1035                 }
1036                 else
1037                     t->value = TOKand;
1038                 return;
1039
1040             case '|':
1041                 p++;
1042                 if (*p == '=')
1043                 {   p++;
1044                     t->value = TOKorass;
1045                 }
1046                 else if (*p == '|')
1047                 {   p++;
1048                     t->value = TOKoror;
1049                     if (dltSyntax)
1050                         error("Use 'or' instead of '||'");
1051                 }
1052                 else
1053                     t->value = TOKor;
1054                 return;
1055
1056             case '-':
1057                 p++;
1058                 if (*p == '=')
1059                 {   p++;
1060                     t->value = TOKminass;
1061                 }
1062 #if 0
1063                 else if (*p == '>')
1064                 {   p++;
1065                     t->value = TOKarrow;
1066                 }
1067 #endif
1068                 else if (*p == '-')
1069                 {   p++;
1070                     t->value = TOKminusminus;
1071                 }
1072                 else
1073                     t->value = TOKmin;
1074                 return;
1075
1076             case '+':
1077                 p++;
1078                 if (*p == '=')
1079                 {   p++;
1080                     t->value = TOKaddass;
1081                 }
1082                 else if (*p == '+')
1083                 {   p++;
1084                     t->value = TOKplusplus;
1085                 }
1086                 else
1087                     t->value = TOKadd;
1088                 return;
1089
1090             case '<':
1091                 p++;
1092                 if (*p == '=')
1093                 {   p++;
1094                     t->value = TOKle;                   // <=
1095                 }
1096                 else if (*p == '<')
1097                 {   p++;
1098                     if (*p == '=')
1099                     {   p++;
1100                         t->value = TOKshlass;           // <<=
1101                     }
1102                     else
1103                         t->value = TOKshl;              // <<
1104                 }
1105                 else if (*p == '>')
1106                 {   p++;
1107                     if (*p == '=')
1108                     {   p++;
1109                         t->value = TOKleg;              // <>=
1110                     }
1111                     else
1112                         t->value = TOKlg;               // <>
1113                 }
1114                 else
1115                     t->value = TOKlt;                   // <
1116                 return;
1117
1118             case '>':
1119                 p++;
1120                 if (*p == '=')
1121                 {   p++;
1122                     t->value = TOKge;                   // >=
1123                 }
1124                 else if (*p == '>')
1125                 {   p++;
1126                     if (*p == '=')
1127                     {   p++;
1128                         t->value = TOKshrass;           // >>=
1129                     }
1130                     else if (*p == '>')
1131                     {   p++;
1132                         if (*p == '=')
1133                         {   p++;
1134                             t->value = TOKushrass;      // >>>=
1135                         }
1136                         else
1137                             t->value = TOKushr;         // >>>
1138                     }
1139                     else
1140                         t->value = TOKshr;              // >>
1141                 }
1142                 else
1143                     t->value = TOKgt;                   // >
1144                 return;
1145
1146             case '!':
1147                 p++;
1148                 if (*p == '=')
1149                 {   p++;
1150                     if (*p == '=' && global.params.Dversion == 1)
1151                     {   p++;
1152                         t->value = TOKnotidentity;      // !==
1153                     }
1154                     else
1155                         t->value = TOKnotequal;         // !=
1156                 }
1157                 else if (*p == '<')
1158                 {   p++;
1159                     if (*p == '>')
1160                     {   p++;
1161                         if (*p == '=')
1162                         {   p++;
1163                             t->value = TOKunord; // !<>=
1164                         }
1165                         else
1166                             t->value = TOKue;   // !<>
1167                     }
1168                     else if (*p == '=')
1169                     {   p++;
1170                         t->value = TOKug;       // !<=
1171                     }
1172                     else
1173                         t->value = TOKuge;      // !<
1174                 }
1175                 else if (*p == '>')
1176                 {   p++;
1177                     if (*p == '=')
1178                     {   p++;
1179                         t->value = TOKul;       // !>=
1180                     }
1181                     else
1182                         t->value = TOKule;      // !>
1183                 }
1184                 else
1185                     t->value = TOKnot;          // !
1186                 return;
1187
1188             case '=':
1189                 p++;
1190                 if (*p == '=')
1191                 {   p++;
1192                     if (*p == '=' && global.params.Dversion == 1)
1193                     {   p++;
1194                         t->value = TOKidentity;         // ===
1195                     }
1196                     else
1197                         t->value = TOKequal;            // ==
1198                 }
1199                 else
1200                     t->value = TOKassign;               // =
1201                 return;
1202
1203             case '~':
1204                 p++;
1205                 if (*p == '=')
1206                 {   p++;
1207                     t->value = TOKcatass;               // ~=
1208                 }
1209                 else
1210                     t->value = TOKtilde;                // ~
1211                 return;
1212
1213 #define NESTED(cin,tokin,cout,tokout) \
1214             case cin: nesting++; p++; t->value = tokin; return;\
1215             case cout: if (nesting == 0) {error("Unexpected '%c'", cout);} else {nesting--;} p++; t->value = tokout; return;
1216
1217             NESTED('(', TOKlparen, ')', TOKrparen)
1218             NESTED('[', TOKlbracket, ']', TOKrbracket)
1219             NESTED('{', TOKlcurly, '}', TOKrcurly)
1220 #undef NESTED
1221
1222 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1223             SINGLE('?', TOKquestion)
1224             SINGLE(',', TOKcomma)
1225             SINGLE(';', TOKsemicolon)
1226             SINGLE('$', TOKdollar)
1227             SINGLE('@', TOKat)
1228
1229 #undef SINGLE
1230
1231             case ':':
1232                 p++;
1233                 if (!nesting)
1234                         indent += 1;
1235                 t->value = TOKcolon;
1236                 return;
1237
1238 #define DOUBLE(c1,tok1,c2,tok2)         \
1239             case c1:                    \
1240                 p++;                    \
1241                 if (*p == c2)           \
1242                 {   p++;                \
1243                     t->value = tok2;    \
1244                 }                       \
1245                 else                    \
1246                     t->value = tok1;    \
1247                 return;
1248
1249             DOUBLE('*', TOKmul, '=', TOKmulass)
1250             DOUBLE('%', TOKmod, '=', TOKmodass)
1251             DOUBLE('^', TOKxor, '=', TOKxorass)
1252
1253 #undef DOUBLE
1254
1255             case '#':           // do # style comments and pragmas
1256                 if (dltSyntax)
1257                 {
1258                     do { p++; } while (*p != '\n');
1259                 }
1260                 else
1261                 {
1262                     p++;
1263                     pragma();
1264                 }
1265                 continue;
1266
1267             default:
1268             {   unsigned char c = *p;
1269
1270                 if (c & 0x80)
1271                 {   unsigned u = decodeUTF();
1272
1273                     // Check for start of unicode identifier
1274                     if (isUniAlpha(u))
1275                         goto case_ident;
1276
1277                     if (u == PS || u == LS)
1278                     {
1279                         loc.linnum++;
1280                         p++;
1281                         continue;
1282                     }
1283                 }
1284                 if (isprint(c))
1285                     error("unsupported char '%c'", c);
1286                 else
1287                     error("unsupported char 0x%02x", c);
1288                 p++;
1289                 continue;
1290             }
1291         }
1292     }
1293 }
1294
1295 /*******************************************
1296  * Parse escape sequence.
1297  */
1298
1299 unsigned Lexer::escapeSequence()
1300 {   unsigned c;
1301     int n;
1302     int ndigits;
1303
1304     c = *p;
1305     switch (c)
1306     {
1307         case '\'':
1308         case '"':
1309         case '?':
1310         case '\\':
1311         Lconsume:
1312                 p++;
1313                 break;
1314
1315         case 'a':       c = 7;          goto Lconsume;
1316         case 'b':       c = 8;          goto Lconsume;
1317         case 'f':       c = 12;         goto Lconsume;
1318         case 'n':       c = 10;         goto Lconsume;
1319         case 'r':       c = 13;         goto Lconsume;
1320         case 't':       c = 9;          goto Lconsume;
1321         case 'v':       c = 11;         goto Lconsume;
1322
1323         case 'u':
1324                 ndigits = 4;
1325                 goto Lhex;
1326         case 'U':
1327                 ndigits = 8;
1328                 goto Lhex;
1329         case 'x':
1330                 ndigits = 2;
1331         Lhex:
1332                 p++;
1333                 c = *p;
1334                 if (ishex(c))
1335                 {   unsigned v;
1336
1337                     n = 0;
1338                     v = 0;
1339                     while (1)
1340                     {
1341                         if (isdigit(c))
1342                             c -= '0';
1343                         else if (islower(c))
1344                             c -= 'a' - 10;
1345                         else
1346                             c -= 'A' - 10;
1347                         v = v * 16 + c;
1348                         c = *++p;
1349                         if (++n == ndigits)
1350                             break;
1351                         if (!ishex(c))
1352                         {   error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1353                             break;
1354                         }
1355                     }
1356                     if (ndigits != 2 && !utf_isValidDchar(v))
1357                         error("invalid UTF character \\U%08x", v);
1358                     c = v;
1359                 }
1360                 else
1361                     error("undefined escape hex sequence \\%c\n",c);
1362                 break;
1363
1364         case '&':                       // named character entity
1365                 for (unsigned char *idstart = ++p; 1; p++)
1366                 {
1367                     switch (*p)
1368                     {
1369                         case ';':
1370                             c = HtmlNamedEntity(idstart, p - idstart);
1371                             if (c == ~0)
1372                             {   error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1373                                 c = ' ';
1374                             }
1375                             p++;
1376                             break;
1377
1378                         default:
1379                             if (isalpha(*p) ||
1380                                 (p != idstart + 1 && isdigit(*p)))
1381                                 continue;
1382                             error("unterminated named entity");
1383                             break;
1384                     }
1385                     break;
1386                 }
1387                 break;
1388
1389         case 0:
1390         case 0x1A:                      // end of file
1391                 c = '\\';
1392                 break;
1393
1394         default:
1395                 if (isoctal(c))
1396                 {   unsigned v;
1397
1398                     n = 0;
1399                     v = 0;
1400                     do
1401                     {
1402                         v = v * 8 + (c - '0');
1403                         c = *++p;
1404                     } while (++n < 3 && isoctal(c));
1405                     c = v;
1406                     if (c > 0xFF)
1407                         error("0%03o is larger than a byte", c);
1408                 }
1409                 else
1410                     error("undefined escape sequence \\%c\n",c);
1411                 break;
1412     }
1413     return c;
1414 }
1415
1416 /**************************************
1417  */
1418
1419 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1420 {   unsigned c;
1421     Loc start = loc;
1422
1423     p++;
1424     stringbuffer.reset();
1425     while (1)
1426     {
1427         c = *p++;
1428         switch (c)
1429         {
1430             case '\n':
1431                 loc.linnum++;
1432                 break;
1433
1434             case '\r':
1435                 if (*p == '\n')
1436                     continue;   // ignore
1437                 c = '\n';       // treat EndOfLine as \n character
1438                 loc.linnum++;
1439                 break;
1440
1441             case 0:
1442             case 0x1A:
1443                 error("unterminated string constant starting at %s", start.toChars());
1444                 t->ustring = (unsigned char *)"";
1445                 t->len = 0;
1446                 t->postfix = 0;
1447                 return TOKstring;
1448
1449             case '"':
1450             case '`':
1451                 if (c == tc)
1452                 {
1453                     t->len = stringbuffer.offset;
1454                     stringbuffer.writeByte(0);
1455                     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1456                     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1457                     stringPostfix(t);
1458                     return TOKstring;
1459                 }
1460                 break;
1461
1462             default:
1463                 if (c & 0x80)
1464                 {   p--;
1465                     unsigned u = decodeUTF();
1466                     p++;
1467                     if (u == PS || u == LS)
1468                         loc.linnum++;
1469                     stringbuffer.writeUTF8(u);
1470                     continue;
1471                 }
1472                 break;
1473         }
1474         stringbuffer.writeByte(c);
1475     }
1476 }
1477
1478 /**************************************
1479  * Lex hex strings:
1480  *      x"0A ae 34FE BD"
1481  */
1482
1483 TOK Lexer::hexStringConstant(Token *t)
1484 {   unsigned c;
1485     Loc start = loc;
1486     unsigned n = 0;
1487     unsigned v;
1488
1489     p++;
1490     stringbuffer.reset();
1491     while (1)
1492     {
1493         c = *p++;
1494         switch (c)
1495         {
1496             case ' ':
1497             case '\t':
1498             case '\v':
1499             case '\f':
1500                 continue;                       // skip white space
1501
1502             case '\r':
1503                 if (*p == '\n')
1504                     continue;                   // ignore
1505                 // Treat isolated '\r' as if it were a '\n'
1506             case '\n':
1507                 loc.linnum++;
1508                 continue;
1509
1510             case 0:
1511             case 0x1A:
1512                 error("unterminated string constant starting at %s", start.toChars());
1513                 t->ustring = (unsigned char *)"";
1514                 t->len = 0;
1515                 t->postfix = 0;
1516                 return TOKstring;
1517
1518             case '"':
1519                 if (n & 1)
1520                 {   error("odd number (%d) of hex characters in hex string", n);
1521                     stringbuffer.writeByte(v);
1522                 }
1523                 t->len = stringbuffer.offset;
1524                 stringbuffer.writeByte(0);
1525                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1526                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1527                 stringPostfix(t);
1528                 return TOKstring;
1529
1530             default:
1531                 if (c >= '0' && c <= '9')
1532                     c -= '0';
1533                 else if (c >= 'a' && c <= 'f')
1534                     c -= 'a' - 10;
1535                 else if (c >= 'A' && c <= 'F')
1536                     c -= 'A' - 10;
1537                 else if (c & 0x80)
1538                 {   p--;
1539                     unsigned u = decodeUTF();
1540                     p++;
1541                     if (u == PS || u == LS)
1542                         loc.linnum++;
1543                     else
1544                         error("non-hex character \\u%x", u);
1545                 }
1546                 else
1547                     error("non-hex character '%c'", c);
1548                 if (n & 1)
1549                 {   v = (v << 4) | c;
1550                     stringbuffer.writeByte(v);
1551                 }
1552                 else
1553                     v = c;
1554                 n++;
1555                 break;
1556         }
1557     }
1558 }
1559
1560
1561 #if V2
1562 /**************************************
1563  * Lex delimited strings:
1564  *      q"(foo(xxx))"   // "foo(xxx)"
1565  *      q"[foo(]"       // "foo("
1566  *      q"/foo]/"       // "foo]"
1567  *      q"HERE
1568  *      foo
1569  *      HERE"           // "foo\n"
1570  * Input:
1571  *      p is on the "
1572  */
1573
1574 TOK Lexer::delimitedStringConstant(Token *t)
1575 {   unsigned c;
1576     Loc start = loc;
1577     unsigned delimleft = 0;
1578     unsigned delimright = 0;
1579     unsigned nest = 1;
1580     unsigned nestcount;
1581     Identifier *hereid = NULL;
1582     unsigned blankrol = 0;
1583     unsigned startline = 0;
1584
1585     p++;
1586     stringbuffer.reset();
1587     while (1)
1588     {
1589         c = *p++;
1590         //printf("c = '%c'\n", c);
1591         switch (c)
1592         {
1593             case '\n':
1594             Lnextline:
1595                 loc.linnum++;
1596                 startline = 1;
1597                 if (blankrol)
1598                 {   blankrol = 0;
1599                     continue;
1600                 }
1601                 if (hereid)
1602                 {
1603                     stringbuffer.writeUTF8(c);
1604                     continue;
1605                 }
1606                 break;
1607
1608             case '\r':
1609                 if (*p == '\n')
1610                     continue;   // ignore
1611                 c = '\n';       // treat EndOfLine as \n character
1612                 goto Lnextline;
1613
1614             case 0:
1615             case 0x1A:
1616                 goto Lerror;
1617
1618             default:
1619                 if (c & 0x80)
1620                 {   p--;
1621                     c = decodeUTF();
1622                     p++;
1623                     if (c == PS || c == LS)
1624                         goto Lnextline;
1625                 }
1626                 break;
1627         }
1628         if (delimleft == 0)
1629         {   delimleft = c;
1630             nest = 1;
1631             nestcount = 1;
1632             if (c == '(')
1633                 delimright = ')';
1634             else if (c == '{')
1635                 delimright = '}';
1636             else if (c == '[')
1637                 delimright = ']';
1638             else if (c == '<')
1639                 delimright = '>';
1640             else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1641             {   // Start of identifier; must be a heredoc
1642                 Token t;
1643                 p--;
1644                 scan(&t);               // read in heredoc identifier
1645                 if (t.value != TOKidentifier)
1646                 {   error("identifier expected for heredoc, not %s", t.toChars());
1647                     delimright = c;
1648                 }
1649                 else
1650                 {   hereid = t.ident;
1651                     //printf("hereid = '%s'\n", hereid->toChars());
1652                     blankrol = 1;
1653                 }
1654                 nest = 0;
1655             }
1656             else
1657             {   delimright = c;
1658                 nest = 0;
1659             }
1660         }
1661         else
1662         {
1663             if (blankrol)
1664             {   error("heredoc rest of line should be blank");
1665                 blankrol = 0;
1666                 continue;
1667             }
1668             if (nest == 1)
1669             {
1670                 if (c == delimleft)
1671                     nestcount++;
1672                 else if (c == delimright)
1673                 {   nestcount--;
1674                     if (nestcount == 0)
1675                         goto Ldone;
1676                 }
1677             }
1678             else if (c == delimright)
1679                 goto Ldone;
1680             if (startline && isalpha(c))
1681             {   Token t;
1682                 unsigned char *psave = p;
1683                 p--;
1684                 scan(&t);               // read in possible heredoc identifier
1685                 //printf("endid = '%s'\n", t.ident->toChars());
1686                 if (t.value == TOKidentifier && t.ident->equals(hereid))
1687                 {   /* should check that rest of line is blank
1688                      */
1689                     goto Ldone;
1690                 }
1691                 p = psave;
1692             }
1693             stringbuffer.writeUTF8(c);
1694             startline = 0;
1695         }
1696     }
1697
1698 Ldone:
1699     if (*p == '"')
1700         p++;
1701     else
1702         error("delimited string must end in %c\"", delimright);
1703     t->len = stringbuffer.offset;
1704     stringbuffer.writeByte(0);
1705     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1706     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1707     stringPostfix(t);
1708     return TOKstring;
1709
1710 Lerror:
1711     error("unterminated string constant starting at %s", start.toChars());
1712     t->ustring = (unsigned char *)"";
1713     t->len = 0;
1714     t->postfix = 0;
1715     return TOKstring;
1716 }
1717
1718 /**************************************
1719  * Lex delimited strings:
1720  *      q{ foo(xxx) } // " foo(xxx) "
1721  *      q{foo(}       // "foo("
1722  *      q{{foo}"}"}   // "{foo}"}""
1723  * Input:
1724  *      p is on the q
1725  */
1726
1727 TOK Lexer::tokenStringConstant(Token *t)
1728 {
1729     unsigned nest = 1;
1730     Loc start = loc;
1731     unsigned char *pstart = ++p;
1732
1733     while (1)
1734     {   Token tok;
1735
1736         scan(&tok);
1737         switch (tok.value)
1738         {
1739             case TOKlcurly:
1740                 nest++;
1741                 continue;
1742
1743             case TOKrcurly:
1744                 if (--nest == 0)
1745                     goto Ldone;
1746                 continue;
1747
1748             case TOKeof:
1749                 goto Lerror;
1750
1751             default:
1752                 continue;
1753         }
1754     }
1755
1756 Ldone:
1757     t->len = p - 1 - pstart;
1758     t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1759     memcpy(t->ustring, pstart, t->len);
1760     t->ustring[t->len] = 0;
1761     stringPostfix(t);
1762     return TOKstring;
1763
1764 Lerror:
1765     error("unterminated token string constant starting at %s", start.toChars());
1766     t->ustring = (unsigned char *)"";
1767     t->len = 0;
1768     t->postfix = 0;
1769     return TOKstring;
1770 }
1771
1772 #endif
1773
1774
1775 /**************************************
1776  */
1777
1778 TOK Lexer::escapeStringConstant(Token *t, int wide)
1779 {   unsigned c;
1780     Loc start = loc;
1781
1782     p++;
1783     stringbuffer.reset();
1784     while (1)
1785     {
1786         c = *p++;
1787         switch (c)
1788         {
1789             case '\\':
1790                 switch (*p)
1791                 {
1792                     case 'u':
1793                     case 'U':
1794                     case '&':
1795                         c = escapeSequence();
1796                         stringbuffer.writeUTF8(c);
1797                         continue;
1798
1799                     default:
1800                         c = escapeSequence();
1801                         break;
1802                 }
1803                 break;
1804
1805             case '\n':
1806                 loc.linnum++;
1807                 break;
1808
1809             case '\r':
1810                 if (*p == '\n')
1811                     continue;   // ignore
1812                 c = '\n';       // treat EndOfLine as \n character
1813                 loc.linnum++;
1814                 break;
1815
1816             case '"':
1817                 t->len = stringbuffer.offset;
1818                 stringbuffer.writeByte(0);
1819                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1820                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1821                 stringPostfix(t);
1822                 return TOKstring;
1823
1824             case 0:
1825             case 0x1A:
1826                 p--;
1827                 error("unterminated string constant starting at %s", start.toChars());
1828                 t->ustring = (unsigned char *)"";
1829                 t->len = 0;
1830                 t->postfix = 0;
1831                 return TOKstring;
1832
1833             default:
1834                 if (c & 0x80)
1835                 {
1836                     p--;
1837                     c = decodeUTF();
1838                     if (c == LS || c == PS)
1839                     {   c = '\n';
1840                         loc.linnum++;
1841                     }
1842                     p++;
1843                     stringbuffer.writeUTF8(c);
1844                     continue;
1845                 }
1846                 break;
1847         }
1848         stringbuffer.writeByte(c);
1849     }
1850 }
1851
1852 /**************************************
1853  */
1854
1855 TOK Lexer::charConstant(Token *t, int wide)
1856 {
1857     unsigned c;
1858     TOK tk = TOKcharv;
1859
1860     //printf("Lexer::charConstant\n");
1861     p++;
1862     c = *p++;
1863     switch (c)
1864     {
1865         case '\\':
1866             switch (*p)
1867             {
1868                 case 'u':
1869                     t->uns64value = escapeSequence();
1870                     tk = TOKwcharv;
1871                     break;
1872
1873                 case 'U':
1874                 case '&':
1875                     t->uns64value = escapeSequence();
1876                     tk = TOKdcharv;
1877                     break;
1878
1879                 default:
1880                     t->uns64value = escapeSequence();
1881                     break;
1882             }
1883             break;
1884
1885         case '\n':
1886         L1:
1887             loc.linnum++;
1888         case '\r':
1889         case 0:
1890         case 0x1A:
1891         case '\'':
1892             error("unterminated character constant");
1893             return tk;
1894
1895         default:
1896             if (c & 0x80)
1897             {
1898                 p--;
1899                 c = decodeUTF();
1900                 p++;
1901                 if (c == LS || c == PS)
1902                     goto L1;
1903                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1904                     tk = TOKwcharv;
1905                 else
1906                     tk = TOKdcharv;
1907             }
1908             t->uns64value = c;
1909             break;
1910     }
1911
1912     if (*p != '\'')
1913     {   error("unterminated character constant");
1914         return tk;
1915     }
1916     p++;
1917     return tk;
1918 }
1919
1920 /***************************************
1921  * Get postfix of string literal.
1922  */
1923
1924 void Lexer::stringPostfix(Token *t)
1925 {
1926     switch (*p)
1927     {
1928         case 'c':
1929         case 'w':
1930         case 'd':
1931             t->postfix = *p;
1932             p++;
1933             break;
1934
1935         default:
1936             t->postfix = 0;
1937             break;
1938     }
1939 }
1940
1941 /***************************************
1942  * Read \u or \U unicode sequence
1943  * Input:
1944  *      u       'u' or 'U'
1945  */
1946
1947 #if 0
1948 unsigned Lexer::wchar(unsigned u)
1949 {
1950     unsigned value;
1951     unsigned n;
1952     unsigned char c;
1953     unsigned nchars;
1954
1955     nchars = (u == 'U') ? 8 : 4;
1956     value = 0;
1957     for (n = 0; 1; n++)
1958     {
1959         ++p;
1960         if (n == nchars)
1961             break;
1962         c = *p;
1963         if (!ishex(c))
1964         {   error("\\%c sequence must be followed by %d hex characters", u, nchars);
1965             break;
1966         }
1967         if (isdigit(c))
1968             c -= '0';
1969         else if (islower(c))
1970             c -= 'a' - 10;
1971         else
1972             c -= 'A' - 10;
1973         value <<= 4;
1974         value |= c;
1975     }
1976     return value;
1977 }
1978 #endif
1979
1980 /**************************************
1981  * Read in a number.
1982  * If it's an integer, store it in tok.TKutok.Vlong.
1983  *      integers can be decimal, octal or hex
1984  *      Handle the suffixes U, UL, LU, L, etc.
1985  * If it's double, store it in tok.TKutok.Vdouble.
1986  * Returns:
1987  *      TKnum
1988  *      TKdouble,...
1989  */
1990
1991 TOK Lexer::number(Token *t)
1992 {
1993     // We use a state machine to collect numbers
1994     enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
1995         STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
1996         STATE_hexh, STATE_error };
1997     enum STATE state;
1998
1999     enum FLAGS
2000     {   FLAGS_decimal  = 1,             // decimal
2001         FLAGS_unsigned = 2,             // u or U suffix
2002         FLAGS_long     = 4,             // l or L suffix
2003     };
2004     enum FLAGS flags = FLAGS_decimal;
2005
2006     int i;
2007     int base;
2008     unsigned c;
2009     unsigned char *start;
2010     TOK result;
2011
2012     //printf("Lexer::number()\n");
2013     state = STATE_initial;
2014     base = 0;
2015     stringbuffer.reset();
2016     start = p;
2017     while (1)
2018     {
2019         c = *p;
2020         switch (state)
2021         {
2022             case STATE_initial:         // opening state
2023                 if (c == '0')
2024                     state = STATE_0;
2025                 else
2026                     state = STATE_decimal;
2027                 break;
2028
2029             case STATE_0:
2030                 flags = (FLAGS) (flags & ~FLAGS_decimal);
2031                 switch (c)
2032                 {
2033 #if ZEROH
2034                     case 'H':                   // 0h
2035                     case 'h':
2036                         goto hexh;
2037 #endif
2038                     case 'X':
2039                     case 'x':
2040                         state = STATE_hex0;
2041                         break;
2042
2043                     case '.':
2044                         if (p[1] == '.')        // .. is a separate token
2045                             goto done;
2046                     case 'i':
2047                     case 'f':
2048                     case 'F':
2049                         goto real;
2050 #if ZEROH
2051                     case 'E':
2052                     case 'e':
2053                         goto case_hex;
2054 #endif
2055                     case 'B':
2056                     case 'b':
2057                         state = STATE_binary0;
2058                         break;
2059
2060                     case '0': case '1': case '2': case '3':
2061                     case '4': case '5': case '6': case '7':
2062                         state = STATE_octal;
2063                         break;
2064
2065 #if ZEROH
2066                     case '8': case '9': case 'A':
2067                     case 'C': case 'D': case 'F':
2068                     case 'a': case 'c': case 'd': case 'f':
2069                     case_hex:
2070                         state = STATE_hexh;
2071                         break;
2072 #endif
2073                     case '_':
2074                         state = STATE_octal;
2075                         p++;
2076                         continue;
2077
2078                     case 'L':
2079                         if (p[1] == 'i')
2080                             goto real;
2081                         goto done;
2082
2083                     default:
2084                         goto done;
2085                 }
2086                 break;
2087
2088             case STATE_decimal:         // reading decimal number
2089                 if (!isdigit(c))
2090                 {
2091 #if ZEROH
2092                     if (ishex(c)
2093                         || c == 'H' || c == 'h'
2094                        )
2095                         goto hexh;
2096 #endif
2097                     if (c == '_')               // ignore embedded _
2098                     {   p++;
2099                         continue;
2100                     }
2101                     if (c == '.' && p[1] != '.')
2102                         goto real;
2103                     else if (c == 'i' || c == 'f' || c == 'F' ||
2104                              c == 'e' || c == 'E')
2105                     {
2106             real:       // It's a real number. Back up and rescan as a real
2107                         p = start;
2108                         return inreal(t);
2109                     }
2110                     else if (c == 'L' && p[1] == 'i')
2111                         goto real;
2112                     goto done;
2113                 }
2114                 break;
2115
2116             case STATE_hex0:            // reading hex number
2117             case STATE_hex:
2118                 if (!ishex(c))
2119                 {
2120                     if (c == '_')               // ignore embedded _
2121                     {   p++;
2122                         continue;
2123                     }
2124                     if (c == '.' && p[1] != '.')
2125                         goto real;
2126                     if (c == 'P' || c == 'p' || c == 'i')
2127                         goto real;
2128                     if (state == STATE_hex0)
2129                         error("Hex digit expected, not '%c'", c);
2130                     goto done;
2131                 }
2132                 state = STATE_hex;
2133                 break;
2134
2135 #if ZEROH
2136             hexh:
2137                 state = STATE_hexh;
2138             case STATE_hexh:            // parse numbers like 0FFh
2139                 if (!ishex(c))
2140                 {
2141                     if (c == 'H' || c == 'h')
2142                     {
2143                         p++;
2144                         base = 16;
2145                         goto done;
2146                     }
2147                     else
2148                     {
2149                         // Check for something like 1E3 or 0E24
2150                         if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2151                             memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2152                             goto real;
2153                         error("Hex digit expected, not '%c'", c);
2154                         goto done;
2155                     }
2156                 }
2157                 break;
2158 #endif
2159
2160             case STATE_octal:           // reading octal number
2161             case STATE_octale:          // reading octal number with non-octal digits
2162                 if (!isoctal(c))
2163                 {
2164 #if ZEROH
2165                     if (ishex(c)
2166                         || c == 'H' || c == 'h'
2167                        )
2168                         goto hexh;
2169 #endif
2170                     if (c == '_')               // ignore embedded _
2171                     {   p++;
2172                         continue;
2173                     }
2174                     if (c == '.' && p[1] != '.')
2175                         goto real;
2176                     if (c == 'i')
2177                         goto real;
2178                     if (isdigit(c))
2179                     {
2180                         state = STATE_octale;
2181                     }
2182                     else
2183                         goto done;
2184                 }
2185                 break;
2186
2187             case STATE_binary0:         // starting binary number
2188             case STATE_binary:          // reading binary number
2189                 if (c != '0' && c != '1')
2190                 {
2191 #if ZEROH
2192                     if (ishex(c)
2193                         || c == 'H' || c == 'h'
2194                        )
2195                         goto hexh;
2196 #endif
2197                     if (c == '_')               // ignore embedded _
2198                     {   p++;
2199                         continue;
2200                     }
2201                     if (state == STATE_binary0)
2202                     {   error("binary digit expected");
2203                         state = STATE_error;
2204                         break;
2205                     }
2206                     else
2207                         goto done;
2208                 }
2209                 state = STATE_binary;
2210                 break;
2211
2212             case STATE_error:           // for error recovery
2213                 if (!isdigit(c))        // scan until non-digit
2214                     goto done;
2215                 break;
2216
2217             default:
2218                 assert(0);
2219         }
2220         stringbuffer.writeByte(c);
2221         p++;
2222     }
2223 done:
2224     stringbuffer.writeByte(0);          // terminate string
2225     if (state == STATE_octale)
2226         error("Octal digit expected");
2227
2228     uinteger_t n;                       // unsigned >=64 bit integer type
2229
2230     if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2231         n = stringbuffer.data[0] - '0';
2232     else
2233     {
2234         // Convert string to integer
2235 #if __DMC__
2236         errno = 0;
2237         n = strtoull((char *)stringbuffer.data,NULL,base);
2238         if (errno == ERANGE)
2239             error("integer overflow");
2240 #else
2241         // Not everybody implements strtoull()
2242         char *p = (char *)stringbuffer.data;
2243         int r = 10, d;
2244
2245         if (*p == '0')
2246         {
2247             if (p[1] == 'x' || p[1] == 'X')
2248                 p += 2, r = 16;
2249             else if (p[1] == 'b' || p[1] == 'B')
2250                 p += 2, r = 2;
2251             else if (isdigit(p[1]))
2252                 p += 1, r = 8;
2253         }
2254
2255         n = 0;
2256         while (1)
2257         {
2258             if (*p >= '0' && *p <= '9')
2259                 d = *p - '0';
2260             else if (*p >= 'a' && *p <= 'z')
2261                 d = *p - 'a' + 10;
2262             else if (*p >= 'A' && *p <= 'Z')
2263                 d = *p - 'A' + 10;
2264             else
2265                 break;
2266             if (d >= r)
2267                 break;
2268             if (n && n * r + d <= n)
2269             {
2270                 error ("integer overflow");
2271                 break;
2272             }
2273
2274             n = n * r + d;
2275             p++;
2276         }
2277 #endif
2278         if (sizeof(n) > 8 &&
2279             n > 0xFFFFFFFFFFFFFFFFULL)  // if n needs more than 64 bits
2280             error("integer overflow");
2281     }
2282
2283     // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2284     while (1)
2285     {   unsigned char f;
2286
2287         switch (*p)
2288         {   case 'U':
2289             case 'u':
2290                 f = FLAGS_unsigned;
2291                 goto L1;
2292
2293             case 'l':
2294                 if (1 || !global.params.useDeprecated)
2295                     error("'l' suffix is deprecated, use 'L' instead");
2296             case 'L':
2297                 f = FLAGS_long;
2298             L1:
2299                 p++;
2300                 if (flags & f)
2301                     error("unrecognized token");
2302                 flags = (FLAGS) (flags | f);
2303                 continue;
2304             default:
2305                 break;
2306         }
2307         break;
2308     }
2309
2310     switch (flags)
2311     {
2312         case 0:
2313             /* Octal or Hexadecimal constant.
2314              * First that fits: int, uint, long, ulong
2315              */
2316             if (n & 0x8000000000000000LL)
2317                     result = TOKuns64v;
2318             else if (n & 0xFFFFFFFF00000000LL)
2319                     result = TOKint64v;
2320             else if (n & 0x80000000)
2321                     result = TOKuns32v;
2322             else
2323                     result = TOKint32v;
2324             break;
2325
2326         case FLAGS_decimal:
2327             /* First that fits: int, long, long long
2328              */
2329             if (n & 0x8000000000000000LL)
2330             {       error("signed integer overflow");
2331                     result = TOKuns64v;
2332             }
2333             else if (n & 0xFFFFFFFF80000000LL)
2334                     result = TOKint64v;
2335             else
2336                     result = TOKint32v;
2337             break;
2338
2339         case FLAGS_unsigned:
2340         case FLAGS_decimal | FLAGS_unsigned:
2341             /* First that fits: uint, ulong
2342              */
2343             if (n & 0xFFFFFFFF00000000LL)
2344                     result = TOKuns64v;
2345             else
2346                     result = TOKuns32v;
2347             break;
2348
2349         case FLAGS_decimal | FLAGS_long:
2350             if (n & 0x8000000000000000LL)
2351             {       error("signed integer overflow");
2352                     result = TOKuns64v;
2353             }
2354             else
2355                     result = TOKint64v;
2356             break;
2357
2358         case FLAGS_long:
2359             if (n & 0x8000000000000000LL)
2360                     result = TOKuns64v;
2361             else
2362                     result = TOKint64v;
2363             break;
2364
2365         case FLAGS_unsigned | FLAGS_long:
2366         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2367             result = TOKuns64v;
2368             break;
2369
2370         default:
2371             #ifdef DEBUG
2372                 printf("%x\n",flags);
2373             #endif
2374             assert(0);
2375     }
2376     t->uns64value = n;
2377     return result;
2378 }
2379
2380 /**************************************
2381  * Read in characters, converting them to real.
2382  * Bugs:
2383  *      Exponent overflow not detected.
2384  *      Too much requested precision is not detected.
2385  */
2386
2387 TOK Lexer::inreal(Token *t)
2388 #ifdef __DMC__
2389 __in
2390 {
2391     assert(*p == '.' || isdigit(*p));
2392 }
2393 __out (result)
2394 {
2395     switch (result)
2396     {
2397         case TOKfloat32v:
2398         case TOKfloat64v:
2399         case TOKfloat80v:
2400         case TOKimaginary32v:
2401         case TOKimaginary64v:
2402         case TOKimaginary80v:
2403             break;
2404
2405         default:
2406             assert(0);
2407     }
2408 }
2409 __body
2410 #endif /* __DMC__ */
2411 {   int dblstate;
2412     unsigned c;
2413     char hex;                   // is this a hexadecimal-floating-constant?
2414     TOK result;
2415
2416     //printf("Lexer::inreal()\n");
2417     stringbuffer.reset();
2418     dblstate = 0;
2419     hex = 0;
2420 Lnext:
2421     while (1)
2422     {
2423         // Get next char from input
2424         c = *p++;
2425         //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2426         while (1)
2427         {
2428             switch (dblstate)
2429             {
2430                 case 0:                 // opening state
2431                     if (c == '0')
2432                         dblstate = 9;
2433                     else if (c == '.')
2434                         dblstate = 3;
2435                     else
2436                         dblstate = 1;
2437                     break;
2438
2439                 case 9:
2440                     dblstate = 1;
2441                     if (c == 'X' || c == 'x')
2442                     {   hex++;
2443                         break;
2444                     }
2445                 case 1:                 // digits to left of .
2446                 case 3:                 // digits to right of .
2447                 case 7:                 // continuing exponent digits
2448                     if (!isdigit(c) && !(hex && isxdigit(c)))
2449                     {
2450                         if (c == '_')
2451                             goto Lnext; // ignore embedded '_'
2452                         dblstate++;
2453                         continue;
2454                     }
2455                     break;
2456
2457                 case 2:                 // no more digits to left of .
2458                     if (c == '.')
2459                     {   dblstate++;
2460                         break;
2461                     }
2462                 case 4:                 // no more digits to right of .
2463                     if ((c == 'E' || c == 'e') ||
2464                         hex && (c == 'P' || c == 'p'))
2465                     {   dblstate = 5;
2466                         hex = 0;        // exponent is always decimal
2467                         break;
2468                     }
2469                     if (hex)
2470                         error("binary-exponent-part required");
2471                     goto done;
2472
2473                 case 5:                 // looking immediately to right of E
2474                     dblstate++;
2475                     if (c == '-' || c == '+')
2476                         break;
2477                 case 6:                 // 1st exponent digit expected
2478                     if (!isdigit(c))
2479                         error("exponent expected");
2480                     dblstate++;
2481                     break;
2482
2483                 case 8:                 // past end of exponent digits
2484                     goto done;
2485             }
2486             break;
2487         }
2488         stringbuffer.writeByte(c);
2489     }
2490 done:
2491     p--;
2492
2493     stringbuffer.writeByte(0);
2494
2495 #if _WIN32 && __DMC__
2496     char *save = __locale_decpoint;
2497     __locale_decpoint = ".";
2498 #endif
2499 #ifdef IN_GCC
2500     t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2501 #else
2502     t->float80value = strtold((char *)stringbuffer.data, NULL);
2503 #endif
2504     errno = 0;
2505     switch (*p)
2506     {
2507         case 'F':
2508         case 'f':
2509 #ifdef IN_GCC
2510             real_t::parse((char *)stringbuffer.data, real_t::Float);
2511 #else
2512             strtof((char *)stringbuffer.data, NULL);
2513 #endif
2514             result = TOKfloat32v;
2515             p++;
2516             break;
2517
2518         default:
2519 #ifdef IN_GCC
2520             real_t::parse((char *)stringbuffer.data, real_t::Double);
2521 #else
2522             strtod((char *)stringbuffer.data, NULL);
2523 #endif
2524             result = TOKfloat64v;
2525             break;
2526
2527         case 'l':
2528             if (!global.params.useDeprecated)
2529                 error("'l' suffix is deprecated, use 'L' instead");
2530         case 'L':
2531             result = TOKfloat80v;
2532             p++;
2533             break;
2534     }
2535     if (*p == 'i' || *p == 'I')
2536     {
2537         if (!global.params.useDeprecated && *p == 'I')
2538             error("'I' suffix is deprecated, use 'i' instead");
2539         p++;
2540         switch (result)
2541         {
2542             case TOKfloat32v:
2543                 result = TOKimaginary32v;
2544                 break;
2545             case TOKfloat64v:
2546                 result = TOKimaginary64v;
2547                 break;
2548             case TOKfloat80v:
2549                 result = TOKimaginary80v;
2550                 break;
2551         }
2552     }
2553 #if _WIN32 && __DMC__
2554     __locale_decpoint = save;
2555 #endif
2556     if (errno == ERANGE)
2557         error("number is not representable");
2558     return result;
2559 }
2560
2561 /*********************************************
2562  * Do pragma.
2563  * Currently, the only pragma supported is:
2564  *      #line linnum [filespec]
2565  */
2566
2567 void Lexer::pragma()
2568 {
2569     Token tok;
2570     int linnum;
2571     char *filespec = NULL;
2572     Loc loc = this->loc;
2573
2574     while (isblank(*p)) p++;
2575     if (*p == '\n')
2576         goto Lerr;
2577
2578     scan(&tok);
2579     if (tok.value != TOKidentifier || tok.ident != Id::line)
2580         goto Lerr;
2581
2582     scan(&tok);
2583     if (tok.value == TOKint32v || tok.value == TOKint64v)
2584         linnum = tok.uns64value - 1;
2585     else
2586         goto Lerr;
2587
2588     while (1)
2589     {
2590         switch (*p)
2591         {
2592             case 0:
2593             case 0x1A:
2594             case '\n':
2595             Lnewline:
2596                 this->loc.linnum = linnum;
2597                 if (filespec)
2598                     this->loc.filename = filespec;
2599                 return;
2600
2601             case '\r':
2602                 p++;
2603                 if (*p != '\n')
2604                 {   p--;
2605                     goto Lnewline;
2606                 }
2607                 continue;
2608
2609             case ' ':
2610             case '\t':
2611             case '\v':
2612             case '\f':
2613                 p++;
2614                 continue;                       // skip white space
2615
2616             case '_':
2617                 if (mod && memcmp(p, "__FILE__", 8) == 0)
2618                 {
2619                     p += 8;
2620                     filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2621                 }
2622                 continue;
2623
2624             case '"':
2625                 if (filespec)
2626                     goto Lerr;
2627                 stringbuffer.reset();
2628                 p++;
2629                 while (1)
2630                 {   unsigned c;
2631
2632                     c = *p;
2633                     switch (c)
2634                     {
2635                         case '\n':
2636                         case '\r':
2637                         case 0:
2638                         case 0x1A:
2639                             goto Lerr;
2640
2641                         case '"':
2642                             stringbuffer.writeByte(0);
2643                             filespec = mem.strdup((char *)stringbuffer.data);
2644                             p++;
2645                             break;
2646
2647                         default:
2648                             if (c & 0x80)
2649                             {   unsigned u = decodeUTF();
2650                                 if (u == PS || u == LS)
2651                                     goto Lerr;
2652                             }
2653                             stringbuffer.writeByte(c);
2654                             p++;
2655                             continue;
2656                     }
2657                     break;
2658                 }
2659                 continue;
2660
2661             default:
2662                 if (*p & 0x80)
2663                 {   unsigned u = decodeUTF();
2664                     if (u == PS || u == LS)
2665                         goto Lnewline;
2666                 }
2667                 goto Lerr;
2668         }
2669     }
2670
2671 Lerr:
2672     error(loc, "#line integer [\"filespec\"]\\n expected");
2673 }
2674
2675
2676 /********************************************
2677  * Decode UTF character.
2678  * Issue error messages for invalid sequences.
2679  * Return decoded character, advance p to last character in UTF sequence.
2680  */
2681
2682 unsigned Lexer::decodeUTF()
2683 {
2684     dchar_t u;
2685     unsigned char c;
2686     unsigned char *s = p;
2687     size_t len;
2688     size_t idx;
2689     char *msg;
2690
2691     c = *s;
2692     assert(c & 0x80);
2693
2694     // Check length of remaining string up to 6 UTF-8 characters
2695     for (len = 1; len < 6 && s[len]; len++)
2696         ;
2697
2698     idx = 0;
2699     msg = utf_decodeChar(s, len, &idx, &u);
2700     p += idx - 1;
2701     if (msg)
2702     {
2703         error("%s", msg);
2704     }
2705     return u;
2706 }
2707
2708
2709 /***************************************************
2710  * Parse doc comment embedded between t->ptr and p.
2711  * Remove trailing blanks and tabs from lines.
2712  * Replace all newlines with \n.
2713  * Remove leading comment character from each line.
2714  * Decide if it's a lineComment or a blockComment.
2715  * Append to previous one for this token.
2716  */
2717
2718 void Lexer::getDocComment(Token *t, unsigned lineComment)
2719 {
2720     OutBuffer buf;
2721     unsigned char ct = t->ptr[2];
2722     unsigned char *q = t->ptr + 3;      // start of comment text
2723     int linestart = 0;
2724
2725     unsigned char *qend = p;
2726     if (ct == '*' || ct == '+')
2727         qend -= 2;
2728
2729     /* Scan over initial row of ****'s or ++++'s or ////'s
2730      */
2731     for (; q < qend; q++)
2732     {
2733         if (*q != ct)
2734             break;
2735     }
2736
2737     /* Remove trailing row of ****'s or ++++'s
2738      */
2739     if (ct != '/')
2740     {
2741         for (; q < qend; qend--)
2742         {
2743             if (qend[-1] != ct)
2744                 break;
2745         }
2746     }
2747
2748     for (; q < qend; q++)
2749     {
2750         unsigned char c = *q;
2751
2752         switch (c)
2753         {
2754             case '*':
2755             case '+':
2756                 if (linestart && c == ct)
2757                 {   linestart = 0;
2758                     /* Trim preceding whitespace up to preceding \n
2759                      */
2760                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2761                         buf.offset--;
2762                     continue;
2763                 }
2764                 break;
2765
2766             case ' ':
2767             case '\t':
2768                 break;
2769
2770             case '\r':
2771                 if (q[1] == '\n')
2772                     continue;           // skip the \r
2773                 goto Lnewline;
2774
2775             default:
2776                 if (c == 226)
2777                 {
2778                     // If LS or PS
2779                     if (q[1] == 128 &&
2780                         (q[2] == 168 || q[2] == 169))
2781                     {
2782                         q += 2;
2783                         goto Lnewline;
2784                     }
2785                 }
2786                 linestart = 0;
2787                 break;
2788
2789             Lnewline:
2790                 c = '\n';               // replace all newlines with \n
2791             case '\n':
2792                 linestart = 1;
2793
2794                 /* Trim trailing whitespace
2795                  */
2796                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2797                     buf.offset--;
2798
2799                 break;
2800         }
2801         buf.writeByte(c);
2802     }
2803
2804     // Always end with a newline
2805     if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2806         buf.writeByte('\n');
2807
2808     buf.writeByte(0);
2809
2810     // It's a line comment if the start of the doc comment comes
2811     // after other non-whitespace on the same line.
2812     unsigned char** dc = (lineComment && anyToken)
2813                          ? &t->lineComment
2814                          : &t->blockComment;
2815
2816     // Combine with previous doc comment, if any
2817     if (*dc)
2818         *dc = combineComments(*dc, (unsigned char *)buf.data);
2819     else
2820         *dc = (unsigned char *)buf.extractData();
2821 }
2822
2823 /********************************************
2824  * Combine two document comments into one.
2825  */
2826
2827 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2828 {
2829     unsigned char *c = c2;
2830
2831     if (c1)
2832     {   c = c1;
2833         if (c2)
2834         {   size_t len1 = strlen((char *)c1);
2835             size_t len2 = strlen((char *)c2);
2836
2837             c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2838             memcpy(c, c1, len1);
2839             c[len1] = '\n';
2840             memcpy(c + len1 + 1, c2, len2);
2841             c[len1 + 1 + len2] = 0;
2842         }
2843     }
2844     return c;
2845 }
2846
2847 /********************************************
2848  * Create an identifier in the string table.
2849  */
2850
2851 Identifier *Lexer::idPool(const char *s)
2852 {
2853     size_t len = strlen(s);
2854     StringValue *sv = stringtable.update(s, len);
2855     Identifier *id = (Identifier *) sv->ptrvalue;
2856     if (!id)
2857     {
2858         id = new Identifier(sv->lstring.string, TOKidentifier);
2859         sv->ptrvalue = id;
2860     }
2861     return id;
2862 }
2863
2864 /*********************************************
2865  * Create a unique identifier using the prefix s.
2866  */
2867
2868 Identifier *Lexer::uniqueId(const char *s, int num)
2869 {   char buffer[32];
2870     size_t slen = strlen(s);
2871
2872     assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2873     sprintf(buffer, "%s%d", s, num);
2874     return idPool(buffer);
2875 }
2876
2877 Identifier *Lexer::uniqueId(const char *s)
2878 {
2879     static int num;
2880     return uniqueId(s, ++num);
2881 }
2882
2883 /****************************************
2884  */
2885
2886 struct Keyword
2887 {   char *name;
2888     enum TOK value;
2889 };
2890
2891 static Keyword keywords[] =
2892 {
2893 //    { "",             TOK     },
2894
2895     {   "this",         TOKthis         },
2896     {   "super",        TOKsuper        },
2897     {   "assert",       TOKassert       },
2898     {   "null",         TOKnull         },
2899     {   "true",         TOKtrue         },
2900     {   "false",        TOKfalse        },
2901     {   "cast",         TOKcast         },
2902     {   "new",          TOKnew          },
2903     {   "delete",       TOKdelete       },
2904     {   "throw",        TOKthrow        },
2905     {   "module",       TOKmodule       },
2906     {   "pragma",       TOKpragma       },
2907     {   "typeof",       TOKtypeof       },
2908     {   "typeid",       TOKtypeid       },
2909
2910     {   "template",     TOKtemplate     },
2911
2912     {   "void",         TOKvoid         },
2913     {   "byte",         TOKint8         },
2914     {   "ubyte",        TOKuns8         },
2915     {   "short",        TOKint16        },
2916     {   "ushort",       TOKuns16        },
2917     {   "int",          TOKint32        },
2918     {   "uint",         TOKuns32        },
2919     {   "long",         TOKint64        },
2920     {   "ulong",        TOKuns64        },
2921     {   "cent",         TOKcent,        },
2922     {   "ucent",        TOKucent,       },
2923     {   "float",        TOKfloat32      },
2924     {   "double",       TOKfloat64      },
2925     {   "real",         TOKfloat80      },
2926
2927     {   "bool",         TOKbool         },
2928     {   "char",         TOKchar         },
2929     {   "wchar",        TOKwchar        },
2930     {   "dchar",        TOKdchar        },
2931
2932     {   "ifloat",       TOKimaginary32  },
2933     {   "idouble",      TOKimaginary64  },
2934     {   "ireal",        TOKimaginary80  },
2935
2936     {   "cfloat",       TOKcomplex32    },
2937     {   "cdouble",      TOKcomplex64    },
2938     {   "creal",        TOKcomplex80    },
2939
2940     {   "delegate",     TOKdelegate     },
2941     {   "function",     TOKfunction     },
2942
2943     {   "is",           TOKis           },
2944     {   "if",           TOKif           },
2945     {   "else",         TOKelse         },
2946     {   "while",        TOKwhile        },
2947     {   "for",          TOKfor          },
2948     {   "do",           TOKdo           },
2949     {   "switch",       TOKswitch       },
2950     {   "case",         TOKcase         },
2951     {   "default",      TOKdefault      },
2952     {   "break",        TOKbreak        },
2953     {   "continue",     TOKcontinue     },
2954     {   "synchronized", TOKsynchronized },
2955     {   "return",       TOKreturn       },
2956     {   "goto",         TOKgoto         },
2957     {   "try",          TOKtry          },
2958     {   "catch",        TOKcatch        },
2959     {   "finally",      TOKfinally      },
2960     {   "with",         TOKwith         },
2961     {   "asm",          TOKasm          },
2962     {   "foreach",      TOKforeach      },
2963     {   "foreach_reverse",      TOKforeach_reverse      },
2964     {   "reversed",     TOKreversed     },
2965     {   "scope",        TOKscope        },
2966
2967     {   "struct",       TOKstruct       },
2968     {   "class",        TOKclass        },
2969     {   "interface",    TOKinterface    },
2970     {   "union",        TOKunion        },
2971     {   "enum",         TOKenum         },
2972     {   "import",       TOKimport       },
2973     {   "mixin",        TOKmixin        },
2974     {   "static",       TOKstatic       },
2975     {   "final",        TOKfinal        },
2976     {   "const",        TOKconst        },
2977     {   "typedef",      TOKtypedef      },
2978     {   "alias",        TOKalias        },
2979     {   "override",     TOKoverride     },
2980     {   "abstract",     TOKabstract     },
2981     {   "volatile",     TOKvolatile     },
2982     {   "debug",        TOKdebug        },
2983     {   "deprecated",   TOKdeprecated   },
2984     {   "in",           TOKin           },
2985     {   "out",          TOKout          },
2986     {   "inout",        TOKinout        },
2987     {   "lazy",         TOKlazy         },
2988     {   "auto",         TOKauto         },
2989
2990     {   "align",        TOKalign        },
2991     {   "extern",       TOKextern       },
2992     {   "private",      TOKprivate      },
2993     {   "package",      TOKpackage      },
2994     {   "protected",    TOKprotected    },
2995     {   "public",       TOKpublic       },
2996     {   "export",       TOKexport       },
2997
2998     {   "body",         TOKbody         },
2999     {   "invariant",    TOKinvariant    },
3000     {   "unittest",     TOKunittest     },
3001     {   "version",      TOKversion      },
3002     //{ "manifest",     TOKmanifest     },
3003
3004     // Added after 1.0
3005     {   "ref",          TOKref          },
3006     {   "macro",        TOKmacro        },
3007
3008
3009     // TAL
3010     {   "and",          TOKandand       },
3011     {   "or",           TOKoror         },
3012     {   "not",          TOKnot          },
3013     {   "extends",      TOKextends      },
3014     {   "implements",   TOKimplements   },
3015     {   "log_error",    TOKlog_error    },
3016     {   "log_warning",  TOKlog_warning  },
3017     {   "log_info",     TOKlog_info     },
3018     {   "log_trace",    TOKlog_trace    },
3019 #if V2
3020     {   "pure",         TOKpure         },
3021     {   "nothrow",      TOKnothrow      },
3022     {   "__traits",     TOKtraits       },
3023     {   "__overloadset", TOKoverloadset },
3024 #endif
3025 };
3026
3027 int Token::isKeyword()
3028 {
3029     for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
3030     {
3031         if (keywords[u].value == value)
3032             return 1;
3033     }
3034     return 0;
3035 }
3036
3037 void Lexer::initKeywords()
3038 {   StringValue *sv;
3039     unsigned u;
3040     enum TOK v;
3041     unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
3042
3043     if (global.params.Dversion == 1)
3044         nkeywords -= 2;
3045
3046     cmtable_init();
3047
3048     for (u = 0; u < nkeywords; u++)
3049     {   char *s;
3050
3051         //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
3052         s = keywords[u].name;
3053         v = keywords[u].value;
3054         sv = stringtable.insert(s, strlen(s));
3055         sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
3056
3057         //printf("tochars[%d] = '%s'\n",v, s);
3058         Token::tochars[v] = s;
3059     }
3060
3061     Token::tochars[TOKeof]              = "EOF";
3062     Token::tochars[TOKlcurly]           = "{";
3063     Token::tochars[TOKrcurly]           = "}";
3064     Token::tochars[TOKlparen]           = "(";
3065     Token::tochars[TOKrparen]           = ")";
3066     Token::tochars[TOKlbracket]         = "[";
3067     Token::tochars[TOKrbracket]         = "]";
3068     Token::tochars[TOKsemicolon]        = ";";
3069     Token::tochars[TOKcolon]            = ":";
3070     Token::tochars[TOKcomma]            = ",";
3071     Token::tochars[TOKdot]              = ".";
3072     Token::tochars[TOKxor]              = "^";
3073     Token::tochars[TOKxorass]           = "^=";
3074     Token::tochars[TOKassign]           = "=";
3075     Token::tochars[TOKconstruct]        = "=";
3076 #if V2
3077     Token::tochars[TOKblit]             = "=";
3078 #endif
3079     Token::tochars[TOKlt]               = "<";
3080     Token::tochars[TOKgt]               = ">";
3081     Token::tochars[TOKle]               = "<=";
3082     Token::tochars[TOKge]               = ">=";
3083     Token::tochars[TOKequal]            = "==";
3084     Token::tochars[TOKnotequal]         = "!=";
3085     Token::tochars[TOKnotidentity]      = "!is";
3086     Token::tochars[TOKtobool]           = "!!";
3087     Token::tochars[TOKat]               = "@";
3088
3089     Token::tochars[TOKunord]            = "!<>=";
3090     Token::tochars[TOKue]               = "!<>";
3091     Token::tochars[TOKlg]               = "<>";
3092     Token::tochars[TOKleg]              = "<>=";
3093     Token::tochars[TOKule]              = "!>";
3094     Token::tochars[TOKul]               = "!>=";
3095     Token::tochars[TOKuge]              = "!<";
3096     Token::tochars[TOKug]               = "!<=";
3097
3098     Token::tochars[TOKnot]              = "!";
3099     Token::tochars[TOKtobool]           = "!!";
3100     Token::tochars[TOKshl]              = "<<";
3101     Token::tochars[TOKshr]              = ">>";
3102     Token::tochars[TOKushr]             = ">>>";
3103     Token::tochars[TOKadd]              = "+";
3104     Token::tochars[TOKmin]              = "-";
3105     Token::tochars[TOKmul]              = "*";
3106     Token::tochars[TOKdiv]              = "/";
3107     Token::tochars[TOKmod]              = "%";
3108     Token::tochars[TOKslice]            = "..";
3109     Token::tochars[TOKdotdotdot]        = "...";
3110     Token::tochars[TOKand]              = "&";
3111     Token::tochars[TOKandand]           = "&&";
3112     Token::tochars[TOKor]               = "|";
3113     Token::tochars[TOKoror]             = "||";
3114     Token::tochars[TOKarray]            = "[]";
3115     Token::tochars[TOKindex]            = "[i]";
3116     Token::tochars[TOKaddress]          = "&";
3117     Token::tochars[TOKstar]             = "*";
3118     Token::tochars[TOKtilde]            = "~";
3119     Token::tochars[TOKdollar]           = "$";
3120     Token::tochars[TOKcast]             = "cast";
3121     Token::tochars[TOKplusplus]         = "++";
3122     Token::tochars[TOKminusminus]       = "--";
3123     Token::tochars[TOKtype]             = "type";
3124     Token::tochars[TOKquestion]         = "?";
3125     Token::tochars[TOKneg]              = "-";
3126     Token::tochars[TOKuadd]             = "+";
3127     Token::tochars[TOKvar]              = "var";
3128     Token::tochars[TOKaddass]           = "+=";
3129     Token::tochars[TOKminass]           = "-=";
3130     Token::tochars[TOKmulass]           = "*=";
3131     Token::tochars[TOKdivass]           = "/=";
3132     Token::tochars[TOKmodass]           = "%=";
3133     Token::tochars[TOKshlass]           = "<<=";
3134     Token::tochars[TOKshrass]           = ">>=";
3135     Token::tochars[TOKushrass]          = ">>>=";
3136     Token::tochars[TOKandass]           = "&=";
3137     Token::tochars[TOKorass]            = "|=";
3138     Token::tochars[TOKcatass]           = "~=";
3139     Token::tochars[TOKcat]              = "~";
3140     Token::tochars[TOKcall]             = "call";
3141     Token::tochars[TOKidentity]         = "is";
3142     Token::tochars[TOKnotidentity]      = "!is";
3143     Token::tochars[TOKendline]          = "\\n";
3144
3145     Token::tochars[TOKorass]            = "|=";
3146     Token::tochars[TOKidentifier]       = "identifier";
3147
3148      // For debugging
3149     Token::tochars[TOKdotexp]           = "dotexp";
3150     Token::tochars[TOKdotti]            = "dotti";
3151     Token::tochars[TOKdotvar]           = "dotvar";
3152     Token::tochars[TOKdottype]          = "dottype";
3153     Token::tochars[TOKsymoff]           = "symoff";
3154     Token::tochars[TOKtypedot]          = "typedot";
3155     Token::tochars[TOKarraylength]      = "arraylength";
3156     Token::tochars[TOKarrayliteral]     = "arrayliteral";
3157     Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3158     Token::tochars[TOKstructliteral]    = "structliteral";
3159     Token::tochars[TOKstring]           = "string";
3160     Token::tochars[TOKdsymbol]          = "symbol";
3161     Token::tochars[TOKtuple]            = "tuple";
3162     Token::tochars[TOKdeclaration]      = "declaration";
3163     Token::tochars[TOKdottd]            = "dottd";
3164     Token::tochars[TOKlogger]           = "logger";
3165     Token::tochars[TOKon_scope_exit]    = "scope(exit)";
3166 }