dmd2/lexer.c

   1
   2 // Compiler implementation of the D programming language
   3 // Copyright (c) 1999-2008 by Digital Mars
   4 // All Rights Reserved
   5 // written by Walter Bright
   6 // http://www.digitalmars.com
   7 // License for redistribution is by either the Artistic License
   8 // in artistic.txt, or the GNU General Public License in gnu.txt.
   9 // See the included readme.txt for details.
  10
  11 /* NOTE: This file has been patched from the original DMD distribution to
  12    work with the GDC compiler.
  13
  14    Modified by David Friedman, December 2006
  15 */
  16
  17 /* Lexical Analyzer */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21 #include <ctype.h>
  22 #include <stdarg.h>
  23 #include <errno.h>
  24 //#include <wchar.h>
  25 #include <stdlib.h>
  26 #include <assert.h>
  27 #include <sys/time.h>
  28
  29 #ifdef IN_GCC
  30
  31 #include <time.h>
  32 #include "mem.h"
  33
  34 #else
  35
  36 #if __GNUC__
  37 #include <time.h>
  38 #endif
  39
  40 #if _WIN32
  41 #include "..\root\mem.h"
  42 #else
  43 #include "../root/mem.h"
  44 #endif
  45 #endif
  46
  47 #include "stringtable.h"
  48
  49 #include "lexer.h"
  50 #include "utf.h"
  51 #include "identifier.h"
  52 #include "id.h"
  53 #include "module.h"
  54
  55 #if _WIN32 && __DMC__
  56 // from \dm\src\include\setlocal.h
  57 extern "C" char * __cdecl __locale_decpoint;
  58 #endif
  59
  60 extern int HtmlNamedEntity(unsigned char *p, int length);
  61
  62 #define LS 0x2028       // UTF line separator
  63 #define PS 0x2029       // UTF paragraph separator
  64
  65 /********************************************
  66  * Do our own char maps
  67  */
  68
  69 static unsigned char cmtable[256];
  70
  71 const int CMoctal =     0x1;
  72 const int CMhex =       0x2;
  73 const int CMidchar =    0x4;
  74
  75 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
  76 inline unsigned char ishex   (unsigned char c) { return cmtable[c] & CMhex; }
  77 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
  78
  79 static void cmtable_init()
  80 {
  81     for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
  82     {
  83         if ('0' <= c && c <= '7')
  84             cmtable[c] |= CMoctal;
  85         if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
  86             cmtable[c] |= CMhex;
  87         if (isalnum(c) || c == '_')
  88             cmtable[c] |= CMidchar;
  89     }
  90 }
  91
  92
  93 /************************* Token **********************************************/
  94
  95 char *Token::tochars[TOKMAX];
  96
  97 void *Token::operator new(size_t size)
  98 {   Token *t;
  99
 100     if (Lexer::freelist)
 101     {
 102         t = Lexer::freelist;
 103         Lexer::freelist = t->next;
 104         return t;
 105     }
 106
 107     return ::operator new(size);
 108 }
 109
 110 #ifdef DEBUG
 111 void Token::print()
 112 {
 113     fprintf(stdmsg, "%s\n", toChars());
 114 }
 115 #endif
 116
 117 char *Token::toChars()
 118 {   char *p;
 119     static char buffer[3 + 3 * sizeof(value) + 1];
 120
 121     p = buffer;
 122     switch (value)
 123     {
 124         case TOKint32v:
 125 #if IN_GCC
 126             sprintf(buffer,"%d",(d_int32)int64value);
 127 #else
 128             sprintf(buffer,"%d",int32value);
 129 #endif
 130             break;
 131
 132         case TOKuns32v:
 133         case TOKcharv:
 134         case TOKwcharv:
 135         case TOKdcharv:
 136 #if IN_GCC
 137             sprintf(buffer,"%uU",(d_uns32)uns64value);
 138 #else
 139             sprintf(buffer,"%uU",uns32value);
 140 #endif
 141             break;
 142
 143         case TOKint64v:
 144             sprintf(buffer,"%"PRIdMAX"L",int64value);
 145             break;
 146
 147         case TOKuns64v:
 148             sprintf(buffer,"%"PRIuMAX"UL",uns64value);
 149             break;
 150
 151 #if IN_GCC
 152         case TOKfloat32v:
 153         case TOKfloat64v:
 154         case TOKfloat80v:
 155             float80value.format(buffer, sizeof(buffer));
 156             break;
 157         case TOKimaginary32v:
 158         case TOKimaginary64v:
 159         case TOKimaginary80v:
 160             float80value.format(buffer, sizeof(buffer));
 161             // %% buffer
 162             strcat(buffer, "i");
 163             break;
 164 #else
 165         case TOKfloat32v:
 166             sprintf(buffer,"%Lgf", float80value);
 167             break;
 168
 169         case TOKfloat64v:
 170             sprintf(buffer,"%Lg", float80value);
 171             break;
 172
 173         case TOKfloat80v:
 174             sprintf(buffer,"%LgL", float80value);
 175             break;
 176
 177         case TOKimaginary32v:
 178             sprintf(buffer,"%Lgfi", float80value);
 179             break;
 180
 181         case TOKimaginary64v:
 182             sprintf(buffer,"%Lgi", float80value);
 183             break;
 184
 185         case TOKimaginary80v:
 186             sprintf(buffer,"%LgLi", float80value);
 187             break;
 188 #endif
 189
 190
 191         case TOKstring:
 192 #if CSTRINGS
 193             p = string;
 194 #else
 195         {   OutBuffer buf;
 196
 197             buf.writeByte('"');
 198             for (size_t i = 0; i < len; )
 199             {   unsigned c;
 200
 201                 utf_decodeChar((unsigned char *)ustring, len, &i, &c);
 202                 switch (c)
 203                 {
 204                     case 0:
 205                         break;
 206
 207                     case '"':
 208                     case '\\':
 209                         buf.writeByte('\\');
 210                     default:
 211                         if (isprint(c))
 212                             buf.writeByte(c);
 213                         else if (c <= 0x7F)
 214                             buf.printf("\\x%02x", c);
 215                         else if (c <= 0xFFFF)
 216                             buf.printf("\\u%04x", c);
 217                         else
 218                             buf.printf("\\U%08x", c);
 219                         continue;
 220                 }
 221                 break;
 222             }
 223             buf.writeByte('"');
 224             if (postfix)
 225                 buf.writeByte('"');
 226             buf.writeByte(0);
 227             p = (char *)buf.extractData();
 228         }
 229 #endif
 230             break;
 231
 232         case TOKidentifier:
 233         case TOKenum:
 234         case TOKstruct:
 235         case TOKimport:
 236         CASE_BASIC_TYPES:
 237             p = ident->toChars();
 238             break;
 239
 240         default:
 241             p = toChars(value);
 242             break;
 243     }
 244     return p;
 245 }
 246
 247 char *Token::toChars(enum TOK value)
 248 {   char *p;
 249     static char buffer[3 + 3 * sizeof(value) + 1];
 250
 251     p = tochars[value];
 252     if (!p)
 253     {   sprintf(buffer,"TOK%d",value);
 254         p = buffer;
 255     }
 256     return p;
 257 }
 258
 259 /*************************** Lexer ********************************************/
 260
 261 Token *Lexer::freelist = NULL;
 262 StringTable Lexer::stringtable;
 263 OutBuffer Lexer::stringbuffer;
 264
 265 Lexer::Lexer(Module *mod,
 266         unsigned char *base, unsigned begoffset, unsigned endoffset,
 267         int doDocComment, int commentToken)
 268     : loc(mod, 1)
 269 {
 270     //printf("Lexer::Lexer(%p,%d)\n",base,length);
 271     //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
 272     memset(&token,0,sizeof(token));
 273     this->base = base;
 274     this->end  = base + endoffset;
 275     p = base + begoffset;
 276     this->mod = mod;
 277     this->doDocComment = doDocComment;
 278     this->anyToken = 0;
 279     this->commentToken = commentToken;
 280     //initKeywords();
 281
 282     /* If first line starts with '#!', ignore the line
 283      */
 284
 285     if (p[0] == '#' && p[1] =='!')
 286     {
 287         p += 2;
 288         while (1)
 289         {   unsigned char c = *p;
 290             switch (c)
 291             {
 292                 case '\n':
 293                     p++;
 294                     break;
 295
 296                 case '\r':
 297                     p++;
 298                     if (*p == '\n')
 299                         p++;
 300                     break;
 301
 302                 case 0:
 303                 case 0x1A:
 304                     break;
 305
 306                 default:
 307                     if (c & 0x80)
 308                     {   unsigned u = decodeUTF();
 309                         if (u == PS || u == LS)
 310                             break;
 311                     }
 312                     p++;
 313                     continue;
 314             }
 315             break;
 316         }
 317         loc.linnum = 2;
 318     }
 319 }
 320
 321
 322 void Lexer::error(const char *format, ...)
 323 {
 324     if (mod && !global.gag)
 325     {
 326         char *p = loc.toChars();
 327         if (*p)
 328             fprintf(stdmsg, "%s: ", p);
 329         mem.free(p);
 330
 331         va_list ap;
 332         va_start(ap, format);
 333         vfprintf(stdmsg, format, ap);
 334         va_end(ap);
 335
 336         fprintf(stdmsg, "\n");
 337         fflush(stdmsg);
 338
 339         if (global.errors >= 20)        // moderate blizzard of cascading messages
 340             fatal();
 341     }
 342     global.errors++;
 343 }
 344
 345 void Lexer::error(Loc loc, const char *format, ...)
 346 {
 347     if (mod && !global.gag)
 348     {
 349         char *p = loc.toChars();
 350         if (*p)
 351             fprintf(stdmsg, "%s: ", p);
 352         mem.free(p);
 353
 354         va_list ap;
 355         va_start(ap, format);
 356         vfprintf(stdmsg, format, ap);
 357         va_end(ap);
 358
 359         fprintf(stdmsg, "\n");
 360         fflush(stdmsg);
 361
 362         if (global.errors >= 20)        // moderate blizzard of cascading messages
 363             fatal();
 364     }
 365     global.errors++;
 366 }
 367
 368 TOK Lexer::nextToken()
 369 {   Token *t;
 370
 371     if (token.next)
 372     {
 373         t = token.next;
 374         memcpy(&token,t,sizeof(Token));
 375         t->next = freelist;
 376         freelist = t;
 377     }
 378     else
 379     {
 380         scan(&token);
 381     }
 382     //token.print();
 383     return token.value;
 384 }
 385
 386 Token *Lexer::peek(Token *ct)
 387 {   Token *t;
 388
 389     if (ct->next)
 390         t = ct->next;
 391     else
 392     {
 393         t = new Token();
 394         scan(t);
 395         t->next = NULL;
 396         ct->next = t;
 397     }
 398     return t;
 399 }
 400
 401 /*********************************
 402  * tk is on the opening (.
 403  * Look ahead and return token that is past the closing ).
 404  */
 405
 406 Token *Lexer::peekPastParen(Token *tk)
 407 {
 408     //printf("peekPastParen()\n");
 409     int parens = 1;
 410     int curlynest = 0;
 411     while (1)
 412     {
 413         tk = peek(tk);
 414         //tk->print();
 415         switch (tk->value)
 416         {
 417             case TOKlparen:
 418                 parens++;
 419                 continue;
 420
 421             case TOKrparen:
 422                 --parens;
 423                 if (parens)
 424                     continue;
 425                 tk = peek(tk);
 426                 break;
 427
 428             case TOKlcurly:
 429                 curlynest++;
 430                 continue;
 431
 432             case TOKrcurly:
 433                 if (--curlynest >= 0)
 434                     continue;
 435                 break;
 436
 437             case TOKsemicolon:
 438                 if (curlynest)
 439                     continue;
 440                 break;
 441
 442             case TOKeof:
 443                 break;
 444
 445             default:
 446                 continue;
 447         }
 448         return tk;
 449     }
 450 }
 451
 452 /**********************************
 453  * Determine if string is a valid Identifier.
 454  * Placed here because of commonality with Lexer functionality.
 455  * Returns:
 456  *      0       invalid
 457  */
 458
 459 int Lexer::isValidIdentifier(char *p)
 460 {
 461     size_t len;
 462     size_t idx;
 463
 464     if (!p || !*p)
 465         goto Linvalid;
 466
 467     if (*p >= '0' && *p <= '9')         // beware of isdigit() on signed chars
 468         goto Linvalid;
 469
 470     len = strlen(p);
 471     idx = 0;
 472     while (p[idx])
 473     {   dchar_t dc;
 474
 475         char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
 476         if (q)
 477             goto Linvalid;
 478
 479         if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
 480             goto Linvalid;
 481     }
 482     return 1;
 483
 484 Linvalid:
 485     return 0;
 486 }
 487
 488 /****************************
 489  * Turn next token in buffer into a token.
 490  */
 491
 492 void Lexer::scan(Token *t)
 493 {
 494     unsigned lastLine = loc.linnum;
 495     unsigned linnum;
 496
 497     t->blockComment = NULL;
 498     t->lineComment = NULL;
 499     while (1)
 500     {
 501         t->ptr = p;
 502         //printf("p = %p, *p = '%c'\n",p,*p);
 503         switch (*p)
 504         {
 505             case 0:
 506             case 0x1A:
 507                 t->value = TOKeof;                      // end of file
 508                 return;
 509
 510             case ' ':
 511             case '\t':
 512             case '\v':
 513             case '\f':
 514                 p++;
 515                 continue;                       // skip white space
 516
 517             case '\r':
 518                 p++;
 519                 if (*p != '\n')                 // if CR stands by itself
 520                     loc.linnum++;
 521                 continue;                       // skip white space
 522
 523             case '\n':
 524                 p++;
 525                 loc.linnum++;
 526                 continue;                       // skip white space
 527
 528             case '0':   case '1':   case '2':   case '3':   case '4':
 529             case '5':   case '6':   case '7':   case '8':   case '9':
 530                 t->value = number(t);
 531                 return;
 532
 533 #if CSTRINGS
 534             case '\'':
 535                 t->value = charConstant(t, 0);
 536                 return;
 537
 538             case '"':
 539                 t->value = stringConstant(t,0);
 540                 return;
 541
 542             case 'l':
 543             case 'L':
 544                 if (p[1] == '\'')
 545                 {
 546                     p++;
 547                     t->value = charConstant(t, 1);
 548                     return;
 549                 }
 550                 else if (p[1] == '"')
 551                 {
 552                     p++;
 553                     t->value = stringConstant(t, 1);
 554                     return;
 555                 }
 556 #else
 557             case '\'':
 558                 t->value = charConstant(t,0);
 559                 return;
 560
 561             case 'r':
 562                 if (p[1] != '"')
 563                     goto case_ident;
 564                 p++;
 565             case '`':
 566                 t->value = wysiwygStringConstant(t, *p);
 567                 return;
 568
 569             case 'x':
 570                 if (p[1] != '"')
 571                     goto case_ident;
 572                 p++;
 573                 t->value = hexStringConstant(t);
 574                 return;
 575
 576 #if V2
 577             case 'q':
 578                 if (p[1] == '"')
 579                 {
 580                     p++;
 581                     t->value = delimitedStringConstant(t);
 582                     return;
 583                 }
 584                 else if (p[1] == '{')
 585                 {
 586                     p++;
 587                     t->value = tokenStringConstant(t);
 588                     return;
 589                 }
 590                 else
 591                     goto case_ident;
 592 #endif
 593
 594             case '"':
 595                 t->value = escapeStringConstant(t,0);
 596                 return;
 597
 598             case '\\':                  // escaped string literal
 599             {   unsigned c;
 600
 601                 stringbuffer.reset();
 602                 do
 603                 {
 604                     p++;
 605                     switch (*p)
 606                     {
 607                         case 'u':
 608                         case 'U':
 609                         case '&':
 610                             c = escapeSequence();
 611                             stringbuffer.writeUTF8(c);
 612                             break;
 613
 614                         default:
 615                             c = escapeSequence();
 616                             stringbuffer.writeByte(c);
 617                             break;
 618                     }
 619                 } while (*p == '\\');
 620                 t->len = stringbuffer.offset;
 621                 stringbuffer.writeByte(0);
 622                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
 623                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
 624                 t->postfix = 0;
 625                 t->value = TOKstring;
 626                 return;
 627             }
 628
 629             case 'l':
 630             case 'L':
 631 #endif
 632             case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 633             case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 634             case 'k':               case 'm':   case 'n':   case 'o':
 635 #if V2
 636             case 'p':   /*case 'q': case 'r':*/ case 's':   case 't':
 637 #else
 638             case 'p':   case 'q': /*case 'r':*/ case 's':   case 't':
 639 #endif
 640             case 'u':   case 'v':   case 'w': /*case 'x':*/ case 'y':
 641             case 'z':
 642             case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 643             case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 644             case 'K':               case 'M':   case 'N':   case 'O':
 645             case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 646             case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 647             case 'Z':
 648             case '_':
 649             case_ident:
 650             {   unsigned char c;
 651                 StringValue *sv;
 652                 Identifier *id;
 653
 654                 do
 655                 {
 656                     c = *++p;
 657                 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
 658                 sv = stringtable.update((char *)t->ptr, p - t->ptr);
 659                 id = (Identifier *) sv->ptrvalue;
 660                 if (!id)
 661                 {   id = new Identifier(sv->lstring.string,TOKidentifier);
 662                     sv->ptrvalue = id;
 663                 }
 664                 t->ident = id;
 665                 t->value = (enum TOK) id->value;
 666                 anyToken = 1;
 667                 if (*t->ptr == '_')     // if special identifier token
 668                 {
 669                     static char date[11+1];
 670                     static char time[8+1];
 671                     static char timestamp[24+1];
 672
 673                     if (!date[0])       // lazy evaluation
 674                     {   time_t t;
 675                         char *p;
 676
 677                         ::time(&t);
 678                         p = ctime(&t);
 679                         assert(p);
 680                         sprintf(date, "%.6s %.4s", p + 4, p + 20);
 681                         sprintf(time, "%.8s", p + 11);
 682                         sprintf(timestamp, "%.24s", p);
 683                     }
 684
 685 #if !V2
 686                     if (mod && id == Id::FILE)
 687                     {
 688                         t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
 689                         goto Lstring;
 690                     }
 691                     else if (mod && id == Id::LINE)
 692                     {
 693                         t->value = TOKint64v;
 694                         t->uns64value = loc.linnum;
 695                     }
 696                     else
 697 #endif
 698                     if (id == Id::DATE)
 699                     {
 700                         t->ustring = (unsigned char *)date;
 701                         goto Lstring;
 702                     }
 703                     else if (id == Id::TIME)
 704                     {
 705                         t->ustring = (unsigned char *)time;
 706                         goto Lstring;
 707                     }
 708                     else if (id == Id::VENDOR)
 709                     {
 710 #ifdef IN_GCC
 711                         t->ustring = (unsigned char *)"GDC";
 712 #else
 713                         t->ustring = (unsigned char *)"Digital Mars D";
 714 #endif
 715                         goto Lstring;
 716                     }
 717                     else if (id == Id::TIMESTAMP)
 718                     {
 719                         t->ustring = (unsigned char *)timestamp;
 720                      Lstring:
 721                         t->value = TOKstring;
 722                      Llen:
 723                         t->postfix = 0;
 724                         t->len = strlen((char *)t->ustring);
 725                     }
 726                     else if (id == Id::VERSIONX)
 727                     {   unsigned major = 0;
 728                         unsigned minor = 0;
 729
 730                         for (char *p = global.version + 1; 1; p++)
 731                         {
 732                             char c = *p;
 733                             if (isdigit(c))
 734                                 minor = minor * 10 + c - '0';
 735                             else if (c == '.')
 736                             {   major = minor;
 737                                 minor = 0;
 738                             }
 739                             else
 740                                 break;
 741                         }
 742                         t->value = TOKint64v;
 743                         t->uns64value = major * 1000 + minor;
 744                     }
 745 #if V2
 746                     else if (id == Id::EOFX)
 747                     {
 748                         t->value = TOKeof;
 749                         // Advance scanner to end of file
 750                         while (!(*p == 0 || *p == 0x1A))
 751                             p++;
 752                     }
 753 #endif
 754                 }
 755                 //printf("t->value = %d\n",t->value);
 756                 return;
 757             }
 758
 759             case '/':
 760                 p++;
 761                 switch (*p)
 762                 {
 763                     case '=':
 764                         p++;
 765                         t->value = TOKdivass;
 766                         return;
 767
 768                     case '*':
 769                         p++;
 770                         linnum = loc.linnum;
 771                         while (1)
 772                         {
 773                             while (1)
 774                             {   unsigned char c = *p;
 775                                 switch (c)
 776                                 {
 777                                     case '/':
 778                                         break;
 779
 780                                     case '\n':
 781                                         loc.linnum++;
 782                                         p++;
 783                                         continue;
 784
 785                                     case '\r':
 786                                         p++;
 787                                         if (*p != '\n')
 788                                             loc.linnum++;
 789                                         continue;
 790
 791                                     case 0:
 792                                     case 0x1A:
 793                                         error("unterminated /* */ comment");
 794                                         p = end;
 795                                         t->value = TOKeof;
 796                                         return;
 797
 798                                     default:
 799                                         if (c & 0x80)
 800                                         {   unsigned u = decodeUTF();
 801                                             if (u == PS || u == LS)
 802                                                 loc.linnum++;
 803                                         }
 804                                         p++;
 805                                         continue;
 806                                 }
 807                                 break;
 808                             }
 809                             p++;
 810                             if (p[-2] == '*' && p - 3 != t->ptr)
 811                                 break;
 812                         }
 813                         if (commentToken)
 814                         {
 815                             t->value = TOKcomment;
 816                             return;
 817                         }
 818                         else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
 819                         {   // if /** but not /**/
 820                             getDocComment(t, lastLine == linnum);
 821                         }
 822                         continue;
 823
 824                     case '/':           // do // style comments
 825                         linnum = loc.linnum;
 826                         while (1)
 827                         {   unsigned char c = *++p;
 828                             switch (c)
 829                             {
 830                                 case '\n':
 831                                     break;
 832
 833                                 case '\r':
 834                                     if (p[1] == '\n')
 835                                         p++;
 836                                     break;
 837
 838                                 case 0:
 839                                 case 0x1A:
 840                                     if (commentToken)
 841                                     {
 842                                         p = end;
 843                                         t->value = TOKcomment;
 844                                         return;
 845                                     }
 846                                     if (doDocComment && t->ptr[2] == '/')
 847                                         getDocComment(t, lastLine == linnum);
 848                                     p = end;
 849                                     t->value = TOKeof;
 850                                     return;
 851
 852                                 default:
 853                                     if (c & 0x80)
 854                                     {   unsigned u = decodeUTF();
 855                                         if (u == PS || u == LS)
 856                                             break;
 857                                     }
 858                                     continue;
 859                             }
 860                             break;
 861                         }
 862
 863                         if (commentToken)
 864                         {
 865                             p++;
 866                             loc.linnum++;
 867                             t->value = TOKcomment;
 868                             return;
 869                         }
 870                         if (doDocComment && t->ptr[2] == '/')
 871                             getDocComment(t, lastLine == linnum);
 872
 873                         p++;
 874                         loc.linnum++;
 875                         continue;
 876
 877                     case '+':
 878                     {   int nest;
 879
 880                         linnum = loc.linnum;
 881                         p++;
 882                         nest = 1;
 883                         while (1)
 884                         {   unsigned char c = *p;
 885                             switch (c)
 886                             {
 887                                 case '/':
 888                                     p++;
 889                                     if (*p == '+')
 890                                     {
 891                                         p++;
 892                                         nest++;
 893                                     }
 894                                     continue;
 895
 896                                 case '+':
 897                                     p++;
 898                                     if (*p == '/')
 899                                     {
 900                                         p++;
 901                                         if (--nest == 0)
 902                                             break;
 903                                     }
 904                                     continue;
 905
 906                                 case '\r':
 907                                     p++;
 908                                     if (*p != '\n')
 909                                         loc.linnum++;
 910                                     continue;
 911
 912                                 case '\n':
 913                                     loc.linnum++;
 914                                     p++;
 915                                     continue;
 916
 917                                 case 0:
 918                                 case 0x1A:
 919                                     error("unterminated /+ +/ comment");
 920                                     p = end;
 921                                     t->value = TOKeof;
 922                                     return;
 923
 924                                 default:
 925                                     if (c & 0x80)
 926                                     {   unsigned u = decodeUTF();
 927                                         if (u == PS || u == LS)
 928                                             loc.linnum++;
 929                                     }
 930                                     p++;
 931                                     continue;
 932                             }
 933                             break;
 934                         }
 935                         if (commentToken)
 936                         {
 937                             t->value = TOKcomment;
 938                             return;
 939                         }
 940                         if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
 941                         {   // if /++ but not /++/
 942                             getDocComment(t, lastLine == linnum);
 943                         }
 944                         continue;
 945                     }
 946                 }
 947                 t->value = TOKdiv;
 948                 return;
 949
 950             case '.':
 951                 p++;
 952                 if (isdigit(*p))
 953                 {   /* Note that we don't allow ._1 and ._ as being
 954                      * valid floating point numbers.
 955                      */
 956                     p--;
 957                     t->value = inreal(t);
 958                 }
 959                 else if (p[0] == '.')
 960                 {
 961                     if (p[1] == '.')
 962                     {   p += 2;
 963                         t->value = TOKdotdotdot;
 964                     }
 965                     else
 966                     {   p++;
 967                         t->value = TOKslice;
 968                     }
 969                 }
 970                 else
 971                     t->value = TOKdot;
 972                 return;
 973
 974             case '&':
 975                 p++;
 976                 if (*p == '=')
 977                 {   p++;
 978                     t->value = TOKandass;
 979                 }
 980                 else if (*p == '&')
 981                 {   p++;
 982                     t->value = TOKandand;
 983                 }
 984                 else
 985                     t->value = TOKand;
 986                 return;
 987
 988             case '|':
 989                 p++;
 990                 if (*p == '=')
 991                 {   p++;
 992                     t->value = TOKorass;
 993                 }
 994                 else if (*p == '|')
 995                 {   p++;
 996                     t->value = TOKoror;
 997                 }
 998                 else
 999                     t->value = TOKor;
1000                 return;
1001
1002             case '-':
1003                 p++;
1004                 if (*p == '=')
1005                 {   p++;
1006                     t->value = TOKminass;
1007                 }
1008 #if 0
1009                 else if (*p == '>')
1010                 {   p++;
1011                     t->value = TOKarrow;
1012                 }
1013 #endif
1014                 else if (*p == '-')
1015                 {   p++;
1016                     t->value = TOKminusminus;
1017                 }
1018                 else
1019                     t->value = TOKmin;
1020                 return;
1021
1022             case '+':
1023                 p++;
1024                 if (*p == '=')
1025                 {   p++;
1026                     t->value = TOKaddass;
1027                 }
1028                 else if (*p == '+')
1029                 {   p++;
1030                     t->value = TOKplusplus;
1031                 }
1032                 else
1033                     t->value = TOKadd;
1034                 return;
1035
1036             case '<':
1037                 p++;
1038                 if (*p == '=')
1039                 {   p++;
1040                     t->value = TOKle;                   // <=
1041                 }
1042                 else if (*p == '<')
1043                 {   p++;
1044                     if (*p == '=')
1045                     {   p++;
1046                         t->value = TOKshlass;           // <<=
1047                     }
1048                     else
1049                         t->value = TOKshl;              // <<
1050                 }
1051                 else if (*p == '>')
1052                 {   p++;
1053                     if (*p == '=')
1054                     {   p++;
1055                         t->value = TOKleg;              // <>=
1056                     }
1057                     else
1058                         t->value = TOKlg;               // <>
1059                 }
1060                 else
1061                     t->value = TOKlt;                   // <
1062                 return;
1063
1064             case '>':
1065                 p++;
1066                 if (*p == '=')
1067                 {   p++;
1068                     t->value = TOKge;                   // >=
1069                 }
1070                 else if (*p == '>')
1071                 {   p++;
1072                     if (*p == '=')
1073                     {   p++;
1074                         t->value = TOKshrass;           // >>=
1075                     }
1076                     else if (*p == '>')
1077                     {   p++;
1078                         if (*p == '=')
1079                         {   p++;
1080                             t->value = TOKushrass;      // >>>=
1081                         }
1082                         else
1083                             t->value = TOKushr;         // >>>
1084                     }
1085                     else
1086                         t->value = TOKshr;              // >>
1087                 }
1088                 else
1089                     t->value = TOKgt;                   // >
1090                 return;
1091
1092             case '!':
1093                 p++;
1094                 if (*p == '=')
1095                 {   p++;
1096                     if (*p == '=' && global.params.Dversion == 1)
1097                     {   p++;
1098                         t->value = TOKnotidentity;      // !==
1099                     }
1100                     else
1101                         t->value = TOKnotequal;         // !=
1102                 }
1103                 else if (*p == '<')
1104                 {   p++;
1105                     if (*p == '>')
1106                     {   p++;
1107                         if (*p == '=')
1108                         {   p++;
1109                             t->value = TOKunord; // !<>=
1110                         }
1111                         else
1112                             t->value = TOKue;   // !<>
1113                     }
1114                     else if (*p == '=')
1115                     {   p++;
1116                         t->value = TOKug;       // !<=
1117                     }
1118                     else
1119                         t->value = TOKuge;      // !<
1120                 }
1121                 else if (*p == '>')
1122                 {   p++;
1123                     if (*p == '=')
1124                     {   p++;
1125                         t->value = TOKul;       // !>=
1126                     }
1127                     else
1128                         t->value = TOKule;      // !>
1129                 }
1130                 else
1131                     t->value = TOKnot;          // !
1132                 return;
1133
1134             case '=':
1135                 p++;
1136                 if (*p == '=')
1137                 {   p++;
1138                     if (*p == '=' && global.params.Dversion == 1)
1139                     {   p++;
1140                         t->value = TOKidentity;         // ===
1141                     }
1142                     else
1143                         t->value = TOKequal;            // ==
1144                 }
1145                 else
1146                     t->value = TOKassign;               // =
1147                 return;
1148
1149             case '~':
1150                 p++;
1151                 if (*p == '=')
1152                 {   p++;
1153                     t->value = TOKcatass;               // ~=
1154                 }
1155                 else
1156                     t->value = TOKtilde;                // ~
1157                 return;
1158
1159 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1160
1161             SINGLE('(', TOKlparen)
1162             SINGLE(')', TOKrparen)
1163             SINGLE('[', TOKlbracket)
1164             SINGLE(']', TOKrbracket)
1165             SINGLE('{', TOKlcurly)
1166             SINGLE('}', TOKrcurly)
1167             SINGLE('?', TOKquestion)
1168             SINGLE(',', TOKcomma)
1169             SINGLE(';', TOKsemicolon)
1170             SINGLE(':', TOKcolon)
1171             SINGLE('$', TOKdollar)
1172
1173 #undef SINGLE
1174
1175 #define DOUBLE(c1,tok1,c2,tok2)         \
1176             case c1:                    \
1177                 p++;                    \
1178                 if (*p == c2)           \
1179                 {   p++;                \
1180                     t->value = tok2;    \
1181                 }                       \
1182                 else                    \
1183                     t->value = tok1;    \
1184                 return;
1185
1186             DOUBLE('*', TOKmul, '=', TOKmulass)
1187             DOUBLE('%', TOKmod, '=', TOKmodass)
1188             DOUBLE('^', TOKxor, '=', TOKxorass)
1189
1190 #undef DOUBLE
1191
1192             case '#':
1193                 p++;
1194                 pragma();
1195                 continue;
1196
1197             default:
1198             {   unsigned char c = *p;
1199
1200                 if (c & 0x80)
1201                 {   unsigned u = decodeUTF();
1202
1203                     // Check for start of unicode identifier
1204                     if (isUniAlpha(u))
1205                         goto case_ident;
1206
1207                     if (u == PS || u == LS)
1208                     {
1209                         loc.linnum++;
1210                         p++;
1211                         continue;
1212                     }
1213                 }
1214                 if (isprint(c))
1215                     error("unsupported char '%c'", c);
1216                 else
1217                     error("unsupported char 0x%02x", c);
1218                 p++;
1219                 continue;
1220             }
1221         }
1222     }
1223 }
1224
1225 /*******************************************
1226  * Parse escape sequence.
1227  */
1228
1229 unsigned Lexer::escapeSequence()
1230 {   unsigned c;
1231     int n;
1232     int ndigits;
1233
1234     c = *p;
1235     switch (c)
1236     {
1237         case '\'':
1238         case '"':
1239         case '?':
1240         case '\\':
1241         Lconsume:
1242                 p++;
1243                 break;
1244
1245         case 'a':       c = 7;          goto Lconsume;
1246         case 'b':       c = 8;          goto Lconsume;
1247         case 'f':       c = 12;         goto Lconsume;
1248         case 'n':       c = 10;         goto Lconsume;
1249         case 'r':       c = 13;         goto Lconsume;
1250         case 't':       c = 9;          goto Lconsume;
1251         case 'v':       c = 11;         goto Lconsume;
1252
1253         case 'u':
1254                 ndigits = 4;
1255                 goto Lhex;
1256         case 'U':
1257                 ndigits = 8;
1258                 goto Lhex;
1259         case 'x':
1260                 ndigits = 2;
1261         Lhex:
1262                 p++;
1263                 c = *p;
1264                 if (ishex(c))
1265                 {   unsigned v;
1266
1267                     n = 0;
1268                     v = 0;
1269                     while (1)
1270                     {
1271                         if (isdigit(c))
1272                             c -= '0';
1273                         else if (islower(c))
1274                             c -= 'a' - 10;
1275                         else
1276                             c -= 'A' - 10;
1277                         v = v * 16 + c;
1278                         c = *++p;
1279                         if (++n == ndigits)
1280                             break;
1281                         if (!ishex(c))
1282                         {   error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1283                             break;
1284                         }
1285                     }
1286                     if (ndigits != 2 && !utf_isValidDchar(v))
1287                         error("invalid UTF character \\U%08x", v);
1288                     c = v;
1289                 }
1290                 else
1291                     error("undefined escape hex sequence \\%c\n",c);
1292                 break;
1293
1294         case '&':                       // named character entity
1295                 for (unsigned char *idstart = ++p; 1; p++)
1296                 {
1297                     switch (*p)
1298                     {
1299                         case ';':
1300                             c = HtmlNamedEntity(idstart, p - idstart);
1301                             if (c == ~0)
1302                             {   error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1303                                 c = ' ';
1304                             }
1305                             p++;
1306                             break;
1307
1308                         default:
1309                             if (isalpha(*p) ||
1310                                 (p != idstart + 1 && isdigit(*p)))
1311                                 continue;
1312                             error("unterminated named entity");
1313                             break;
1314                     }
1315                     break;
1316                 }
1317                 break;
1318
1319         case 0:
1320         case 0x1A:                      // end of file
1321                 c = '\\';
1322                 break;
1323
1324         default:
1325                 if (isoctal(c))
1326                 {   unsigned v;
1327
1328                     n = 0;
1329                     v = 0;
1330                     do
1331                     {
1332                         v = v * 8 + (c - '0');
1333                         c = *++p;
1334                     } while (++n < 3 && isoctal(c));
1335                     c = v;
1336                     if (c > 0xFF)
1337                         error("0%03o is larger than a byte", c);
1338                 }
1339                 else
1340                     error("undefined escape sequence \\%c\n",c);
1341                 break;
1342     }
1343     return c;
1344 }
1345
1346 /**************************************
1347  */
1348
1349 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1350 {   unsigned c;
1351     Loc start = loc;
1352
1353     p++;
1354     stringbuffer.reset();
1355     while (1)
1356     {
1357         c = *p++;
1358         switch (c)
1359         {
1360             case '\n':
1361                 loc.linnum++;
1362                 break;
1363
1364             case '\r':
1365                 if (*p == '\n')
1366                     continue;   // ignore
1367                 c = '\n';       // treat EndOfLine as \n character
1368                 loc.linnum++;
1369                 break;
1370
1371             case 0:
1372             case 0x1A:
1373                 error("unterminated string constant starting at %s", start.toChars());
1374                 t->ustring = (unsigned char *)"";
1375                 t->len = 0;
1376                 t->postfix = 0;
1377                 return TOKstring;
1378
1379             case '"':
1380             case '`':
1381                 if (c == tc)
1382                 {
1383                     t->len = stringbuffer.offset;
1384                     stringbuffer.writeByte(0);
1385                     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1386                     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1387                     stringPostfix(t);
1388                     return TOKstring;
1389                 }
1390                 break;
1391
1392             default:
1393                 if (c & 0x80)
1394                 {   p--;
1395                     unsigned u = decodeUTF();
1396                     p++;
1397                     if (u == PS || u == LS)
1398                         loc.linnum++;
1399                     stringbuffer.writeUTF8(u);
1400                     continue;
1401                 }
1402                 break;
1403         }
1404         stringbuffer.writeByte(c);
1405     }
1406 }
1407
1408 /**************************************
1409  * Lex hex strings:
1410  *      x"0A ae 34FE BD"
1411  */
1412
1413 TOK Lexer::hexStringConstant(Token *t)
1414 {   unsigned c;
1415     Loc start = loc;
1416     unsigned n = 0;
1417     unsigned v;
1418
1419     p++;
1420     stringbuffer.reset();
1421     while (1)
1422     {
1423         c = *p++;
1424         switch (c)
1425         {
1426             case ' ':
1427             case '\t':
1428             case '\v':
1429             case '\f':
1430                 continue;                       // skip white space
1431
1432             case '\r':
1433                 if (*p == '\n')
1434                     continue;                   // ignore
1435                 // Treat isolated '\r' as if it were a '\n'
1436             case '\n':
1437                 loc.linnum++;
1438                 continue;
1439
1440             case 0:
1441             case 0x1A:
1442                 error("unterminated string constant starting at %s", start.toChars());
1443                 t->ustring = (unsigned char *)"";
1444                 t->len = 0;
1445                 t->postfix = 0;
1446                 return TOKstring;
1447
1448             case '"':
1449                 if (n & 1)
1450                 {   error("odd number (%d) of hex characters in hex string", n);
1451                     stringbuffer.writeByte(v);
1452                 }
1453                 t->len = stringbuffer.offset;
1454                 stringbuffer.writeByte(0);
1455                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1456                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1457                 stringPostfix(t);
1458                 return TOKstring;
1459
1460             default:
1461                 if (c >= '0' && c <= '9')
1462                     c -= '0';
1463                 else if (c >= 'a' && c <= 'f')
1464                     c -= 'a' - 10;
1465                 else if (c >= 'A' && c <= 'F')
1466                     c -= 'A' - 10;
1467                 else if (c & 0x80)
1468                 {   p--;
1469                     unsigned u = decodeUTF();
1470                     p++;
1471                     if (u == PS || u == LS)
1472                         loc.linnum++;
1473                     else
1474                         error("non-hex character \\u%x", u);
1475                 }
1476                 else
1477                     error("non-hex character '%c'", c);
1478                 if (n & 1)
1479                 {   v = (v << 4) | c;
1480                     stringbuffer.writeByte(v);
1481                 }
1482                 else
1483                     v = c;
1484                 n++;
1485                 break;
1486         }
1487     }
1488 }
1489
1490
1491 #if V2
1492 /**************************************
1493  * Lex delimited strings:
1494  *      q"(foo(xxx))"   // "foo(xxx)"
1495  *      q"[foo(]"       // "foo("
1496  *      q"/foo]/"       // "foo]"
1497  *      q"HERE
1498  *      foo
1499  *      HERE"           // "foo\n"
1500  * Input:
1501  *      p is on the "
1502  */
1503
1504 TOK Lexer::delimitedStringConstant(Token *t)
1505 {   unsigned c;
1506     Loc start = loc;
1507     unsigned delimleft = 0;
1508     unsigned delimright = 0;
1509     unsigned nest = 1;
1510     unsigned nestcount;
1511     Identifier *hereid = NULL;
1512     unsigned blankrol = 0;
1513     unsigned startline = 0;
1514
1515     p++;
1516     stringbuffer.reset();
1517     while (1)
1518     {
1519         c = *p++;
1520         //printf("c = '%c'\n", c);
1521         switch (c)
1522         {
1523             case '\n':
1524             Lnextline:
1525                 loc.linnum++;
1526                 startline = 1;
1527                 if (blankrol)
1528                 {   blankrol = 0;
1529                     continue;
1530                 }
1531                 if (hereid)
1532                 {
1533                     stringbuffer.writeUTF8(c);
1534                     continue;
1535                 }
1536                 break;
1537
1538             case '\r':
1539                 if (*p == '\n')
1540                     continue;   // ignore
1541                 c = '\n';       // treat EndOfLine as \n character
1542                 goto Lnextline;
1543
1544             case 0:
1545             case 0x1A:
1546                 goto Lerror;
1547
1548             default:
1549                 if (c & 0x80)
1550                 {   p--;
1551                     c = decodeUTF();
1552                     p++;
1553                     if (c == PS || c == LS)
1554                         goto Lnextline;
1555                 }
1556                 break;
1557         }
1558         if (delimleft == 0)
1559         {   delimleft = c;
1560             nest = 1;
1561             nestcount = 1;
1562             if (c == '(')
1563                 delimright = ')';
1564             else if (c == '{')
1565                 delimright = '}';
1566             else if (c == '[')
1567                 delimright = ']';
1568             else if (c == '<')
1569                 delimright = '>';
1570             else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1571             {   // Start of identifier; must be a heredoc
1572                 Token t;
1573                 p--;
1574                 scan(&t);               // read in heredoc identifier
1575                 if (t.value != TOKidentifier)
1576                 {   error("identifier expected for heredoc, not %s", t.toChars());
1577                     delimright = c;
1578                 }
1579                 else
1580                 {   hereid = t.ident;
1581                     //printf("hereid = '%s'\n", hereid->toChars());
1582                     blankrol = 1;
1583                 }
1584                 nest = 0;
1585             }
1586             else
1587             {   delimright = c;
1588                 nest = 0;
1589             }
1590         }
1591         else
1592         {
1593             if (blankrol)
1594             {   error("heredoc rest of line should be blank");
1595                 blankrol = 0;
1596                 continue;
1597             }
1598             if (nest == 1)
1599             {
1600                 if (c == delimleft)
1601                     nestcount++;
1602                 else if (c == delimright)
1603                 {   nestcount--;
1604                     if (nestcount == 0)
1605                         goto Ldone;
1606                 }
1607             }
1608             else if (c == delimright)
1609                 goto Ldone;
1610             if (startline && isalpha(c))
1611             {   Token t;
1612                 unsigned char *psave = p;
1613                 p--;
1614                 scan(&t);               // read in possible heredoc identifier
1615                 //printf("endid = '%s'\n", t.ident->toChars());
1616                 if (t.value == TOKidentifier && t.ident->equals(hereid))
1617                 {   /* should check that rest of line is blank
1618                      */
1619                     goto Ldone;
1620                 }
1621                 p = psave;
1622             }
1623             stringbuffer.writeUTF8(c);
1624             startline = 0;
1625         }
1626     }
1627
1628 Ldone:
1629     if (*p == '"')
1630         p++;
1631     else
1632         error("delimited string must end in %c\"", delimright);
1633     t->len = stringbuffer.offset;
1634     stringbuffer.writeByte(0);
1635     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1636     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1637     stringPostfix(t);
1638     return TOKstring;
1639
1640 Lerror:
1641     error("unterminated string constant starting at %s", start.toChars());
1642     t->ustring = (unsigned char *)"";
1643     t->len = 0;
1644     t->postfix = 0;
1645     return TOKstring;
1646 }
1647
1648 /**************************************
1649  * Lex delimited strings:
1650  *      q{ foo(xxx) } // " foo(xxx) "
1651  *      q{foo(}       // "foo("
1652  *      q{{foo}"}"}   // "{foo}"}""
1653  * Input:
1654  *      p is on the q
1655  */
1656
1657 TOK Lexer::tokenStringConstant(Token *t)
1658 {
1659     unsigned nest = 1;
1660     Loc start = loc;
1661     unsigned char *pstart = ++p;
1662
1663     while (1)
1664     {   Token tok;
1665
1666         scan(&tok);
1667         switch (tok.value)
1668         {
1669             case TOKlcurly:
1670                 nest++;
1671                 continue;
1672
1673             case TOKrcurly:
1674                 if (--nest == 0)
1675                     goto Ldone;
1676                 continue;
1677
1678             case TOKeof:
1679                 goto Lerror;
1680
1681             default:
1682                 continue;
1683         }
1684     }
1685
1686 Ldone:
1687     t->len = p - 1 - pstart;
1688     t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1689     memcpy(t->ustring, pstart, t->len);
1690     t->ustring[t->len] = 0;
1691     stringPostfix(t);
1692     return TOKstring;
1693
1694 Lerror:
1695     error("unterminated token string constant starting at %s", start.toChars());
1696     t->ustring = (unsigned char *)"";
1697     t->len = 0;
1698     t->postfix = 0;
1699     return TOKstring;
1700 }
1701
1702 #endif
1703
1704
1705 /**************************************
1706  */
1707
1708 TOK Lexer::escapeStringConstant(Token *t, int wide)
1709 {   unsigned c;
1710     Loc start = loc;
1711
1712     p++;
1713     stringbuffer.reset();
1714     while (1)
1715     {
1716         c = *p++;
1717         switch (c)
1718         {
1719             case '\\':
1720                 switch (*p)
1721                 {
1722                     case 'u':
1723                     case 'U':
1724                     case '&':
1725                         c = escapeSequence();
1726                         stringbuffer.writeUTF8(c);
1727                         continue;
1728
1729                     default:
1730                         c = escapeSequence();
1731                         break;
1732                 }
1733                 break;
1734
1735             case '\n':
1736                 loc.linnum++;
1737                 break;
1738
1739             case '\r':
1740                 if (*p == '\n')
1741                     continue;   // ignore
1742                 c = '\n';       // treat EndOfLine as \n character
1743                 loc.linnum++;
1744                 break;
1745
1746             case '"':
1747                 t->len = stringbuffer.offset;
1748                 stringbuffer.writeByte(0);
1749                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1750                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1751                 stringPostfix(t);
1752                 return TOKstring;
1753
1754             case 0:
1755             case 0x1A:
1756                 p--;
1757                 error("unterminated string constant starting at %s", start.toChars());
1758                 t->ustring = (unsigned char *)"";
1759                 t->len = 0;
1760                 t->postfix = 0;
1761                 return TOKstring;
1762
1763             default:
1764                 if (c & 0x80)
1765                 {
1766                     p--;
1767                     c = decodeUTF();
1768                     if (c == LS || c == PS)
1769                     {   c = '\n';
1770                         loc.linnum++;
1771                     }
1772                     p++;
1773                     stringbuffer.writeUTF8(c);
1774                     continue;
1775                 }
1776                 break;
1777         }
1778         stringbuffer.writeByte(c);
1779     }
1780 }
1781
1782 /**************************************
1783  */
1784
1785 TOK Lexer::charConstant(Token *t, int wide)
1786 {
1787     unsigned c;
1788     TOK tk = TOKcharv;
1789
1790     //printf("Lexer::charConstant\n");
1791     p++;
1792     c = *p++;
1793     switch (c)
1794     {
1795         case '\\':
1796             switch (*p)
1797             {
1798                 case 'u':
1799                     t->uns64value = escapeSequence();
1800                     tk = TOKwcharv;
1801                     break;
1802
1803                 case 'U':
1804                 case '&':
1805                     t->uns64value = escapeSequence();
1806                     tk = TOKdcharv;
1807                     break;
1808
1809                 default:
1810                     t->uns64value = escapeSequence();
1811                     break;
1812             }
1813             break;
1814
1815         case '\n':
1816         L1:
1817             loc.linnum++;
1818         case '\r':
1819         case 0:
1820         case 0x1A:
1821         case '\'':
1822             error("unterminated character constant");
1823             return tk;
1824
1825         default:
1826             if (c & 0x80)
1827             {
1828                 p--;
1829                 c = decodeUTF();
1830                 p++;
1831                 if (c == LS || c == PS)
1832                     goto L1;
1833                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1834                     tk = TOKwcharv;
1835                 else
1836                     tk = TOKdcharv;
1837             }
1838             t->uns64value = c;
1839             break;
1840     }
1841
1842     if (*p != '\'')
1843     {   error("unterminated character constant");
1844         return tk;
1845     }
1846     p++;
1847     return tk;
1848 }
1849
1850 /***************************************
1851  * Get postfix of string literal.
1852  */
1853
1854 void Lexer::stringPostfix(Token *t)
1855 {
1856     switch (*p)
1857     {
1858         case 'c':
1859         case 'w':
1860         case 'd':
1861             t->postfix = *p;
1862             p++;
1863             break;
1864
1865         default:
1866             t->postfix = 0;
1867             break;
1868     }
1869 }
1870
1871 /***************************************
1872  * Read \u or \U unicode sequence
1873  * Input:
1874  *      u       'u' or 'U'
1875  */
1876
1877 #if 0
1878 unsigned Lexer::wchar(unsigned u)
1879 {
1880     unsigned value;
1881     unsigned n;
1882     unsigned char c;
1883     unsigned nchars;
1884
1885     nchars = (u == 'U') ? 8 : 4;
1886     value = 0;
1887     for (n = 0; 1; n++)
1888     {
1889         ++p;
1890         if (n == nchars)
1891             break;
1892         c = *p;
1893         if (!ishex(c))
1894         {   error("\\%c sequence must be followed by %d hex characters", u, nchars);
1895             break;
1896         }
1897         if (isdigit(c))
1898             c -= '0';
1899         else if (islower(c))
1900             c -= 'a' - 10;
1901         else
1902             c -= 'A' - 10;
1903         value <<= 4;
1904         value |= c;
1905     }
1906     return value;
1907 }
1908 #endif
1909
1910 /**************************************
1911  * Read in a number.
1912  * If it's an integer, store it in tok.TKutok.Vlong.
1913  *      integers can be decimal, octal or hex
1914  *      Handle the suffixes U, UL, LU, L, etc.
1915  * If it's double, store it in tok.TKutok.Vdouble.
1916  * Returns:
1917  *      TKnum
1918  *      TKdouble,...
1919  */
1920
1921 TOK Lexer::number(Token *t)
1922 {
1923     // We use a state machine to collect numbers
1924     enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
1925         STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
1926         STATE_hexh, STATE_error };
1927     enum STATE state;
1928
1929     enum FLAGS
1930     {   FLAGS_decimal  = 1,             // decimal
1931         FLAGS_unsigned = 2,             // u or U suffix
1932         FLAGS_long     = 4,             // l or L suffix
1933     };
1934     enum FLAGS flags = FLAGS_decimal;
1935
1936     int i;
1937     int base;
1938     unsigned c;
1939     unsigned char *start;
1940     TOK result;
1941
1942     //printf("Lexer::number()\n");
1943     state = STATE_initial;
1944     base = 0;
1945     stringbuffer.reset();
1946     start = p;
1947     while (1)
1948     {
1949         c = *p;
1950         switch (state)
1951         {
1952             case STATE_initial:         // opening state
1953                 if (c == '0')
1954                     state = STATE_0;
1955                 else
1956                     state = STATE_decimal;
1957                 break;
1958
1959             case STATE_0:
1960                 flags = (FLAGS) (flags & ~FLAGS_decimal);
1961                 switch (c)
1962                 {
1963 #if ZEROH
1964                     case 'H':                   // 0h
1965                     case 'h':
1966                         goto hexh;
1967 #endif
1968                     case 'X':
1969                     case 'x':
1970                         state = STATE_hex0;
1971                         break;
1972
1973                     case '.':
1974                         if (p[1] == '.')        // .. is a separate token
1975                             goto done;
1976                     case 'i':
1977                     case 'f':
1978                     case 'F':
1979                         goto real;
1980 #if ZEROH
1981                     case 'E':
1982                     case 'e':
1983                         goto case_hex;
1984 #endif
1985                     case 'B':
1986                     case 'b':
1987                         state = STATE_binary0;
1988                         break;
1989
1990                     case '0': case '1': case '2': case '3':
1991                     case '4': case '5': case '6': case '7':
1992                         state = STATE_octal;
1993                         break;
1994
1995 #if ZEROH
1996                     case '8': case '9': case 'A':
1997                     case 'C': case 'D': case 'F':
1998                     case 'a': case 'c': case 'd': case 'f':
1999                     case_hex:
2000                         state = STATE_hexh;
2001                         break;
2002 #endif
2003                     case '_':
2004                         state = STATE_octal;
2005                         p++;
2006                         continue;
2007
2008                     case 'L':
2009                         if (p[1] == 'i')
2010                             goto real;
2011                         goto done;
2012
2013                     default:
2014                         goto done;
2015                 }
2016                 break;
2017
2018             case STATE_decimal:         // reading decimal number
2019                 if (!isdigit(c))
2020                 {
2021 #if ZEROH
2022                     if (ishex(c)
2023                         || c == 'H' || c == 'h'
2024                        )
2025                         goto hexh;
2026 #endif
2027                     if (c == '_')               // ignore embedded _
2028                     {   p++;
2029                         continue;
2030                     }
2031                     if (c == '.' && p[1] != '.')
2032                         goto real;
2033                     else if (c == 'i' || c == 'f' || c == 'F' ||
2034                              c == 'e' || c == 'E')
2035                     {
2036             real:       // It's a real number. Back up and rescan as a real
2037                         p = start;
2038                         return inreal(t);
2039                     }
2040                     else if (c == 'L' && p[1] == 'i')
2041                         goto real;
2042                     goto done;
2043                 }
2044                 break;
2045
2046             case STATE_hex0:            // reading hex number
2047             case STATE_hex:
2048                 if (!ishex(c))
2049                 {
2050                     if (c == '_')               // ignore embedded _
2051                     {   p++;
2052                         continue;
2053                     }
2054                     if (c == '.' && p[1] != '.')
2055                         goto real;
2056                     if (c == 'P' || c == 'p' || c == 'i')
2057                         goto real;
2058                     if (state == STATE_hex0)
2059                         error("Hex digit expected, not '%c'", c);
2060                     goto done;
2061                 }
2062                 state = STATE_hex;
2063                 break;
2064
2065 #if ZEROH
2066             hexh:
2067                 state = STATE_hexh;
2068             case STATE_hexh:            // parse numbers like 0FFh
2069                 if (!ishex(c))
2070                 {
2071                     if (c == 'H' || c == 'h')
2072                     {
2073                         p++;
2074                         base = 16;
2075                         goto done;
2076                     }
2077                     else
2078                     {
2079                         // Check for something like 1E3 or 0E24
2080                         if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2081                             memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2082                             goto real;
2083                         error("Hex digit expected, not '%c'", c);
2084                         goto done;
2085                     }
2086                 }
2087                 break;
2088 #endif
2089
2090             case STATE_octal:           // reading octal number
2091             case STATE_octale:          // reading octal number with non-octal digits
2092                 if (!isoctal(c))
2093                 {
2094 #if ZEROH
2095                     if (ishex(c)
2096                         || c == 'H' || c == 'h'
2097                        )
2098                         goto hexh;
2099 #endif
2100                     if (c == '_')               // ignore embedded _
2101                     {   p++;
2102                         continue;
2103                     }
2104                     if (c == '.' && p[1] != '.')
2105                         goto real;
2106                     if (c == 'i')
2107                         goto real;
2108                     if (isdigit(c))
2109                     {
2110                         state = STATE_octale;
2111                     }
2112                     else
2113                         goto done;
2114                 }
2115                 break;
2116
2117             case STATE_binary0:         // starting binary number
2118             case STATE_binary:          // reading binary number
2119                 if (c != '0' && c != '1')
2120                 {
2121 #if ZEROH
2122                     if (ishex(c)
2123                         || c == 'H' || c == 'h'
2124                        )
2125                         goto hexh;
2126 #endif
2127                     if (c == '_')               // ignore embedded _
2128                     {   p++;
2129                         continue;
2130                     }
2131                     if (state == STATE_binary0)
2132                     {   error("binary digit expected");
2133                         state = STATE_error;
2134                         break;
2135                     }
2136                     else
2137                         goto done;
2138                 }
2139                 state = STATE_binary;
2140                 break;
2141
2142             case STATE_error:           // for error recovery
2143                 if (!isdigit(c))        // scan until non-digit
2144                     goto done;
2145                 break;
2146
2147             default:
2148                 assert(0);
2149         }
2150         stringbuffer.writeByte(c);
2151         p++;
2152     }
2153 done:
2154     stringbuffer.writeByte(0);          // terminate string
2155     if (state == STATE_octale)
2156         error("Octal digit expected");
2157
2158     uinteger_t n;                       // unsigned >=64 bit integer type
2159
2160     if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2161         n = stringbuffer.data[0] - '0';
2162     else
2163     {
2164         // Convert string to integer
2165 #if __DMC__
2166         errno = 0;
2167         n = strtoull((char *)stringbuffer.data,NULL,base);
2168         if (errno == ERANGE)
2169             error("integer overflow");
2170 #else
2171         // Not everybody implements strtoull()
2172         char *p = (char *)stringbuffer.data;
2173         int r = 10, d;
2174
2175         if (*p == '0')
2176         {
2177             if (p[1] == 'x' || p[1] == 'X')
2178                 p += 2, r = 16;
2179             else if (p[1] == 'b' || p[1] == 'B')
2180                 p += 2, r = 2;
2181             else if (isdigit(p[1]))
2182                 p += 1, r = 8;
2183         }
2184
2185         n = 0;
2186         while (1)
2187         {
2188             if (*p >= '0' && *p <= '9')
2189                 d = *p - '0';
2190             else if (*p >= 'a' && *p <= 'z')
2191                 d = *p - 'a' + 10;
2192             else if (*p >= 'A' && *p <= 'Z')
2193                 d = *p - 'A' + 10;
2194             else
2195                 break;
2196             if (d >= r)
2197                 break;
2198             if (n && n * r + d <= n)
2199             {
2200                 error ("integer overflow");
2201                 break;
2202             }
2203
2204             n = n * r + d;
2205             p++;
2206         }
2207 #endif
2208         if (sizeof(n) > 8 &&
2209             n > 0xFFFFFFFFFFFFFFFFULL)  // if n needs more than 64 bits
2210             error("integer overflow");
2211     }
2212
2213     // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2214     while (1)
2215     {   unsigned char f;
2216
2217         switch (*p)
2218         {   case 'U':
2219             case 'u':
2220                 f = FLAGS_unsigned;
2221                 goto L1;
2222
2223             case 'l':
2224                 if (1 || !global.params.useDeprecated)
2225                     error("'l' suffix is deprecated, use 'L' instead");
2226             case 'L':
2227                 f = FLAGS_long;
2228             L1:
2229                 p++;
2230                 if (flags & f)
2231                     error("unrecognized token");
2232                 flags = (FLAGS) (flags | f);
2233                 continue;
2234             default:
2235                 break;
2236         }
2237         break;
2238     }
2239
2240     switch (flags)
2241     {
2242         case 0:
2243             /* Octal or Hexadecimal constant.
2244              * First that fits: int, uint, long, ulong
2245              */
2246             if (n & 0x8000000000000000LL)
2247                     result = TOKuns64v;
2248             else if (n & 0xFFFFFFFF00000000LL)
2249                     result = TOKint64v;
2250             else if (n & 0x80000000)
2251                     result = TOKuns32v;
2252             else
2253                     result = TOKint32v;
2254             break;
2255
2256         case FLAGS_decimal:
2257             /* First that fits: int, long, long long
2258              */
2259             if (n & 0x8000000000000000LL)
2260             {       error("signed integer overflow");
2261                     result = TOKuns64v;
2262             }
2263             else if (n & 0xFFFFFFFF80000000LL)
2264                     result = TOKint64v;
2265             else
2266                     result = TOKint32v;
2267             break;
2268
2269         case FLAGS_unsigned:
2270         case FLAGS_decimal | FLAGS_unsigned:
2271             /* First that fits: uint, ulong
2272              */
2273             if (n & 0xFFFFFFFF00000000LL)
2274                     result = TOKuns64v;
2275             else
2276                     result = TOKuns32v;
2277             break;
2278
2279         case FLAGS_decimal | FLAGS_long:
2280             if (n & 0x8000000000000000LL)
2281             {       error("signed integer overflow");
2282                     result = TOKuns64v;
2283             }
2284             else
2285                     result = TOKint64v;
2286             break;
2287
2288         case FLAGS_long:
2289             if (n & 0x8000000000000000LL)
2290                     result = TOKuns64v;
2291             else
2292                     result = TOKint64v;
2293             break;
2294
2295         case FLAGS_unsigned | FLAGS_long:
2296         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2297             result = TOKuns64v;
2298             break;
2299
2300         default:
2301             #ifdef DEBUG
2302                 printf("%x\n",flags);
2303             #endif
2304             assert(0);
2305     }
2306     t->uns64value = n;
2307     return result;
2308 }
2309
2310 /**************************************
2311  * Read in characters, converting them to real.
2312  * Bugs:
2313  *      Exponent overflow not detected.
2314  *      Too much requested precision is not detected.
2315  */
2316
2317 TOK Lexer::inreal(Token *t)
2318 #ifdef __DMC__
2319 __in
2320 {
2321     assert(*p == '.' || isdigit(*p));
2322 }
2323 __out (result)
2324 {
2325     switch (result)
2326     {
2327         case TOKfloat32v:
2328         case TOKfloat64v:
2329         case TOKfloat80v:
2330         case TOKimaginary32v:
2331         case TOKimaginary64v:
2332         case TOKimaginary80v:
2333             break;
2334
2335         default:
2336             assert(0);
2337     }
2338 }
2339 __body
2340 #endif /* __DMC__ */
2341 {   int dblstate;
2342     unsigned c;
2343     char hex;                   // is this a hexadecimal-floating-constant?
2344     TOK result;
2345
2346     //printf("Lexer::inreal()\n");
2347     stringbuffer.reset();
2348     dblstate = 0;
2349     hex = 0;
2350 Lnext:
2351     while (1)
2352     {
2353         // Get next char from input
2354         c = *p++;
2355         //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2356         while (1)
2357         {
2358             switch (dblstate)
2359             {
2360                 case 0:                 // opening state
2361                     if (c == '0')
2362                         dblstate = 9;
2363                     else if (c == '.')
2364                         dblstate = 3;
2365                     else
2366                         dblstate = 1;
2367                     break;
2368
2369                 case 9:
2370                     dblstate = 1;
2371                     if (c == 'X' || c == 'x')
2372                     {   hex++;
2373                         break;
2374                     }
2375                 case 1:                 // digits to left of .
2376                 case 3:                 // digits to right of .
2377                 case 7:                 // continuing exponent digits
2378                     if (!isdigit(c) && !(hex && isxdigit(c)))
2379                     {
2380                         if (c == '_')
2381                             goto Lnext; // ignore embedded '_'
2382                         dblstate++;
2383                         continue;
2384                     }
2385                     break;
2386
2387                 case 2:                 // no more digits to left of .
2388                     if (c == '.')
2389                     {   dblstate++;
2390                         break;
2391                     }
2392                 case 4:                 // no more digits to right of .
2393                     if ((c == 'E' || c == 'e') ||
2394                         hex && (c == 'P' || c == 'p'))
2395                     {   dblstate = 5;
2396                         hex = 0;        // exponent is always decimal
2397                         break;
2398                     }
2399                     if (hex)
2400                         error("binary-exponent-part required");
2401                     goto done;
2402
2403                 case 5:                 // looking immediately to right of E
2404                     dblstate++;
2405                     if (c == '-' || c == '+')
2406                         break;
2407                 case 6:                 // 1st exponent digit expected
2408                     if (!isdigit(c))
2409                         error("exponent expected");
2410                     dblstate++;
2411                     break;
2412
2413                 case 8:                 // past end of exponent digits
2414                     goto done;
2415             }
2416             break;
2417         }
2418         stringbuffer.writeByte(c);
2419     }
2420 done:
2421     p--;
2422
2423     stringbuffer.writeByte(0);
2424
2425 #if _WIN32 && __DMC__
2426     char *save = __locale_decpoint;
2427     __locale_decpoint = ".";
2428 #endif
2429 #ifdef IN_GCC
2430     t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2431 #else
2432     t->float80value = strtold((char *)stringbuffer.data, NULL);
2433 #endif
2434     errno = 0;
2435     switch (*p)
2436     {
2437         case 'F':
2438         case 'f':
2439 #ifdef IN_GCC
2440             real_t::parse((char *)stringbuffer.data, real_t::Float);
2441 #else
2442             strtof((char *)stringbuffer.data, NULL);
2443 #endif
2444             result = TOKfloat32v;
2445             p++;
2446             break;
2447
2448         default:
2449 #ifdef IN_GCC
2450             real_t::parse((char *)stringbuffer.data, real_t::Double);
2451 #else
2452             strtod((char *)stringbuffer.data, NULL);
2453 #endif
2454             result = TOKfloat64v;
2455             break;
2456
2457         case 'l':
2458             if (!global.params.useDeprecated)
2459                 error("'l' suffix is deprecated, use 'L' instead");
2460         case 'L':
2461             result = TOKfloat80v;
2462             p++;
2463             break;
2464     }
2465     if (*p == 'i' || *p == 'I')
2466     {
2467         if (!global.params.useDeprecated && *p == 'I')
2468             error("'I' suffix is deprecated, use 'i' instead");
2469         p++;
2470         switch (result)
2471         {
2472             case TOKfloat32v:
2473                 result = TOKimaginary32v;
2474                 break;
2475             case TOKfloat64v:
2476                 result = TOKimaginary64v;
2477                 break;
2478             case TOKfloat80v:
2479                 result = TOKimaginary80v;
2480                 break;
2481         }
2482     }
2483 #if _WIN32 && __DMC__
2484     __locale_decpoint = save;
2485 #endif
2486     if (errno == ERANGE)
2487         error("number is not representable");
2488     return result;
2489 }
2490
2491 /*********************************************
2492  * Do pragma.
2493  * Currently, the only pragma supported is:
2494  *      #line linnum [filespec]
2495  */
2496
2497 void Lexer::pragma()
2498 {
2499     Token tok;
2500     int linnum;
2501     char *filespec = NULL;
2502     Loc loc = this->loc;
2503
2504     scan(&tok);
2505     if (tok.value != TOKidentifier || tok.ident != Id::line)
2506         goto Lerr;
2507
2508     scan(&tok);
2509     if (tok.value == TOKint32v || tok.value == TOKint64v)
2510         linnum = tok.uns64value - 1;
2511     else
2512         goto Lerr;
2513
2514     while (1)
2515     {
2516         switch (*p)
2517         {
2518             case 0:
2519             case 0x1A:
2520             case '\n':
2521             Lnewline:
2522                 this->loc.linnum = linnum;
2523                 if (filespec)
2524                     this->loc.filename = filespec;
2525                 return;
2526
2527             case '\r':
2528                 p++;
2529                 if (*p != '\n')
2530                 {   p--;
2531                     goto Lnewline;
2532                 }
2533                 continue;
2534
2535             case ' ':
2536             case '\t':
2537             case '\v':
2538             case '\f':
2539                 p++;
2540                 continue;                       // skip white space
2541
2542             case '_':
2543                 if (mod && memcmp(p, "__FILE__", 8) == 0)
2544                 {
2545                     p += 8;
2546                     filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2547                 }
2548                 continue;
2549
2550             case '"':
2551                 if (filespec)
2552                     goto Lerr;
2553                 stringbuffer.reset();
2554                 p++;
2555                 while (1)
2556                 {   unsigned c;
2557
2558                     c = *p;
2559                     switch (c)
2560                     {
2561                         case '\n':
2562                         case '\r':
2563                         case 0:
2564                         case 0x1A:
2565                             goto Lerr;
2566
2567                         case '"':
2568                             stringbuffer.writeByte(0);
2569                             filespec = mem.strdup((char *)stringbuffer.data);
2570                             p++;
2571                             break;
2572
2573                         default:
2574                             if (c & 0x80)
2575                             {   unsigned u = decodeUTF();
2576                                 if (u == PS || u == LS)
2577                                     goto Lerr;
2578                             }
2579                             stringbuffer.writeByte(c);
2580                             p++;
2581                             continue;
2582                     }
2583                     break;
2584                 }
2585                 continue;
2586
2587             default:
2588                 if (*p & 0x80)
2589                 {   unsigned u = decodeUTF();
2590                     if (u == PS || u == LS)
2591                         goto Lnewline;
2592                 }
2593                 goto Lerr;
2594         }
2595     }
2596
2597 Lerr:
2598     error(loc, "#line integer [\"filespec\"]\\n expected");
2599 }
2600
2601
2602 /********************************************
2603  * Decode UTF character.
2604  * Issue error messages for invalid sequences.
2605  * Return decoded character, advance p to last character in UTF sequence.
2606  */
2607
2608 unsigned Lexer::decodeUTF()
2609 {
2610     dchar_t u;
2611     unsigned char c;
2612     unsigned char *s = p;
2613     size_t len;
2614     size_t idx;
2615     char *msg;
2616
2617     c = *s;
2618     assert(c & 0x80);
2619
2620     // Check length of remaining string up to 6 UTF-8 characters
2621     for (len = 1; len < 6 && s[len]; len++)
2622         ;
2623
2624     idx = 0;
2625     msg = utf_decodeChar(s, len, &idx, &u);
2626     p += idx - 1;
2627     if (msg)
2628     {
2629         error("%s", msg);
2630     }
2631     return u;
2632 }
2633
2634
2635 /***************************************************
2636  * Parse doc comment embedded between t->ptr and p.
2637  * Remove trailing blanks and tabs from lines.
2638  * Replace all newlines with \n.
2639  * Remove leading comment character from each line.
2640  * Decide if it's a lineComment or a blockComment.
2641  * Append to previous one for this token.
2642  */
2643
2644 void Lexer::getDocComment(Token *t, unsigned lineComment)
2645 {
2646     OutBuffer buf;
2647     unsigned char ct = t->ptr[2];
2648     unsigned char *q = t->ptr + 3;      // start of comment text
2649     int linestart = 0;
2650
2651     unsigned char *qend = p;
2652     if (ct == '*' || ct == '+')
2653         qend -= 2;
2654
2655     /* Scan over initial row of ****'s or ++++'s or ////'s
2656      */
2657     for (; q < qend; q++)
2658     {
2659         if (*q != ct)
2660             break;
2661     }
2662
2663     /* Remove trailing row of ****'s or ++++'s
2664      */
2665     if (ct != '/')
2666     {
2667         for (; q < qend; qend--)
2668         {
2669             if (qend[-1] != ct)
2670                 break;
2671         }
2672     }
2673
2674     for (; q < qend; q++)
2675     {
2676         unsigned char c = *q;
2677
2678         switch (c)
2679         {
2680             case '*':
2681             case '+':
2682                 if (linestart && c == ct)
2683                 {   linestart = 0;
2684                     /* Trim preceding whitespace up to preceding \n
2685                      */
2686                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2687                         buf.offset--;
2688                     continue;
2689                 }
2690                 break;
2691
2692             case ' ':
2693             case '\t':
2694                 break;
2695
2696             case '\r':
2697                 if (q[1] == '\n')
2698                     continue;           // skip the \r
2699                 goto Lnewline;
2700
2701             default:
2702                 if (c == 226)
2703                 {
2704                     // If LS or PS
2705                     if (q[1] == 128 &&
2706                         (q[2] == 168 || q[2] == 169))
2707                     {
2708                         q += 2;
2709                         goto Lnewline;
2710                     }
2711                 }
2712                 linestart = 0;
2713                 break;
2714
2715             Lnewline:
2716                 c = '\n';               // replace all newlines with \n
2717             case '\n':
2718                 linestart = 1;
2719
2720                 /* Trim trailing whitespace
2721                  */
2722                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2723                     buf.offset--;
2724
2725                 break;
2726         }
2727         buf.writeByte(c);
2728     }
2729
2730     // Always end with a newline
2731     if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2732         buf.writeByte('\n');
2733
2734     buf.writeByte(0);
2735
2736     // It's a line comment if the start of the doc comment comes
2737     // after other non-whitespace on the same line.
2738     unsigned char** dc = (lineComment && anyToken)
2739                          ? &t->lineComment
2740                          : &t->blockComment;
2741
2742     // Combine with previous doc comment, if any
2743     if (*dc)
2744         *dc = combineComments(*dc, (unsigned char *)buf.data);
2745     else
2746         *dc = (unsigned char *)buf.extractData();
2747 }
2748
2749 /********************************************
2750  * Combine two document comments into one.
2751  */
2752
2753 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2754 {
2755     unsigned char *c = c2;
2756
2757     if (c1)
2758     {   c = c1;
2759         if (c2)
2760         {   size_t len1 = strlen((char *)c1);
2761             size_t len2 = strlen((char *)c2);
2762
2763             c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2764             memcpy(c, c1, len1);
2765             c[len1] = '\n';
2766             memcpy(c + len1 + 1, c2, len2);
2767             c[len1 + 1 + len2] = 0;
2768         }
2769     }
2770     return c;
2771 }
2772
2773 /********************************************
2774  * Create an identifier in the string table.
2775  */
2776
2777 Identifier *Lexer::idPool(const char *s)
2778 {
2779     size_t len = strlen(s);
2780     StringValue *sv = stringtable.update(s, len);
2781     Identifier *id = (Identifier *) sv->ptrvalue;
2782     if (!id)
2783     {
2784         id = new Identifier(sv->lstring.string, TOKidentifier);
2785         sv->ptrvalue = id;
2786     }
2787     return id;
2788 }
2789
2790 /*********************************************
2791  * Create a unique identifier using the prefix s.
2792  */
2793
2794 Identifier *Lexer::uniqueId(const char *s, int num)
2795 {   char buffer[32];
2796     size_t slen = strlen(s);
2797
2798     assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2799     sprintf(buffer, "%s%d", s, num);
2800     return idPool(buffer);
2801 }
2802
2803 Identifier *Lexer::uniqueId(const char *s)
2804 {
2805     static int num;
2806     return uniqueId(s, ++num);
2807 }
2808
2809 /****************************************
2810  */
2811
2812 struct Keyword
2813 {   char *name;
2814     enum TOK value;
2815 };
2816
2817 static Keyword keywords[] =
2818 {
2819 //    { "",             TOK     },
2820
2821     {   "this",         TOKthis         },
2822     {   "super",        TOKsuper        },
2823     {   "assert",       TOKassert       },
2824     {   "null",         TOKnull         },
2825     {   "true",         TOKtrue         },
2826     {   "false",        TOKfalse        },
2827     {   "cast",         TOKcast         },
2828     {   "new",          TOKnew          },
2829     {   "delete",       TOKdelete       },
2830     {   "throw",        TOKthrow        },
2831     {   "module",       TOKmodule       },
2832     {   "pragma",       TOKpragma       },
2833     {   "typeof",       TOKtypeof       },
2834     {   "typeid",       TOKtypeid       },
2835
2836     {   "template",     TOKtemplate     },
2837
2838     {   "void",         TOKvoid         },
2839     {   "byte",         TOKint8         },
2840     {   "ubyte",        TOKuns8         },
2841     {   "short",        TOKint16        },
2842     {   "ushort",       TOKuns16        },
2843     {   "int",          TOKint32        },
2844     {   "uint",         TOKuns32        },
2845     {   "long",         TOKint64        },
2846     {   "ulong",        TOKuns64        },
2847     {   "cent",         TOKcent,        },
2848     {   "ucent",        TOKucent,       },
2849     {   "float",        TOKfloat32      },
2850     {   "double",       TOKfloat64      },
2851     {   "real",         TOKfloat80      },
2852
2853     {   "bool",         TOKbool         },
2854     {   "char",         TOKchar         },
2855     {   "wchar",        TOKwchar        },
2856     {   "dchar",        TOKdchar        },
2857
2858     {   "ifloat",       TOKimaginary32  },
2859     {   "idouble",      TOKimaginary64  },
2860     {   "ireal",        TOKimaginary80  },
2861
2862     {   "cfloat",       TOKcomplex32    },
2863     {   "cdouble",      TOKcomplex64    },
2864     {   "creal",        TOKcomplex80    },
2865
2866     {   "delegate",     TOKdelegate     },
2867     {   "function",     TOKfunction     },
2868
2869     {   "is",           TOKis           },
2870     {   "if",           TOKif           },
2871     {   "else",         TOKelse         },
2872     {   "while",        TOKwhile        },
2873     {   "for",          TOKfor          },
2874     {   "do",           TOKdo           },
2875     {   "switch",       TOKswitch       },
2876     {   "case",         TOKcase         },
2877     {   "default",      TOKdefault      },
2878     {   "break",        TOKbreak        },
2879     {   "continue",     TOKcontinue     },
2880     {   "synchronized", TOKsynchronized },
2881     {   "return",       TOKreturn       },
2882     {   "goto",         TOKgoto         },
2883     {   "try",          TOKtry          },
2884     {   "catch",        TOKcatch        },
2885     {   "finally",      TOKfinally      },
2886     {   "with",         TOKwith         },
2887     {   "asm",          TOKasm          },
2888     {   "foreach",      TOKforeach      },
2889     {   "foreach_reverse",      TOKforeach_reverse      },
2890     {   "scope",        TOKscope        },
2891
2892     {   "struct",       TOKstruct       },
2893     {   "class",        TOKclass        },
2894     {   "interface",    TOKinterface    },
2895     {   "union",        TOKunion        },
2896     {   "enum",         TOKenum         },
2897     {   "import",       TOKimport       },
2898     {   "mixin",        TOKmixin        },
2899     {   "static",       TOKstatic       },
2900     {   "final",        TOKfinal        },
2901     {   "const",        TOKconst        },
2902     {   "typedef",      TOKtypedef      },
2903     {   "alias",        TOKalias        },
2904     {   "override",     TOKoverride     },
2905     {   "abstract",     TOKabstract     },
2906     {   "volatile",     TOKvolatile     },
2907     {   "debug",        TOKdebug        },
2908     {   "deprecated",   TOKdeprecated   },
2909     {   "in",           TOKin           },
2910     {   "out",          TOKout          },
2911     {   "inout",        TOKinout        },
2912     {   "lazy",         TOKlazy         },
2913     {   "auto",         TOKauto         },
2914
2915     {   "align",        TOKalign        },
2916     {   "extern",       TOKextern       },
2917     {   "private",      TOKprivate      },
2918     {   "package",      TOKpackage      },
2919     {   "protected",    TOKprotected    },
2920     {   "public",       TOKpublic       },
2921     {   "export",       TOKexport       },
2922
2923     {   "body",         TOKbody         },
2924     {   "invariant",    TOKinvariant    },
2925     {   "unittest",     TOKunittest     },
2926     {   "version",      TOKversion      },
2927     //{ "manifest",     TOKmanifest     },
2928
2929     // Added after 1.0
2930     {   "ref",          TOKref          },
2931     {   "macro",        TOKmacro        },
2932 #if V2
2933     {   "pure",         TOKpure         },
2934     {   "nothrow",      TOKnothrow      },
2935     {   "__thread",     TOKtls          },
2936     {   "__traits",     TOKtraits       },
2937     {   "__overloadset", TOKoverloadset },
2938     {   "__FILE__",     TOKfile         },
2939     {   "__LINE__",     TOKline         },
2940 #endif
2941 };
2942
2943 int Token::isKeyword()
2944 {
2945     for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
2946     {
2947         if (keywords[u].value == value)
2948             return 1;
2949     }
2950     return 0;
2951 }
2952
2953 void Lexer::initKeywords()
2954 {   StringValue *sv;
2955     unsigned u;
2956     enum TOK v;
2957     unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
2958
2959     if (global.params.Dversion == 1)
2960         nkeywords -= 2;
2961
2962     cmtable_init();
2963
2964     for (u = 0; u < nkeywords; u++)
2965     {   char *s;
2966
2967         //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
2968         s = keywords[u].name;
2969         v = keywords[u].value;
2970         sv = stringtable.insert(s, strlen(s));
2971         sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
2972
2973         //printf("tochars[%d] = '%s'\n",v, s);
2974         Token::tochars[v] = s;
2975     }
2976
2977     Token::tochars[TOKeof]              = "EOF";
2978     Token::tochars[TOKlcurly]           = "{";
2979     Token::tochars[TOKrcurly]           = "}";
2980     Token::tochars[TOKlparen]           = "(";
2981     Token::tochars[TOKrparen]           = ")";
2982     Token::tochars[TOKlbracket]         = "[";
2983     Token::tochars[TOKrbracket]         = "]";
2984     Token::tochars[TOKsemicolon]        = ";";
2985     Token::tochars[TOKcolon]            = ":";
2986     Token::tochars[TOKcomma]            = ",";
2987     Token::tochars[TOKdot]              = ".";
2988     Token::tochars[TOKxor]              = "^";
2989     Token::tochars[TOKxorass]           = "^=";
2990     Token::tochars[TOKassign]           = "=";
2991     Token::tochars[TOKconstruct]        = "=";
2992 #if V2
2993     Token::tochars[TOKblit]             = "=";
2994 #endif
2995     Token::tochars[TOKlt]               = "<";
2996     Token::tochars[TOKgt]               = ">";
2997     Token::tochars[TOKle]               = "<=";
2998     Token::tochars[TOKge]               = ">=";
2999     Token::tochars[TOKequal]            = "==";
3000     Token::tochars[TOKnotequal]         = "!=";
3001     Token::tochars[TOKnotidentity]      = "!is";
3002     Token::tochars[TOKtobool]           = "!!";
3003
3004     Token::tochars[TOKunord]            = "!<>=";
3005     Token::tochars[TOKue]               = "!<>";
3006     Token::tochars[TOKlg]               = "<>";
3007     Token::tochars[TOKleg]              = "<>=";
3008     Token::tochars[TOKule]              = "!>";
3009     Token::tochars[TOKul]               = "!>=";
3010     Token::tochars[TOKuge]              = "!<";
3011     Token::tochars[TOKug]               = "!<=";
3012
3013     Token::tochars[TOKnot]              = "!";
3014     Token::tochars[TOKtobool]           = "!!";
3015     Token::tochars[TOKshl]              = "<<";
3016     Token::tochars[TOKshr]              = ">>";
3017     Token::tochars[TOKushr]             = ">>>";
3018     Token::tochars[TOKadd]              = "+";
3019     Token::tochars[TOKmin]              = "-";
3020     Token::tochars[TOKmul]              = "*";
3021     Token::tochars[TOKdiv]              = "/";
3022     Token::tochars[TOKmod]              = "%";
3023     Token::tochars[TOKslice]            = "..";
3024     Token::tochars[TOKdotdotdot]        = "...";
3025     Token::tochars[TOKand]              = "&";
3026     Token::tochars[TOKandand]           = "&&";
3027     Token::tochars[TOKor]               = "|";
3028     Token::tochars[TOKoror]             = "||";
3029     Token::tochars[TOKarray]            = "[]";
3030     Token::tochars[TOKindex]            = "[i]";
3031     Token::tochars[TOKaddress]          = "&";
3032     Token::tochars[TOKstar]             = "*";
3033     Token::tochars[TOKtilde]            = "~";
3034     Token::tochars[TOKdollar]           = "$";
3035     Token::tochars[TOKcast]             = "cast";
3036     Token::tochars[TOKplusplus]         = "++";
3037     Token::tochars[TOKminusminus]       = "--";
3038     Token::tochars[TOKtype]             = "type";
3039     Token::tochars[TOKquestion]         = "?";
3040     Token::tochars[TOKneg]              = "-";
3041     Token::tochars[TOKuadd]             = "+";
3042     Token::tochars[TOKvar]              = "var";
3043     Token::tochars[TOKaddass]           = "+=";
3044     Token::tochars[TOKminass]           = "-=";
3045     Token::tochars[TOKmulass]           = "*=";
3046     Token::tochars[TOKdivass]           = "/=";
3047     Token::tochars[TOKmodass]           = "%=";
3048     Token::tochars[TOKshlass]           = "<<=";
3049     Token::tochars[TOKshrass]           = ">>=";
3050     Token::tochars[TOKushrass]          = ">>>=";
3051     Token::tochars[TOKandass]           = "&=";
3052     Token::tochars[TOKorass]            = "|=";
3053     Token::tochars[TOKcatass]           = "~=";
3054     Token::tochars[TOKcat]              = "~";
3055     Token::tochars[TOKcall]             = "call";
3056     Token::tochars[TOKidentity]         = "is";
3057     Token::tochars[TOKnotidentity]      = "!is";
3058
3059     Token::tochars[TOKorass]            = "|=";
3060     Token::tochars[TOKidentifier]       = "identifier";
3061
3062      // For debugging
3063     Token::tochars[TOKdotexp]           = "dotexp";
3064     Token::tochars[TOKdotti]            = "dotti";
3065     Token::tochars[TOKdotvar]           = "dotvar";
3066     Token::tochars[TOKdottype]          = "dottype";
3067     Token::tochars[TOKsymoff]           = "symoff";
3068     Token::tochars[TOKtypedot]          = "typedot";
3069     Token::tochars[TOKarraylength]      = "arraylength";
3070     Token::tochars[TOKarrayliteral]     = "arrayliteral";
3071     Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3072     Token::tochars[TOKstructliteral]    = "structliteral";
3073     Token::tochars[TOKstring]           = "string";
3074     Token::tochars[TOKdsymbol]          = "symbol";
3075     Token::tochars[TOKtuple]            = "tuple";
3076     Token::tochars[TOKdeclaration]      = "declaration";
3077     Token::tochars[TOKdottd]            = "dottd";
3078     Token::tochars[TOKon_scope_exit]    = "scope(exit)";
3079 }