dmd/lexer.c

   1
   2 // Compiler implementation of the D programming language
   3 // Copyright (c) 1999-2008 by Digital Mars
   4 // All Rights Reserved
   5 // written by Walter Bright
   6 // http://www.digitalmars.com
   7 // License for redistribution is by either the Artistic License
   8 // in artistic.txt, or the GNU General Public License in gnu.txt.
   9 // See the included readme.txt for details.
  10
  11 /* NOTE: This file has been patched from the original DMD distribution to
  12    work with the GDC compiler.
  13
  14    Modified by David Friedman, December 2006
  15 */
  16
  17 /* Lexical Analyzer */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21 #include <ctype.h>
  22 #include <stdarg.h>
  23 #include <errno.h>
  24 //#include <wchar.h>
  25 #include <stdlib.h>
  26 #include <assert.h>
  27 #include <sys/time.h>
  28
  29 #ifdef IN_GCC
  30
  31 #include <time.h>
  32 #include "mem.h"
  33
  34 #else
  35
  36 #if __GNUC__
  37 #include <time.h>
  38 #endif
  39
  40 #if _WIN32
  41 #include "..\root\mem.h"
  42 #else
  43 #include "../root/mem.h"
  44 #endif
  45 #endif
  46
  47 #include "stringtable.h"
  48
  49 #include "lexer.h"
  50 #include "utf.h"
  51 #include "identifier.h"
  52 #include "id.h"
  53 #include "module.h"
  54
  55 #if _WIN32 && __DMC__
  56 // from \dm\src\include\setlocal.h
  57 extern "C" char * __cdecl __locale_decpoint;
  58 #endif
  59
  60 extern int HtmlNamedEntity(unsigned char *p, int length);
  61
  62 #define LS 0x2028       // UTF line separator
  63 #define PS 0x2029       // UTF paragraph separator
  64
  65 /********************************************
  66  * Do our own char maps
  67  */
  68
  69 static unsigned char cmtable[256];
  70
  71 const int CMoctal =     0x1;
  72 const int CMhex =       0x2;
  73 const int CMidchar =    0x4;
  74
  75 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
  76 inline unsigned char ishex   (unsigned char c) { return cmtable[c] & CMhex; }
  77 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
  78
  79 static void cmtable_init()
  80 {
  81     for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
  82     {
  83         if ('0' <= c && c <= '7')
  84             cmtable[c] |= CMoctal;
  85         if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
  86             cmtable[c] |= CMhex;
  87         if (isalnum(c) || c == '_')
  88             cmtable[c] |= CMidchar;
  89     }
  90 }
  91
  92
  93 /************************* Token **********************************************/
  94
  95 char *Token::tochars[TOKMAX];
  96
  97 void *Token::operator new(size_t size)
  98 {   Token *t;
  99
 100     if (Lexer::freelist)
 101     {
 102         t = Lexer::freelist;
 103         Lexer::freelist = t->next;
 104         return t;
 105     }
 106
 107     return ::operator new(size);
 108 }
 109
 110 #ifdef DEBUG
 111 void Token::print()
 112 {
 113     fprintf(stdmsg, "%s\n", toChars());
 114 }
 115 #endif
 116
 117 char *Token::toChars()
 118 {   char *p;
 119     static char buffer[3 + 3 * sizeof(value) + 1];
 120
 121     p = buffer;
 122     switch (value)
 123     {
 124         case TOKint32v:
 125 #if IN_GCC
 126             sprintf(buffer,"%d",(d_int32)int64value);
 127 #else
 128             sprintf(buffer,"%d",int32value);
 129 #endif
 130             break;
 131
 132         case TOKuns32v:
 133         case TOKcharv:
 134         case TOKwcharv:
 135         case TOKdcharv:
 136 #if IN_GCC
 137             sprintf(buffer,"%uU",(d_uns32)uns64value);
 138 #else
 139             sprintf(buffer,"%uU",uns32value);
 140 #endif
 141             break;
 142
 143         case TOKint64v:
 144             sprintf(buffer,"%"PRIdMAX"L",int64value);
 145             break;
 146
 147         case TOKuns64v:
 148             sprintf(buffer,"%"PRIuMAX"UL",uns64value);
 149             break;
 150
 151 #if IN_GCC
 152         case TOKfloat32v:
 153         case TOKfloat64v:
 154         case TOKfloat80v:
 155             float80value.format(buffer, sizeof(buffer));
 156             break;
 157         case TOKimaginary32v:
 158         case TOKimaginary64v:
 159         case TOKimaginary80v:
 160             float80value.format(buffer, sizeof(buffer));
 161             // %% buffer
 162             strcat(buffer, "i");
 163             break;
 164 #else
 165         case TOKfloat32v:
 166             sprintf(buffer,"%Lgf", float80value);
 167             break;
 168
 169         case TOKfloat64v:
 170             sprintf(buffer,"%Lg", float80value);
 171             break;
 172
 173         case TOKfloat80v:
 174             sprintf(buffer,"%LgL", float80value);
 175             break;
 176
 177         case TOKimaginary32v:
 178             sprintf(buffer,"%Lgfi", float80value);
 179             break;
 180
 181         case TOKimaginary64v:
 182             sprintf(buffer,"%Lgi", float80value);
 183             break;
 184
 185         case TOKimaginary80v:
 186             sprintf(buffer,"%LgLi", float80value);
 187             break;
 188 #endif
 189
 190
 191         case TOKstring:
 192 #if CSTRINGS
 193             p = string;
 194 #else
 195         {   OutBuffer buf;
 196
 197             buf.writeByte('"');
 198             for (size_t i = 0; i < len; )
 199             {   unsigned c;
 200
 201                 utf_decodeChar((unsigned char *)ustring, len, &i, &c);
 202                 switch (c)
 203                 {
 204                     case 0:
 205                         break;
 206
 207                     case '"':
 208                     case '\\':
 209                         buf.writeByte('\\');
 210                     default:
 211                         if (isprint(c))
 212                             buf.writeByte(c);
 213                         else if (c <= 0x7F)
 214                             buf.printf("\\x%02x", c);
 215                         else if (c <= 0xFFFF)
 216                             buf.printf("\\u%04x", c);
 217                         else
 218                             buf.printf("\\U%08x", c);
 219                         continue;
 220                 }
 221                 break;
 222             }
 223             buf.writeByte('"');
 224             if (postfix)
 225                 buf.writeByte('"');
 226             buf.writeByte(0);
 227             p = (char *)buf.extractData();
 228         }
 229 #endif
 230             break;
 231
 232         case TOKidentifier:
 233         case TOKenum:
 234         case TOKstruct:
 235         case TOKimport:
 236         CASE_BASIC_TYPES:
 237             p = ident->toChars();
 238             break;
 239
 240         default:
 241             p = toChars(value);
 242             break;
 243     }
 244     return p;
 245 }
 246
 247 char *Token::toChars(enum TOK value)
 248 {   char *p;
 249     static char buffer[3 + 3 * sizeof(value) + 1];
 250
 251     p = tochars[value];
 252     if (!p)
 253     {   sprintf(buffer,"TOK%d",value);
 254         p = buffer;
 255     }
 256     return p;
 257 }
 258
 259 /*************************** Lexer ********************************************/
 260
 261 Token *Lexer::freelist = NULL;
 262 StringTable Lexer::stringtable;
 263 OutBuffer Lexer::stringbuffer;
 264
 265 Lexer::Lexer(Module *mod,
 266         unsigned char *base, unsigned begoffset, unsigned endoffset,
 267         int doDocComment, int commentToken, bool dltSyntax)
 268     : loc(mod, 1), dltSyntax(dltSyntax)
 269 {
 270     //printf("Lexer::Lexer(%p,%d)\n",base,length);
 271     //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
 272     memset(&token,0,sizeof(token));
 273     this->base = base;
 274     this->end  = base + endoffset;
 275     p = base + begoffset;
 276     this->mod = mod;
 277     this->doDocComment = doDocComment;
 278     this->anyToken = 0;
 279     this->commentToken = commentToken;
 280     this->nesting = 0;
 281     this->indent = 0;
 282     this->atStartOfLine = 1;
 283     //initKeywords();
 284
 285     /* If first line starts with '#!', ignore the line
 286      */
 287
 288     if (p[0] == '#' && p[1] =='!')
 289     {
 290         p += 2;
 291         while (1)
 292         {   unsigned char c = *p;
 293             switch (c)
 294             {
 295                 case '\n':
 296                     p++;
 297                     break;
 298
 299                 case '\r':
 300                     p++;
 301                     if (*p == '\n')
 302                         p++;
 303                     break;
 304
 305                 case 0:
 306                 case 0x1A:
 307                     break;
 308
 309                 default:
 310                     if (c & 0x80)
 311                     {   unsigned u = decodeUTF();
 312                         if (u == PS || u == LS)
 313                             break;
 314                     }
 315                     p++;
 316                     continue;
 317             }
 318             break;
 319         }
 320         loc.linnum = 2;
 321     }
 322 }
 323
 324
 325 void Lexer::error(const char *format, ...)
 326 {
 327     if (mod && !global.gag)
 328     {
 329         char *p = loc.toChars();
 330         if (*p)
 331             fprintf(stdmsg, "%s: ", p);
 332         mem.free(p);
 333
 334         va_list ap;
 335         va_start(ap, format);
 336         vfprintf(stdmsg, format, ap);
 337         va_end(ap);
 338
 339         fprintf(stdmsg, "\n");
 340         fflush(stdmsg);
 341
 342         if (global.errors >= 20)        // moderate blizzard of cascading messages
 343             fatal();
 344     }
 345     global.errors++;
 346 }
 347
 348 void Lexer::error(Loc loc, const char *format, ...)
 349 {
 350     if (mod && !global.gag)
 351     {
 352         char *p = loc.toChars();
 353         if (*p)
 354             fprintf(stdmsg, "%s: ", p);
 355         mem.free(p);
 356
 357         va_list ap;
 358         va_start(ap, format);
 359         vfprintf(stdmsg, format, ap);
 360         va_end(ap);
 361
 362         fprintf(stdmsg, "\n");
 363         fflush(stdmsg);
 364
 365         if (global.errors >= 20)        // moderate blizzard of cascading messages
 366             fatal();
 367     }
 368     global.errors++;
 369 }
 370
 371 TOK Lexer::nextToken()
 372 {   Token *t;
 373
 374     if (token.next)
 375     {
 376         t = token.next;
 377         memcpy(&token,t,sizeof(Token));
 378         t->next = freelist;
 379         freelist = t;
 380     }
 381     else
 382     {
 383         scan(&token);
 384     }
 385     //token.print();
 386     return token.value;
 387 }
 388
 389 Token *Lexer::peek(Token *ct)
 390 {   Token *t;
 391
 392     if (ct->next)
 393         t = ct->next;
 394     else
 395     {
 396         t = new Token();
 397         scan(t);
 398         t->next = NULL;
 399         ct->next = t;
 400     }
 401     return t;
 402 }
 403
 404 /*********************************
 405  * tk is on the opening (.
 406  * Look ahead and return token that is past the closing ).
 407  */
 408
 409 Token *Lexer::peekPastParen(Token *tk)
 410 {
 411     //printf("peekPastParen()\n");
 412     int parens = 1;
 413     int curlynest = 0;
 414     while (1)
 415     {
 416         tk = peek(tk);
 417         //tk->print();
 418         switch (tk->value)
 419         {
 420             case TOKlparen:
 421                 parens++;
 422                 continue;
 423
 424             case TOKrparen:
 425                 --parens;
 426                 if (parens)
 427                     continue;
 428                 tk = peek(tk);
 429                 break;
 430
 431             case TOKlcurly:
 432                 curlynest++;
 433                 continue;
 434
 435             case TOKrcurly:
 436                 if (--curlynest >= 0)
 437                     continue;
 438                 break;
 439
 440             case TOKsemicolon:
 441                 if (curlynest)
 442                     continue;
 443                 break;
 444
 445             case TOKeof:
 446                 break;
 447
 448             default:
 449                 continue;
 450         }
 451         return tk;
 452     }
 453 }
 454
 455 /**********************************
 456  * Determine if string is a valid Identifier.
 457  * Placed here because of commonality with Lexer functionality.
 458  * Returns:
 459  *      0       invalid
 460  */
 461
 462 int Lexer::isValidIdentifier(char *p)
 463 {
 464     size_t len;
 465     size_t idx;
 466
 467     if (!p || !*p)
 468         goto Linvalid;
 469
 470     if (*p >= '0' && *p <= '9')         // beware of isdigit() on signed chars
 471         goto Linvalid;
 472
 473     len = strlen(p);
 474     idx = 0;
 475     while (p[idx])
 476     {   dchar_t dc;
 477
 478         char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
 479         if (q)
 480             goto Linvalid;
 481
 482         if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
 483             goto Linvalid;
 484     }
 485     return 1;
 486
 487 Linvalid:
 488     return 0;
 489 }
 490
 491 /****************************
 492  * Turn next token in buffer into a token.
 493  */
 494
 495 void Lexer::scan(Token *t)
 496 {
 497     unsigned lastLine = loc.linnum;
 498     unsigned linnum;
 499
 500     t->blockComment = NULL;
 501     t->lineComment = NULL;
 502     while (1)
 503     {
 504         t->ptr = p;
 505
 506         if (dltSyntax && atStartOfLine) {
 507                 // Check indent
 508                 int i;
 509                 for (i = 0; p[i] == '\t'; i++) {
 510                 }
 511                 if (p[i] == ' ') {
 512                     error("Whitespace error: use tabs to indent!");
 513                 }
 514                 if (p[i] == '#') {
 515                     p += i;
 516                     atStartOfLine = 0;
 517                 } else if (p[i] != '\n') {
 518                     if (p[i] == '\0')
 519                         i = 0;                  // End-of-file always has no indent
 520                     if (i > indent) {
 521                         error("Unexpected indentation");
 522                     } else if (i < indent) {
 523                         indent -= 1;
 524                         t->value = TOKrcurly;
 525                         return;
 526                     }
 527                     atStartOfLine = 0;
 528                 } /* else ignore blank line */
 529         }
 530
 531         //printf("p = %p, *p = '%c'\n",p,*p);
 532         switch (*p)
 533         {
 534             case 0:
 535             case 0x1A:
 536                 t->value = TOKeof;                      // end of file
 537                 return;
 538
 539             case ' ':
 540             case '\t':
 541             case '\v':
 542             case '\f':
 543                 p++;
 544                 continue;                       // skip white space
 545
 546             case '\r':
 547                 if (*p == '\n') {               // if CRLF
 548                     p++;
 549                     continue;
 550                 }
 551                 // fall-through
 552             case '\n':
 553                 p++;
 554                 loc.linnum++;
 555                 if (dltSyntax && !nesting) {
 556                         atStartOfLine = 1;
 557                         t->value = TOKendline;
 558                         return;
 559                 }
 560                 continue;                       // Ignore newlines inside brackets
 561             case '0':   case '1':   case '2':   case '3':   case '4':
 562             case '5':   case '6':   case '7':   case '8':   case '9':
 563                 t->value = number(t);
 564                 return;
 565
 566 #if CSTRINGS
 567             case '\'':
 568                 t->value = charConstant(t, 0);
 569                 return;
 570
 571             case '"':
 572                 t->value = stringConstant(t,0);
 573                 return;
 574
 575             case 'l':
 576             case 'L':
 577                 if (p[1] == '\'')
 578                 {
 579                     p++;
 580                     t->value = charConstant(t, 1);
 581                     return;
 582                 }
 583                 else if (p[1] == '"')
 584                 {
 585                     p++;
 586                     t->value = stringConstant(t, 1);
 587                     return;
 588                 }
 589 #else
 590             case '\'':
 591                 t->value = charConstant(t,0);
 592                 return;
 593
 594             case 'r':
 595                 if (p[1] != '"')
 596                     goto case_ident;
 597                 p++;
 598             case '`':
 599                 t->value = wysiwygStringConstant(t, *p);
 600                 return;
 601
 602             case 'x':
 603                 if (p[1] != '"')
 604                     goto case_ident;
 605                 p++;
 606                 t->value = hexStringConstant(t);
 607                 return;
 608
 609 #if V2
 610             case 'q':
 611                 if (p[1] == '"')
 612                 {
 613                     p++;
 614                     t->value = delimitedStringConstant(t);
 615                     return;
 616                 }
 617                 else if (p[1] == '{')
 618                 {
 619                     p++;
 620                     t->value = tokenStringConstant(t);
 621                     return;
 622                 }
 623                 else
 624                     goto case_ident;
 625 #endif
 626
 627             case '"':
 628                 t->value = escapeStringConstant(t,0);
 629                 return;
 630
 631             case '\\':                  // escaped string literal
 632             {   unsigned c;
 633
 634                 stringbuffer.reset();
 635                 do
 636                 {
 637                     p++;
 638                     switch (*p)
 639                     {
 640                         case 'u':
 641                         case 'U':
 642                         case '&':
 643                             c = escapeSequence();
 644                             stringbuffer.writeUTF8(c);
 645                             break;
 646
 647                         default:
 648                             c = escapeSequence();
 649                             stringbuffer.writeByte(c);
 650                             break;
 651                     }
 652                 } while (*p == '\\');
 653                 t->len = stringbuffer.offset;
 654                 stringbuffer.writeByte(0);
 655                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
 656                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
 657                 t->postfix = 0;
 658                 t->value = TOKstring;
 659                 return;
 660             }
 661
 662             case 'l':
 663             case 'L':
 664 #endif
 665             case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 666             case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 667             case 'k':               case 'm':   case 'n':   case 'o':
 668 #if V2
 669             case 'p':   /*case 'q': case 'r':*/ case 's':   case 't':
 670 #else
 671             case 'p':   case 'q': /*case 'r':*/ case 's':   case 't':
 672 #endif
 673             case 'u':   case 'v':   case 'w': /*case 'x':*/ case 'y':
 674             case 'z':
 675             case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 676             case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 677             case 'K':               case 'M':   case 'N':   case 'O':
 678             case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 679             case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 680             case 'Z':
 681             case '_':
 682             case_ident:
 683             {   unsigned char c;
 684                 StringValue *sv;
 685                 Identifier *id;
 686
 687                 do
 688                 {
 689                     c = *++p;
 690                 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
 691                 sv = stringtable.update((char *)t->ptr, p - t->ptr);
 692                 id = (Identifier *) sv->ptrvalue;
 693                 if (!id)
 694                 {   id = new Identifier(sv->lstring.string,TOKidentifier);
 695                     sv->ptrvalue = id;
 696                 }
 697                 t->ident = id;
 698                 t->value = (enum TOK) id->value;
 699                 anyToken = 1;
 700                 if (*t->ptr == '_')     // if special identifier token
 701                 {
 702                     static char date[11+1];
 703                     static char time[8+1];
 704                     static char timestamp[24+1];
 705
 706                     if (!date[0])       // lazy evaluation
 707                     {   time_t t;
 708                         char *p;
 709
 710                         ::time(&t);
 711                         p = ctime(&t);
 712                         assert(p);
 713                         sprintf(date, "%.6s %.4s", p + 4, p + 20);
 714                         sprintf(time, "%.8s", p + 11);
 715                         sprintf(timestamp, "%.24s", p);
 716                     }
 717
 718                     if (mod && id == Id::FILE)
 719                     {
 720                         t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
 721                         goto Lstring;
 722                     }
 723                     else if (mod && id == Id::LINE)
 724                     {
 725                         t->value = TOKint64v;
 726                         t->uns64value = loc.linnum;
 727                     }
 728                     else if (id == Id::DATE)
 729                     {
 730                         t->ustring = (unsigned char *)date;
 731                         goto Lstring;
 732                     }
 733                     else if (id == Id::TIME)
 734                     {
 735                         t->ustring = (unsigned char *)time;
 736                         goto Lstring;
 737                     }
 738                     else if (id == Id::VENDOR)
 739                     {
 740 #ifdef IN_GCC
 741                         t->ustring = (unsigned char *)"GDC";
 742 #else
 743                         t->ustring = (unsigned char *)"Digital Mars D";
 744 #endif
 745                         goto Lstring;
 746                     }
 747                     else if (id == Id::TIMESTAMP)
 748                     {
 749                         t->ustring = (unsigned char *)timestamp;
 750                      Lstring:
 751                         t->value = TOKstring;
 752                      Llen:
 753                         t->postfix = 0;
 754                         t->len = strlen((char *)t->ustring);
 755                     }
 756                     else if (id == Id::VERSIONX)
 757                     {   unsigned major = 0;
 758                         unsigned minor = 0;
 759
 760                         for (char *p = global.version + 1; 1; p++)
 761                         {
 762                             char c = *p;
 763                             if (isdigit(c))
 764                                 minor = minor * 10 + c - '0';
 765                             else if (c == '.')
 766                             {   major = minor;
 767                                 minor = 0;
 768                             }
 769                             else
 770                                 break;
 771                         }
 772                         t->value = TOKint64v;
 773                         t->uns64value = major * 1000 + minor;
 774                     }
 775 #if V2
 776                     else if (id == Id::EOFX)
 777                     {
 778                         t->value = TOKeof;
 779                         // Advance scanner to end of file
 780                         while (!(*p == 0 || *p == 0x1A))
 781                             p++;
 782                     }
 783 #endif
 784                 }
 785                 //printf("t->value = %d\n",t->value);
 786                 return;
 787             }
 788
 789             case '/':
 790                 p++;
 791                 switch (*p)
 792                 {
 793                     case '=':
 794                         p++;
 795                         t->value = TOKdivass;
 796                         return;
 797
 798                     case '*':
 799                         p++;
 800                         linnum = loc.linnum;
 801                         while (1)
 802                         {
 803                             while (1)
 804                             {   unsigned char c = *p;
 805                                 switch (c)
 806                                 {
 807                                     case '/':
 808                                         break;
 809
 810                                     case '\n':
 811                                         loc.linnum++;
 812                                         p++;
 813                                         continue;
 814
 815                                     case '\r':
 816                                         p++;
 817                                         if (*p != '\n')
 818                                             loc.linnum++;
 819                                         continue;
 820
 821                                     case 0:
 822                                     case 0x1A:
 823                                         error("unterminated /* */ comment");
 824                                         p = end;
 825                                         t->value = TOKeof;
 826                                         return;
 827
 828                                     default:
 829                                         if (c & 0x80)
 830                                         {   unsigned u = decodeUTF();
 831                                             if (u == PS || u == LS)
 832                                                 loc.linnum++;
 833                                         }
 834                                         p++;
 835                                         continue;
 836                                 }
 837                                 break;
 838                             }
 839                             p++;
 840                             if (p[-2] == '*' && p - 3 != t->ptr)
 841                                 break;
 842                         }
 843                         if (commentToken)
 844                         {
 845                             t->value = TOKcomment;
 846                             return;
 847                         }
 848                         else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
 849                         {   // if /** but not /**/
 850                             getDocComment(t, lastLine == linnum);
 851                         }
 852                         continue;
 853
 854                     case '/':           // do // style comments
 855                         linnum = loc.linnum;
 856                         while (1)
 857                         {   unsigned char c = *++p;
 858                             switch (c)
 859                             {
 860                                 case '\n':
 861                                     break;
 862
 863                                 case '\r':
 864                                     if (p[1] == '\n')
 865                                         p++;
 866                                     break;
 867
 868                                 case 0:
 869                                 case 0x1A:
 870                                     if (commentToken)
 871                                     {
 872                                         p = end;
 873                                         t->value = TOKcomment;
 874                                         return;
 875                                     }
 876                                     if (doDocComment && t->ptr[2] == '/')
 877                                         getDocComment(t, lastLine == linnum);
 878                                     p = end;
 879                                     t->value = TOKeof;
 880                                     return;
 881
 882                                 default:
 883                                     if (c & 0x80)
 884                                     {   unsigned u = decodeUTF();
 885                                         if (u == PS || u == LS)
 886                                             break;
 887                                     }
 888                                     continue;
 889                             }
 890                             break;
 891                         }
 892
 893                         if (commentToken)
 894                         {
 895                             p++;
 896                             loc.linnum++;
 897                             t->value = TOKcomment;
 898                             return;
 899                         }
 900                         if (doDocComment && t->ptr[2] == '/')
 901                             getDocComment(t, lastLine == linnum);
 902
 903                         p++;
 904                         loc.linnum++;
 905                         continue;
 906
 907                     case '+':
 908                     {   int nest;
 909
 910                         linnum = loc.linnum;
 911                         p++;
 912                         nest = 1;
 913                         while (1)
 914                         {   unsigned char c = *p;
 915                             switch (c)
 916                             {
 917                                 case '/':
 918                                     p++;
 919                                     if (*p == '+')
 920                                     {
 921                                         p++;
 922                                         nest++;
 923                                     }
 924                                     continue;
 925
 926                                 case '+':
 927                                     p++;
 928                                     if (*p == '/')
 929                                     {
 930                                         p++;
 931                                         if (--nest == 0)
 932                                             break;
 933                                     }
 934                                     continue;
 935
 936                                 case '\r':
 937                                     p++;
 938                                     if (*p != '\n')
 939                                         loc.linnum++;
 940                                     continue;
 941
 942                                 case '\n':
 943                                     loc.linnum++;
 944                                     p++;
 945                                     continue;
 946
 947                                 case 0:
 948                                 case 0x1A:
 949                                     error("unterminated /+ +/ comment");
 950                                     p = end;
 951                                     t->value = TOKeof;
 952                                     return;
 953
 954                                 default:
 955                                     if (c & 0x80)
 956                                     {   unsigned u = decodeUTF();
 957                                         if (u == PS || u == LS)
 958                                             loc.linnum++;
 959                                     }
 960                                     p++;
 961                                     continue;
 962                             }
 963                             break;
 964                         }
 965                         if (commentToken)
 966                         {
 967                             t->value = TOKcomment;
 968                             return;
 969                         }
 970                         if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
 971                         {   // if /++ but not /++/
 972                             getDocComment(t, lastLine == linnum);
 973                         }
 974                         continue;
 975                     }
 976                 }
 977                 t->value = TOKdiv;
 978                 return;
 979
 980             case '.':
 981                 p++;
 982                 if (isdigit(*p))
 983                 {   /* Note that we don't allow ._1 and ._ as being
 984                      * valid floating point numbers.
 985                      */
 986                     p--;
 987                     t->value = inreal(t);
 988                 }
 989                 else if (p[0] == '.')
 990                 {
 991                     if (p[1] == '.')
 992                     {   p += 2;
 993                         t->value = TOKdotdotdot;
 994                     }
 995                     else
 996                     {   p++;
 997                         t->value = TOKslice;
 998                     }
 999                 }
1000                 else
1001                     t->value = TOKdot;
1002                 return;
1003
1004             case '&':
1005                 p++;
1006                 if (*p == '=')
1007                 {   p++;
1008                     t->value = TOKandass;
1009                 }
1010                 else if (*p == '&')
1011                 {   p++;
1012                     t->value = TOKandand;
1013                 }
1014                 else
1015                     t->value = TOKand;
1016                 return;
1017
1018             case '|':
1019                 p++;
1020                 if (*p == '=')
1021                 {   p++;
1022                     t->value = TOKorass;
1023                 }
1024                 else if (*p == '|')
1025                 {   p++;
1026                     t->value = TOKoror;
1027                 }
1028                 else
1029                     t->value = TOKor;
1030                 return;
1031
1032             case '-':
1033                 p++;
1034                 if (*p == '=')
1035                 {   p++;
1036                     t->value = TOKminass;
1037                 }
1038 #if 0
1039                 else if (*p == '>')
1040                 {   p++;
1041                     t->value = TOKarrow;
1042                 }
1043 #endif
1044                 else if (*p == '-')
1045                 {   p++;
1046                     t->value = TOKminusminus;
1047                 }
1048                 else
1049                     t->value = TOKmin;
1050                 return;
1051
1052             case '+':
1053                 p++;
1054                 if (*p == '=')
1055                 {   p++;
1056                     t->value = TOKaddass;
1057                 }
1058                 else if (*p == '+')
1059                 {   p++;
1060                     t->value = TOKplusplus;
1061                 }
1062                 else
1063                     t->value = TOKadd;
1064                 return;
1065
1066             case '<':
1067                 p++;
1068                 if (*p == '=')
1069                 {   p++;
1070                     t->value = TOKle;                   // <=
1071                 }
1072                 else if (*p == '<')
1073                 {   p++;
1074                     if (*p == '=')
1075                     {   p++;
1076                         t->value = TOKshlass;           // <<=
1077                     }
1078                     else
1079                         t->value = TOKshl;              // <<
1080                 }
1081                 else if (*p == '>')
1082                 {   p++;
1083                     if (*p == '=')
1084                     {   p++;
1085                         t->value = TOKleg;              // <>=
1086                     }
1087                     else
1088                         t->value = TOKlg;               // <>
1089                 }
1090                 else
1091                     t->value = TOKlt;                   // <
1092                 return;
1093
1094             case '>':
1095                 p++;
1096                 if (*p == '=')
1097                 {   p++;
1098                     t->value = TOKge;                   // >=
1099                 }
1100                 else if (*p == '>')
1101                 {   p++;
1102                     if (*p == '=')
1103                     {   p++;
1104                         t->value = TOKshrass;           // >>=
1105                     }
1106                     else if (*p == '>')
1107                     {   p++;
1108                         if (*p == '=')
1109                         {   p++;
1110                             t->value = TOKushrass;      // >>>=
1111                         }
1112                         else
1113                             t->value = TOKushr;         // >>>
1114                     }
1115                     else
1116                         t->value = TOKshr;              // >>
1117                 }
1118                 else
1119                     t->value = TOKgt;                   // >
1120                 return;
1121
1122             case '!':
1123                 p++;
1124                 if (*p == '=')
1125                 {   p++;
1126                     if (*p == '=' && global.params.Dversion == 1)
1127                     {   p++;
1128                         t->value = TOKnotidentity;      // !==
1129                     }
1130                     else
1131                         t->value = TOKnotequal;         // !=
1132                 }
1133                 else if (*p == '<')
1134                 {   p++;
1135                     if (*p == '>')
1136                     {   p++;
1137                         if (*p == '=')
1138                         {   p++;
1139                             t->value = TOKunord; // !<>=
1140                         }
1141                         else
1142                             t->value = TOKue;   // !<>
1143                     }
1144                     else if (*p == '=')
1145                     {   p++;
1146                         t->value = TOKug;       // !<=
1147                     }
1148                     else
1149                         t->value = TOKuge;      // !<
1150                 }
1151                 else if (*p == '>')
1152                 {   p++;
1153                     if (*p == '=')
1154                     {   p++;
1155                         t->value = TOKul;       // !>=
1156                     }
1157                     else
1158                         t->value = TOKule;      // !>
1159                 }
1160                 else
1161                     t->value = TOKnot;          // !
1162                 return;
1163
1164             case '=':
1165                 p++;
1166                 if (*p == '=')
1167                 {   p++;
1168                     if (*p == '=' && global.params.Dversion == 1)
1169                     {   p++;
1170                         t->value = TOKidentity;         // ===
1171                     }
1172                     else
1173                         t->value = TOKequal;            // ==
1174                 }
1175                 else
1176                     t->value = TOKassign;               // =
1177                 return;
1178
1179             case '~':
1180                 p++;
1181                 if (*p == '=')
1182                 {   p++;
1183                     t->value = TOKcatass;               // ~=
1184                 }
1185                 else
1186                     t->value = TOKtilde;                // ~
1187                 return;
1188
1189 #define NESTED(cin,tokin,cout,tokout) \
1190             case cin: nesting++; p++; t->value = tokin; return;\
1191             case cout: if (nesting == 0) {error("Unexpected '%c'", cout);} else {nesting--;} p++; t->value = tokout; return;
1192
1193             NESTED('(', TOKlparen, ')', TOKrparen)
1194             NESTED('[', TOKlbracket, ']', TOKrbracket)
1195             NESTED('{', TOKlcurly, '}', TOKrcurly)
1196 #undef NESTED
1197
1198 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1199             SINGLE('?', TOKquestion)
1200             SINGLE(',', TOKcomma)
1201             SINGLE(';', TOKsemicolon)
1202             SINGLE('$', TOKdollar)
1203             SINGLE('@', TOKat)
1204
1205 #undef SINGLE
1206
1207             case ':':
1208                 p++;
1209                 if (!nesting)
1210                         indent += 1;
1211                 t->value = TOKcolon;
1212                 return;
1213
1214 #define DOUBLE(c1,tok1,c2,tok2)         \
1215             case c1:                    \
1216                 p++;                    \
1217                 if (*p == c2)           \
1218                 {   p++;                \
1219                     t->value = tok2;    \
1220                 }                       \
1221                 else                    \
1222                     t->value = tok1;    \
1223                 return;
1224
1225             DOUBLE('*', TOKmul, '=', TOKmulass)
1226             DOUBLE('%', TOKmod, '=', TOKmodass)
1227             DOUBLE('^', TOKxor, '=', TOKxorass)
1228
1229 #undef DOUBLE
1230
1231             case '#':           // do # style comments and pragmas
1232                 p++;
1233                 pragma();
1234                 continue;
1235
1236             default:
1237             {   unsigned char c = *p;
1238
1239                 if (c & 0x80)
1240                 {   unsigned u = decodeUTF();
1241
1242                     // Check for start of unicode identifier
1243                     if (isUniAlpha(u))
1244                         goto case_ident;
1245
1246                     if (u == PS || u == LS)
1247                     {
1248                         loc.linnum++;
1249                         p++;
1250                         continue;
1251                     }
1252                 }
1253                 if (isprint(c))
1254                     error("unsupported char '%c'", c);
1255                 else
1256                     error("unsupported char 0x%02x", c);
1257                 p++;
1258                 continue;
1259             }
1260         }
1261     }
1262 }
1263
1264 /*******************************************
1265  * Parse escape sequence.
1266  */
1267
1268 unsigned Lexer::escapeSequence()
1269 {   unsigned c;
1270     int n;
1271     int ndigits;
1272
1273     c = *p;
1274     switch (c)
1275     {
1276         case '\'':
1277         case '"':
1278         case '?':
1279         case '\\':
1280         Lconsume:
1281                 p++;
1282                 break;
1283
1284         case 'a':       c = 7;          goto Lconsume;
1285         case 'b':       c = 8;          goto Lconsume;
1286         case 'f':       c = 12;         goto Lconsume;
1287         case 'n':       c = 10;         goto Lconsume;
1288         case 'r':       c = 13;         goto Lconsume;
1289         case 't':       c = 9;          goto Lconsume;
1290         case 'v':       c = 11;         goto Lconsume;
1291
1292         case 'u':
1293                 ndigits = 4;
1294                 goto Lhex;
1295         case 'U':
1296                 ndigits = 8;
1297                 goto Lhex;
1298         case 'x':
1299                 ndigits = 2;
1300         Lhex:
1301                 p++;
1302                 c = *p;
1303                 if (ishex(c))
1304                 {   unsigned v;
1305
1306                     n = 0;
1307                     v = 0;
1308                     while (1)
1309                     {
1310                         if (isdigit(c))
1311                             c -= '0';
1312                         else if (islower(c))
1313                             c -= 'a' - 10;
1314                         else
1315                             c -= 'A' - 10;
1316                         v = v * 16 + c;
1317                         c = *++p;
1318                         if (++n == ndigits)
1319                             break;
1320                         if (!ishex(c))
1321                         {   error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1322                             break;
1323                         }
1324                     }
1325                     if (ndigits != 2 && !utf_isValidDchar(v))
1326                         error("invalid UTF character \\U%08x", v);
1327                     c = v;
1328                 }
1329                 else
1330                     error("undefined escape hex sequence \\%c\n",c);
1331                 break;
1332
1333         case '&':                       // named character entity
1334                 for (unsigned char *idstart = ++p; 1; p++)
1335                 {
1336                     switch (*p)
1337                     {
1338                         case ';':
1339                             c = HtmlNamedEntity(idstart, p - idstart);
1340                             if (c == ~0)
1341                             {   error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1342                                 c = ' ';
1343                             }
1344                             p++;
1345                             break;
1346
1347                         default:
1348                             if (isalpha(*p) ||
1349                                 (p != idstart + 1 && isdigit(*p)))
1350                                 continue;
1351                             error("unterminated named entity");
1352                             break;
1353                     }
1354                     break;
1355                 }
1356                 break;
1357
1358         case 0:
1359         case 0x1A:                      // end of file
1360                 c = '\\';
1361                 break;
1362
1363         default:
1364                 if (isoctal(c))
1365                 {   unsigned v;
1366
1367                     n = 0;
1368                     v = 0;
1369                     do
1370                     {
1371                         v = v * 8 + (c - '0');
1372                         c = *++p;
1373                     } while (++n < 3 && isoctal(c));
1374                     c = v;
1375                     if (c > 0xFF)
1376                         error("0%03o is larger than a byte", c);
1377                 }
1378                 else
1379                     error("undefined escape sequence \\%c\n",c);
1380                 break;
1381     }
1382     return c;
1383 }
1384
1385 /**************************************
1386  */
1387
1388 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1389 {   unsigned c;
1390     Loc start = loc;
1391
1392     p++;
1393     stringbuffer.reset();
1394     while (1)
1395     {
1396         c = *p++;
1397         switch (c)
1398         {
1399             case '\n':
1400                 loc.linnum++;
1401                 break;
1402
1403             case '\r':
1404                 if (*p == '\n')
1405                     continue;   // ignore
1406                 c = '\n';       // treat EndOfLine as \n character
1407                 loc.linnum++;
1408                 break;
1409
1410             case 0:
1411             case 0x1A:
1412                 error("unterminated string constant starting at %s", start.toChars());
1413                 t->ustring = (unsigned char *)"";
1414                 t->len = 0;
1415                 t->postfix = 0;
1416                 return TOKstring;
1417
1418             case '"':
1419             case '`':
1420                 if (c == tc)
1421                 {
1422                     t->len = stringbuffer.offset;
1423                     stringbuffer.writeByte(0);
1424                     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1425                     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1426                     stringPostfix(t);
1427                     return TOKstring;
1428                 }
1429                 break;
1430
1431             default:
1432                 if (c & 0x80)
1433                 {   p--;
1434                     unsigned u = decodeUTF();
1435                     p++;
1436                     if (u == PS || u == LS)
1437                         loc.linnum++;
1438                     stringbuffer.writeUTF8(u);
1439                     continue;
1440                 }
1441                 break;
1442         }
1443         stringbuffer.writeByte(c);
1444     }
1445 }
1446
1447 /**************************************
1448  * Lex hex strings:
1449  *      x"0A ae 34FE BD"
1450  */
1451
1452 TOK Lexer::hexStringConstant(Token *t)
1453 {   unsigned c;
1454     Loc start = loc;
1455     unsigned n = 0;
1456     unsigned v;
1457
1458     p++;
1459     stringbuffer.reset();
1460     while (1)
1461     {
1462         c = *p++;
1463         switch (c)
1464         {
1465             case ' ':
1466             case '\t':
1467             case '\v':
1468             case '\f':
1469                 continue;                       // skip white space
1470
1471             case '\r':
1472                 if (*p == '\n')
1473                     continue;                   // ignore
1474                 // Treat isolated '\r' as if it were a '\n'
1475             case '\n':
1476                 loc.linnum++;
1477                 continue;
1478
1479             case 0:
1480             case 0x1A:
1481                 error("unterminated string constant starting at %s", start.toChars());
1482                 t->ustring = (unsigned char *)"";
1483                 t->len = 0;
1484                 t->postfix = 0;
1485                 return TOKstring;
1486
1487             case '"':
1488                 if (n & 1)
1489                 {   error("odd number (%d) of hex characters in hex string", n);
1490                     stringbuffer.writeByte(v);
1491                 }
1492                 t->len = stringbuffer.offset;
1493                 stringbuffer.writeByte(0);
1494                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1495                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1496                 stringPostfix(t);
1497                 return TOKstring;
1498
1499             default:
1500                 if (c >= '0' && c <= '9')
1501                     c -= '0';
1502                 else if (c >= 'a' && c <= 'f')
1503                     c -= 'a' - 10;
1504                 else if (c >= 'A' && c <= 'F')
1505                     c -= 'A' - 10;
1506                 else if (c & 0x80)
1507                 {   p--;
1508                     unsigned u = decodeUTF();
1509                     p++;
1510                     if (u == PS || u == LS)
1511                         loc.linnum++;
1512                     else
1513                         error("non-hex character \\u%x", u);
1514                 }
1515                 else
1516                     error("non-hex character '%c'", c);
1517                 if (n & 1)
1518                 {   v = (v << 4) | c;
1519                     stringbuffer.writeByte(v);
1520                 }
1521                 else
1522                     v = c;
1523                 n++;
1524                 break;
1525         }
1526     }
1527 }
1528
1529
1530 #if V2
1531 /**************************************
1532  * Lex delimited strings:
1533  *      q"(foo(xxx))"   // "foo(xxx)"
1534  *      q"[foo(]"       // "foo("
1535  *      q"/foo]/"       // "foo]"
1536  *      q"HERE
1537  *      foo
1538  *      HERE"           // "foo\n"
1539  * Input:
1540  *      p is on the "
1541  */
1542
1543 TOK Lexer::delimitedStringConstant(Token *t)
1544 {   unsigned c;
1545     Loc start = loc;
1546     unsigned delimleft = 0;
1547     unsigned delimright = 0;
1548     unsigned nest = 1;
1549     unsigned nestcount;
1550     Identifier *hereid = NULL;
1551     unsigned blankrol = 0;
1552     unsigned startline = 0;
1553
1554     p++;
1555     stringbuffer.reset();
1556     while (1)
1557     {
1558         c = *p++;
1559         //printf("c = '%c'\n", c);
1560         switch (c)
1561         {
1562             case '\n':
1563             Lnextline:
1564                 loc.linnum++;
1565                 startline = 1;
1566                 if (blankrol)
1567                 {   blankrol = 0;
1568                     continue;
1569                 }
1570                 if (hereid)
1571                 {
1572                     stringbuffer.writeUTF8(c);
1573                     continue;
1574                 }
1575                 break;
1576
1577             case '\r':
1578                 if (*p == '\n')
1579                     continue;   // ignore
1580                 c = '\n';       // treat EndOfLine as \n character
1581                 goto Lnextline;
1582
1583             case 0:
1584             case 0x1A:
1585                 goto Lerror;
1586
1587             default:
1588                 if (c & 0x80)
1589                 {   p--;
1590                     c = decodeUTF();
1591                     p++;
1592                     if (c == PS || c == LS)
1593                         goto Lnextline;
1594                 }
1595                 break;
1596         }
1597         if (delimleft == 0)
1598         {   delimleft = c;
1599             nest = 1;
1600             nestcount = 1;
1601             if (c == '(')
1602                 delimright = ')';
1603             else if (c == '{')
1604                 delimright = '}';
1605             else if (c == '[')
1606                 delimright = ']';
1607             else if (c == '<')
1608                 delimright = '>';
1609             else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1610             {   // Start of identifier; must be a heredoc
1611                 Token t;
1612                 p--;
1613                 scan(&t);               // read in heredoc identifier
1614                 if (t.value != TOKidentifier)
1615                 {   error("identifier expected for heredoc, not %s", t.toChars());
1616                     delimright = c;
1617                 }
1618                 else
1619                 {   hereid = t.ident;
1620                     //printf("hereid = '%s'\n", hereid->toChars());
1621                     blankrol = 1;
1622                 }
1623                 nest = 0;
1624             }
1625             else
1626             {   delimright = c;
1627                 nest = 0;
1628             }
1629         }
1630         else
1631         {
1632             if (blankrol)
1633             {   error("heredoc rest of line should be blank");
1634                 blankrol = 0;
1635                 continue;
1636             }
1637             if (nest == 1)
1638             {
1639                 if (c == delimleft)
1640                     nestcount++;
1641                 else if (c == delimright)
1642                 {   nestcount--;
1643                     if (nestcount == 0)
1644                         goto Ldone;
1645                 }
1646             }
1647             else if (c == delimright)
1648                 goto Ldone;
1649             if (startline && isalpha(c))
1650             {   Token t;
1651                 unsigned char *psave = p;
1652                 p--;
1653                 scan(&t);               // read in possible heredoc identifier
1654                 //printf("endid = '%s'\n", t.ident->toChars());
1655                 if (t.value == TOKidentifier && t.ident->equals(hereid))
1656                 {   /* should check that rest of line is blank
1657                      */
1658                     goto Ldone;
1659                 }
1660                 p = psave;
1661             }
1662             stringbuffer.writeUTF8(c);
1663             startline = 0;
1664         }
1665     }
1666
1667 Ldone:
1668     if (*p == '"')
1669         p++;
1670     else
1671         error("delimited string must end in %c\"", delimright);
1672     t->len = stringbuffer.offset;
1673     stringbuffer.writeByte(0);
1674     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1675     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1676     stringPostfix(t);
1677     return TOKstring;
1678
1679 Lerror:
1680     error("unterminated string constant starting at %s", start.toChars());
1681     t->ustring = (unsigned char *)"";
1682     t->len = 0;
1683     t->postfix = 0;
1684     return TOKstring;
1685 }
1686
1687 /**************************************
1688  * Lex delimited strings:
1689  *      q{ foo(xxx) } // " foo(xxx) "
1690  *      q{foo(}       // "foo("
1691  *      q{{foo}"}"}   // "{foo}"}""
1692  * Input:
1693  *      p is on the q
1694  */
1695
1696 TOK Lexer::tokenStringConstant(Token *t)
1697 {
1698     unsigned nest = 1;
1699     Loc start = loc;
1700     unsigned char *pstart = ++p;
1701
1702     while (1)
1703     {   Token tok;
1704
1705         scan(&tok);
1706         switch (tok.value)
1707         {
1708             case TOKlcurly:
1709                 nest++;
1710                 continue;
1711
1712             case TOKrcurly:
1713                 if (--nest == 0)
1714                     goto Ldone;
1715                 continue;
1716
1717             case TOKeof:
1718                 goto Lerror;
1719
1720             default:
1721                 continue;
1722         }
1723     }
1724
1725 Ldone:
1726     t->len = p - 1 - pstart;
1727     t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1728     memcpy(t->ustring, pstart, t->len);
1729     t->ustring[t->len] = 0;
1730     stringPostfix(t);
1731     return TOKstring;
1732
1733 Lerror:
1734     error("unterminated token string constant starting at %s", start.toChars());
1735     t->ustring = (unsigned char *)"";
1736     t->len = 0;
1737     t->postfix = 0;
1738     return TOKstring;
1739 }
1740
1741 #endif
1742
1743
1744 /**************************************
1745  */
1746
1747 TOK Lexer::escapeStringConstant(Token *t, int wide)
1748 {   unsigned c;
1749     Loc start = loc;
1750
1751     p++;
1752     stringbuffer.reset();
1753     while (1)
1754     {
1755         c = *p++;
1756         switch (c)
1757         {
1758             case '\\':
1759                 switch (*p)
1760                 {
1761                     case 'u':
1762                     case 'U':
1763                     case '&':
1764                         c = escapeSequence();
1765                         stringbuffer.writeUTF8(c);
1766                         continue;
1767
1768                     default:
1769                         c = escapeSequence();
1770                         break;
1771                 }
1772                 break;
1773
1774             case '\n':
1775                 loc.linnum++;
1776                 break;
1777
1778             case '\r':
1779                 if (*p == '\n')
1780                     continue;   // ignore
1781                 c = '\n';       // treat EndOfLine as \n character
1782                 loc.linnum++;
1783                 break;
1784
1785             case '"':
1786                 t->len = stringbuffer.offset;
1787                 stringbuffer.writeByte(0);
1788                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1789                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1790                 stringPostfix(t);
1791                 return TOKstring;
1792
1793             case 0:
1794             case 0x1A:
1795                 p--;
1796                 error("unterminated string constant starting at %s", start.toChars());
1797                 t->ustring = (unsigned char *)"";
1798                 t->len = 0;
1799                 t->postfix = 0;
1800                 return TOKstring;
1801
1802             default:
1803                 if (c & 0x80)
1804                 {
1805                     p--;
1806                     c = decodeUTF();
1807                     if (c == LS || c == PS)
1808                     {   c = '\n';
1809                         loc.linnum++;
1810                     }
1811                     p++;
1812                     stringbuffer.writeUTF8(c);
1813                     continue;
1814                 }
1815                 break;
1816         }
1817         stringbuffer.writeByte(c);
1818     }
1819 }
1820
1821 /**************************************
1822  */
1823
1824 TOK Lexer::charConstant(Token *t, int wide)
1825 {
1826     unsigned c;
1827     TOK tk = TOKcharv;
1828
1829     //printf("Lexer::charConstant\n");
1830     p++;
1831     c = *p++;
1832     switch (c)
1833     {
1834         case '\\':
1835             switch (*p)
1836             {
1837                 case 'u':
1838                     t->uns64value = escapeSequence();
1839                     tk = TOKwcharv;
1840                     break;
1841
1842                 case 'U':
1843                 case '&':
1844                     t->uns64value = escapeSequence();
1845                     tk = TOKdcharv;
1846                     break;
1847
1848                 default:
1849                     t->uns64value = escapeSequence();
1850                     break;
1851             }
1852             break;
1853
1854         case '\n':
1855         L1:
1856             loc.linnum++;
1857         case '\r':
1858         case 0:
1859         case 0x1A:
1860         case '\'':
1861             error("unterminated character constant");
1862             return tk;
1863
1864         default:
1865             if (c & 0x80)
1866             {
1867                 p--;
1868                 c = decodeUTF();
1869                 p++;
1870                 if (c == LS || c == PS)
1871                     goto L1;
1872                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1873                     tk = TOKwcharv;
1874                 else
1875                     tk = TOKdcharv;
1876             }
1877             t->uns64value = c;
1878             break;
1879     }
1880
1881     if (*p != '\'')
1882     {   error("unterminated character constant");
1883         return tk;
1884     }
1885     p++;
1886     return tk;
1887 }
1888
1889 /***************************************
1890  * Get postfix of string literal.
1891  */
1892
1893 void Lexer::stringPostfix(Token *t)
1894 {
1895     switch (*p)
1896     {
1897         case 'c':
1898         case 'w':
1899         case 'd':
1900             t->postfix = *p;
1901             p++;
1902             break;
1903
1904         default:
1905             t->postfix = 0;
1906             break;
1907     }
1908 }
1909
1910 /***************************************
1911  * Read \u or \U unicode sequence
1912  * Input:
1913  *      u       'u' or 'U'
1914  */
1915
1916 #if 0
1917 unsigned Lexer::wchar(unsigned u)
1918 {
1919     unsigned value;
1920     unsigned n;
1921     unsigned char c;
1922     unsigned nchars;
1923
1924     nchars = (u == 'U') ? 8 : 4;
1925     value = 0;
1926     for (n = 0; 1; n++)
1927     {
1928         ++p;
1929         if (n == nchars)
1930             break;
1931         c = *p;
1932         if (!ishex(c))
1933         {   error("\\%c sequence must be followed by %d hex characters", u, nchars);
1934             break;
1935         }
1936         if (isdigit(c))
1937             c -= '0';
1938         else if (islower(c))
1939             c -= 'a' - 10;
1940         else
1941             c -= 'A' - 10;
1942         value <<= 4;
1943         value |= c;
1944     }
1945     return value;
1946 }
1947 #endif
1948
1949 /**************************************
1950  * Read in a number.
1951  * If it's an integer, store it in tok.TKutok.Vlong.
1952  *      integers can be decimal, octal or hex
1953  *      Handle the suffixes U, UL, LU, L, etc.
1954  * If it's double, store it in tok.TKutok.Vdouble.
1955  * Returns:
1956  *      TKnum
1957  *      TKdouble,...
1958  */
1959
1960 TOK Lexer::number(Token *t)
1961 {
1962     // We use a state machine to collect numbers
1963     enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
1964         STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
1965         STATE_hexh, STATE_error };
1966     enum STATE state;
1967
1968     enum FLAGS
1969     {   FLAGS_decimal  = 1,             // decimal
1970         FLAGS_unsigned = 2,             // u or U suffix
1971         FLAGS_long     = 4,             // l or L suffix
1972     };
1973     enum FLAGS flags = FLAGS_decimal;
1974
1975     int i;
1976     int base;
1977     unsigned c;
1978     unsigned char *start;
1979     TOK result;
1980
1981     //printf("Lexer::number()\n");
1982     state = STATE_initial;
1983     base = 0;
1984     stringbuffer.reset();
1985     start = p;
1986     while (1)
1987     {
1988         c = *p;
1989         switch (state)
1990         {
1991             case STATE_initial:         // opening state
1992                 if (c == '0')
1993                     state = STATE_0;
1994                 else
1995                     state = STATE_decimal;
1996                 break;
1997
1998             case STATE_0:
1999                 flags = (FLAGS) (flags & ~FLAGS_decimal);
2000                 switch (c)
2001                 {
2002 #if ZEROH
2003                     case 'H':                   // 0h
2004                     case 'h':
2005                         goto hexh;
2006 #endif
2007                     case 'X':
2008                     case 'x':
2009                         state = STATE_hex0;
2010                         break;
2011
2012                     case '.':
2013                         if (p[1] == '.')        // .. is a separate token
2014                             goto done;
2015                     case 'i':
2016                     case 'f':
2017                     case 'F':
2018                         goto real;
2019 #if ZEROH
2020                     case 'E':
2021                     case 'e':
2022                         goto case_hex;
2023 #endif
2024                     case 'B':
2025                     case 'b':
2026                         state = STATE_binary0;
2027                         break;
2028
2029                     case '0': case '1': case '2': case '3':
2030                     case '4': case '5': case '6': case '7':
2031                         state = STATE_octal;
2032                         break;
2033
2034 #if ZEROH
2035                     case '8': case '9': case 'A':
2036                     case 'C': case 'D': case 'F':
2037                     case 'a': case 'c': case 'd': case 'f':
2038                     case_hex:
2039                         state = STATE_hexh;
2040                         break;
2041 #endif
2042                     case '_':
2043                         state = STATE_octal;
2044                         p++;
2045                         continue;
2046
2047                     case 'L':
2048                         if (p[1] == 'i')
2049                             goto real;
2050                         goto done;
2051
2052                     default:
2053                         goto done;
2054                 }
2055                 break;
2056
2057             case STATE_decimal:         // reading decimal number
2058                 if (!isdigit(c))
2059                 {
2060 #if ZEROH
2061                     if (ishex(c)
2062                         || c == 'H' || c == 'h'
2063                        )
2064                         goto hexh;
2065 #endif
2066                     if (c == '_')               // ignore embedded _
2067                     {   p++;
2068                         continue;
2069                     }
2070                     if (c == '.' && p[1] != '.')
2071                         goto real;
2072                     else if (c == 'i' || c == 'f' || c == 'F' ||
2073                              c == 'e' || c == 'E')
2074                     {
2075             real:       // It's a real number. Back up and rescan as a real
2076                         p = start;
2077                         return inreal(t);
2078                     }
2079                     else if (c == 'L' && p[1] == 'i')
2080                         goto real;
2081                     goto done;
2082                 }
2083                 break;
2084
2085             case STATE_hex0:            // reading hex number
2086             case STATE_hex:
2087                 if (!ishex(c))
2088                 {
2089                     if (c == '_')               // ignore embedded _
2090                     {   p++;
2091                         continue;
2092                     }
2093                     if (c == '.' && p[1] != '.')
2094                         goto real;
2095                     if (c == 'P' || c == 'p' || c == 'i')
2096                         goto real;
2097                     if (state == STATE_hex0)
2098                         error("Hex digit expected, not '%c'", c);
2099                     goto done;
2100                 }
2101                 state = STATE_hex;
2102                 break;
2103
2104 #if ZEROH
2105             hexh:
2106                 state = STATE_hexh;
2107             case STATE_hexh:            // parse numbers like 0FFh
2108                 if (!ishex(c))
2109                 {
2110                     if (c == 'H' || c == 'h')
2111                     {
2112                         p++;
2113                         base = 16;
2114                         goto done;
2115                     }
2116                     else
2117                     {
2118                         // Check for something like 1E3 or 0E24
2119                         if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2120                             memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2121                             goto real;
2122                         error("Hex digit expected, not '%c'", c);
2123                         goto done;
2124                     }
2125                 }
2126                 break;
2127 #endif
2128
2129             case STATE_octal:           // reading octal number
2130             case STATE_octale:          // reading octal number with non-octal digits
2131                 if (!isoctal(c))
2132                 {
2133 #if ZEROH
2134                     if (ishex(c)
2135                         || c == 'H' || c == 'h'
2136                        )
2137                         goto hexh;
2138 #endif
2139                     if (c == '_')               // ignore embedded _
2140                     {   p++;
2141                         continue;
2142                     }
2143                     if (c == '.' && p[1] != '.')
2144                         goto real;
2145                     if (c == 'i')
2146                         goto real;
2147                     if (isdigit(c))
2148                     {
2149                         state = STATE_octale;
2150                     }
2151                     else
2152                         goto done;
2153                 }
2154                 break;
2155
2156             case STATE_binary0:         // starting binary number
2157             case STATE_binary:          // reading binary number
2158                 if (c != '0' && c != '1')
2159                 {
2160 #if ZEROH
2161                     if (ishex(c)
2162                         || c == 'H' || c == 'h'
2163                        )
2164                         goto hexh;
2165 #endif
2166                     if (c == '_')               // ignore embedded _
2167                     {   p++;
2168                         continue;
2169                     }
2170                     if (state == STATE_binary0)
2171                     {   error("binary digit expected");
2172                         state = STATE_error;
2173                         break;
2174                     }
2175                     else
2176                         goto done;
2177                 }
2178                 state = STATE_binary;
2179                 break;
2180
2181             case STATE_error:           // for error recovery
2182                 if (!isdigit(c))        // scan until non-digit
2183                     goto done;
2184                 break;
2185
2186             default:
2187                 assert(0);
2188         }
2189         stringbuffer.writeByte(c);
2190         p++;
2191     }
2192 done:
2193     stringbuffer.writeByte(0);          // terminate string
2194     if (state == STATE_octale)
2195         error("Octal digit expected");
2196
2197     uinteger_t n;                       // unsigned >=64 bit integer type
2198
2199     if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2200         n = stringbuffer.data[0] - '0';
2201     else
2202     {
2203         // Convert string to integer
2204 #if __DMC__
2205         errno = 0;
2206         n = strtoull((char *)stringbuffer.data,NULL,base);
2207         if (errno == ERANGE)
2208             error("integer overflow");
2209 #else
2210         // Not everybody implements strtoull()
2211         char *p = (char *)stringbuffer.data;
2212         int r = 10, d;
2213
2214         if (*p == '0')
2215         {
2216             if (p[1] == 'x' || p[1] == 'X')
2217                 p += 2, r = 16;
2218             else if (p[1] == 'b' || p[1] == 'B')
2219                 p += 2, r = 2;
2220             else if (isdigit(p[1]))
2221                 p += 1, r = 8;
2222         }
2223
2224         n = 0;
2225         while (1)
2226         {
2227             if (*p >= '0' && *p <= '9')
2228                 d = *p - '0';
2229             else if (*p >= 'a' && *p <= 'z')
2230                 d = *p - 'a' + 10;
2231             else if (*p >= 'A' && *p <= 'Z')
2232                 d = *p - 'A' + 10;
2233             else
2234                 break;
2235             if (d >= r)
2236                 break;
2237             if (n && n * r + d <= n)
2238             {
2239                 error ("integer overflow");
2240                 break;
2241             }
2242
2243             n = n * r + d;
2244             p++;
2245         }
2246 #endif
2247         if (sizeof(n) > 8 &&
2248             n > 0xFFFFFFFFFFFFFFFFULL)  // if n needs more than 64 bits
2249             error("integer overflow");
2250     }
2251
2252     // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2253     while (1)
2254     {   unsigned char f;
2255
2256         switch (*p)
2257         {   case 'U':
2258             case 'u':
2259                 f = FLAGS_unsigned;
2260                 goto L1;
2261
2262             case 'l':
2263                 if (1 || !global.params.useDeprecated)
2264                     error("'l' suffix is deprecated, use 'L' instead");
2265             case 'L':
2266                 f = FLAGS_long;
2267             L1:
2268                 p++;
2269                 if (flags & f)
2270                     error("unrecognized token");
2271                 flags = (FLAGS) (flags | f);
2272                 continue;
2273             default:
2274                 break;
2275         }
2276         break;
2277     }
2278
2279     switch (flags)
2280     {
2281         case 0:
2282             /* Octal or Hexadecimal constant.
2283              * First that fits: int, uint, long, ulong
2284              */
2285             if (n & 0x8000000000000000LL)
2286                     result = TOKuns64v;
2287             else if (n & 0xFFFFFFFF00000000LL)
2288                     result = TOKint64v;
2289             else if (n & 0x80000000)
2290                     result = TOKuns32v;
2291             else
2292                     result = TOKint32v;
2293             break;
2294
2295         case FLAGS_decimal:
2296             /* First that fits: int, long, long long
2297              */
2298             if (n & 0x8000000000000000LL)
2299             {       error("signed integer overflow");
2300                     result = TOKuns64v;
2301             }
2302             else if (n & 0xFFFFFFFF80000000LL)
2303                     result = TOKint64v;
2304             else
2305                     result = TOKint32v;
2306             break;
2307
2308         case FLAGS_unsigned:
2309         case FLAGS_decimal | FLAGS_unsigned:
2310             /* First that fits: uint, ulong
2311              */
2312             if (n & 0xFFFFFFFF00000000LL)
2313                     result = TOKuns64v;
2314             else
2315                     result = TOKuns32v;
2316             break;
2317
2318         case FLAGS_decimal | FLAGS_long:
2319             if (n & 0x8000000000000000LL)
2320             {       error("signed integer overflow");
2321                     result = TOKuns64v;
2322             }
2323             else
2324                     result = TOKint64v;
2325             break;
2326
2327         case FLAGS_long:
2328             if (n & 0x8000000000000000LL)
2329                     result = TOKuns64v;
2330             else
2331                     result = TOKint64v;
2332             break;
2333
2334         case FLAGS_unsigned | FLAGS_long:
2335         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2336             result = TOKuns64v;
2337             break;
2338
2339         default:
2340             #ifdef DEBUG
2341                 printf("%x\n",flags);
2342             #endif
2343             assert(0);
2344     }
2345     t->uns64value = n;
2346     return result;
2347 }
2348
2349 /**************************************
2350  * Read in characters, converting them to real.
2351  * Bugs:
2352  *      Exponent overflow not detected.
2353  *      Too much requested precision is not detected.
2354  */
2355
2356 TOK Lexer::inreal(Token *t)
2357 #ifdef __DMC__
2358 __in
2359 {
2360     assert(*p == '.' || isdigit(*p));
2361 }
2362 __out (result)
2363 {
2364     switch (result)
2365     {
2366         case TOKfloat32v:
2367         case TOKfloat64v:
2368         case TOKfloat80v:
2369         case TOKimaginary32v:
2370         case TOKimaginary64v:
2371         case TOKimaginary80v:
2372             break;
2373
2374         default:
2375             assert(0);
2376     }
2377 }
2378 __body
2379 #endif /* __DMC__ */
2380 {   int dblstate;
2381     unsigned c;
2382     char hex;                   // is this a hexadecimal-floating-constant?
2383     TOK result;
2384
2385     //printf("Lexer::inreal()\n");
2386     stringbuffer.reset();
2387     dblstate = 0;
2388     hex = 0;
2389 Lnext:
2390     while (1)
2391     {
2392         // Get next char from input
2393         c = *p++;
2394         //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2395         while (1)
2396         {
2397             switch (dblstate)
2398             {
2399                 case 0:                 // opening state
2400                     if (c == '0')
2401                         dblstate = 9;
2402                     else if (c == '.')
2403                         dblstate = 3;
2404                     else
2405                         dblstate = 1;
2406                     break;
2407
2408                 case 9:
2409                     dblstate = 1;
2410                     if (c == 'X' || c == 'x')
2411                     {   hex++;
2412                         break;
2413                     }
2414                 case 1:                 // digits to left of .
2415                 case 3:                 // digits to right of .
2416                 case 7:                 // continuing exponent digits
2417                     if (!isdigit(c) && !(hex && isxdigit(c)))
2418                     {
2419                         if (c == '_')
2420                             goto Lnext; // ignore embedded '_'
2421                         dblstate++;
2422                         continue;
2423                     }
2424                     break;
2425
2426                 case 2:                 // no more digits to left of .
2427                     if (c == '.')
2428                     {   dblstate++;
2429                         break;
2430                     }
2431                 case 4:                 // no more digits to right of .
2432                     if ((c == 'E' || c == 'e') ||
2433                         hex && (c == 'P' || c == 'p'))
2434                     {   dblstate = 5;
2435                         hex = 0;        // exponent is always decimal
2436                         break;
2437                     }
2438                     if (hex)
2439                         error("binary-exponent-part required");
2440                     goto done;
2441
2442                 case 5:                 // looking immediately to right of E
2443                     dblstate++;
2444                     if (c == '-' || c == '+')
2445                         break;
2446                 case 6:                 // 1st exponent digit expected
2447                     if (!isdigit(c))
2448                         error("exponent expected");
2449                     dblstate++;
2450                     break;
2451
2452                 case 8:                 // past end of exponent digits
2453                     goto done;
2454             }
2455             break;
2456         }
2457         stringbuffer.writeByte(c);
2458     }
2459 done:
2460     p--;
2461
2462     stringbuffer.writeByte(0);
2463
2464 #if _WIN32 && __DMC__
2465     char *save = __locale_decpoint;
2466     __locale_decpoint = ".";
2467 #endif
2468 #ifdef IN_GCC
2469     t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2470 #else
2471     t->float80value = strtold((char *)stringbuffer.data, NULL);
2472 #endif
2473     errno = 0;
2474     switch (*p)
2475     {
2476         case 'F':
2477         case 'f':
2478 #ifdef IN_GCC
2479             real_t::parse((char *)stringbuffer.data, real_t::Float);
2480 #else
2481             strtof((char *)stringbuffer.data, NULL);
2482 #endif
2483             result = TOKfloat32v;
2484             p++;
2485             break;
2486
2487         default:
2488 #ifdef IN_GCC
2489             real_t::parse((char *)stringbuffer.data, real_t::Double);
2490 #else
2491             strtod((char *)stringbuffer.data, NULL);
2492 #endif
2493             result = TOKfloat64v;
2494             break;
2495
2496         case 'l':
2497             if (!global.params.useDeprecated)
2498                 error("'l' suffix is deprecated, use 'L' instead");
2499         case 'L':
2500             result = TOKfloat80v;
2501             p++;
2502             break;
2503     }
2504     if (*p == 'i' || *p == 'I')
2505     {
2506         if (!global.params.useDeprecated && *p == 'I')
2507             error("'I' suffix is deprecated, use 'i' instead");
2508         p++;
2509         switch (result)
2510         {
2511             case TOKfloat32v:
2512                 result = TOKimaginary32v;
2513                 break;
2514             case TOKfloat64v:
2515                 result = TOKimaginary64v;
2516                 break;
2517             case TOKfloat80v:
2518                 result = TOKimaginary80v;
2519                 break;
2520         }
2521     }
2522 #if _WIN32 && __DMC__
2523     __locale_decpoint = save;
2524 #endif
2525     if (errno == ERANGE)
2526         error("number is not representable");
2527     return result;
2528 }
2529
2530 /*********************************************
2531  * Do pragma.
2532  * Currently, the only pragma supported is:
2533  *      #line linnum [filespec]
2534  */
2535
2536 void Lexer::pragma()
2537 {
2538     Token tok;
2539     int linnum;
2540     char *filespec = NULL;
2541     Loc loc = this->loc;
2542
2543     while (isblank(*p)) p++;
2544     if (*p == '\n')
2545         goto Lerr;
2546
2547     scan(&tok);
2548     if (tok.value != TOKidentifier || tok.ident != Id::line)
2549         goto Lerr;
2550
2551     scan(&tok);
2552     if (tok.value == TOKint32v || tok.value == TOKint64v)
2553         linnum = tok.uns64value - 1;
2554     else
2555         goto Lerr;
2556
2557     while (1)
2558     {
2559         switch (*p)
2560         {
2561             case 0:
2562             case 0x1A:
2563             case '\n':
2564             Lnewline:
2565                 this->loc.linnum = linnum;
2566                 if (filespec)
2567                     this->loc.filename = filespec;
2568                 return;
2569
2570             case '\r':
2571                 p++;
2572                 if (*p != '\n')
2573                 {   p--;
2574                     goto Lnewline;
2575                 }
2576                 continue;
2577
2578             case ' ':
2579             case '\t':
2580             case '\v':
2581             case '\f':
2582                 p++;
2583                 continue;                       // skip white space
2584
2585             case '_':
2586                 if (mod && memcmp(p, "__FILE__", 8) == 0)
2587                 {
2588                     p += 8;
2589                     filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2590                 }
2591                 continue;
2592
2593             case '"':
2594                 if (filespec)
2595                     goto Lerr;
2596                 stringbuffer.reset();
2597                 p++;
2598                 while (1)
2599                 {   unsigned c;
2600
2601                     c = *p;
2602                     switch (c)
2603                     {
2604                         case '\n':
2605                         case '\r':
2606                         case 0:
2607                         case 0x1A:
2608                             goto Lerr;
2609
2610                         case '"':
2611                             stringbuffer.writeByte(0);
2612                             filespec = mem.strdup((char *)stringbuffer.data);
2613                             p++;
2614                             break;
2615
2616                         default:
2617                             if (c & 0x80)
2618                             {   unsigned u = decodeUTF();
2619                                 if (u == PS || u == LS)
2620                                     goto Lerr;
2621                             }
2622                             stringbuffer.writeByte(c);
2623                             p++;
2624                             continue;
2625                     }
2626                     break;
2627                 }
2628                 continue;
2629
2630             default:
2631                 if (*p & 0x80)
2632                 {   unsigned u = decodeUTF();
2633                     if (u == PS || u == LS)
2634                         goto Lnewline;
2635                 }
2636                 goto Lerr;
2637         }
2638     }
2639
2640 Lerr:
2641     // No problem: this is just a comment line
2642     while (*p != '\n')
2643             p++;
2644
2645     // error(loc, "#line integer [\"filespec\"]\\n expected");
2646 }
2647
2648
2649 /********************************************
2650  * Decode UTF character.
2651  * Issue error messages for invalid sequences.
2652  * Return decoded character, advance p to last character in UTF sequence.
2653  */
2654
2655 unsigned Lexer::decodeUTF()
2656 {
2657     dchar_t u;
2658     unsigned char c;
2659     unsigned char *s = p;
2660     size_t len;
2661     size_t idx;
2662     char *msg;
2663
2664     c = *s;
2665     assert(c & 0x80);
2666
2667     // Check length of remaining string up to 6 UTF-8 characters
2668     for (len = 1; len < 6 && s[len]; len++)
2669         ;
2670
2671     idx = 0;
2672     msg = utf_decodeChar(s, len, &idx, &u);
2673     p += idx - 1;
2674     if (msg)
2675     {
2676         error("%s", msg);
2677     }
2678     return u;
2679 }
2680
2681
2682 /***************************************************
2683  * Parse doc comment embedded between t->ptr and p.
2684  * Remove trailing blanks and tabs from lines.
2685  * Replace all newlines with \n.
2686  * Remove leading comment character from each line.
2687  * Decide if it's a lineComment or a blockComment.
2688  * Append to previous one for this token.
2689  */
2690
2691 void Lexer::getDocComment(Token *t, unsigned lineComment)
2692 {
2693     OutBuffer buf;
2694     unsigned char ct = t->ptr[2];
2695     unsigned char *q = t->ptr + 3;      // start of comment text
2696     int linestart = 0;
2697
2698     unsigned char *qend = p;
2699     if (ct == '*' || ct == '+')
2700         qend -= 2;
2701
2702     /* Scan over initial row of ****'s or ++++'s or ////'s
2703      */
2704     for (; q < qend; q++)
2705     {
2706         if (*q != ct)
2707             break;
2708     }
2709
2710     /* Remove trailing row of ****'s or ++++'s
2711      */
2712     if (ct != '/')
2713     {
2714         for (; q < qend; qend--)
2715         {
2716             if (qend[-1] != ct)
2717                 break;
2718         }
2719     }
2720
2721     for (; q < qend; q++)
2722     {
2723         unsigned char c = *q;
2724
2725         switch (c)
2726         {
2727             case '*':
2728             case '+':
2729                 if (linestart && c == ct)
2730                 {   linestart = 0;
2731                     /* Trim preceding whitespace up to preceding \n
2732                      */
2733                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2734                         buf.offset--;
2735                     continue;
2736                 }
2737                 break;
2738
2739             case ' ':
2740             case '\t':
2741                 break;
2742
2743             case '\r':
2744                 if (q[1] == '\n')
2745                     continue;           // skip the \r
2746                 goto Lnewline;
2747
2748             default:
2749                 if (c == 226)
2750                 {
2751                     // If LS or PS
2752                     if (q[1] == 128 &&
2753                         (q[2] == 168 || q[2] == 169))
2754                     {
2755                         q += 2;
2756                         goto Lnewline;
2757                     }
2758                 }
2759                 linestart = 0;
2760                 break;
2761
2762             Lnewline:
2763                 c = '\n';               // replace all newlines with \n
2764             case '\n':
2765                 linestart = 1;
2766
2767                 /* Trim trailing whitespace
2768                  */
2769                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2770                     buf.offset--;
2771
2772                 break;
2773         }
2774         buf.writeByte(c);
2775     }
2776
2777     // Always end with a newline
2778     if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2779         buf.writeByte('\n');
2780
2781     buf.writeByte(0);
2782
2783     // It's a line comment if the start of the doc comment comes
2784     // after other non-whitespace on the same line.
2785     unsigned char** dc = (lineComment && anyToken)
2786                          ? &t->lineComment
2787                          : &t->blockComment;
2788
2789     // Combine with previous doc comment, if any
2790     if (*dc)
2791         *dc = combineComments(*dc, (unsigned char *)buf.data);
2792     else
2793         *dc = (unsigned char *)buf.extractData();
2794 }
2795
2796 /********************************************
2797  * Combine two document comments into one.
2798  */
2799
2800 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2801 {
2802     unsigned char *c = c2;
2803
2804     if (c1)
2805     {   c = c1;
2806         if (c2)
2807         {   size_t len1 = strlen((char *)c1);
2808             size_t len2 = strlen((char *)c2);
2809
2810             c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2811             memcpy(c, c1, len1);
2812             c[len1] = '\n';
2813             memcpy(c + len1 + 1, c2, len2);
2814             c[len1 + 1 + len2] = 0;
2815         }
2816     }
2817     return c;
2818 }
2819
2820 /********************************************
2821  * Create an identifier in the string table.
2822  */
2823
2824 Identifier *Lexer::idPool(const char *s)
2825 {
2826     size_t len = strlen(s);
2827     StringValue *sv = stringtable.update(s, len);
2828     Identifier *id = (Identifier *) sv->ptrvalue;
2829     if (!id)
2830     {
2831         id = new Identifier(sv->lstring.string, TOKidentifier);
2832         sv->ptrvalue = id;
2833     }
2834     return id;
2835 }
2836
2837 /*********************************************
2838  * Create a unique identifier using the prefix s.
2839  */
2840
2841 Identifier *Lexer::uniqueId(const char *s, int num)
2842 {   char buffer[32];
2843     size_t slen = strlen(s);
2844
2845     assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2846     sprintf(buffer, "%s%d", s, num);
2847     return idPool(buffer);
2848 }
2849
2850 Identifier *Lexer::uniqueId(const char *s)
2851 {
2852     static int num;
2853     return uniqueId(s, ++num);
2854 }
2855
2856 /****************************************
2857  */
2858
2859 struct Keyword
2860 {   char *name;
2861     enum TOK value;
2862 };
2863
2864 static Keyword keywords[] =
2865 {
2866 //    { "",             TOK     },
2867
2868     {   "this",         TOKthis         },
2869     {   "super",        TOKsuper        },
2870     {   "assert",       TOKassert       },
2871     {   "null",         TOKnull         },
2872     {   "true",         TOKtrue         },
2873     {   "false",        TOKfalse        },
2874     {   "cast",         TOKcast         },
2875     {   "new",          TOKnew          },
2876     {   "delete",       TOKdelete       },
2877     {   "throw",        TOKthrow        },
2878     {   "module",       TOKmodule       },
2879     {   "pragma",       TOKpragma       },
2880     {   "typeof",       TOKtypeof       },
2881     {   "typeid",       TOKtypeid       },
2882
2883     {   "template",     TOKtemplate     },
2884
2885     {   "void",         TOKvoid         },
2886     {   "byte",         TOKint8         },
2887     {   "ubyte",        TOKuns8         },
2888     {   "short",        TOKint16        },
2889     {   "ushort",       TOKuns16        },
2890     {   "int",          TOKint32        },
2891     {   "uint",         TOKuns32        },
2892     {   "long",         TOKint64        },
2893     {   "ulong",        TOKuns64        },
2894     {   "cent",         TOKcent,        },
2895     {   "ucent",        TOKucent,       },
2896     {   "float",        TOKfloat32      },
2897     {   "double",       TOKfloat64      },
2898     {   "real",         TOKfloat80      },
2899
2900     {   "bool",         TOKbool         },
2901     {   "char",         TOKchar         },
2902     {   "wchar",        TOKwchar        },
2903     {   "dchar",        TOKdchar        },
2904
2905     {   "ifloat",       TOKimaginary32  },
2906     {   "idouble",      TOKimaginary64  },
2907     {   "ireal",        TOKimaginary80  },
2908
2909     {   "cfloat",       TOKcomplex32    },
2910     {   "cdouble",      TOKcomplex64    },
2911     {   "creal",        TOKcomplex80    },
2912
2913     {   "delegate",     TOKdelegate     },
2914     {   "function",     TOKfunction     },
2915
2916     {   "is",           TOKis           },
2917     {   "if",           TOKif           },
2918     {   "else",         TOKelse         },
2919     {   "while",        TOKwhile        },
2920     {   "for",          TOKfor          },
2921     {   "do",           TOKdo           },
2922     {   "switch",       TOKswitch       },
2923     {   "case",         TOKcase         },
2924     {   "default",      TOKdefault      },
2925     {   "break",        TOKbreak        },
2926     {   "continue",     TOKcontinue     },
2927     {   "synchronized", TOKsynchronized },
2928     {   "return",       TOKreturn       },
2929     {   "goto",         TOKgoto         },
2930     {   "try",          TOKtry          },
2931     {   "catch",        TOKcatch        },
2932     {   "finally",      TOKfinally      },
2933     {   "with",         TOKwith         },
2934     {   "asm",          TOKasm          },
2935     {   "foreach",      TOKforeach      },
2936     {   "foreach_reverse",      TOKforeach_reverse      },
2937     {   "reversed",     TOKreversed     },
2938     {   "scope",        TOKscope        },
2939
2940     {   "struct",       TOKstruct       },
2941     {   "class",        TOKclass        },
2942     {   "interface",    TOKinterface    },
2943     {   "union",        TOKunion        },
2944     {   "enum",         TOKenum         },
2945     {   "import",       TOKimport       },
2946     {   "mixin",        TOKmixin        },
2947     {   "static",       TOKstatic       },
2948     {   "final",        TOKfinal        },
2949     {   "const",        TOKconst        },
2950     {   "typedef",      TOKtypedef      },
2951     {   "alias",        TOKalias        },
2952     {   "override",     TOKoverride     },
2953     {   "abstract",     TOKabstract     },
2954     {   "volatile",     TOKvolatile     },
2955     {   "debug",        TOKdebug        },
2956     {   "deprecated",   TOKdeprecated   },
2957     {   "in",           TOKin           },
2958     {   "out",          TOKout          },
2959     {   "inout",        TOKinout        },
2960     {   "lazy",         TOKlazy         },
2961     {   "auto",         TOKauto         },
2962
2963     {   "align",        TOKalign        },
2964     {   "extern",       TOKextern       },
2965     {   "private",      TOKprivate      },
2966     {   "package",      TOKpackage      },
2967     {   "protected",    TOKprotected    },
2968     {   "public",       TOKpublic       },
2969     {   "export",       TOKexport       },
2970
2971     {   "body",         TOKbody         },
2972     {   "invariant",    TOKinvariant    },
2973     {   "unittest",     TOKunittest     },
2974     {   "version",      TOKversion      },
2975     //{ "manifest",     TOKmanifest     },
2976
2977     // Added after 1.0
2978     {   "ref",          TOKref          },
2979     {   "macro",        TOKmacro        },
2980
2981
2982     // TAL
2983     {   "and",          TOKandand       },
2984     {   "or",           TOKoror         },
2985     {   "not",          TOKnot          },
2986     {   "extends",      TOKextends      },
2987     {   "log_error",    TOKlog_error    },
2988     {   "log_warning",  TOKlog_warning  },
2989     {   "log_info",     TOKlog_info     },
2990     {   "log_trace",    TOKlog_trace    },
2991 #if V2
2992     {   "pure",         TOKpure         },
2993     {   "nothrow",      TOKnothrow      },
2994     {   "__traits",     TOKtraits       },
2995     {   "__overloadset", TOKoverloadset },
2996 #endif
2997 };
2998
2999 int Token::isKeyword()
3000 {
3001     for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
3002     {
3003         if (keywords[u].value == value)
3004             return 1;
3005     }
3006     return 0;
3007 }
3008
3009 void Lexer::initKeywords()
3010 {   StringValue *sv;
3011     unsigned u;
3012     enum TOK v;
3013     unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
3014
3015     if (global.params.Dversion == 1)
3016         nkeywords -= 2;
3017
3018     cmtable_init();
3019
3020     for (u = 0; u < nkeywords; u++)
3021     {   char *s;
3022
3023         //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
3024         s = keywords[u].name;
3025         v = keywords[u].value;
3026         sv = stringtable.insert(s, strlen(s));
3027         sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
3028
3029         //printf("tochars[%d] = '%s'\n",v, s);
3030         Token::tochars[v] = s;
3031     }
3032
3033     Token::tochars[TOKeof]              = "EOF";
3034     Token::tochars[TOKlcurly]           = "{";
3035     Token::tochars[TOKrcurly]           = "}";
3036     Token::tochars[TOKlparen]           = "(";
3037     Token::tochars[TOKrparen]           = ")";
3038     Token::tochars[TOKlbracket]         = "[";
3039     Token::tochars[TOKrbracket]         = "]";
3040     Token::tochars[TOKsemicolon]        = ";";
3041     Token::tochars[TOKcolon]            = ":";
3042     Token::tochars[TOKcomma]            = ",";
3043     Token::tochars[TOKdot]              = ".";
3044     Token::tochars[TOKxor]              = "^";
3045     Token::tochars[TOKxorass]           = "^=";
3046     Token::tochars[TOKassign]           = "=";
3047     Token::tochars[TOKconstruct]        = "=";
3048 #if V2
3049     Token::tochars[TOKblit]             = "=";
3050 #endif
3051     Token::tochars[TOKlt]               = "<";
3052     Token::tochars[TOKgt]               = ">";
3053     Token::tochars[TOKle]               = "<=";
3054     Token::tochars[TOKge]               = ">=";
3055     Token::tochars[TOKequal]            = "==";
3056     Token::tochars[TOKnotequal]         = "!=";
3057     Token::tochars[TOKnotidentity]      = "!is";
3058     Token::tochars[TOKtobool]           = "!!";
3059     Token::tochars[TOKat]               = "@";
3060
3061     Token::tochars[TOKunord]            = "!<>=";
3062     Token::tochars[TOKue]               = "!<>";
3063     Token::tochars[TOKlg]               = "<>";
3064     Token::tochars[TOKleg]              = "<>=";
3065     Token::tochars[TOKule]              = "!>";
3066     Token::tochars[TOKul]               = "!>=";
3067     Token::tochars[TOKuge]              = "!<";
3068     Token::tochars[TOKug]               = "!<=";
3069
3070     Token::tochars[TOKnot]              = "!";
3071     Token::tochars[TOKtobool]           = "!!";
3072     Token::tochars[TOKshl]              = "<<";
3073     Token::tochars[TOKshr]              = ">>";
3074     Token::tochars[TOKushr]             = ">>>";
3075     Token::tochars[TOKadd]              = "+";
3076     Token::tochars[TOKmin]              = "-";
3077     Token::tochars[TOKmul]              = "*";
3078     Token::tochars[TOKdiv]              = "/";
3079     Token::tochars[TOKmod]              = "%";
3080     Token::tochars[TOKslice]            = "..";
3081     Token::tochars[TOKdotdotdot]        = "...";
3082     Token::tochars[TOKand]              = "&";
3083     Token::tochars[TOKandand]           = "&&";
3084     Token::tochars[TOKor]               = "|";
3085     Token::tochars[TOKoror]             = "||";
3086     Token::tochars[TOKarray]            = "[]";
3087     Token::tochars[TOKindex]            = "[i]";
3088     Token::tochars[TOKaddress]          = "&";
3089     Token::tochars[TOKstar]             = "*";
3090     Token::tochars[TOKtilde]            = "~";
3091     Token::tochars[TOKdollar]           = "$";
3092     Token::tochars[TOKcast]             = "cast";
3093     Token::tochars[TOKplusplus]         = "++";
3094     Token::tochars[TOKminusminus]       = "--";
3095     Token::tochars[TOKtype]             = "type";
3096     Token::tochars[TOKquestion]         = "?";
3097     Token::tochars[TOKneg]              = "-";
3098     Token::tochars[TOKuadd]             = "+";
3099     Token::tochars[TOKvar]              = "var";
3100     Token::tochars[TOKaddass]           = "+=";
3101     Token::tochars[TOKminass]           = "-=";
3102     Token::tochars[TOKmulass]           = "*=";
3103     Token::tochars[TOKdivass]           = "/=";
3104     Token::tochars[TOKmodass]           = "%=";
3105     Token::tochars[TOKshlass]           = "<<=";
3106     Token::tochars[TOKshrass]           = ">>=";
3107     Token::tochars[TOKushrass]          = ">>>=";
3108     Token::tochars[TOKandass]           = "&=";
3109     Token::tochars[TOKorass]            = "|=";
3110     Token::tochars[TOKcatass]           = "~=";
3111     Token::tochars[TOKcat]              = "~";
3112     Token::tochars[TOKcall]             = "call";
3113     Token::tochars[TOKidentity]         = "is";
3114     Token::tochars[TOKnotidentity]      = "!is";
3115     Token::tochars[TOKendline]          = "\\n";
3116
3117     Token::tochars[TOKorass]            = "|=";
3118     Token::tochars[TOKidentifier]       = "identifier";
3119
3120      // For debugging
3121     Token::tochars[TOKdotexp]           = "dotexp";
3122     Token::tochars[TOKdotti]            = "dotti";
3123     Token::tochars[TOKdotvar]           = "dotvar";
3124     Token::tochars[TOKdottype]          = "dottype";
3125     Token::tochars[TOKsymoff]           = "symoff";
3126     Token::tochars[TOKtypedot]          = "typedot";
3127     Token::tochars[TOKarraylength]      = "arraylength";
3128     Token::tochars[TOKarrayliteral]     = "arrayliteral";
3129     Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3130     Token::tochars[TOKstructliteral]    = "structliteral";
3131     Token::tochars[TOKstring]           = "string";
3132     Token::tochars[TOKdsymbol]          = "symbol";
3133     Token::tochars[TOKtuple]            = "tuple";
3134     Token::tochars[TOKdeclaration]      = "declaration";
3135     Token::tochars[TOKdottd]            = "dottd";
3136     Token::tochars[TOKlogger]           = "logger";
3137     Token::tochars[TOKon_scope_exit]    = "scope(exit)";
3138 }