dmd/lexer.c

   1
   2 // Compiler implementation of the D programming language
   3 // Copyright (c) 1999-2008 by Digital Mars
   4 // All Rights Reserved
   5 // written by Walter Bright
   6 // http://www.digitalmars.com
   7 // License for redistribution is by either the Artistic License
   8 // in artistic.txt, or the GNU General Public License in gnu.txt.
   9 // See the included readme.txt for details.
  10
  11 /* NOTE: This file has been patched from the original DMD distribution to
  12    work with the GDC compiler.
  13
  14    Modified by David Friedman, December 2006
  15 */
  16
  17 /* Lexical Analyzer */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21 #include <ctype.h>
  22 #include <stdarg.h>
  23 #include <errno.h>
  24 //#include <wchar.h>
  25 #include <stdlib.h>
  26 #include <assert.h>
  27 #include <sys/time.h>
  28
  29 #ifdef IN_GCC
  30
  31 #include <time.h>
  32 #include "mem.h"
  33
  34 #else
  35
  36 #if __GNUC__
  37 #include <time.h>
  38 #endif
  39
  40 #if _WIN32
  41 #include "..\root\mem.h"
  42 #else
  43 #include "../root/mem.h"
  44 #endif
  45 #endif
  46
  47 #include "stringtable.h"
  48
  49 #include "lexer.h"
  50 #include "utf.h"
  51 #include "identifier.h"
  52 #include "id.h"
  53 #include "module.h"
  54
  55 #if _WIN32 && __DMC__
  56 // from \dm\src\include\setlocal.h
  57 extern "C" char * __cdecl __locale_decpoint;
  58 #endif
  59
  60 extern int HtmlNamedEntity(unsigned char *p, int length);
  61
  62 #define LS 0x2028       // UTF line separator
  63 #define PS 0x2029       // UTF paragraph separator
  64
  65 /********************************************
  66  * Do our own char maps
  67  */
  68
  69 static unsigned char cmtable[256];
  70
  71 const int CMoctal =     0x1;
  72 const int CMhex =       0x2;
  73 const int CMidchar =    0x4;
  74
  75 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
  76 inline unsigned char ishex   (unsigned char c) { return cmtable[c] & CMhex; }
  77 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
  78
  79 static void cmtable_init()
  80 {
  81     for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
  82     {
  83         if ('0' <= c && c <= '7')
  84             cmtable[c] |= CMoctal;
  85         if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
  86             cmtable[c] |= CMhex;
  87         if (isalnum(c) || c == '_')
  88             cmtable[c] |= CMidchar;
  89     }
  90 }
  91
  92
  93 /************************* Token **********************************************/
  94
  95 char *Token::tochars[TOKMAX];
  96
  97 void *Token::operator new(size_t size)
  98 {   Token *t;
  99
 100     if (Lexer::freelist)
 101     {
 102         t = Lexer::freelist;
 103         Lexer::freelist = t->next;
 104         return t;
 105     }
 106
 107     return ::operator new(size);
 108 }
 109
 110 #ifdef DEBUG
 111 void Token::print()
 112 {
 113     fprintf(stdmsg, "%s\n", toChars());
 114 }
 115 #endif
 116
 117 char *Token::toChars()
 118 {   char *p;
 119     static char buffer[3 + 3 * sizeof(value) + 1];
 120
 121     p = buffer;
 122     switch (value)
 123     {
 124         case TOKint32v:
 125 #if IN_GCC
 126             sprintf(buffer,"%d",(d_int32)int64value);
 127 #else
 128             sprintf(buffer,"%d",int32value);
 129 #endif
 130             break;
 131
 132         case TOKuns32v:
 133         case TOKcharv:
 134         case TOKwcharv:
 135         case TOKdcharv:
 136 #if IN_GCC
 137             sprintf(buffer,"%uU",(d_uns32)uns64value);
 138 #else
 139             sprintf(buffer,"%uU",uns32value);
 140 #endif
 141             break;
 142
 143         case TOKint64v:
 144             sprintf(buffer,"%"PRIdMAX"L",int64value);
 145             break;
 146
 147         case TOKuns64v:
 148             sprintf(buffer,"%"PRIuMAX"UL",uns64value);
 149             break;
 150
 151 #if IN_GCC
 152         case TOKfloat32v:
 153         case TOKfloat64v:
 154         case TOKfloat80v:
 155             float80value.format(buffer, sizeof(buffer));
 156             break;
 157         case TOKimaginary32v:
 158         case TOKimaginary64v:
 159         case TOKimaginary80v:
 160             float80value.format(buffer, sizeof(buffer));
 161             // %% buffer
 162             strcat(buffer, "i");
 163             break;
 164 #else
 165         case TOKfloat32v:
 166             sprintf(buffer,"%Lgf", float80value);
 167             break;
 168
 169         case TOKfloat64v:
 170             sprintf(buffer,"%Lg", float80value);
 171             break;
 172
 173         case TOKfloat80v:
 174             sprintf(buffer,"%LgL", float80value);
 175             break;
 176
 177         case TOKimaginary32v:
 178             sprintf(buffer,"%Lgfi", float80value);
 179             break;
 180
 181         case TOKimaginary64v:
 182             sprintf(buffer,"%Lgi", float80value);
 183             break;
 184
 185         case TOKimaginary80v:
 186             sprintf(buffer,"%LgLi", float80value);
 187             break;
 188 #endif
 189
 190
 191         case TOKstring:
 192 #if CSTRINGS
 193             p = string;
 194 #else
 195         {   OutBuffer buf;
 196
 197             buf.writeByte('"');
 198             for (size_t i = 0; i < len; )
 199             {   unsigned c;
 200
 201                 utf_decodeChar((unsigned char *)ustring, len, &i, &c);
 202                 switch (c)
 203                 {
 204                     case 0:
 205                         break;
 206
 207                     case '"':
 208                     case '\\':
 209                         buf.writeByte('\\');
 210                     default:
 211                         if (isprint(c))
 212                             buf.writeByte(c);
 213                         else if (c <= 0x7F)
 214                             buf.printf("\\x%02x", c);
 215                         else if (c <= 0xFFFF)
 216                             buf.printf("\\u%04x", c);
 217                         else
 218                             buf.printf("\\U%08x", c);
 219                         continue;
 220                 }
 221                 break;
 222             }
 223             buf.writeByte('"');
 224             if (postfix)
 225                 buf.writeByte('"');
 226             buf.writeByte(0);
 227             p = (char *)buf.extractData();
 228         }
 229 #endif
 230             break;
 231
 232         case TOKidentifier:
 233         case TOKenum:
 234         case TOKstruct:
 235         case TOKimport:
 236         CASE_BASIC_TYPES:
 237             p = ident->toChars();
 238             break;
 239
 240         default:
 241             p = toChars(value);
 242             break;
 243     }
 244     return p;
 245 }
 246
 247 char *Token::toChars(enum TOK value)
 248 {   char *p;
 249     static char buffer[3 + 3 * sizeof(value) + 1];
 250
 251     p = tochars[value];
 252     if (!p)
 253     {   sprintf(buffer,"TOK%d",value);
 254         p = buffer;
 255     }
 256     return p;
 257 }
 258
 259 /*************************** Lexer ********************************************/
 260
 261 Token *Lexer::freelist = NULL;
 262 StringTable Lexer::stringtable;
 263 OutBuffer Lexer::stringbuffer;
 264
 265 Lexer::Lexer(Module *mod,
 266         unsigned char *base, unsigned begoffset, unsigned endoffset,
 267         int doDocComment, int commentToken, bool dltSyntax)
 268     : loc(mod, 1), dltSyntax(dltSyntax)
 269 {
 270     //printf("Lexer::Lexer(%p,%d)\n",base,length);
 271     //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
 272     memset(&token,0,sizeof(token));
 273     this->base = base;
 274     this->end  = base + endoffset;
 275     p = base + begoffset;
 276     this->mod = mod;
 277     this->doDocComment = doDocComment;
 278     this->anyToken = 0;
 279     this->commentToken = commentToken;
 280     this->nesting = 0;
 281     this->indent = 0;
 282     this->atStartOfLine = 1;
 283     //initKeywords();
 284
 285     /* If first line starts with '#!', ignore the line
 286      */
 287
 288     if (p[0] == '#' && p[1] =='!')
 289     {
 290         p += 2;
 291         while (1)
 292         {   unsigned char c = *p;
 293             switch (c)
 294             {
 295                 case '\n':
 296                     p++;
 297                     break;
 298
 299                 case '\r':
 300                     p++;
 301                     if (*p == '\n')
 302                         p++;
 303                     break;
 304
 305                 case 0:
 306                 case 0x1A:
 307                     break;
 308
 309                 default:
 310                     if (c & 0x80)
 311                     {   unsigned u = decodeUTF();
 312                         if (u == PS || u == LS)
 313                             break;
 314                     }
 315                     p++;
 316                     continue;
 317             }
 318             break;
 319         }
 320         loc.linnum = 2;
 321     }
 322 }
 323
 324
 325 void Lexer::error(const char *format, ...)
 326 {
 327     if (mod && !global.gag)
 328     {
 329         char *p = loc.toChars();
 330         if (*p)
 331             fprintf(stdmsg, "%s: ", p);
 332         mem.free(p);
 333
 334         va_list ap;
 335         va_start(ap, format);
 336         vfprintf(stdmsg, format, ap);
 337         va_end(ap);
 338
 339         fprintf(stdmsg, "\n");
 340         fflush(stdmsg);
 341
 342         if (global.errors >= 20)        // moderate blizzard of cascading messages
 343             fatal();
 344     }
 345     global.errors++;
 346 }
 347
 348 void Lexer::error(Loc loc, const char *format, ...)
 349 {
 350     if (mod && !global.gag)
 351     {
 352         char *p = loc.toChars();
 353         if (*p)
 354             fprintf(stdmsg, "%s: ", p);
 355         mem.free(p);
 356
 357         va_list ap;
 358         va_start(ap, format);
 359         vfprintf(stdmsg, format, ap);
 360         va_end(ap);
 361
 362         fprintf(stdmsg, "\n");
 363         fflush(stdmsg);
 364
 365         if (global.errors >= 20)        // moderate blizzard of cascading messages
 366             fatal();
 367     }
 368     global.errors++;
 369 }
 370
 371 TOK Lexer::nextToken()
 372 {   Token *t;
 373
 374     if (token.next)
 375     {
 376         t = token.next;
 377         memcpy(&token,t,sizeof(Token));
 378         t->next = freelist;
 379         freelist = t;
 380     }
 381     else
 382     {
 383         scan(&token);
 384     }
 385     //token.print();
 386     return token.value;
 387 }
 388
 389 Token *Lexer::peek(Token *ct)
 390 {   Token *t;
 391
 392     if (ct->next)
 393         t = ct->next;
 394     else
 395     {
 396         t = new Token();
 397         scan(t);
 398         t->next = NULL;
 399         ct->next = t;
 400     }
 401     return t;
 402 }
 403
 404 /*********************************
 405  * tk is on the opening (.
 406  * Look ahead and return token that is past the closing ).
 407  */
 408
 409 Token *Lexer::peekPastParen(Token *tk)
 410 {
 411     //printf("peekPastParen()\n");
 412     int parens = 1;
 413     int curlynest = 0;
 414     while (1)
 415     {
 416         tk = peek(tk);
 417         //tk->print();
 418         switch (tk->value)
 419         {
 420             case TOKlparen:
 421                 parens++;
 422                 continue;
 423
 424             case TOKrparen:
 425                 --parens;
 426                 if (parens)
 427                     continue;
 428                 tk = peek(tk);
 429                 break;
 430
 431             case TOKlcurly:
 432                 curlynest++;
 433                 continue;
 434
 435             case TOKrcurly:
 436                 if (--curlynest >= 0)
 437                     continue;
 438                 break;
 439
 440             case TOKsemicolon:
 441                 if (curlynest)
 442                     continue;
 443                 break;
 444
 445             case TOKeof:
 446                 break;
 447
 448             default:
 449                 continue;
 450         }
 451         return tk;
 452     }
 453 }
 454
 455 /**********************************
 456  * Determine if string is a valid Identifier.
 457  * Placed here because of commonality with Lexer functionality.
 458  * Returns:
 459  *      0       invalid
 460  */
 461
 462 int Lexer::isValidIdentifier(char *p)
 463 {
 464     size_t len;
 465     size_t idx;
 466
 467     if (!p || !*p)
 468         goto Linvalid;
 469
 470     if (*p >= '0' && *p <= '9')         // beware of isdigit() on signed chars
 471         goto Linvalid;
 472
 473     len = strlen(p);
 474     idx = 0;
 475     while (p[idx])
 476     {   dchar_t dc;
 477
 478         char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
 479         if (q)
 480             goto Linvalid;
 481
 482         if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
 483             goto Linvalid;
 484     }
 485     return 1;
 486
 487 Linvalid:
 488     return 0;
 489 }
 490
 491 /****************************
 492  * Turn next token in buffer into a token.
 493  */
 494
 495 void Lexer::scan(Token *t)
 496 {
 497     unsigned lastLine = loc.linnum;
 498     unsigned linnum;
 499
 500     t->blockComment = NULL;
 501     t->lineComment = NULL;
 502     while (1)
 503     {
 504         t->ptr = p;
 505
 506         if (dltSyntax && atStartOfLine) {
 507                 // Check indent
 508                 int i;
 509                 for (i = 0; p[i] == '\t'; i++) {
 510                 }
 511                 if (p[i] == ' ') {
 512                     error("Whitespace error: use tabs to indent!");
 513                 }
 514                 if (p[i] == '#') {
 515                     p += i;
 516                     atStartOfLine = 0;
 517                 } else if (p[i] != '\n' && p[i] != '\r') {
 518                     if (p[i] == '\0')
 519                         i = 0;                  // End-of-file always has no indent
 520                     if (i > indent) {
 521                         error("unexpected indentation (expected %d tabs, not %d)",
 522                                 indent, i);
 523                     } else if (i < indent) {
 524                         indent -= 1;
 525                         t->value = TOKrcurly;
 526                         return;
 527                     }
 528                     atStartOfLine = 0;
 529                 } /* else ignore blank line */
 530         }
 531
 532         //printf("p = %p, *p = '%c'\n",p,*p);
 533         switch (*p)
 534         {
 535             case 0:
 536             case 0x1A:
 537                 t->value = TOKeof;                      // end of file
 538                 return;
 539
 540             case ' ':
 541             case '\t':
 542             case '\v':
 543             case '\f':
 544                 p++;
 545                 continue;                       // skip white space
 546
 547             case '\r':
 548                 if (p[1] == '\n') {             // if CRLF
 549                     p++;
 550                     continue;
 551                 }
 552                 // fall-through
 553             case '\n':
 554                 p++;
 555                 loc.linnum++;
 556                 if (dltSyntax && !nesting) {
 557                         atStartOfLine = 1;
 558                         t->value = TOKendline;
 559                         return;
 560                 }
 561                 continue;                       // Ignore newlines inside brackets
 562             case '0':   case '1':   case '2':   case '3':   case '4':
 563             case '5':   case '6':   case '7':   case '8':   case '9':
 564                 t->value = number(t);
 565                 return;
 566
 567 #if CSTRINGS
 568             case '\'':
 569                 t->value = charConstant(t, 0);
 570                 return;
 571
 572             case '"':
 573                 t->value = stringConstant(t,0);
 574                 return;
 575
 576             case 'l':
 577             case 'L':
 578                 if (p[1] == '\'')
 579                 {
 580                     p++;
 581                     t->value = charConstant(t, 1);
 582                     return;
 583                 }
 584                 else if (p[1] == '"')
 585                 {
 586                     p++;
 587                     t->value = stringConstant(t, 1);
 588                     return;
 589                 }
 590 #else
 591             case '\'':
 592                 t->value = charConstant(t,0);
 593                 return;
 594
 595             case 'r':
 596                 if (p[1] != '"')
 597                     goto case_ident;
 598                 p++;
 599             case '`':
 600                 t->value = wysiwygStringConstant(t, *p);
 601                 return;
 602
 603             case 'x':
 604                 if (p[1] != '"')
 605                     goto case_ident;
 606                 p++;
 607                 t->value = hexStringConstant(t);
 608                 return;
 609
 610 #if V2
 611             case 'q':
 612                 if (p[1] == '"')
 613                 {
 614                     p++;
 615                     t->value = delimitedStringConstant(t);
 616                     return;
 617                 }
 618                 else if (p[1] == '{')
 619                 {
 620                     p++;
 621                     t->value = tokenStringConstant(t);
 622                     return;
 623                 }
 624                 else
 625                     goto case_ident;
 626 #endif
 627
 628             case '"':
 629                 t->value = escapeStringConstant(t,0);
 630                 return;
 631
 632             case '\\':                  // escaped string literal
 633             {   unsigned c;
 634
 635                 stringbuffer.reset();
 636                 do
 637                 {
 638                     p++;
 639                     switch (*p)
 640                     {
 641                         case 'u':
 642                         case 'U':
 643                         case '&':
 644                             c = escapeSequence();
 645                             stringbuffer.writeUTF8(c);
 646                             break;
 647
 648                         default:
 649                             c = escapeSequence();
 650                             stringbuffer.writeByte(c);
 651                             break;
 652                     }
 653                 } while (*p == '\\');
 654                 t->len = stringbuffer.offset;
 655                 stringbuffer.writeByte(0);
 656                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
 657                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
 658                 t->postfix = 0;
 659                 t->value = TOKstring;
 660                 return;
 661             }
 662
 663             case 'l':
 664             case 'L':
 665 #endif
 666             case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 667             case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 668             case 'k':               case 'm':   case 'n':   case 'o':
 669 #if V2
 670             case 'p':   /*case 'q': case 'r':*/ case 's':   case 't':
 671 #else
 672             case 'p':   case 'q': /*case 'r':*/ case 's':   case 't':
 673 #endif
 674             case 'u':   case 'v':   case 'w': /*case 'x':*/ case 'y':
 675             case 'z':
 676             case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 677             case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 678             case 'K':               case 'M':   case 'N':   case 'O':
 679             case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 680             case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 681             case 'Z':
 682             case '_':
 683             case_ident:
 684             {   unsigned char c;
 685                 StringValue *sv;
 686                 Identifier *id;
 687
 688                 do
 689                 {
 690                     c = *++p;
 691                 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
 692                 sv = stringtable.update((char *)t->ptr, p - t->ptr);
 693                 id = (Identifier *) sv->ptrvalue;
 694                 if (!id)
 695                 {   id = new Identifier(sv->lstring.string,TOKidentifier);
 696                     sv->ptrvalue = id;
 697                 }
 698                 t->ident = id;
 699                 t->value = (enum TOK) id->value;
 700                 anyToken = 1;
 701                 if (*t->ptr == '_')     // if special identifier token
 702                 {
 703                     static char date[11+1];
 704                     static char time[8+1];
 705                     static char timestamp[24+1];
 706
 707                     if (!date[0])       // lazy evaluation
 708                     {   time_t t;
 709                         char *p;
 710
 711                         ::time(&t);
 712                         p = ctime(&t);
 713                         assert(p);
 714                         sprintf(date, "%.6s %.4s", p + 4, p + 20);
 715                         sprintf(time, "%.8s", p + 11);
 716                         sprintf(timestamp, "%.24s", p);
 717                     }
 718
 719                     if (mod && id == Id::FILE)
 720                     {
 721                         t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
 722                         goto Lstring;
 723                     }
 724                     else if (mod && id == Id::LINE)
 725                     {
 726                         t->value = TOKint64v;
 727                         t->uns64value = loc.linnum;
 728                     }
 729                     else if (id == Id::DATE)
 730                     {
 731                         t->ustring = (unsigned char *)date;
 732                         goto Lstring;
 733                     }
 734                     else if (id == Id::TIME)
 735                     {
 736                         t->ustring = (unsigned char *)time;
 737                         goto Lstring;
 738                     }
 739                     else if (id == Id::VENDOR)
 740                     {
 741 #ifdef IN_GCC
 742                         t->ustring = (unsigned char *)"GDC";
 743 #else
 744                         t->ustring = (unsigned char *)"Digital Mars D";
 745 #endif
 746                         goto Lstring;
 747                     }
 748                     else if (id == Id::TIMESTAMP)
 749                     {
 750                         t->ustring = (unsigned char *)timestamp;
 751                      Lstring:
 752                         t->value = TOKstring;
 753                      Llen:
 754                         t->postfix = 0;
 755                         t->len = strlen((char *)t->ustring);
 756                     }
 757                     else if (id == Id::VERSIONX)
 758                     {   unsigned major = 0;
 759                         unsigned minor = 0;
 760
 761                         for (char *p = global.version + 1; 1; p++)
 762                         {
 763                             char c = *p;
 764                             if (isdigit(c))
 765                                 minor = minor * 10 + c - '0';
 766                             else if (c == '.')
 767                             {   major = minor;
 768                                 minor = 0;
 769                             }
 770                             else
 771                                 break;
 772                         }
 773                         t->value = TOKint64v;
 774                         t->uns64value = major * 1000 + minor;
 775                     }
 776 #if V2
 777                     else if (id == Id::EOFX)
 778                     {
 779                         t->value = TOKeof;
 780                         // Advance scanner to end of file
 781                         while (!(*p == 0 || *p == 0x1A))
 782                             p++;
 783                     }
 784 #endif
 785                 }
 786                 //printf("t->value = %d\n",t->value);
 787                 return;
 788             }
 789
 790             case '/':
 791                 p++;
 792                 switch (*p)
 793                 {
 794                     case '=':
 795                         p++;
 796                         t->value = TOKdivass;
 797                         return;
 798
 799                     case '*':
 800                         p++;
 801                         linnum = loc.linnum;
 802                         while (1)
 803                         {
 804                             while (1)
 805                             {   unsigned char c = *p;
 806                                 switch (c)
 807                                 {
 808                                     case '/':
 809                                         break;
 810
 811                                     case '\n':
 812                                         loc.linnum++;
 813                                         p++;
 814                                         continue;
 815
 816                                     case '\r':
 817                                         p++;
 818                                         if (*p != '\n')
 819                                             loc.linnum++;
 820                                         continue;
 821
 822                                     case 0:
 823                                     case 0x1A:
 824                                         error("unterminated /* */ comment");
 825                                         p = end;
 826                                         t->value = TOKeof;
 827                                         return;
 828
 829                                     default:
 830                                         if (c & 0x80)
 831                                         {   unsigned u = decodeUTF();
 832                                             if (u == PS || u == LS)
 833                                                 loc.linnum++;
 834                                         }
 835                                         p++;
 836                                         continue;
 837                                 }
 838                                 break;
 839                             }
 840                             p++;
 841                             if (p[-2] == '*' && p - 3 != t->ptr)
 842                                 break;
 843                         }
 844                         if (commentToken)
 845                         {
 846                             t->value = TOKcomment;
 847                             return;
 848                         }
 849                         else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
 850                         {   // if /** but not /**/
 851                             getDocComment(t, lastLine == linnum);
 852                         }
 853                         continue;
 854
 855                     case '/':           // do // style comments
 856                         linnum = loc.linnum;
 857                         while (1)
 858                         {   unsigned char c = *++p;
 859                             switch (c)
 860                             {
 861                                 case '\n':
 862                                     break;
 863
 864                                 case '\r':
 865                                     if (p[1] == '\n')
 866                                         p++;
 867                                     break;
 868
 869                                 case 0:
 870                                 case 0x1A:
 871                                     if (commentToken)
 872                                     {
 873                                         p = end;
 874                                         t->value = TOKcomment;
 875                                         return;
 876                                     }
 877                                     if (doDocComment && t->ptr[2] == '/')
 878                                         getDocComment(t, lastLine == linnum);
 879                                     p = end;
 880                                     t->value = TOKeof;
 881                                     return;
 882
 883                                 default:
 884                                     if (c & 0x80)
 885                                     {   unsigned u = decodeUTF();
 886                                         if (u == PS || u == LS)
 887                                             break;
 888                                     }
 889                                     continue;
 890                             }
 891                             break;
 892                         }
 893
 894                         if (commentToken)
 895                         {
 896                             p++;
 897                             loc.linnum++;
 898                             t->value = TOKcomment;
 899                             return;
 900                         }
 901                         if (doDocComment && t->ptr[2] == '/')
 902                             getDocComment(t, lastLine == linnum);
 903
 904                         p++;
 905                         loc.linnum++;
 906                         continue;
 907
 908                     case '+':
 909                     {   int nest;
 910
 911                         linnum = loc.linnum;
 912                         p++;
 913                         nest = 1;
 914                         while (1)
 915                         {   unsigned char c = *p;
 916                             switch (c)
 917                             {
 918                                 case '/':
 919                                     p++;
 920                                     if (*p == '+')
 921                                     {
 922                                         p++;
 923                                         nest++;
 924                                     }
 925                                     continue;
 926
 927                                 case '+':
 928                                     p++;
 929                                     if (*p == '/')
 930                                     {
 931                                         p++;
 932                                         if (--nest == 0)
 933                                             break;
 934                                     }
 935                                     continue;
 936
 937                                 case '\r':
 938                                     p++;
 939                                     if (*p != '\n')
 940                                         loc.linnum++;
 941                                     continue;
 942
 943                                 case '\n':
 944                                     loc.linnum++;
 945                                     p++;
 946                                     continue;
 947
 948                                 case 0:
 949                                 case 0x1A:
 950                                     error("unterminated /+ +/ comment");
 951                                     p = end;
 952                                     t->value = TOKeof;
 953                                     return;
 954
 955                                 default:
 956                                     if (c & 0x80)
 957                                     {   unsigned u = decodeUTF();
 958                                         if (u == PS || u == LS)
 959                                             loc.linnum++;
 960                                     }
 961                                     p++;
 962                                     continue;
 963                             }
 964                             break;
 965                         }
 966                         if (commentToken)
 967                         {
 968                             t->value = TOKcomment;
 969                             return;
 970                         }
 971                         if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
 972                         {   // if /++ but not /++/
 973                             getDocComment(t, lastLine == linnum);
 974                         }
 975                         continue;
 976                     }
 977                 }
 978                 t->value = TOKdiv;
 979                 return;
 980
 981             case '.':
 982                 p++;
 983                 if (isdigit(*p))
 984                 {   /* Note that we don't allow ._1 and ._ as being
 985                      * valid floating point numbers.
 986                      */
 987                     p--;
 988                     t->value = inreal(t);
 989                 }
 990                 else if (p[0] == '.')
 991                 {
 992                     if (p[1] == '.')
 993                     {   p += 2;
 994                         t->value = TOKdotdotdot;
 995                     }
 996                     else
 997                     {   p++;
 998                         t->value = TOKslice;
 999                     }
1000                 }
1001                 else
1002                     t->value = TOKdot;
1003                 return;
1004
1005             case '&':
1006                 p++;
1007                 if (*p == '=')
1008                 {   p++;
1009                     t->value = TOKandass;
1010                 }
1011                 else if (*p == '&')
1012                 {   p++;
1013                     t->value = TOKandand;
1014                     if (dltSyntax)
1015                         error("Use 'and' instead of '&&'");
1016                 }
1017                 else
1018                     t->value = TOKand;
1019                 return;
1020
1021             case '|':
1022                 p++;
1023                 if (*p == '=')
1024                 {   p++;
1025                     t->value = TOKorass;
1026                 }
1027                 else if (*p == '|')
1028                 {   p++;
1029                     t->value = TOKoror;
1030                     if (dltSyntax)
1031                         error("Use 'or' instead of '||'");
1032                 }
1033                 else
1034                     t->value = TOKor;
1035                 return;
1036
1037             case '-':
1038                 p++;
1039                 if (*p == '=')
1040                 {   p++;
1041                     t->value = TOKminass;
1042                 }
1043 #if 0
1044                 else if (*p == '>')
1045                 {   p++;
1046                     t->value = TOKarrow;
1047                 }
1048 #endif
1049                 else if (*p == '-')
1050                 {   p++;
1051                     t->value = TOKminusminus;
1052                 }
1053                 else
1054                     t->value = TOKmin;
1055                 return;
1056
1057             case '+':
1058                 p++;
1059                 if (*p == '=')
1060                 {   p++;
1061                     t->value = TOKaddass;
1062                 }
1063                 else if (*p == '+')
1064                 {   p++;
1065                     t->value = TOKplusplus;
1066                 }
1067                 else
1068                     t->value = TOKadd;
1069                 return;
1070
1071             case '<':
1072                 p++;
1073                 if (*p == '=')
1074                 {   p++;
1075                     t->value = TOKle;                   // <=
1076                 }
1077                 else if (*p == '<')
1078                 {   p++;
1079                     if (*p == '=')
1080                     {   p++;
1081                         t->value = TOKshlass;           // <<=
1082                     }
1083                     else
1084                         t->value = TOKshl;              // <<
1085                 }
1086                 else if (*p == '>')
1087                 {   p++;
1088                     if (*p == '=')
1089                     {   p++;
1090                         t->value = TOKleg;              // <>=
1091                     }
1092                     else
1093                         t->value = TOKlg;               // <>
1094                 }
1095                 else
1096                     t->value = TOKlt;                   // <
1097                 return;
1098
1099             case '>':
1100                 p++;
1101                 if (*p == '=')
1102                 {   p++;
1103                     t->value = TOKge;                   // >=
1104                 }
1105                 else if (*p == '>')
1106                 {   p++;
1107                     if (*p == '=')
1108                     {   p++;
1109                         t->value = TOKshrass;           // >>=
1110                     }
1111                     else if (*p == '>')
1112                     {   p++;
1113                         if (*p == '=')
1114                         {   p++;
1115                             t->value = TOKushrass;      // >>>=
1116                         }
1117                         else
1118                             t->value = TOKushr;         // >>>
1119                     }
1120                     else
1121                         t->value = TOKshr;              // >>
1122                 }
1123                 else
1124                     t->value = TOKgt;                   // >
1125                 return;
1126
1127             case '!':
1128                 p++;
1129                 if (*p == '=')
1130                 {   p++;
1131                     if (*p == '=' && global.params.Dversion == 1)
1132                     {   p++;
1133                         t->value = TOKnotidentity;      // !==
1134                     }
1135                     else
1136                         t->value = TOKnotequal;         // !=
1137                 }
1138                 else if (*p == '<')
1139                 {   p++;
1140                     if (*p == '>')
1141                     {   p++;
1142                         if (*p == '=')
1143                         {   p++;
1144                             t->value = TOKunord; // !<>=
1145                         }
1146                         else
1147                             t->value = TOKue;   // !<>
1148                     }
1149                     else if (*p == '=')
1150                     {   p++;
1151                         t->value = TOKug;       // !<=
1152                     }
1153                     else
1154                         t->value = TOKuge;      // !<
1155                 }
1156                 else if (*p == '>')
1157                 {   p++;
1158                     if (*p == '=')
1159                     {   p++;
1160                         t->value = TOKul;       // !>=
1161                     }
1162                     else
1163                         t->value = TOKule;      // !>
1164                 }
1165                 else
1166                     t->value = TOKnot;          // !
1167                 return;
1168
1169             case '=':
1170                 p++;
1171                 if (*p == '=')
1172                 {   p++;
1173                     if (*p == '=' && global.params.Dversion == 1)
1174                     {   p++;
1175                         t->value = TOKidentity;         // ===
1176                     }
1177                     else
1178                         t->value = TOKequal;            // ==
1179                 }
1180                 else
1181                     t->value = TOKassign;               // =
1182                 return;
1183
1184             case '~':
1185                 p++;
1186                 if (*p == '=')
1187                 {   p++;
1188                     t->value = TOKcatass;               // ~=
1189                 }
1190                 else
1191                     t->value = TOKtilde;                // ~
1192                 return;
1193
1194 #define NESTED(cin,tokin,cout,tokout) \
1195             case cin: nesting++; p++; t->value = tokin; return;\
1196             case cout: if (nesting == 0) {error("Unexpected '%c'", cout);} else {nesting--;} p++; t->value = tokout; return;
1197
1198             NESTED('(', TOKlparen, ')', TOKrparen)
1199             NESTED('[', TOKlbracket, ']', TOKrbracket)
1200             NESTED('{', TOKlcurly, '}', TOKrcurly)
1201 #undef NESTED
1202
1203 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1204             SINGLE('?', TOKquestion)
1205             SINGLE(',', TOKcomma)
1206             SINGLE(';', TOKsemicolon)
1207             SINGLE('$', TOKdollar)
1208             SINGLE('@', TOKat)
1209
1210 #undef SINGLE
1211
1212             case ':':
1213                 p++;
1214                 if (!nesting)
1215                         indent += 1;
1216                 t->value = TOKcolon;
1217                 return;
1218
1219 #define DOUBLE(c1,tok1,c2,tok2)         \
1220             case c1:                    \
1221                 p++;                    \
1222                 if (*p == c2)           \
1223                 {   p++;                \
1224                     t->value = tok2;    \
1225                 }                       \
1226                 else                    \
1227                     t->value = tok1;    \
1228                 return;
1229
1230             DOUBLE('*', TOKmul, '=', TOKmulass)
1231             DOUBLE('%', TOKmod, '=', TOKmodass)
1232             DOUBLE('^', TOKxor, '=', TOKxorass)
1233
1234 #undef DOUBLE
1235
1236             case '#':           // do # style comments and pragmas
1237                 if (dltSyntax)
1238                 {
1239                     do { p++; } while (*p != '\n');
1240                 }
1241                 else
1242                 {
1243                     p++;
1244                     pragma();
1245                 }
1246                 continue;
1247
1248             default:
1249             {   unsigned char c = *p;
1250
1251                 if (c & 0x80)
1252                 {   unsigned u = decodeUTF();
1253
1254                     // Check for start of unicode identifier
1255                     if (isUniAlpha(u))
1256                         goto case_ident;
1257
1258                     if (u == PS || u == LS)
1259                     {
1260                         loc.linnum++;
1261                         p++;
1262                         continue;
1263                     }
1264                 }
1265                 if (isprint(c))
1266                     error("unsupported char '%c'", c);
1267                 else
1268                     error("unsupported char 0x%02x", c);
1269                 p++;
1270                 continue;
1271             }
1272         }
1273     }
1274 }
1275
1276 /*******************************************
1277  * Parse escape sequence.
1278  */
1279
1280 unsigned Lexer::escapeSequence()
1281 {   unsigned c;
1282     int n;
1283     int ndigits;
1284
1285     c = *p;
1286     switch (c)
1287     {
1288         case '\'':
1289         case '"':
1290         case '?':
1291         case '\\':
1292         Lconsume:
1293                 p++;
1294                 break;
1295
1296         case 'a':       c = 7;          goto Lconsume;
1297         case 'b':       c = 8;          goto Lconsume;
1298         case 'f':       c = 12;         goto Lconsume;
1299         case 'n':       c = 10;         goto Lconsume;
1300         case 'r':       c = 13;         goto Lconsume;
1301         case 't':       c = 9;          goto Lconsume;
1302         case 'v':       c = 11;         goto Lconsume;
1303
1304         case 'u':
1305                 ndigits = 4;
1306                 goto Lhex;
1307         case 'U':
1308                 ndigits = 8;
1309                 goto Lhex;
1310         case 'x':
1311                 ndigits = 2;
1312         Lhex:
1313                 p++;
1314                 c = *p;
1315                 if (ishex(c))
1316                 {   unsigned v;
1317
1318                     n = 0;
1319                     v = 0;
1320                     while (1)
1321                     {
1322                         if (isdigit(c))
1323                             c -= '0';
1324                         else if (islower(c))
1325                             c -= 'a' - 10;
1326                         else
1327                             c -= 'A' - 10;
1328                         v = v * 16 + c;
1329                         c = *++p;
1330                         if (++n == ndigits)
1331                             break;
1332                         if (!ishex(c))
1333                         {   error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1334                             break;
1335                         }
1336                     }
1337                     if (ndigits != 2 && !utf_isValidDchar(v))
1338                         error("invalid UTF character \\U%08x", v);
1339                     c = v;
1340                 }
1341                 else
1342                     error("undefined escape hex sequence \\%c\n",c);
1343                 break;
1344
1345         case '&':                       // named character entity
1346                 for (unsigned char *idstart = ++p; 1; p++)
1347                 {
1348                     switch (*p)
1349                     {
1350                         case ';':
1351                             c = HtmlNamedEntity(idstart, p - idstart);
1352                             if (c == ~0)
1353                             {   error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1354                                 c = ' ';
1355                             }
1356                             p++;
1357                             break;
1358
1359                         default:
1360                             if (isalpha(*p) ||
1361                                 (p != idstart + 1 && isdigit(*p)))
1362                                 continue;
1363                             error("unterminated named entity");
1364                             break;
1365                     }
1366                     break;
1367                 }
1368                 break;
1369
1370         case 0:
1371         case 0x1A:                      // end of file
1372                 c = '\\';
1373                 break;
1374
1375         default:
1376                 if (isoctal(c))
1377                 {   unsigned v;
1378
1379                     n = 0;
1380                     v = 0;
1381                     do
1382                     {
1383                         v = v * 8 + (c - '0');
1384                         c = *++p;
1385                     } while (++n < 3 && isoctal(c));
1386                     c = v;
1387                     if (c > 0xFF)
1388                         error("0%03o is larger than a byte", c);
1389                 }
1390                 else
1391                     error("undefined escape sequence \\%c\n",c);
1392                 break;
1393     }
1394     return c;
1395 }
1396
1397 /**************************************
1398  */
1399
1400 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1401 {   unsigned c;
1402     Loc start = loc;
1403
1404     p++;
1405     stringbuffer.reset();
1406     while (1)
1407     {
1408         c = *p++;
1409         switch (c)
1410         {
1411             case '\n':
1412                 loc.linnum++;
1413                 break;
1414
1415             case '\r':
1416                 if (*p == '\n')
1417                     continue;   // ignore
1418                 c = '\n';       // treat EndOfLine as \n character
1419                 loc.linnum++;
1420                 break;
1421
1422             case 0:
1423             case 0x1A:
1424                 error("unterminated string constant starting at %s", start.toChars());
1425                 t->ustring = (unsigned char *)"";
1426                 t->len = 0;
1427                 t->postfix = 0;
1428                 return TOKstring;
1429
1430             case '"':
1431             case '`':
1432                 if (c == tc)
1433                 {
1434                     t->len = stringbuffer.offset;
1435                     stringbuffer.writeByte(0);
1436                     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1437                     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1438                     stringPostfix(t);
1439                     return TOKstring;
1440                 }
1441                 break;
1442
1443             default:
1444                 if (c & 0x80)
1445                 {   p--;
1446                     unsigned u = decodeUTF();
1447                     p++;
1448                     if (u == PS || u == LS)
1449                         loc.linnum++;
1450                     stringbuffer.writeUTF8(u);
1451                     continue;
1452                 }
1453                 break;
1454         }
1455         stringbuffer.writeByte(c);
1456     }
1457 }
1458
1459 /**************************************
1460  * Lex hex strings:
1461  *      x"0A ae 34FE BD"
1462  */
1463
1464 TOK Lexer::hexStringConstant(Token *t)
1465 {   unsigned c;
1466     Loc start = loc;
1467     unsigned n = 0;
1468     unsigned v;
1469
1470     p++;
1471     stringbuffer.reset();
1472     while (1)
1473     {
1474         c = *p++;
1475         switch (c)
1476         {
1477             case ' ':
1478             case '\t':
1479             case '\v':
1480             case '\f':
1481                 continue;                       // skip white space
1482
1483             case '\r':
1484                 if (*p == '\n')
1485                     continue;                   // ignore
1486                 // Treat isolated '\r' as if it were a '\n'
1487             case '\n':
1488                 loc.linnum++;
1489                 continue;
1490
1491             case 0:
1492             case 0x1A:
1493                 error("unterminated string constant starting at %s", start.toChars());
1494                 t->ustring = (unsigned char *)"";
1495                 t->len = 0;
1496                 t->postfix = 0;
1497                 return TOKstring;
1498
1499             case '"':
1500                 if (n & 1)
1501                 {   error("odd number (%d) of hex characters in hex string", n);
1502                     stringbuffer.writeByte(v);
1503                 }
1504                 t->len = stringbuffer.offset;
1505                 stringbuffer.writeByte(0);
1506                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1507                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1508                 stringPostfix(t);
1509                 return TOKstring;
1510
1511             default:
1512                 if (c >= '0' && c <= '9')
1513                     c -= '0';
1514                 else if (c >= 'a' && c <= 'f')
1515                     c -= 'a' - 10;
1516                 else if (c >= 'A' && c <= 'F')
1517                     c -= 'A' - 10;
1518                 else if (c & 0x80)
1519                 {   p--;
1520                     unsigned u = decodeUTF();
1521                     p++;
1522                     if (u == PS || u == LS)
1523                         loc.linnum++;
1524                     else
1525                         error("non-hex character \\u%x", u);
1526                 }
1527                 else
1528                     error("non-hex character '%c'", c);
1529                 if (n & 1)
1530                 {   v = (v << 4) | c;
1531                     stringbuffer.writeByte(v);
1532                 }
1533                 else
1534                     v = c;
1535                 n++;
1536                 break;
1537         }
1538     }
1539 }
1540
1541
1542 #if V2
1543 /**************************************
1544  * Lex delimited strings:
1545  *      q"(foo(xxx))"   // "foo(xxx)"
1546  *      q"[foo(]"       // "foo("
1547  *      q"/foo]/"       // "foo]"
1548  *      q"HERE
1549  *      foo
1550  *      HERE"           // "foo\n"
1551  * Input:
1552  *      p is on the "
1553  */
1554
1555 TOK Lexer::delimitedStringConstant(Token *t)
1556 {   unsigned c;
1557     Loc start = loc;
1558     unsigned delimleft = 0;
1559     unsigned delimright = 0;
1560     unsigned nest = 1;
1561     unsigned nestcount;
1562     Identifier *hereid = NULL;
1563     unsigned blankrol = 0;
1564     unsigned startline = 0;
1565
1566     p++;
1567     stringbuffer.reset();
1568     while (1)
1569     {
1570         c = *p++;
1571         //printf("c = '%c'\n", c);
1572         switch (c)
1573         {
1574             case '\n':
1575             Lnextline:
1576                 loc.linnum++;
1577                 startline = 1;
1578                 if (blankrol)
1579                 {   blankrol = 0;
1580                     continue;
1581                 }
1582                 if (hereid)
1583                 {
1584                     stringbuffer.writeUTF8(c);
1585                     continue;
1586                 }
1587                 break;
1588
1589             case '\r':
1590                 if (*p == '\n')
1591                     continue;   // ignore
1592                 c = '\n';       // treat EndOfLine as \n character
1593                 goto Lnextline;
1594
1595             case 0:
1596             case 0x1A:
1597                 goto Lerror;
1598
1599             default:
1600                 if (c & 0x80)
1601                 {   p--;
1602                     c = decodeUTF();
1603                     p++;
1604                     if (c == PS || c == LS)
1605                         goto Lnextline;
1606                 }
1607                 break;
1608         }
1609         if (delimleft == 0)
1610         {   delimleft = c;
1611             nest = 1;
1612             nestcount = 1;
1613             if (c == '(')
1614                 delimright = ')';
1615             else if (c == '{')
1616                 delimright = '}';
1617             else if (c == '[')
1618                 delimright = ']';
1619             else if (c == '<')
1620                 delimright = '>';
1621             else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1622             {   // Start of identifier; must be a heredoc
1623                 Token t;
1624                 p--;
1625                 scan(&t);               // read in heredoc identifier
1626                 if (t.value != TOKidentifier)
1627                 {   error("identifier expected for heredoc, not %s", t.toChars());
1628                     delimright = c;
1629                 }
1630                 else
1631                 {   hereid = t.ident;
1632                     //printf("hereid = '%s'\n", hereid->toChars());
1633                     blankrol = 1;
1634                 }
1635                 nest = 0;
1636             }
1637             else
1638             {   delimright = c;
1639                 nest = 0;
1640             }
1641         }
1642         else
1643         {
1644             if (blankrol)
1645             {   error("heredoc rest of line should be blank");
1646                 blankrol = 0;
1647                 continue;
1648             }
1649             if (nest == 1)
1650             {
1651                 if (c == delimleft)
1652                     nestcount++;
1653                 else if (c == delimright)
1654                 {   nestcount--;
1655                     if (nestcount == 0)
1656                         goto Ldone;
1657                 }
1658             }
1659             else if (c == delimright)
1660                 goto Ldone;
1661             if (startline && isalpha(c))
1662             {   Token t;
1663                 unsigned char *psave = p;
1664                 p--;
1665                 scan(&t);               // read in possible heredoc identifier
1666                 //printf("endid = '%s'\n", t.ident->toChars());
1667                 if (t.value == TOKidentifier && t.ident->equals(hereid))
1668                 {   /* should check that rest of line is blank
1669                      */
1670                     goto Ldone;
1671                 }
1672                 p = psave;
1673             }
1674             stringbuffer.writeUTF8(c);
1675             startline = 0;
1676         }
1677     }
1678
1679 Ldone:
1680     if (*p == '"')
1681         p++;
1682     else
1683         error("delimited string must end in %c\"", delimright);
1684     t->len = stringbuffer.offset;
1685     stringbuffer.writeByte(0);
1686     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1687     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1688     stringPostfix(t);
1689     return TOKstring;
1690
1691 Lerror:
1692     error("unterminated string constant starting at %s", start.toChars());
1693     t->ustring = (unsigned char *)"";
1694     t->len = 0;
1695     t->postfix = 0;
1696     return TOKstring;
1697 }
1698
1699 /**************************************
1700  * Lex delimited strings:
1701  *      q{ foo(xxx) } // " foo(xxx) "
1702  *      q{foo(}       // "foo("
1703  *      q{{foo}"}"}   // "{foo}"}""
1704  * Input:
1705  *      p is on the q
1706  */
1707
1708 TOK Lexer::tokenStringConstant(Token *t)
1709 {
1710     unsigned nest = 1;
1711     Loc start = loc;
1712     unsigned char *pstart = ++p;
1713
1714     while (1)
1715     {   Token tok;
1716
1717         scan(&tok);
1718         switch (tok.value)
1719         {
1720             case TOKlcurly:
1721                 nest++;
1722                 continue;
1723
1724             case TOKrcurly:
1725                 if (--nest == 0)
1726                     goto Ldone;
1727                 continue;
1728
1729             case TOKeof:
1730                 goto Lerror;
1731
1732             default:
1733                 continue;
1734         }
1735     }
1736
1737 Ldone:
1738     t->len = p - 1 - pstart;
1739     t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1740     memcpy(t->ustring, pstart, t->len);
1741     t->ustring[t->len] = 0;
1742     stringPostfix(t);
1743     return TOKstring;
1744
1745 Lerror:
1746     error("unterminated token string constant starting at %s", start.toChars());
1747     t->ustring = (unsigned char *)"";
1748     t->len = 0;
1749     t->postfix = 0;
1750     return TOKstring;
1751 }
1752
1753 #endif
1754
1755
1756 /**************************************
1757  */
1758
1759 TOK Lexer::escapeStringConstant(Token *t, int wide)
1760 {   unsigned c;
1761     Loc start = loc;
1762
1763     p++;
1764     stringbuffer.reset();
1765     while (1)
1766     {
1767         c = *p++;
1768         switch (c)
1769         {
1770             case '\\':
1771                 switch (*p)
1772                 {
1773                     case 'u':
1774                     case 'U':
1775                     case '&':
1776                         c = escapeSequence();
1777                         stringbuffer.writeUTF8(c);
1778                         continue;
1779
1780                     default:
1781                         c = escapeSequence();
1782                         break;
1783                 }
1784                 break;
1785
1786             case '\n':
1787                 loc.linnum++;
1788                 break;
1789
1790             case '\r':
1791                 if (*p == '\n')
1792                     continue;   // ignore
1793                 c = '\n';       // treat EndOfLine as \n character
1794                 loc.linnum++;
1795                 break;
1796
1797             case '"':
1798                 t->len = stringbuffer.offset;
1799                 stringbuffer.writeByte(0);
1800                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1801                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1802                 stringPostfix(t);
1803                 return TOKstring;
1804
1805             case 0:
1806             case 0x1A:
1807                 p--;
1808                 error("unterminated string constant starting at %s", start.toChars());
1809                 t->ustring = (unsigned char *)"";
1810                 t->len = 0;
1811                 t->postfix = 0;
1812                 return TOKstring;
1813
1814             default:
1815                 if (c & 0x80)
1816                 {
1817                     p--;
1818                     c = decodeUTF();
1819                     if (c == LS || c == PS)
1820                     {   c = '\n';
1821                         loc.linnum++;
1822                     }
1823                     p++;
1824                     stringbuffer.writeUTF8(c);
1825                     continue;
1826                 }
1827                 break;
1828         }
1829         stringbuffer.writeByte(c);
1830     }
1831 }
1832
1833 /**************************************
1834  */
1835
1836 TOK Lexer::charConstant(Token *t, int wide)
1837 {
1838     unsigned c;
1839     TOK tk = TOKcharv;
1840
1841     //printf("Lexer::charConstant\n");
1842     p++;
1843     c = *p++;
1844     switch (c)
1845     {
1846         case '\\':
1847             switch (*p)
1848             {
1849                 case 'u':
1850                     t->uns64value = escapeSequence();
1851                     tk = TOKwcharv;
1852                     break;
1853
1854                 case 'U':
1855                 case '&':
1856                     t->uns64value = escapeSequence();
1857                     tk = TOKdcharv;
1858                     break;
1859
1860                 default:
1861                     t->uns64value = escapeSequence();
1862                     break;
1863             }
1864             break;
1865
1866         case '\n':
1867         L1:
1868             loc.linnum++;
1869         case '\r':
1870         case 0:
1871         case 0x1A:
1872         case '\'':
1873             error("unterminated character constant");
1874             return tk;
1875
1876         default:
1877             if (c & 0x80)
1878             {
1879                 p--;
1880                 c = decodeUTF();
1881                 p++;
1882                 if (c == LS || c == PS)
1883                     goto L1;
1884                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1885                     tk = TOKwcharv;
1886                 else
1887                     tk = TOKdcharv;
1888             }
1889             t->uns64value = c;
1890             break;
1891     }
1892
1893     if (*p != '\'')
1894     {   error("unterminated character constant");
1895         return tk;
1896     }
1897     p++;
1898     return tk;
1899 }
1900
1901 /***************************************
1902  * Get postfix of string literal.
1903  */
1904
1905 void Lexer::stringPostfix(Token *t)
1906 {
1907     switch (*p)
1908     {
1909         case 'c':
1910         case 'w':
1911         case 'd':
1912             t->postfix = *p;
1913             p++;
1914             break;
1915
1916         default:
1917             t->postfix = 0;
1918             break;
1919     }
1920 }
1921
1922 /***************************************
1923  * Read \u or \U unicode sequence
1924  * Input:
1925  *      u       'u' or 'U'
1926  */
1927
1928 #if 0
1929 unsigned Lexer::wchar(unsigned u)
1930 {
1931     unsigned value;
1932     unsigned n;
1933     unsigned char c;
1934     unsigned nchars;
1935
1936     nchars = (u == 'U') ? 8 : 4;
1937     value = 0;
1938     for (n = 0; 1; n++)
1939     {
1940         ++p;
1941         if (n == nchars)
1942             break;
1943         c = *p;
1944         if (!ishex(c))
1945         {   error("\\%c sequence must be followed by %d hex characters", u, nchars);
1946             break;
1947         }
1948         if (isdigit(c))
1949             c -= '0';
1950         else if (islower(c))
1951             c -= 'a' - 10;
1952         else
1953             c -= 'A' - 10;
1954         value <<= 4;
1955         value |= c;
1956     }
1957     return value;
1958 }
1959 #endif
1960
1961 /**************************************
1962  * Read in a number.
1963  * If it's an integer, store it in tok.TKutok.Vlong.
1964  *      integers can be decimal, octal or hex
1965  *      Handle the suffixes U, UL, LU, L, etc.
1966  * If it's double, store it in tok.TKutok.Vdouble.
1967  * Returns:
1968  *      TKnum
1969  *      TKdouble,...
1970  */
1971
1972 TOK Lexer::number(Token *t)
1973 {
1974     // We use a state machine to collect numbers
1975     enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
1976         STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
1977         STATE_hexh, STATE_error };
1978     enum STATE state;
1979
1980     enum FLAGS
1981     {   FLAGS_decimal  = 1,             // decimal
1982         FLAGS_unsigned = 2,             // u or U suffix
1983         FLAGS_long     = 4,             // l or L suffix
1984     };
1985     enum FLAGS flags = FLAGS_decimal;
1986
1987     int i;
1988     int base;
1989     unsigned c;
1990     unsigned char *start;
1991     TOK result;
1992
1993     //printf("Lexer::number()\n");
1994     state = STATE_initial;
1995     base = 0;
1996     stringbuffer.reset();
1997     start = p;
1998     while (1)
1999     {
2000         c = *p;
2001         switch (state)
2002         {
2003             case STATE_initial:         // opening state
2004                 if (c == '0')
2005                     state = STATE_0;
2006                 else
2007                     state = STATE_decimal;
2008                 break;
2009
2010             case STATE_0:
2011                 flags = (FLAGS) (flags & ~FLAGS_decimal);
2012                 switch (c)
2013                 {
2014 #if ZEROH
2015                     case 'H':                   // 0h
2016                     case 'h':
2017                         goto hexh;
2018 #endif
2019                     case 'X':
2020                     case 'x':
2021                         state = STATE_hex0;
2022                         break;
2023
2024                     case '.':
2025                         if (p[1] == '.')        // .. is a separate token
2026                             goto done;
2027                     case 'i':
2028                     case 'f':
2029                     case 'F':
2030                         goto real;
2031 #if ZEROH
2032                     case 'E':
2033                     case 'e':
2034                         goto case_hex;
2035 #endif
2036                     case 'B':
2037                     case 'b':
2038                         state = STATE_binary0;
2039                         break;
2040
2041                     case '0': case '1': case '2': case '3':
2042                     case '4': case '5': case '6': case '7':
2043                         state = STATE_octal;
2044                         break;
2045
2046 #if ZEROH
2047                     case '8': case '9': case 'A':
2048                     case 'C': case 'D': case 'F':
2049                     case 'a': case 'c': case 'd': case 'f':
2050                     case_hex:
2051                         state = STATE_hexh;
2052                         break;
2053 #endif
2054                     case '_':
2055                         state = STATE_octal;
2056                         p++;
2057                         continue;
2058
2059                     case 'L':
2060                         if (p[1] == 'i')
2061                             goto real;
2062                         goto done;
2063
2064                     default:
2065                         goto done;
2066                 }
2067                 break;
2068
2069             case STATE_decimal:         // reading decimal number
2070                 if (!isdigit(c))
2071                 {
2072 #if ZEROH
2073                     if (ishex(c)
2074                         || c == 'H' || c == 'h'
2075                        )
2076                         goto hexh;
2077 #endif
2078                     if (c == '_')               // ignore embedded _
2079                     {   p++;
2080                         continue;
2081                     }
2082                     if (c == '.' && p[1] != '.')
2083                         goto real;
2084                     else if (c == 'i' || c == 'f' || c == 'F' ||
2085                              c == 'e' || c == 'E')
2086                     {
2087             real:       // It's a real number. Back up and rescan as a real
2088                         p = start;
2089                         return inreal(t);
2090                     }
2091                     else if (c == 'L' && p[1] == 'i')
2092                         goto real;
2093                     goto done;
2094                 }
2095                 break;
2096
2097             case STATE_hex0:            // reading hex number
2098             case STATE_hex:
2099                 if (!ishex(c))
2100                 {
2101                     if (c == '_')               // ignore embedded _
2102                     {   p++;
2103                         continue;
2104                     }
2105                     if (c == '.' && p[1] != '.')
2106                         goto real;
2107                     if (c == 'P' || c == 'p' || c == 'i')
2108                         goto real;
2109                     if (state == STATE_hex0)
2110                         error("Hex digit expected, not '%c'", c);
2111                     goto done;
2112                 }
2113                 state = STATE_hex;
2114                 break;
2115
2116 #if ZEROH
2117             hexh:
2118                 state = STATE_hexh;
2119             case STATE_hexh:            // parse numbers like 0FFh
2120                 if (!ishex(c))
2121                 {
2122                     if (c == 'H' || c == 'h')
2123                     {
2124                         p++;
2125                         base = 16;
2126                         goto done;
2127                     }
2128                     else
2129                     {
2130                         // Check for something like 1E3 or 0E24
2131                         if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2132                             memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2133                             goto real;
2134                         error("Hex digit expected, not '%c'", c);
2135                         goto done;
2136                     }
2137                 }
2138                 break;
2139 #endif
2140
2141             case STATE_octal:           // reading octal number
2142             case STATE_octale:          // reading octal number with non-octal digits
2143                 if (!isoctal(c))
2144                 {
2145 #if ZEROH
2146                     if (ishex(c)
2147                         || c == 'H' || c == 'h'
2148                        )
2149                         goto hexh;
2150 #endif
2151                     if (c == '_')               // ignore embedded _
2152                     {   p++;
2153                         continue;
2154                     }
2155                     if (c == '.' && p[1] != '.')
2156                         goto real;
2157                     if (c == 'i')
2158                         goto real;
2159                     if (isdigit(c))
2160                     {
2161                         state = STATE_octale;
2162                     }
2163                     else
2164                         goto done;
2165                 }
2166                 break;
2167
2168             case STATE_binary0:         // starting binary number
2169             case STATE_binary:          // reading binary number
2170                 if (c != '0' && c != '1')
2171                 {
2172 #if ZEROH
2173                     if (ishex(c)
2174                         || c == 'H' || c == 'h'
2175                        )
2176                         goto hexh;
2177 #endif
2178                     if (c == '_')               // ignore embedded _
2179                     {   p++;
2180                         continue;
2181                     }
2182                     if (state == STATE_binary0)
2183                     {   error("binary digit expected");
2184                         state = STATE_error;
2185                         break;
2186                     }
2187                     else
2188                         goto done;
2189                 }
2190                 state = STATE_binary;
2191                 break;
2192
2193             case STATE_error:           // for error recovery
2194                 if (!isdigit(c))        // scan until non-digit
2195                     goto done;
2196                 break;
2197
2198             default:
2199                 assert(0);
2200         }
2201         stringbuffer.writeByte(c);
2202         p++;
2203     }
2204 done:
2205     stringbuffer.writeByte(0);          // terminate string
2206     if (state == STATE_octale)
2207         error("Octal digit expected");
2208
2209     uinteger_t n;                       // unsigned >=64 bit integer type
2210
2211     if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2212         n = stringbuffer.data[0] - '0';
2213     else
2214     {
2215         // Convert string to integer
2216 #if __DMC__
2217         errno = 0;
2218         n = strtoull((char *)stringbuffer.data,NULL,base);
2219         if (errno == ERANGE)
2220             error("integer overflow");
2221 #else
2222         // Not everybody implements strtoull()
2223         char *p = (char *)stringbuffer.data;
2224         int r = 10, d;
2225
2226         if (*p == '0')
2227         {
2228             if (p[1] == 'x' || p[1] == 'X')
2229                 p += 2, r = 16;
2230             else if (p[1] == 'b' || p[1] == 'B')
2231                 p += 2, r = 2;
2232             else if (isdigit(p[1]))
2233                 p += 1, r = 8;
2234         }
2235
2236         n = 0;
2237         while (1)
2238         {
2239             if (*p >= '0' && *p <= '9')
2240                 d = *p - '0';
2241             else if (*p >= 'a' && *p <= 'z')
2242                 d = *p - 'a' + 10;
2243             else if (*p >= 'A' && *p <= 'Z')
2244                 d = *p - 'A' + 10;
2245             else
2246                 break;
2247             if (d >= r)
2248                 break;
2249             if (n && n * r + d <= n)
2250             {
2251                 error ("integer overflow");
2252                 break;
2253             }
2254
2255             n = n * r + d;
2256             p++;
2257         }
2258 #endif
2259         if (sizeof(n) > 8 &&
2260             n > 0xFFFFFFFFFFFFFFFFULL)  // if n needs more than 64 bits
2261             error("integer overflow");
2262     }
2263
2264     // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2265     while (1)
2266     {   unsigned char f;
2267
2268         switch (*p)
2269         {   case 'U':
2270             case 'u':
2271                 f = FLAGS_unsigned;
2272                 goto L1;
2273
2274             case 'l':
2275                 if (1 || !global.params.useDeprecated)
2276                     error("'l' suffix is deprecated, use 'L' instead");
2277             case 'L':
2278                 f = FLAGS_long;
2279             L1:
2280                 p++;
2281                 if (flags & f)
2282                     error("unrecognized token");
2283                 flags = (FLAGS) (flags | f);
2284                 continue;
2285             default:
2286                 break;
2287         }
2288         break;
2289     }
2290
2291     switch (flags)
2292     {
2293         case 0:
2294             /* Octal or Hexadecimal constant.
2295              * First that fits: int, uint, long, ulong
2296              */
2297             if (n & 0x8000000000000000LL)
2298                     result = TOKuns64v;
2299             else if (n & 0xFFFFFFFF00000000LL)
2300                     result = TOKint64v;
2301             else if (n & 0x80000000)
2302                     result = TOKuns32v;
2303             else
2304                     result = TOKint32v;
2305             break;
2306
2307         case FLAGS_decimal:
2308             /* First that fits: int, long, long long
2309              */
2310             if (n & 0x8000000000000000LL)
2311             {       error("signed integer overflow");
2312                     result = TOKuns64v;
2313             }
2314             else if (n & 0xFFFFFFFF80000000LL)
2315                     result = TOKint64v;
2316             else
2317                     result = TOKint32v;
2318             break;
2319
2320         case FLAGS_unsigned:
2321         case FLAGS_decimal | FLAGS_unsigned:
2322             /* First that fits: uint, ulong
2323              */
2324             if (n & 0xFFFFFFFF00000000LL)
2325                     result = TOKuns64v;
2326             else
2327                     result = TOKuns32v;
2328             break;
2329
2330         case FLAGS_decimal | FLAGS_long:
2331             if (n & 0x8000000000000000LL)
2332             {       error("signed integer overflow");
2333                     result = TOKuns64v;
2334             }
2335             else
2336                     result = TOKint64v;
2337             break;
2338
2339         case FLAGS_long:
2340             if (n & 0x8000000000000000LL)
2341                     result = TOKuns64v;
2342             else
2343                     result = TOKint64v;
2344             break;
2345
2346         case FLAGS_unsigned | FLAGS_long:
2347         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2348             result = TOKuns64v;
2349             break;
2350
2351         default:
2352             #ifdef DEBUG
2353                 printf("%x\n",flags);
2354             #endif
2355             assert(0);
2356     }
2357     t->uns64value = n;
2358     return result;
2359 }
2360
2361 /**************************************
2362  * Read in characters, converting them to real.
2363  * Bugs:
2364  *      Exponent overflow not detected.
2365  *      Too much requested precision is not detected.
2366  */
2367
2368 TOK Lexer::inreal(Token *t)
2369 #ifdef __DMC__
2370 __in
2371 {
2372     assert(*p == '.' || isdigit(*p));
2373 }
2374 __out (result)
2375 {
2376     switch (result)
2377     {
2378         case TOKfloat32v:
2379         case TOKfloat64v:
2380         case TOKfloat80v:
2381         case TOKimaginary32v:
2382         case TOKimaginary64v:
2383         case TOKimaginary80v:
2384             break;
2385
2386         default:
2387             assert(0);
2388     }
2389 }
2390 __body
2391 #endif /* __DMC__ */
2392 {   int dblstate;
2393     unsigned c;
2394     char hex;                   // is this a hexadecimal-floating-constant?
2395     TOK result;
2396
2397     //printf("Lexer::inreal()\n");
2398     stringbuffer.reset();
2399     dblstate = 0;
2400     hex = 0;
2401 Lnext:
2402     while (1)
2403     {
2404         // Get next char from input
2405         c = *p++;
2406         //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2407         while (1)
2408         {
2409             switch (dblstate)
2410             {
2411                 case 0:                 // opening state
2412                     if (c == '0')
2413                         dblstate = 9;
2414                     else if (c == '.')
2415                         dblstate = 3;
2416                     else
2417                         dblstate = 1;
2418                     break;
2419
2420                 case 9:
2421                     dblstate = 1;
2422                     if (c == 'X' || c == 'x')
2423                     {   hex++;
2424                         break;
2425                     }
2426                 case 1:                 // digits to left of .
2427                 case 3:                 // digits to right of .
2428                 case 7:                 // continuing exponent digits
2429                     if (!isdigit(c) && !(hex && isxdigit(c)))
2430                     {
2431                         if (c == '_')
2432                             goto Lnext; // ignore embedded '_'
2433                         dblstate++;
2434                         continue;
2435                     }
2436                     break;
2437
2438                 case 2:                 // no more digits to left of .
2439                     if (c == '.')
2440                     {   dblstate++;
2441                         break;
2442                     }
2443                 case 4:                 // no more digits to right of .
2444                     if ((c == 'E' || c == 'e') ||
2445                         hex && (c == 'P' || c == 'p'))
2446                     {   dblstate = 5;
2447                         hex = 0;        // exponent is always decimal
2448                         break;
2449                     }
2450                     if (hex)
2451                         error("binary-exponent-part required");
2452                     goto done;
2453
2454                 case 5:                 // looking immediately to right of E
2455                     dblstate++;
2456                     if (c == '-' || c == '+')
2457                         break;
2458                 case 6:                 // 1st exponent digit expected
2459                     if (!isdigit(c))
2460                         error("exponent expected");
2461                     dblstate++;
2462                     break;
2463
2464                 case 8:                 // past end of exponent digits
2465                     goto done;
2466             }
2467             break;
2468         }
2469         stringbuffer.writeByte(c);
2470     }
2471 done:
2472     p--;
2473
2474     stringbuffer.writeByte(0);
2475
2476 #if _WIN32 && __DMC__
2477     char *save = __locale_decpoint;
2478     __locale_decpoint = ".";
2479 #endif
2480 #ifdef IN_GCC
2481     t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2482 #else
2483     t->float80value = strtold((char *)stringbuffer.data, NULL);
2484 #endif
2485     errno = 0;
2486     switch (*p)
2487     {
2488         case 'F':
2489         case 'f':
2490 #ifdef IN_GCC
2491             real_t::parse((char *)stringbuffer.data, real_t::Float);
2492 #else
2493             strtof((char *)stringbuffer.data, NULL);
2494 #endif
2495             result = TOKfloat32v;
2496             p++;
2497             break;
2498
2499         default:
2500 #ifdef IN_GCC
2501             real_t::parse((char *)stringbuffer.data, real_t::Double);
2502 #else
2503             strtod((char *)stringbuffer.data, NULL);
2504 #endif
2505             result = TOKfloat64v;
2506             break;
2507
2508         case 'l':
2509             if (!global.params.useDeprecated)
2510                 error("'l' suffix is deprecated, use 'L' instead");
2511         case 'L':
2512             result = TOKfloat80v;
2513             p++;
2514             break;
2515     }
2516     if (*p == 'i' || *p == 'I')
2517     {
2518         if (!global.params.useDeprecated && *p == 'I')
2519             error("'I' suffix is deprecated, use 'i' instead");
2520         p++;
2521         switch (result)
2522         {
2523             case TOKfloat32v:
2524                 result = TOKimaginary32v;
2525                 break;
2526             case TOKfloat64v:
2527                 result = TOKimaginary64v;
2528                 break;
2529             case TOKfloat80v:
2530                 result = TOKimaginary80v;
2531                 break;
2532         }
2533     }
2534 #if _WIN32 && __DMC__
2535     __locale_decpoint = save;
2536 #endif
2537     if (errno == ERANGE)
2538         error("number is not representable");
2539     return result;
2540 }
2541
2542 /*********************************************
2543  * Do pragma.
2544  * Currently, the only pragma supported is:
2545  *      #line linnum [filespec]
2546  */
2547
2548 void Lexer::pragma()
2549 {
2550     Token tok;
2551     int linnum;
2552     char *filespec = NULL;
2553     Loc loc = this->loc;
2554
2555     while (isblank(*p)) p++;
2556     if (*p == '\n')
2557         goto Lerr;
2558
2559     scan(&tok);
2560     if (tok.value != TOKidentifier || tok.ident != Id::line)
2561         goto Lerr;
2562
2563     scan(&tok);
2564     if (tok.value == TOKint32v || tok.value == TOKint64v)
2565         linnum = tok.uns64value - 1;
2566     else
2567         goto Lerr;
2568
2569     while (1)
2570     {
2571         switch (*p)
2572         {
2573             case 0:
2574             case 0x1A:
2575             case '\n':
2576             Lnewline:
2577                 this->loc.linnum = linnum;
2578                 if (filespec)
2579                     this->loc.filename = filespec;
2580                 return;
2581
2582             case '\r':
2583                 p++;
2584                 if (*p != '\n')
2585                 {   p--;
2586                     goto Lnewline;
2587                 }
2588                 continue;
2589
2590             case ' ':
2591             case '\t':
2592             case '\v':
2593             case '\f':
2594                 p++;
2595                 continue;                       // skip white space
2596
2597             case '_':
2598                 if (mod && memcmp(p, "__FILE__", 8) == 0)
2599                 {
2600                     p += 8;
2601                     filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2602                 }
2603                 continue;
2604
2605             case '"':
2606                 if (filespec)
2607                     goto Lerr;
2608                 stringbuffer.reset();
2609                 p++;
2610                 while (1)
2611                 {   unsigned c;
2612
2613                     c = *p;
2614                     switch (c)
2615                     {
2616                         case '\n':
2617                         case '\r':
2618                         case 0:
2619                         case 0x1A:
2620                             goto Lerr;
2621
2622                         case '"':
2623                             stringbuffer.writeByte(0);
2624                             filespec = mem.strdup((char *)stringbuffer.data);
2625                             p++;
2626                             break;
2627
2628                         default:
2629                             if (c & 0x80)
2630                             {   unsigned u = decodeUTF();
2631                                 if (u == PS || u == LS)
2632                                     goto Lerr;
2633                             }
2634                             stringbuffer.writeByte(c);
2635                             p++;
2636                             continue;
2637                     }
2638                     break;
2639                 }
2640                 continue;
2641
2642             default:
2643                 if (*p & 0x80)
2644                 {   unsigned u = decodeUTF();
2645                     if (u == PS || u == LS)
2646                         goto Lnewline;
2647                 }
2648                 goto Lerr;
2649         }
2650     }
2651
2652 Lerr:
2653     error(loc, "#line integer [\"filespec\"]\\n expected");
2654 }
2655
2656
2657 /********************************************
2658  * Decode UTF character.
2659  * Issue error messages for invalid sequences.
2660  * Return decoded character, advance p to last character in UTF sequence.
2661  */
2662
2663 unsigned Lexer::decodeUTF()
2664 {
2665     dchar_t u;
2666     unsigned char c;
2667     unsigned char *s = p;
2668     size_t len;
2669     size_t idx;
2670     char *msg;
2671
2672     c = *s;
2673     assert(c & 0x80);
2674
2675     // Check length of remaining string up to 6 UTF-8 characters
2676     for (len = 1; len < 6 && s[len]; len++)
2677         ;
2678
2679     idx = 0;
2680     msg = utf_decodeChar(s, len, &idx, &u);
2681     p += idx - 1;
2682     if (msg)
2683     {
2684         error("%s", msg);
2685     }
2686     return u;
2687 }
2688
2689
2690 /***************************************************
2691  * Parse doc comment embedded between t->ptr and p.
2692  * Remove trailing blanks and tabs from lines.
2693  * Replace all newlines with \n.
2694  * Remove leading comment character from each line.
2695  * Decide if it's a lineComment or a blockComment.
2696  * Append to previous one for this token.
2697  */
2698
2699 void Lexer::getDocComment(Token *t, unsigned lineComment)
2700 {
2701     OutBuffer buf;
2702     unsigned char ct = t->ptr[2];
2703     unsigned char *q = t->ptr + 3;      // start of comment text
2704     int linestart = 0;
2705
2706     unsigned char *qend = p;
2707     if (ct == '*' || ct == '+')
2708         qend -= 2;
2709
2710     /* Scan over initial row of ****'s or ++++'s or ////'s
2711      */
2712     for (; q < qend; q++)
2713     {
2714         if (*q != ct)
2715             break;
2716     }
2717
2718     /* Remove trailing row of ****'s or ++++'s
2719      */
2720     if (ct != '/')
2721     {
2722         for (; q < qend; qend--)
2723         {
2724             if (qend[-1] != ct)
2725                 break;
2726         }
2727     }
2728
2729     for (; q < qend; q++)
2730     {
2731         unsigned char c = *q;
2732
2733         switch (c)
2734         {
2735             case '*':
2736             case '+':
2737                 if (linestart && c == ct)
2738                 {   linestart = 0;
2739                     /* Trim preceding whitespace up to preceding \n
2740                      */
2741                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2742                         buf.offset--;
2743                     continue;
2744                 }
2745                 break;
2746
2747             case ' ':
2748             case '\t':
2749                 break;
2750
2751             case '\r':
2752                 if (q[1] == '\n')
2753                     continue;           // skip the \r
2754                 goto Lnewline;
2755
2756             default:
2757                 if (c == 226)
2758                 {
2759                     // If LS or PS
2760                     if (q[1] == 128 &&
2761                         (q[2] == 168 || q[2] == 169))
2762                     {
2763                         q += 2;
2764                         goto Lnewline;
2765                     }
2766                 }
2767                 linestart = 0;
2768                 break;
2769
2770             Lnewline:
2771                 c = '\n';               // replace all newlines with \n
2772             case '\n':
2773                 linestart = 1;
2774
2775                 /* Trim trailing whitespace
2776                  */
2777                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2778                     buf.offset--;
2779
2780                 break;
2781         }
2782         buf.writeByte(c);
2783     }
2784
2785     // Always end with a newline
2786     if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2787         buf.writeByte('\n');
2788
2789     buf.writeByte(0);
2790
2791     // It's a line comment if the start of the doc comment comes
2792     // after other non-whitespace on the same line.
2793     unsigned char** dc = (lineComment && anyToken)
2794                          ? &t->lineComment
2795                          : &t->blockComment;
2796
2797     // Combine with previous doc comment, if any
2798     if (*dc)
2799         *dc = combineComments(*dc, (unsigned char *)buf.data);
2800     else
2801         *dc = (unsigned char *)buf.extractData();
2802 }
2803
2804 /********************************************
2805  * Combine two document comments into one.
2806  */
2807
2808 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2809 {
2810     unsigned char *c = c2;
2811
2812     if (c1)
2813     {   c = c1;
2814         if (c2)
2815         {   size_t len1 = strlen((char *)c1);
2816             size_t len2 = strlen((char *)c2);
2817
2818             c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2819             memcpy(c, c1, len1);
2820             c[len1] = '\n';
2821             memcpy(c + len1 + 1, c2, len2);
2822             c[len1 + 1 + len2] = 0;
2823         }
2824     }
2825     return c;
2826 }
2827
2828 /********************************************
2829  * Create an identifier in the string table.
2830  */
2831
2832 Identifier *Lexer::idPool(const char *s)
2833 {
2834     size_t len = strlen(s);
2835     StringValue *sv = stringtable.update(s, len);
2836     Identifier *id = (Identifier *) sv->ptrvalue;
2837     if (!id)
2838     {
2839         id = new Identifier(sv->lstring.string, TOKidentifier);
2840         sv->ptrvalue = id;
2841     }
2842     return id;
2843 }
2844
2845 /*********************************************
2846  * Create a unique identifier using the prefix s.
2847  */
2848
2849 Identifier *Lexer::uniqueId(const char *s, int num)
2850 {   char buffer[32];
2851     size_t slen = strlen(s);
2852
2853     assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2854     sprintf(buffer, "%s%d", s, num);
2855     return idPool(buffer);
2856 }
2857
2858 Identifier *Lexer::uniqueId(const char *s)
2859 {
2860     static int num;
2861     return uniqueId(s, ++num);
2862 }
2863
2864 /****************************************
2865  */
2866
2867 struct Keyword
2868 {   char *name;
2869     enum TOK value;
2870 };
2871
2872 static Keyword keywords[] =
2873 {
2874 //    { "",             TOK     },
2875
2876     {   "this",         TOKthis         },
2877     {   "super",        TOKsuper        },
2878     {   "assert",       TOKassert       },
2879     {   "null",         TOKnull         },
2880     {   "true",         TOKtrue         },
2881     {   "false",        TOKfalse        },
2882     {   "cast",         TOKcast         },
2883     {   "new",          TOKnew          },
2884     {   "delete",       TOKdelete       },
2885     {   "throw",        TOKthrow        },
2886     {   "module",       TOKmodule       },
2887     {   "pragma",       TOKpragma       },
2888     {   "typeof",       TOKtypeof       },
2889     {   "typeid",       TOKtypeid       },
2890
2891     {   "template",     TOKtemplate     },
2892
2893     {   "void",         TOKvoid         },
2894     {   "byte",         TOKint8         },
2895     {   "ubyte",        TOKuns8         },
2896     {   "short",        TOKint16        },
2897     {   "ushort",       TOKuns16        },
2898     {   "int",          TOKint32        },
2899     {   "uint",         TOKuns32        },
2900     {   "long",         TOKint64        },
2901     {   "ulong",        TOKuns64        },
2902     {   "cent",         TOKcent,        },
2903     {   "ucent",        TOKucent,       },
2904     {   "float",        TOKfloat32      },
2905     {   "double",       TOKfloat64      },
2906     {   "real",         TOKfloat80      },
2907
2908     {   "bool",         TOKbool         },
2909     {   "char",         TOKchar         },
2910     {   "wchar",        TOKwchar        },
2911     {   "dchar",        TOKdchar        },
2912
2913     {   "ifloat",       TOKimaginary32  },
2914     {   "idouble",      TOKimaginary64  },
2915     {   "ireal",        TOKimaginary80  },
2916
2917     {   "cfloat",       TOKcomplex32    },
2918     {   "cdouble",      TOKcomplex64    },
2919     {   "creal",        TOKcomplex80    },
2920
2921     {   "delegate",     TOKdelegate     },
2922     {   "function",     TOKfunction     },
2923
2924     {   "is",           TOKis           },
2925     {   "if",           TOKif           },
2926     {   "else",         TOKelse         },
2927     {   "while",        TOKwhile        },
2928     {   "for",          TOKfor          },
2929     {   "do",           TOKdo           },
2930     {   "switch",       TOKswitch       },
2931     {   "case",         TOKcase         },
2932     {   "default",      TOKdefault      },
2933     {   "break",        TOKbreak        },
2934     {   "continue",     TOKcontinue     },
2935     {   "synchronized", TOKsynchronized },
2936     {   "return",       TOKreturn       },
2937     {   "goto",         TOKgoto         },
2938     {   "try",          TOKtry          },
2939     {   "catch",        TOKcatch        },
2940     {   "finally",      TOKfinally      },
2941     {   "with",         TOKwith         },
2942     {   "asm",          TOKasm          },
2943     {   "foreach",      TOKforeach      },
2944     {   "foreach_reverse",      TOKforeach_reverse      },
2945     {   "reversed",     TOKreversed     },
2946     {   "scope",        TOKscope        },
2947
2948     {   "struct",       TOKstruct       },
2949     {   "class",        TOKclass        },
2950     {   "interface",    TOKinterface    },
2951     {   "union",        TOKunion        },
2952     {   "enum",         TOKenum         },
2953     {   "import",       TOKimport       },
2954     {   "mixin",        TOKmixin        },
2955     {   "static",       TOKstatic       },
2956     {   "final",        TOKfinal        },
2957     {   "const",        TOKconst        },
2958     {   "typedef",      TOKtypedef      },
2959     {   "alias",        TOKalias        },
2960     {   "override",     TOKoverride     },
2961     {   "abstract",     TOKabstract     },
2962     {   "volatile",     TOKvolatile     },
2963     {   "debug",        TOKdebug        },
2964     {   "deprecated",   TOKdeprecated   },
2965     {   "in",           TOKin           },
2966     {   "out",          TOKout          },
2967     {   "inout",        TOKinout        },
2968     {   "lazy",         TOKlazy         },
2969     {   "auto",         TOKauto         },
2970
2971     {   "align",        TOKalign        },
2972     {   "extern",       TOKextern       },
2973     {   "private",      TOKprivate      },
2974     {   "package",      TOKpackage      },
2975     {   "protected",    TOKprotected    },
2976     {   "public",       TOKpublic       },
2977     {   "export",       TOKexport       },
2978
2979     {   "body",         TOKbody         },
2980     {   "invariant",    TOKinvariant    },
2981     {   "unittest",     TOKunittest     },
2982     {   "version",      TOKversion      },
2983     //{ "manifest",     TOKmanifest     },
2984
2985     // Added after 1.0
2986     {   "ref",          TOKref          },
2987     {   "macro",        TOKmacro        },
2988
2989
2990     // TAL
2991     {   "and",          TOKandand       },
2992     {   "or",           TOKoror         },
2993     {   "not",          TOKnot          },
2994     {   "extends",      TOKextends      },
2995     {   "implements",   TOKimplements   },
2996     {   "log_error",    TOKlog_error    },
2997     {   "log_warning",  TOKlog_warning  },
2998     {   "log_info",     TOKlog_info     },
2999     {   "log_trace",    TOKlog_trace    },
3000 #if V2
3001     {   "pure",         TOKpure         },
3002     {   "nothrow",      TOKnothrow      },
3003     {   "__traits",     TOKtraits       },
3004     {   "__overloadset", TOKoverloadset },
3005 #endif
3006 };
3007
3008 int Token::isKeyword()
3009 {
3010     for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
3011     {
3012         if (keywords[u].value == value)
3013             return 1;
3014     }
3015     return 0;
3016 }
3017
3018 void Lexer::initKeywords()
3019 {   StringValue *sv;
3020     unsigned u;
3021     enum TOK v;
3022     unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
3023
3024     if (global.params.Dversion == 1)
3025         nkeywords -= 2;
3026
3027     cmtable_init();
3028
3029     for (u = 0; u < nkeywords; u++)
3030     {   char *s;
3031
3032         //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
3033         s = keywords[u].name;
3034         v = keywords[u].value;
3035         sv = stringtable.insert(s, strlen(s));
3036         sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
3037
3038         //printf("tochars[%d] = '%s'\n",v, s);
3039         Token::tochars[v] = s;
3040     }
3041
3042     Token::tochars[TOKeof]              = "EOF";
3043     Token::tochars[TOKlcurly]           = "{";
3044     Token::tochars[TOKrcurly]           = "}";
3045     Token::tochars[TOKlparen]           = "(";
3046     Token::tochars[TOKrparen]           = ")";
3047     Token::tochars[TOKlbracket]         = "[";
3048     Token::tochars[TOKrbracket]         = "]";
3049     Token::tochars[TOKsemicolon]        = ";";
3050     Token::tochars[TOKcolon]            = ":";
3051     Token::tochars[TOKcomma]            = ",";
3052     Token::tochars[TOKdot]              = ".";
3053     Token::tochars[TOKxor]              = "^";
3054     Token::tochars[TOKxorass]           = "^=";
3055     Token::tochars[TOKassign]           = "=";
3056     Token::tochars[TOKconstruct]        = "=";
3057 #if V2
3058     Token::tochars[TOKblit]             = "=";
3059 #endif
3060     Token::tochars[TOKlt]               = "<";
3061     Token::tochars[TOKgt]               = ">";
3062     Token::tochars[TOKle]               = "<=";
3063     Token::tochars[TOKge]               = ">=";
3064     Token::tochars[TOKequal]            = "==";
3065     Token::tochars[TOKnotequal]         = "!=";
3066     Token::tochars[TOKnotidentity]      = "!is";
3067     Token::tochars[TOKtobool]           = "!!";
3068     Token::tochars[TOKat]               = "@";
3069
3070     Token::tochars[TOKunord]            = "!<>=";
3071     Token::tochars[TOKue]               = "!<>";
3072     Token::tochars[TOKlg]               = "<>";
3073     Token::tochars[TOKleg]              = "<>=";
3074     Token::tochars[TOKule]              = "!>";
3075     Token::tochars[TOKul]               = "!>=";
3076     Token::tochars[TOKuge]              = "!<";
3077     Token::tochars[TOKug]               = "!<=";
3078
3079     Token::tochars[TOKnot]              = "!";
3080     Token::tochars[TOKtobool]           = "!!";
3081     Token::tochars[TOKshl]              = "<<";
3082     Token::tochars[TOKshr]              = ">>";
3083     Token::tochars[TOKushr]             = ">>>";
3084     Token::tochars[TOKadd]              = "+";
3085     Token::tochars[TOKmin]              = "-";
3086     Token::tochars[TOKmul]              = "*";
3087     Token::tochars[TOKdiv]              = "/";
3088     Token::tochars[TOKmod]              = "%";
3089     Token::tochars[TOKslice]            = "..";
3090     Token::tochars[TOKdotdotdot]        = "...";
3091     Token::tochars[TOKand]              = "&";
3092     Token::tochars[TOKandand]           = "&&";
3093     Token::tochars[TOKor]               = "|";
3094     Token::tochars[TOKoror]             = "||";
3095     Token::tochars[TOKarray]            = "[]";
3096     Token::tochars[TOKindex]            = "[i]";
3097     Token::tochars[TOKaddress]          = "&";
3098     Token::tochars[TOKstar]             = "*";
3099     Token::tochars[TOKtilde]            = "~";
3100     Token::tochars[TOKdollar]           = "$";
3101     Token::tochars[TOKcast]             = "cast";
3102     Token::tochars[TOKplusplus]         = "++";
3103     Token::tochars[TOKminusminus]       = "--";
3104     Token::tochars[TOKtype]             = "type";
3105     Token::tochars[TOKquestion]         = "?";
3106     Token::tochars[TOKneg]              = "-";
3107     Token::tochars[TOKuadd]             = "+";
3108     Token::tochars[TOKvar]              = "var";
3109     Token::tochars[TOKaddass]           = "+=";
3110     Token::tochars[TOKminass]           = "-=";
3111     Token::tochars[TOKmulass]           = "*=";
3112     Token::tochars[TOKdivass]           = "/=";
3113     Token::tochars[TOKmodass]           = "%=";
3114     Token::tochars[TOKshlass]           = "<<=";
3115     Token::tochars[TOKshrass]           = ">>=";
3116     Token::tochars[TOKushrass]          = ">>>=";
3117     Token::tochars[TOKandass]           = "&=";
3118     Token::tochars[TOKorass]            = "|=";
3119     Token::tochars[TOKcatass]           = "~=";
3120     Token::tochars[TOKcat]              = "~";
3121     Token::tochars[TOKcall]             = "call";
3122     Token::tochars[TOKidentity]         = "is";
3123     Token::tochars[TOKnotidentity]      = "!is";
3124     Token::tochars[TOKendline]          = "\\n";
3125
3126     Token::tochars[TOKorass]            = "|=";
3127     Token::tochars[TOKidentifier]       = "identifier";
3128
3129      // For debugging
3130     Token::tochars[TOKdotexp]           = "dotexp";
3131     Token::tochars[TOKdotti]            = "dotti";
3132     Token::tochars[TOKdotvar]           = "dotvar";
3133     Token::tochars[TOKdottype]          = "dottype";
3134     Token::tochars[TOKsymoff]           = "symoff";
3135     Token::tochars[TOKtypedot]          = "typedot";
3136     Token::tochars[TOKarraylength]      = "arraylength";
3137     Token::tochars[TOKarrayliteral]     = "arrayliteral";
3138     Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3139     Token::tochars[TOKstructliteral]    = "structliteral";
3140     Token::tochars[TOKstring]           = "string";
3141     Token::tochars[TOKdsymbol]          = "symbol";
3142     Token::tochars[TOKtuple]            = "tuple";
3143     Token::tochars[TOKdeclaration]      = "declaration";
3144     Token::tochars[TOKdottd]            = "dottd";
3145     Token::tochars[TOKlogger]           = "logger";
3146     Token::tochars[TOKon_scope_exit]    = "scope(exit)";
3147 }