dmd2/lexer.c

   1
   2 // Compiler implementation of the D programming language
   3 // Copyright (c) 1999-2008 by Digital Mars
   4 // All Rights Reserved
   5 // written by Walter Bright
   6 // http://www.digitalmars.com
   7 // License for redistribution is by either the Artistic License
   8 // in artistic.txt, or the GNU General Public License in gnu.txt.
   9 // See the included readme.txt for details.
  10
  11 /* NOTE: This file has been patched from the original DMD distribution to
  12    work with the GDC compiler.
  13
  14    Modified by David Friedman, December 2006
  15 */
  16
  17 /* Lexical Analyzer */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21 #include <ctype.h>
  22 #include <stdarg.h>
  23 #include <errno.h>
  24 //#include <wchar.h>
  25 #include <stdlib.h>
  26 #include <assert.h>
  27 #include <sys/time.h>
  28
  29 #ifdef IN_GCC
  30
  31 #include <time.h>
  32 #include "mem.h"
  33
  34 #else
  35
  36 #if __GNUC__
  37 #include <time.h>
  38 #endif
  39
  40 #if _WIN32
  41 #include "..\root\mem.h"
  42 #else
  43 #include "../root/mem.h"
  44 #endif
  45 #endif
  46
  47 #include "stringtable.h"
  48
  49 #include "lexer.h"
  50 #include "utf.h"
  51 #include "identifier.h"
  52 #include "id.h"
  53 #include "module.h"
  54
  55 #if _WIN32 && __DMC__
  56 // from \dm\src\include\setlocal.h
  57 extern "C" char * __cdecl __locale_decpoint;
  58 #endif
  59
  60 extern int HtmlNamedEntity(unsigned char *p, int length);
  61
  62 #define LS 0x2028       // UTF line separator
  63 #define PS 0x2029       // UTF paragraph separator
  64
  65 /********************************************
  66  * Do our own char maps
  67  */
  68
  69 static unsigned char cmtable[256];
  70
  71 const int CMoctal =     0x1;
  72 const int CMhex =       0x2;
  73 const int CMidchar =    0x4;
  74
  75 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
  76 inline unsigned char ishex   (unsigned char c) { return cmtable[c] & CMhex; }
  77 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
  78
  79 static void cmtable_init()
  80 {
  81     for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
  82     {
  83         if ('0' <= c && c <= '7')
  84             cmtable[c] |= CMoctal;
  85         if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
  86             cmtable[c] |= CMhex;
  87         if (isalnum(c) || c == '_')
  88             cmtable[c] |= CMidchar;
  89     }
  90 }
  91
  92
  93 /************************* Token **********************************************/
  94
  95 char *Token::tochars[TOKMAX];
  96
  97 void *Token::operator new(size_t size)
  98 {   Token *t;
  99
 100     if (Lexer::freelist)
 101     {
 102         t = Lexer::freelist;
 103         Lexer::freelist = t->next;
 104         return t;
 105     }
 106
 107     return ::operator new(size);
 108 }
 109
 110 #ifdef DEBUG
 111 void Token::print()
 112 {
 113     fprintf(stdmsg, "%s\n", toChars());
 114 }
 115 #endif
 116
 117 char *Token::toChars()
 118 {   char *p;
 119     static char buffer[3 + 3 * sizeof(value) + 1];
 120
 121     p = buffer;
 122     switch (value)
 123     {
 124         case TOKint32v:
 125 #if IN_GCC
 126             sprintf(buffer,"%d",(d_int32)int64value);
 127 #else
 128             sprintf(buffer,"%d",int32value);
 129 #endif
 130             break;
 131
 132         case TOKuns32v:
 133         case TOKcharv:
 134         case TOKwcharv:
 135         case TOKdcharv:
 136 #if IN_GCC
 137             sprintf(buffer,"%uU",(d_uns32)uns64value);
 138 #else
 139             sprintf(buffer,"%uU",uns32value);
 140 #endif
 141             break;
 142
 143         case TOKint64v:
 144             sprintf(buffer,"%"PRIdMAX"L",int64value);
 145             break;
 146
 147         case TOKuns64v:
 148             sprintf(buffer,"%"PRIuMAX"UL",uns64value);
 149             break;
 150
 151 #if IN_GCC
 152         case TOKfloat32v:
 153         case TOKfloat64v:
 154         case TOKfloat80v:
 155             float80value.format(buffer, sizeof(buffer));
 156             break;
 157         case TOKimaginary32v:
 158         case TOKimaginary64v:
 159         case TOKimaginary80v:
 160             float80value.format(buffer, sizeof(buffer));
 161             // %% buffer
 162             strcat(buffer, "i");
 163             break;
 164 #else
 165         case TOKfloat32v:
 166             sprintf(buffer,"%Lgf", float80value);
 167             break;
 168
 169         case TOKfloat64v:
 170             sprintf(buffer,"%Lg", float80value);
 171             break;
 172
 173         case TOKfloat80v:
 174             sprintf(buffer,"%LgL", float80value);
 175             break;
 176
 177         case TOKimaginary32v:
 178             sprintf(buffer,"%Lgfi", float80value);
 179             break;
 180
 181         case TOKimaginary64v:
 182             sprintf(buffer,"%Lgi", float80value);
 183             break;
 184
 185         case TOKimaginary80v:
 186             sprintf(buffer,"%LgLi", float80value);
 187             break;
 188 #endif
 189
 190
 191         case TOKstring:
 192 #if CSTRINGS
 193             p = string;
 194 #else
 195         {   OutBuffer buf;
 196
 197             buf.writeByte('"');
 198             for (size_t i = 0; i < len; )
 199             {   unsigned c;
 200
 201                 utf_decodeChar((unsigned char *)ustring, len, &i, &c);
 202                 switch (c)
 203                 {
 204                     case 0:
 205                         break;
 206
 207                     case '"':
 208                     case '\\':
 209                         buf.writeByte('\\');
 210                     default:
 211                         if (isprint(c))
 212                             buf.writeByte(c);
 213                         else if (c <= 0x7F)
 214                             buf.printf("\\x%02x", c);
 215                         else if (c <= 0xFFFF)
 216                             buf.printf("\\u%04x", c);
 217                         else
 218                             buf.printf("\\U%08x", c);
 219                         continue;
 220                 }
 221                 break;
 222             }
 223             buf.writeByte('"');
 224             if (postfix)
 225                 buf.writeByte('"');
 226             buf.writeByte(0);
 227             p = (char *)buf.extractData();
 228         }
 229 #endif
 230             break;
 231
 232         case TOKidentifier:
 233         case TOKenum:
 234         case TOKstruct:
 235         case TOKimport:
 236         CASE_BASIC_TYPES:
 237             p = ident->toChars();
 238             break;
 239
 240         default:
 241             p = toChars(value);
 242             break;
 243     }
 244     return p;
 245 }
 246
 247 char *Token::toChars(enum TOK value)
 248 {   char *p;
 249     static char buffer[3 + 3 * sizeof(value) + 1];
 250
 251     p = tochars[value];
 252     if (!p)
 253     {   sprintf(buffer,"TOK%d",value);
 254         p = buffer;
 255     }
 256     return p;
 257 }
 258
 259 /*************************** Lexer ********************************************/
 260
 261 Token *Lexer::freelist = NULL;
 262 StringTable Lexer::stringtable;
 263 OutBuffer Lexer::stringbuffer;
 264
 265 Lexer::Lexer(Module *mod,
 266         unsigned char *base, unsigned begoffset, unsigned endoffset,
 267         int doDocComment, int commentToken, bool dltSyntax)
 268     : loc(mod, 1), dltSyntax(dltSyntax)
 269 {
 270     //printf("Lexer::Lexer(%p,%d)\n",base,length);
 271     //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
 272     memset(&token,0,sizeof(token));
 273     this->base = base;
 274     this->end  = base + endoffset;
 275     p = base + begoffset;
 276     this->mod = mod;
 277     this->doDocComment = doDocComment;
 278     this->anyToken = 0;
 279     this->commentToken = commentToken;
 280     this->nesting = 0;
 281     this->indent = 0;
 282     this->atStartOfLine = 1;
 283     this->incLineno = 0;
 284     //initKeywords();
 285
 286     /* If first line starts with '#!', ignore the line
 287      */
 288
 289     if (p[0] == '#' && p[1] =='!')
 290     {
 291         p += 2;
 292         while (1)
 293         {   unsigned char c = *p;
 294             switch (c)
 295             {
 296                 case '\n':
 297                     p++;
 298                     break;
 299
 300                 case '\r':
 301                     p++;
 302                     if (*p == '\n')
 303                         p++;
 304                     break;
 305
 306                 case 0:
 307                 case 0x1A:
 308                     break;
 309
 310                 default:
 311                     if (c & 0x80)
 312                     {   unsigned u = decodeUTF();
 313                         if (u == PS || u == LS)
 314                             break;
 315                     }
 316                     p++;
 317                     continue;
 318             }
 319             break;
 320         }
 321         loc.linnum = 2;
 322     }
 323 }
 324
 325
 326 void Lexer::error(const char *format, ...)
 327 {
 328     if (mod && !global.gag)
 329     {
 330         char *p = loc.toChars();
 331         if (*p)
 332             fprintf(stdmsg, "%s: ", p);
 333         mem.free(p);
 334
 335         va_list ap;
 336         va_start(ap, format);
 337         vfprintf(stdmsg, format, ap);
 338         va_end(ap);
 339
 340         fprintf(stdmsg, "\n");
 341         fflush(stdmsg);
 342
 343         if (global.errors >= 20)        // moderate blizzard of cascading messages
 344             fatal();
 345     }
 346     global.errors++;
 347 }
 348
 349 void Lexer::error(Loc loc, const char *format, ...)
 350 {
 351     if (mod && !global.gag)
 352     {
 353         char *p = loc.toChars();
 354         if (*p)
 355             fprintf(stdmsg, "%s: ", p);
 356         mem.free(p);
 357
 358         va_list ap;
 359         va_start(ap, format);
 360         vfprintf(stdmsg, format, ap);
 361         va_end(ap);
 362
 363         fprintf(stdmsg, "\n");
 364         fflush(stdmsg);
 365
 366         if (global.errors >= 20)        // moderate blizzard of cascading messages
 367             fatal();
 368     }
 369     global.errors++;
 370 }
 371
 372 TOK Lexer::nextToken()
 373 {   Token *t;
 374
 375     if (token.next)
 376     {
 377         t = token.next;
 378         memcpy(&token,t,sizeof(Token));
 379         t->next = freelist;
 380         freelist = t;
 381     }
 382     else
 383     {
 384         scan(&token);
 385     }
 386     //token.print();
 387     return token.value;
 388 }
 389
 390 Token *Lexer::peek(Token *ct)
 391 {   Token *t;
 392
 393     if (ct->next)
 394         t = ct->next;
 395     else
 396     {
 397         t = new Token();
 398         scan(t);
 399         t->next = NULL;
 400         ct->next = t;
 401     }
 402     return t;
 403 }
 404
 405 /*********************************
 406  * tk is on the opening (.
 407  * Look ahead and return token that is past the closing ).
 408  */
 409
 410 Token *Lexer::peekPastParen(Token *tk)
 411 {
 412     //printf("peekPastParen()\n");
 413     int parens = 1;
 414     int curlynest = 0;
 415     while (1)
 416     {
 417         tk = peek(tk);
 418         //tk->print();
 419         switch (tk->value)
 420         {
 421             case TOKlparen:
 422                 parens++;
 423                 continue;
 424
 425             case TOKrparen:
 426                 --parens;
 427                 if (parens)
 428                     continue;
 429                 tk = peek(tk);
 430                 break;
 431
 432             case TOKlcurly:
 433                 curlynest++;
 434                 continue;
 435
 436             case TOKrcurly:
 437                 if (--curlynest >= 0)
 438                     continue;
 439                 break;
 440
 441             case TOKsemicolon:
 442                 if (curlynest)
 443                     continue;
 444                 break;
 445
 446             case TOKeof:
 447                 break;
 448
 449             default:
 450                 continue;
 451         }
 452         return tk;
 453     }
 454 }
 455
 456 /**********************************
 457  * Determine if string is a valid Identifier.
 458  * Placed here because of commonality with Lexer functionality.
 459  * Returns:
 460  *      0       invalid
 461  */
 462
 463 int Lexer::isValidIdentifier(char *p)
 464 {
 465     size_t len;
 466     size_t idx;
 467
 468     if (!p || !*p)
 469         goto Linvalid;
 470
 471     if (*p >= '0' && *p <= '9')         // beware of isdigit() on signed chars
 472         goto Linvalid;
 473
 474     len = strlen(p);
 475     idx = 0;
 476     while (p[idx])
 477     {   dchar_t dc;
 478
 479         char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
 480         if (q)
 481             goto Linvalid;
 482
 483         if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
 484             goto Linvalid;
 485     }
 486     return 1;
 487
 488 Linvalid:
 489     return 0;
 490 }
 491
 492 /****************************
 493  * Turn next token in buffer into a token.
 494  */
 495
 496 void Lexer::scan(Token *t)
 497 {
 498     unsigned lastLine = loc.linnum;
 499     unsigned linnum;
 500
 501     // Delayed line-number updating
 502     if (incLineno)
 503     {
 504         assert(incLineno == 1);
 505         incLineno = 0;
 506         loc.linnum++;
 507     }
 508
 509     t->blockComment = NULL;
 510     t->lineComment = NULL;
 511     while (1)
 512     {
 513         t->ptr = p;
 514
 515         if (dltSyntax && atStartOfLine) {
 516                 // Check indent
 517                 int i;
 518                 for (i = 0; p[i] == '\t'; i++) {
 519                 }
 520                 if (p[i] == ' ') {
 521                     error("Whitespace error: use tabs to indent!");
 522                 }
 523                 if (p[i] == '#') {
 524                     p += i;
 525                     atStartOfLine = 0;
 526                 } else if (p[i] != '\n' && p[i] != '\r') {
 527                     if (p[i] == '\0')
 528                         i = 0;                  // End-of-file always has no indent
 529                     if (i > indent) {
 530                         error("unexpected indentation (expected %d tabs, not %d)",
 531                                 indent, i);
 532                     } else if (i < indent) {
 533                         indent -= 1;
 534                         t->value = TOKrcurly;
 535                         return;
 536                     }
 537                     atStartOfLine = 0;
 538                 } /* else ignore blank line */
 539         }
 540
 541         //printf("p = %p, *p = '%c'\n",p,*p);
 542         switch (*p)
 543         {
 544             case 0:
 545             case 0x1A:
 546                 t->value = TOKeof;                      // end of file
 547                 return;
 548
 549             case ' ':
 550             case '\t':
 551             case '\v':
 552             case '\f':
 553                 p++;
 554                 continue;                       // skip white space
 555
 556             case '\r':
 557                 if (p[1] == '\n') {             // if CRLF
 558                     p++;
 559                     continue;
 560                 }
 561                 // fall-through
 562             case '\n':
 563                 p++;
 564                 if (dltSyntax)
 565                 {
 566                     // Delay incrementing the line number until after sending
 567                     // the TOKendline, for better error messages
 568                     if (incLineno)
 569                         loc.linnum++;
 570                     incLineno = 1;
 571
 572                     if (!nesting)
 573                     {
 574                         atStartOfLine = 1;
 575                         t->value = TOKendline;
 576                         return;
 577                     }
 578                 }
 579                 else
 580                     loc.linnum++;
 581                 continue;                       // Ignore newlines inside brackets
 582             case '0':   case '1':   case '2':   case '3':   case '4':
 583             case '5':   case '6':   case '7':   case '8':   case '9':
 584                 t->value = number(t);
 585                 return;
 586
 587 #if CSTRINGS
 588             case '\'':
 589                 t->value = charConstant(t, 0);
 590                 return;
 591
 592             case '"':
 593                 t->value = stringConstant(t,0);
 594                 return;
 595
 596             case 'l':
 597             case 'L':
 598                 if (p[1] == '\'')
 599                 {
 600                     p++;
 601                     t->value = charConstant(t, 1);
 602                     return;
 603                 }
 604                 else if (p[1] == '"')
 605                 {
 606                     p++;
 607                     t->value = stringConstant(t, 1);
 608                     return;
 609                 }
 610 #else
 611             case '\'':
 612                 t->value = charConstant(t,0);
 613                 return;
 614
 615             case 'r':
 616                 if (p[1] != '"')
 617                     goto case_ident;
 618                 p++;
 619             case '`':
 620                 t->value = wysiwygStringConstant(t, *p);
 621                 return;
 622
 623             case 'x':
 624                 if (p[1] != '"')
 625                     goto case_ident;
 626                 p++;
 627                 t->value = hexStringConstant(t);
 628                 return;
 629
 630 #if V2
 631             case 'q':
 632                 if (p[1] == '"')
 633                 {
 634                     p++;
 635                     t->value = delimitedStringConstant(t);
 636                     return;
 637                 }
 638                 else if (p[1] == '{')
 639                 {
 640                     p++;
 641                     t->value = tokenStringConstant(t);
 642                     return;
 643                 }
 644                 else
 645                     goto case_ident;
 646 #endif
 647
 648             case '"':
 649                 t->value = escapeStringConstant(t,0);
 650                 return;
 651
 652             case '\\':                  // escaped string literal
 653             {   unsigned c;
 654
 655                 stringbuffer.reset();
 656                 do
 657                 {
 658                     p++;
 659                     switch (*p)
 660                     {
 661                         case 'u':
 662                         case 'U':
 663                         case '&':
 664                             c = escapeSequence();
 665                             stringbuffer.writeUTF8(c);
 666                             break;
 667
 668                         default:
 669                             c = escapeSequence();
 670                             stringbuffer.writeByte(c);
 671                             break;
 672                     }
 673                 } while (*p == '\\');
 674                 t->len = stringbuffer.offset;
 675                 stringbuffer.writeByte(0);
 676                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
 677                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
 678                 t->postfix = 0;
 679                 t->value = TOKstring;
 680                 return;
 681             }
 682
 683             case 'l':
 684             case 'L':
 685 #endif
 686             case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 687             case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 688             case 'k':               case 'm':   case 'n':   case 'o':
 689 #if V2
 690             case 'p':   /*case 'q': case 'r':*/ case 's':   case 't':
 691 #else
 692             case 'p':   case 'q': /*case 'r':*/ case 's':   case 't':
 693 #endif
 694             case 'u':   case 'v':   case 'w': /*case 'x':*/ case 'y':
 695             case 'z':
 696             case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 697             case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 698             case 'K':               case 'M':   case 'N':   case 'O':
 699             case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 700             case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 701             case 'Z':
 702             case '_':
 703             case_ident:
 704             {   unsigned char c;
 705                 StringValue *sv;
 706                 Identifier *id;
 707
 708                 do
 709                 {
 710                     c = *++p;
 711                 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
 712                 sv = stringtable.update((char *)t->ptr, p - t->ptr);
 713                 id = (Identifier *) sv->ptrvalue;
 714                 if (!id)
 715                 {   id = new Identifier(sv->lstring.string,TOKidentifier);
 716                     sv->ptrvalue = id;
 717                 }
 718                 t->ident = id;
 719                 t->value = (enum TOK) id->value;
 720                 if (!dltSyntax)
 721                 {
 722                     if (t->value == TOKand ||
 723                         t->value == TOKor ||
 724                         t->value == TOKnot)
 725                     {
 726                         t->value = TOKidentifier;
 727                     }
 728                 }
 729                 anyToken = 1;
 730                 if (*t->ptr == '_')     // if special identifier token
 731                 {
 732                     static char date[11+1];
 733                     static char time[8+1];
 734                     static char timestamp[24+1];
 735
 736                     if (!date[0])       // lazy evaluation
 737                     {   time_t t;
 738                         char *p;
 739
 740                         ::time(&t);
 741                         p = ctime(&t);
 742                         assert(p);
 743                         sprintf(date, "%.6s %.4s", p + 4, p + 20);
 744                         sprintf(time, "%.8s", p + 11);
 745                         sprintf(timestamp, "%.24s", p);
 746                     }
 747
 748 #if !V2
 749                     if (mod && id == Id::FILE)
 750                     {
 751                         t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
 752                         goto Lstring;
 753                     }
 754                     else if (mod && id == Id::LINE)
 755                     {
 756                         t->value = TOKint64v;
 757                         t->uns64value = loc.linnum;
 758                     }
 759                     else
 760 #endif
 761                     if (id == Id::DATE)
 762                     {
 763                         t->ustring = (unsigned char *)date;
 764                         goto Lstring;
 765                     }
 766                     else if (id == Id::TIME)
 767                     {
 768                         t->ustring = (unsigned char *)time;
 769                         goto Lstring;
 770                     }
 771                     else if (id == Id::VENDOR)
 772                     {
 773 #ifdef IN_GCC
 774                         t->ustring = (unsigned char *)"GDC";
 775 #else
 776                         t->ustring = (unsigned char *)"Digital Mars D";
 777 #endif
 778                         goto Lstring;
 779                     }
 780                     else if (id == Id::TIMESTAMP)
 781                     {
 782                         t->ustring = (unsigned char *)timestamp;
 783                      Lstring:
 784                         t->value = TOKstring;
 785                      Llen:
 786                         t->postfix = 0;
 787                         t->len = strlen((char *)t->ustring);
 788                     }
 789                     else if (id == Id::VERSIONX)
 790                     {   unsigned major = 0;
 791                         unsigned minor = 0;
 792
 793                         for (char *p = global.version + 1; 1; p++)
 794                         {
 795                             char c = *p;
 796                             if (isdigit(c))
 797                                 minor = minor * 10 + c - '0';
 798                             else if (c == '.')
 799                             {   major = minor;
 800                                 minor = 0;
 801                             }
 802                             else
 803                                 break;
 804                         }
 805                         t->value = TOKint64v;
 806                         t->uns64value = major * 1000 + minor;
 807                     }
 808 #if V2
 809                     else if (id == Id::EOFX)
 810                     {
 811                         t->value = TOKeof;
 812                         // Advance scanner to end of file
 813                         while (!(*p == 0 || *p == 0x1A))
 814                             p++;
 815                     }
 816 #endif
 817                 }
 818                 //printf("t->value = %d\n",t->value);
 819                 return;
 820             }
 821
 822             case '/':
 823                 p++;
 824                 switch (*p)
 825                 {
 826                     case '=':
 827                         p++;
 828                         t->value = TOKdivass;
 829                         return;
 830
 831                     case '*':
 832                         p++;
 833                         linnum = loc.linnum;
 834                         while (1)
 835                         {
 836                             while (1)
 837                             {   unsigned char c = *p;
 838                                 switch (c)
 839                                 {
 840                                     case '/':
 841                                         break;
 842
 843                                     case '\n':
 844                                         loc.linnum++;
 845                                         p++;
 846                                         continue;
 847
 848                                     case '\r':
 849                                         p++;
 850                                         if (*p != '\n')
 851                                             loc.linnum++;
 852                                         continue;
 853
 854                                     case 0:
 855                                     case 0x1A:
 856                                         error("unterminated /* */ comment");
 857                                         p = end;
 858                                         t->value = TOKeof;
 859                                         return;
 860
 861                                     default:
 862                                         if (c & 0x80)
 863                                         {   unsigned u = decodeUTF();
 864                                             if (u == PS || u == LS)
 865                                                 loc.linnum++;
 866                                         }
 867                                         p++;
 868                                         continue;
 869                                 }
 870                                 break;
 871                             }
 872                             p++;
 873                             if (p[-2] == '*' && p - 3 != t->ptr)
 874                                 break;
 875                         }
 876                         if (commentToken)
 877                         {
 878                             t->value = TOKcomment;
 879                             return;
 880                         }
 881                         else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
 882                         {   // if /** but not /**/
 883                             getDocComment(t, lastLine == linnum);
 884                         }
 885                         continue;
 886
 887                     case '/':           // do // style comments
 888                         linnum = loc.linnum;
 889                         while (1)
 890                         {   unsigned char c = *++p;
 891                             switch (c)
 892                             {
 893                                 case '\n':
 894                                     break;
 895
 896                                 case '\r':
 897                                     if (p[1] == '\n')
 898                                         p++;
 899                                     break;
 900
 901                                 case 0:
 902                                 case 0x1A:
 903                                     if (commentToken)
 904                                     {
 905                                         p = end;
 906                                         t->value = TOKcomment;
 907                                         return;
 908                                     }
 909                                     if (doDocComment && t->ptr[2] == '/')
 910                                         getDocComment(t, lastLine == linnum);
 911                                     p = end;
 912                                     t->value = TOKeof;
 913                                     return;
 914
 915                                 default:
 916                                     if (c & 0x80)
 917                                     {   unsigned u = decodeUTF();
 918                                         if (u == PS || u == LS)
 919                                             break;
 920                                     }
 921                                     continue;
 922                             }
 923                             break;
 924                         }
 925
 926                         if (commentToken)
 927                         {
 928                             p++;
 929                             loc.linnum++;
 930                             t->value = TOKcomment;
 931                             return;
 932                         }
 933                         if (doDocComment && t->ptr[2] == '/')
 934                             getDocComment(t, lastLine == linnum);
 935
 936                         p++;
 937                         loc.linnum++;
 938                         continue;
 939
 940                     case '+':
 941                     {   int nest;
 942
 943                         linnum = loc.linnum;
 944                         p++;
 945                         nest = 1;
 946                         while (1)
 947                         {   unsigned char c = *p;
 948                             switch (c)
 949                             {
 950                                 case '/':
 951                                     p++;
 952                                     if (*p == '+')
 953                                     {
 954                                         p++;
 955                                         nest++;
 956                                     }
 957                                     continue;
 958
 959                                 case '+':
 960                                     p++;
 961                                     if (*p == '/')
 962                                     {
 963                                         p++;
 964                                         if (--nest == 0)
 965                                             break;
 966                                     }
 967                                     continue;
 968
 969                                 case '\r':
 970                                     p++;
 971                                     if (*p != '\n')
 972                                         loc.linnum++;
 973                                     continue;
 974
 975                                 case '\n':
 976                                     loc.linnum++;
 977                                     p++;
 978                                     continue;
 979
 980                                 case 0:
 981                                 case 0x1A:
 982                                     error("unterminated /+ +/ comment");
 983                                     p = end;
 984                                     t->value = TOKeof;
 985                                     return;
 986
 987                                 default:
 988                                     if (c & 0x80)
 989                                     {   unsigned u = decodeUTF();
 990                                         if (u == PS || u == LS)
 991                                             loc.linnum++;
 992                                     }
 993                                     p++;
 994                                     continue;
 995                             }
 996                             break;
 997                         }
 998                         if (commentToken)
 999                         {
1000                             t->value = TOKcomment;
1001                             return;
1002                         }
1003                         if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
1004                         {   // if /++ but not /++/
1005                             getDocComment(t, lastLine == linnum);
1006                         }
1007                         continue;
1008                     }
1009                 }
1010                 t->value = TOKdiv;
1011                 return;
1012
1013             case '.':
1014                 p++;
1015                 if (isdigit(*p))
1016                 {   /* Note that we don't allow ._1 and ._ as being
1017                      * valid floating point numbers.
1018                      */
1019                     p--;
1020                     t->value = inreal(t);
1021                 }
1022                 else if (p[0] == '.')
1023                 {
1024                     if (p[1] == '.')
1025                     {   p += 2;
1026                         t->value = TOKdotdotdot;
1027                     }
1028                     else
1029                     {   p++;
1030                         t->value = TOKslice;
1031                     }
1032                 }
1033                 else
1034                     t->value = TOKdot;
1035                 return;
1036
1037             case '&':
1038                 p++;
1039                 if (*p == '=')
1040                 {   p++;
1041                     t->value = TOKandass;
1042                 }
1043                 else if (*p == '&')
1044                 {   p++;
1045                     t->value = TOKandand;
1046                     if (dltSyntax)
1047                         error("Use 'and' instead of '&&'");
1048                 }
1049                 else
1050                     t->value = TOKand;
1051                 return;
1052
1053             case '|':
1054                 p++;
1055                 if (*p == '=')
1056                 {   p++;
1057                     t->value = TOKorass;
1058                 }
1059                 else if (*p == '|')
1060                 {   p++;
1061                     t->value = TOKoror;
1062                     if (dltSyntax)
1063                         error("Use 'or' instead of '||'");
1064                 }
1065                 else
1066                     t->value = TOKor;
1067                 return;
1068
1069             case '-':
1070                 p++;
1071                 if (*p == '=')
1072                 {   p++;
1073                     t->value = TOKminass;
1074                 }
1075 #if 0
1076                 else if (*p == '>')
1077                 {   p++;
1078                     t->value = TOKarrow;
1079                 }
1080 #endif
1081                 else if (*p == '-')
1082                 {   p++;
1083                     t->value = TOKminusminus;
1084                 }
1085                 else
1086                     t->value = TOKmin;
1087                 return;
1088
1089             case '+':
1090                 p++;
1091                 if (*p == '=')
1092                 {   p++;
1093                     t->value = TOKaddass;
1094                 }
1095                 else if (*p == '+')
1096                 {   p++;
1097                     t->value = TOKplusplus;
1098                 }
1099                 else
1100                     t->value = TOKadd;
1101                 return;
1102
1103             case '<':
1104                 p++;
1105                 if (*p == '=')
1106                 {   p++;
1107                     t->value = TOKle;                   // <=
1108                 }
1109                 else if (*p == '<')
1110                 {   p++;
1111                     if (*p == '=')
1112                     {   p++;
1113                         t->value = TOKshlass;           // <<=
1114                     }
1115                     else
1116                         t->value = TOKshl;              // <<
1117                 }
1118                 else if (*p == '>')
1119                 {   p++;
1120                     if (*p == '=')
1121                     {   p++;
1122                         t->value = TOKleg;              // <>=
1123                     }
1124                     else
1125                         t->value = TOKlg;               // <>
1126                 }
1127                 else
1128                     t->value = TOKlt;                   // <
1129                 return;
1130
1131             case '>':
1132                 p++;
1133                 if (*p == '=')
1134                 {   p++;
1135                     t->value = TOKge;                   // >=
1136                 }
1137                 else if (*p == '>')
1138                 {   p++;
1139                     if (*p == '=')
1140                     {   p++;
1141                         t->value = TOKshrass;           // >>=
1142                     }
1143                     else if (*p == '>')
1144                     {   p++;
1145                         if (*p == '=')
1146                         {   p++;
1147                             t->value = TOKushrass;      // >>>=
1148                         }
1149                         else
1150                             t->value = TOKushr;         // >>>
1151                     }
1152                     else
1153                         t->value = TOKshr;              // >>
1154                 }
1155                 else
1156                     t->value = TOKgt;                   // >
1157                 return;
1158
1159             case '!':
1160                 p++;
1161                 if (*p == '=')
1162                 {   p++;
1163                     if (*p == '=' && global.params.Dversion == 1)
1164                     {   p++;
1165                         t->value = TOKnotidentity;      // !==
1166                     }
1167                     else
1168                         t->value = TOKnotequal;         // !=
1169                 }
1170                 else if (*p == '<')
1171                 {   p++;
1172                     if (*p == '>')
1173                     {   p++;
1174                         if (*p == '=')
1175                         {   p++;
1176                             t->value = TOKunord; // !<>=
1177                         }
1178                         else
1179                             t->value = TOKue;   // !<>
1180                     }
1181                     else if (*p == '=')
1182                     {   p++;
1183                         t->value = TOKug;       // !<=
1184                     }
1185                     else
1186                         t->value = TOKuge;      // !<
1187                 }
1188                 else if (*p == '>')
1189                 {   p++;
1190                     if (*p == '=')
1191                     {   p++;
1192                         t->value = TOKul;       // !>=
1193                     }
1194                     else
1195                         t->value = TOKule;      // !>
1196                 }
1197                 else
1198                     t->value = TOKnot;          // !
1199                 return;
1200
1201             case '=':
1202                 p++;
1203                 if (*p == '=')
1204                 {   p++;
1205                     if (*p == '=' && global.params.Dversion == 1)
1206                     {   p++;
1207                         t->value = TOKidentity;         // ===
1208                     }
1209                     else
1210                         t->value = TOKequal;            // ==
1211                 }
1212                 else
1213                     t->value = TOKassign;               // =
1214                 return;
1215
1216             case '~':
1217                 p++;
1218                 if (*p == '=')
1219                 {   p++;
1220                     t->value = TOKcatass;               // ~=
1221                 }
1222                 else
1223                     t->value = TOKtilde;                // ~
1224                 return;
1225
1226 #define NESTED(cin,tokin,cout,tokout) \
1227             case cin: nesting++; p++; t->value = tokin; return;\
1228             case cout: if (nesting == 0) {error("Unexpected '%c'", cout);} else {nesting--;} p++; t->value = tokout; return;
1229
1230             NESTED('(', TOKlparen, ')', TOKrparen)
1231             NESTED('[', TOKlbracket, ']', TOKrbracket)
1232             NESTED('{', TOKlcurly, '}', TOKrcurly)
1233 #undef NESTED
1234
1235 #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1236             SINGLE('?', TOKquestion)
1237             SINGLE(',', TOKcomma)
1238             SINGLE(';', TOKsemicolon)
1239             SINGLE('$', TOKdollar)
1240             SINGLE('@', TOKat)
1241
1242 #undef SINGLE
1243
1244             case ':':
1245                 p++;
1246                 if (!nesting)
1247                         indent += 1;
1248                 t->value = TOKcolon;
1249                 return;
1250
1251 #define DOUBLE(c1,tok1,c2,tok2)         \
1252             case c1:                    \
1253                 p++;                    \
1254                 if (*p == c2)           \
1255                 {   p++;                \
1256                     t->value = tok2;    \
1257                 }                       \
1258                 else                    \
1259                     t->value = tok1;    \
1260                 return;
1261
1262             DOUBLE('*', TOKmul, '=', TOKmulass)
1263             DOUBLE('%', TOKmod, '=', TOKmodass)
1264             DOUBLE('^', TOKxor, '=', TOKxorass)
1265
1266 #undef DOUBLE
1267
1268             case '#':           // do # style comments and pragmas
1269                 if (dltSyntax)
1270                 {
1271                     do { p++; } while (*p != '\n');
1272                 }
1273                 else
1274                 {
1275                     p++;
1276                     pragma();
1277                 }
1278                 continue;
1279
1280             default:
1281             {   unsigned char c = *p;
1282
1283                 if (c & 0x80)
1284                 {   unsigned u = decodeUTF();
1285
1286                     // Check for start of unicode identifier
1287                     if (isUniAlpha(u))
1288                         goto case_ident;
1289
1290                     if (u == PS || u == LS)
1291                     {
1292                         loc.linnum++;
1293                         p++;
1294                         continue;
1295                     }
1296                 }
1297                 if (isprint(c))
1298                     error("unsupported char '%c'", c);
1299                 else
1300                     error("unsupported char 0x%02x", c);
1301                 p++;
1302                 continue;
1303             }
1304         }
1305     }
1306 }
1307
1308 /*******************************************
1309  * Parse escape sequence.
1310  */
1311
1312 unsigned Lexer::escapeSequence()
1313 {   unsigned c;
1314     int n;
1315     int ndigits;
1316
1317     c = *p;
1318     switch (c)
1319     {
1320         case '\'':
1321         case '"':
1322         case '?':
1323         case '\\':
1324         Lconsume:
1325                 p++;
1326                 break;
1327
1328         case 'a':       c = 7;          goto Lconsume;
1329         case 'b':       c = 8;          goto Lconsume;
1330         case 'f':       c = 12;         goto Lconsume;
1331         case 'n':       c = 10;         goto Lconsume;
1332         case 'r':       c = 13;         goto Lconsume;
1333         case 't':       c = 9;          goto Lconsume;
1334         case 'v':       c = 11;         goto Lconsume;
1335
1336         case 'u':
1337                 ndigits = 4;
1338                 goto Lhex;
1339         case 'U':
1340                 ndigits = 8;
1341                 goto Lhex;
1342         case 'x':
1343                 ndigits = 2;
1344         Lhex:
1345                 p++;
1346                 c = *p;
1347                 if (ishex(c))
1348                 {   unsigned v;
1349
1350                     n = 0;
1351                     v = 0;
1352                     while (1)
1353                     {
1354                         if (isdigit(c))
1355                             c -= '0';
1356                         else if (islower(c))
1357                             c -= 'a' - 10;
1358                         else
1359                             c -= 'A' - 10;
1360                         v = v * 16 + c;
1361                         c = *++p;
1362                         if (++n == ndigits)
1363                             break;
1364                         if (!ishex(c))
1365                         {   error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1366                             break;
1367                         }
1368                     }
1369                     if (ndigits != 2 && !utf_isValidDchar(v))
1370                         error("invalid UTF character \\U%08x", v);
1371                     c = v;
1372                 }
1373                 else
1374                     error("undefined escape hex sequence \\%c\n",c);
1375                 break;
1376
1377         case '&':                       // named character entity
1378                 for (unsigned char *idstart = ++p; 1; p++)
1379                 {
1380                     switch (*p)
1381                     {
1382                         case ';':
1383                             c = HtmlNamedEntity(idstart, p - idstart);
1384                             if (c == ~0)
1385                             {   error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1386                                 c = ' ';
1387                             }
1388                             p++;
1389                             break;
1390
1391                         default:
1392                             if (isalpha(*p) ||
1393                                 (p != idstart + 1 && isdigit(*p)))
1394                                 continue;
1395                             error("unterminated named entity");
1396                             break;
1397                     }
1398                     break;
1399                 }
1400                 break;
1401
1402         case 0:
1403         case 0x1A:                      // end of file
1404                 c = '\\';
1405                 break;
1406
1407         default:
1408                 if (isoctal(c))
1409                 {   unsigned v;
1410
1411                     n = 0;
1412                     v = 0;
1413                     do
1414                     {
1415                         v = v * 8 + (c - '0');
1416                         c = *++p;
1417                     } while (++n < 3 && isoctal(c));
1418                     c = v;
1419                     if (c > 0xFF)
1420                         error("0%03o is larger than a byte", c);
1421                 }
1422                 else
1423                     error("undefined escape sequence \\%c\n",c);
1424                 break;
1425     }
1426     return c;
1427 }
1428
1429 /**************************************
1430  */
1431
1432 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1433 {   unsigned c;
1434     Loc start = loc;
1435
1436     p++;
1437     stringbuffer.reset();
1438     while (1)
1439     {
1440         c = *p++;
1441         switch (c)
1442         {
1443             case '\n':
1444                 loc.linnum++;
1445                 break;
1446
1447             case '\r':
1448                 if (*p == '\n')
1449                     continue;   // ignore
1450                 c = '\n';       // treat EndOfLine as \n character
1451                 loc.linnum++;
1452                 break;
1453
1454             case 0:
1455             case 0x1A:
1456                 error("unterminated string constant starting at %s", start.toChars());
1457                 t->ustring = (unsigned char *)"";
1458                 t->len = 0;
1459                 t->postfix = 0;
1460                 return TOKstring;
1461
1462             case '"':
1463             case '`':
1464                 if (c == tc)
1465                 {
1466                     t->len = stringbuffer.offset;
1467                     stringbuffer.writeByte(0);
1468                     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1469                     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1470                     stringPostfix(t);
1471                     return TOKstring;
1472                 }
1473                 break;
1474
1475             default:
1476                 if (c & 0x80)
1477                 {   p--;
1478                     unsigned u = decodeUTF();
1479                     p++;
1480                     if (u == PS || u == LS)
1481                         loc.linnum++;
1482                     stringbuffer.writeUTF8(u);
1483                     continue;
1484                 }
1485                 break;
1486         }
1487         stringbuffer.writeByte(c);
1488     }
1489 }
1490
1491 /**************************************
1492  * Lex hex strings:
1493  *      x"0A ae 34FE BD"
1494  */
1495
1496 TOK Lexer::hexStringConstant(Token *t)
1497 {   unsigned c;
1498     Loc start = loc;
1499     unsigned n = 0;
1500     unsigned v;
1501
1502     p++;
1503     stringbuffer.reset();
1504     while (1)
1505     {
1506         c = *p++;
1507         switch (c)
1508         {
1509             case ' ':
1510             case '\t':
1511             case '\v':
1512             case '\f':
1513                 continue;                       // skip white space
1514
1515             case '\r':
1516                 if (*p == '\n')
1517                     continue;                   // ignore
1518                 // Treat isolated '\r' as if it were a '\n'
1519             case '\n':
1520                 loc.linnum++;
1521                 continue;
1522
1523             case 0:
1524             case 0x1A:
1525                 error("unterminated string constant starting at %s", start.toChars());
1526                 t->ustring = (unsigned char *)"";
1527                 t->len = 0;
1528                 t->postfix = 0;
1529                 return TOKstring;
1530
1531             case '"':
1532                 if (n & 1)
1533                 {   error("odd number (%d) of hex characters in hex string", n);
1534                     stringbuffer.writeByte(v);
1535                 }
1536                 t->len = stringbuffer.offset;
1537                 stringbuffer.writeByte(0);
1538                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1539                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1540                 stringPostfix(t);
1541                 return TOKstring;
1542
1543             default:
1544                 if (c >= '0' && c <= '9')
1545                     c -= '0';
1546                 else if (c >= 'a' && c <= 'f')
1547                     c -= 'a' - 10;
1548                 else if (c >= 'A' && c <= 'F')
1549                     c -= 'A' - 10;
1550                 else if (c & 0x80)
1551                 {   p--;
1552                     unsigned u = decodeUTF();
1553                     p++;
1554                     if (u == PS || u == LS)
1555                         loc.linnum++;
1556                     else
1557                         error("non-hex character \\u%x", u);
1558                 }
1559                 else
1560                     error("non-hex character '%c'", c);
1561                 if (n & 1)
1562                 {   v = (v << 4) | c;
1563                     stringbuffer.writeByte(v);
1564                 }
1565                 else
1566                     v = c;
1567                 n++;
1568                 break;
1569         }
1570     }
1571 }
1572
1573
1574 #if V2
1575 /**************************************
1576  * Lex delimited strings:
1577  *      q"(foo(xxx))"   // "foo(xxx)"
1578  *      q"[foo(]"       // "foo("
1579  *      q"/foo]/"       // "foo]"
1580  *      q"HERE
1581  *      foo
1582  *      HERE"           // "foo\n"
1583  * Input:
1584  *      p is on the "
1585  */
1586
1587 TOK Lexer::delimitedStringConstant(Token *t)
1588 {   unsigned c;
1589     Loc start = loc;
1590     unsigned delimleft = 0;
1591     unsigned delimright = 0;
1592     unsigned nest = 1;
1593     unsigned nestcount;
1594     Identifier *hereid = NULL;
1595     unsigned blankrol = 0;
1596     unsigned startline = 0;
1597
1598     p++;
1599     stringbuffer.reset();
1600     while (1)
1601     {
1602         c = *p++;
1603         //printf("c = '%c'\n", c);
1604         switch (c)
1605         {
1606             case '\n':
1607             Lnextline:
1608                 loc.linnum++;
1609                 startline = 1;
1610                 if (blankrol)
1611                 {   blankrol = 0;
1612                     continue;
1613                 }
1614                 if (hereid)
1615                 {
1616                     stringbuffer.writeUTF8(c);
1617                     continue;
1618                 }
1619                 break;
1620
1621             case '\r':
1622                 if (*p == '\n')
1623                     continue;   // ignore
1624                 c = '\n';       // treat EndOfLine as \n character
1625                 goto Lnextline;
1626
1627             case 0:
1628             case 0x1A:
1629                 goto Lerror;
1630
1631             default:
1632                 if (c & 0x80)
1633                 {   p--;
1634                     c = decodeUTF();
1635                     p++;
1636                     if (c == PS || c == LS)
1637                         goto Lnextline;
1638                 }
1639                 break;
1640         }
1641         if (delimleft == 0)
1642         {   delimleft = c;
1643             nest = 1;
1644             nestcount = 1;
1645             if (c == '(')
1646                 delimright = ')';
1647             else if (c == '{')
1648                 delimright = '}';
1649             else if (c == '[')
1650                 delimright = ']';
1651             else if (c == '<')
1652                 delimright = '>';
1653             else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1654             {   // Start of identifier; must be a heredoc
1655                 Token t;
1656                 p--;
1657                 scan(&t);               // read in heredoc identifier
1658                 if (t.value != TOKidentifier)
1659                 {   error("identifier expected for heredoc, not %s", t.toChars());
1660                     delimright = c;
1661                 }
1662                 else
1663                 {   hereid = t.ident;
1664                     //printf("hereid = '%s'\n", hereid->toChars());
1665                     blankrol = 1;
1666                 }
1667                 nest = 0;
1668             }
1669             else
1670             {   delimright = c;
1671                 nest = 0;
1672             }
1673         }
1674         else
1675         {
1676             if (blankrol)
1677             {   error("heredoc rest of line should be blank");
1678                 blankrol = 0;
1679                 continue;
1680             }
1681             if (nest == 1)
1682             {
1683                 if (c == delimleft)
1684                     nestcount++;
1685                 else if (c == delimright)
1686                 {   nestcount--;
1687                     if (nestcount == 0)
1688                         goto Ldone;
1689                 }
1690             }
1691             else if (c == delimright)
1692                 goto Ldone;
1693             if (startline && isalpha(c))
1694             {   Token t;
1695                 unsigned char *psave = p;
1696                 p--;
1697                 scan(&t);               // read in possible heredoc identifier
1698                 //printf("endid = '%s'\n", t.ident->toChars());
1699                 if (t.value == TOKidentifier && t.ident->equals(hereid))
1700                 {   /* should check that rest of line is blank
1701                      */
1702                     goto Ldone;
1703                 }
1704                 p = psave;
1705             }
1706             stringbuffer.writeUTF8(c);
1707             startline = 0;
1708         }
1709     }
1710
1711 Ldone:
1712     if (*p == '"')
1713         p++;
1714     else
1715         error("delimited string must end in %c\"", delimright);
1716     t->len = stringbuffer.offset;
1717     stringbuffer.writeByte(0);
1718     t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1719     memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1720     stringPostfix(t);
1721     return TOKstring;
1722
1723 Lerror:
1724     error("unterminated string constant starting at %s", start.toChars());
1725     t->ustring = (unsigned char *)"";
1726     t->len = 0;
1727     t->postfix = 0;
1728     return TOKstring;
1729 }
1730
1731 /**************************************
1732  * Lex delimited strings:
1733  *      q{ foo(xxx) } // " foo(xxx) "
1734  *      q{foo(}       // "foo("
1735  *      q{{foo}"}"}   // "{foo}"}""
1736  * Input:
1737  *      p is on the q
1738  */
1739
1740 TOK Lexer::tokenStringConstant(Token *t)
1741 {
1742     unsigned nest = 1;
1743     Loc start = loc;
1744     unsigned char *pstart = ++p;
1745
1746     nesting++;
1747     while (1)
1748     {   Token tok;
1749
1750         scan(&tok);
1751         switch (tok.value)
1752         {
1753             case TOKlcurly:
1754                 nest++;
1755                 continue;
1756
1757             case TOKrcurly:
1758                 if (--nest == 0)
1759                     goto Ldone;
1760                 continue;
1761
1762             case TOKeof:
1763                 goto Lerror;
1764
1765             default:
1766                 continue;
1767         }
1768     }
1769
1770 Ldone:
1771     t->len = p - 1 - pstart;
1772     t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1773     memcpy(t->ustring, pstart, t->len);
1774     t->ustring[t->len] = 0;
1775     stringPostfix(t);
1776     return TOKstring;
1777
1778 Lerror:
1779     error("unterminated token string constant starting at %s", start.toChars());
1780     t->ustring = (unsigned char *)"";
1781     t->len = 0;
1782     t->postfix = 0;
1783     return TOKstring;
1784 }
1785
1786 #endif
1787
1788
1789 /**************************************
1790  */
1791
1792 TOK Lexer::escapeStringConstant(Token *t, int wide)
1793 {   unsigned c;
1794     Loc start = loc;
1795
1796     p++;
1797     stringbuffer.reset();
1798     while (1)
1799     {
1800         c = *p++;
1801         switch (c)
1802         {
1803             case '\\':
1804                 switch (*p)
1805                 {
1806                     case 'u':
1807                     case 'U':
1808                     case '&':
1809                         c = escapeSequence();
1810                         stringbuffer.writeUTF8(c);
1811                         continue;
1812
1813                     default:
1814                         c = escapeSequence();
1815                         break;
1816                 }
1817                 break;
1818
1819             case '\n':
1820                 loc.linnum++;
1821                 break;
1822
1823             case '\r':
1824                 if (*p == '\n')
1825                     continue;   // ignore
1826                 c = '\n';       // treat EndOfLine as \n character
1827                 loc.linnum++;
1828                 break;
1829
1830             case '"':
1831                 t->len = stringbuffer.offset;
1832                 stringbuffer.writeByte(0);
1833                 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1834                 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1835                 stringPostfix(t);
1836                 return TOKstring;
1837
1838             case 0:
1839             case 0x1A:
1840                 p--;
1841                 error("unterminated string constant starting at %s", start.toChars());
1842                 t->ustring = (unsigned char *)"";
1843                 t->len = 0;
1844                 t->postfix = 0;
1845                 return TOKstring;
1846
1847             default:
1848                 if (c & 0x80)
1849                 {
1850                     p--;
1851                     c = decodeUTF();
1852                     if (c == LS || c == PS)
1853                     {   c = '\n';
1854                         loc.linnum++;
1855                     }
1856                     p++;
1857                     stringbuffer.writeUTF8(c);
1858                     continue;
1859                 }
1860                 break;
1861         }
1862         stringbuffer.writeByte(c);
1863     }
1864 }
1865
1866 /**************************************
1867  */
1868
1869 TOK Lexer::charConstant(Token *t, int wide)
1870 {
1871     unsigned c;
1872     TOK tk = TOKcharv;
1873
1874     //printf("Lexer::charConstant\n");
1875     p++;
1876     c = *p++;
1877     switch (c)
1878     {
1879         case '\\':
1880             switch (*p)
1881             {
1882                 case 'u':
1883                     t->uns64value = escapeSequence();
1884                     tk = TOKwcharv;
1885                     break;
1886
1887                 case 'U':
1888                 case '&':
1889                     t->uns64value = escapeSequence();
1890                     tk = TOKdcharv;
1891                     break;
1892
1893                 default:
1894                     t->uns64value = escapeSequence();
1895                     break;
1896             }
1897             break;
1898
1899         case '\n':
1900         L1:
1901             loc.linnum++;
1902         case '\r':
1903         case 0:
1904         case 0x1A:
1905         case '\'':
1906             error("unterminated character constant");
1907             return tk;
1908
1909         default:
1910             if (c & 0x80)
1911             {
1912                 p--;
1913                 c = decodeUTF();
1914                 p++;
1915                 if (c == LS || c == PS)
1916                     goto L1;
1917                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1918                     tk = TOKwcharv;
1919                 else
1920                     tk = TOKdcharv;
1921             }
1922             t->uns64value = c;
1923             break;
1924     }
1925
1926     if (*p != '\'')
1927     {   error("unterminated character constant");
1928         return tk;
1929     }
1930     p++;
1931     return tk;
1932 }
1933
1934 /***************************************
1935  * Get postfix of string literal.
1936  */
1937
1938 void Lexer::stringPostfix(Token *t)
1939 {
1940     switch (*p)
1941     {
1942         case 'c':
1943         case 'w':
1944         case 'd':
1945             t->postfix = *p;
1946             p++;
1947             break;
1948
1949         default:
1950             t->postfix = 0;
1951             break;
1952     }
1953 }
1954
1955 /***************************************
1956  * Read \u or \U unicode sequence
1957  * Input:
1958  *      u       'u' or 'U'
1959  */
1960
1961 #if 0
1962 unsigned Lexer::wchar(unsigned u)
1963 {
1964     unsigned value;
1965     unsigned n;
1966     unsigned char c;
1967     unsigned nchars;
1968
1969     nchars = (u == 'U') ? 8 : 4;
1970     value = 0;
1971     for (n = 0; 1; n++)
1972     {
1973         ++p;
1974         if (n == nchars)
1975             break;
1976         c = *p;
1977         if (!ishex(c))
1978         {   error("\\%c sequence must be followed by %d hex characters", u, nchars);
1979             break;
1980         }
1981         if (isdigit(c))
1982             c -= '0';
1983         else if (islower(c))
1984             c -= 'a' - 10;
1985         else
1986             c -= 'A' - 10;
1987         value <<= 4;
1988         value |= c;
1989     }
1990     return value;
1991 }
1992 #endif
1993
1994 /**************************************
1995  * Read in a number.
1996  * If it's an integer, store it in tok.TKutok.Vlong.
1997  *      integers can be decimal, octal or hex
1998  *      Handle the suffixes U, UL, LU, L, etc.
1999  * If it's double, store it in tok.TKutok.Vdouble.
2000  * Returns:
2001  *      TKnum
2002  *      TKdouble,...
2003  */
2004
2005 TOK Lexer::number(Token *t)
2006 {
2007     // We use a state machine to collect numbers
2008     enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
2009         STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
2010         STATE_hexh, STATE_error };
2011     enum STATE state;
2012
2013     enum FLAGS
2014     {   FLAGS_decimal  = 1,             // decimal
2015         FLAGS_unsigned = 2,             // u or U suffix
2016         FLAGS_long     = 4,             // l or L suffix
2017     };
2018     enum FLAGS flags = FLAGS_decimal;
2019
2020     int i;
2021     int base;
2022     unsigned c;
2023     unsigned char *start;
2024     TOK result;
2025
2026     //printf("Lexer::number()\n");
2027     state = STATE_initial;
2028     base = 0;
2029     stringbuffer.reset();
2030     start = p;
2031     while (1)
2032     {
2033         c = *p;
2034         switch (state)
2035         {
2036             case STATE_initial:         // opening state
2037                 if (c == '0')
2038                     state = STATE_0;
2039                 else
2040                     state = STATE_decimal;
2041                 break;
2042
2043             case STATE_0:
2044                 flags = (FLAGS) (flags & ~FLAGS_decimal);
2045                 switch (c)
2046                 {
2047 #if ZEROH
2048                     case 'H':                   // 0h
2049                     case 'h':
2050                         goto hexh;
2051 #endif
2052                     case 'X':
2053                     case 'x':
2054                         state = STATE_hex0;
2055                         break;
2056
2057                     case '.':
2058                         if (p[1] == '.')        // .. is a separate token
2059                             goto done;
2060                     case 'i':
2061                     case 'f':
2062                     case 'F':
2063                         goto real;
2064 #if ZEROH
2065                     case 'E':
2066                     case 'e':
2067                         goto case_hex;
2068 #endif
2069                     case 'B':
2070                     case 'b':
2071                         state = STATE_binary0;
2072                         break;
2073
2074                     case '0': case '1': case '2': case '3':
2075                     case '4': case '5': case '6': case '7':
2076                         state = STATE_octal;
2077                         break;
2078
2079 #if ZEROH
2080                     case '8': case '9': case 'A':
2081                     case 'C': case 'D': case 'F':
2082                     case 'a': case 'c': case 'd': case 'f':
2083                     case_hex:
2084                         state = STATE_hexh;
2085                         break;
2086 #endif
2087                     case '_':
2088                         state = STATE_octal;
2089                         p++;
2090                         continue;
2091
2092                     case 'L':
2093                         if (p[1] == 'i')
2094                             goto real;
2095                         goto done;
2096
2097                     default:
2098                         goto done;
2099                 }
2100                 break;
2101
2102             case STATE_decimal:         // reading decimal number
2103                 if (!isdigit(c))
2104                 {
2105 #if ZEROH
2106                     if (ishex(c)
2107                         || c == 'H' || c == 'h'
2108                        )
2109                         goto hexh;
2110 #endif
2111                     if (c == '_')               // ignore embedded _
2112                     {   p++;
2113                         continue;
2114                     }
2115                     if (c == '.' && p[1] != '.')
2116                         goto real;
2117                     else if (c == 'i' || c == 'f' || c == 'F' ||
2118                              c == 'e' || c == 'E')
2119                     {
2120             real:       // It's a real number. Back up and rescan as a real
2121                         p = start;
2122                         return inreal(t);
2123                     }
2124                     else if (c == 'L' && p[1] == 'i')
2125                         goto real;
2126                     goto done;
2127                 }
2128                 break;
2129
2130             case STATE_hex0:            // reading hex number
2131             case STATE_hex:
2132                 if (!ishex(c))
2133                 {
2134                     if (c == '_')               // ignore embedded _
2135                     {   p++;
2136                         continue;
2137                     }
2138                     if (c == '.' && p[1] != '.')
2139                         goto real;
2140                     if (c == 'P' || c == 'p' || c == 'i')
2141                         goto real;
2142                     if (state == STATE_hex0)
2143                         error("Hex digit expected, not '%c'", c);
2144                     goto done;
2145                 }
2146                 state = STATE_hex;
2147                 break;
2148
2149 #if ZEROH
2150             hexh:
2151                 state = STATE_hexh;
2152             case STATE_hexh:            // parse numbers like 0FFh
2153                 if (!ishex(c))
2154                 {
2155                     if (c == 'H' || c == 'h')
2156                     {
2157                         p++;
2158                         base = 16;
2159                         goto done;
2160                     }
2161                     else
2162                     {
2163                         // Check for something like 1E3 or 0E24
2164                         if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2165                             memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2166                             goto real;
2167                         error("Hex digit expected, not '%c'", c);
2168                         goto done;
2169                     }
2170                 }
2171                 break;
2172 #endif
2173
2174             case STATE_octal:           // reading octal number
2175             case STATE_octale:          // reading octal number with non-octal digits
2176                 if (!isoctal(c))
2177                 {
2178 #if ZEROH
2179                     if (ishex(c)
2180                         || c == 'H' || c == 'h'
2181                        )
2182                         goto hexh;
2183 #endif
2184                     if (c == '_')               // ignore embedded _
2185                     {   p++;
2186                         continue;
2187                     }
2188                     if (c == '.' && p[1] != '.')
2189                         goto real;
2190                     if (c == 'i')
2191                         goto real;
2192                     if (isdigit(c))
2193                     {
2194                         state = STATE_octale;
2195                     }
2196                     else
2197                         goto done;
2198                 }
2199                 break;
2200
2201             case STATE_binary0:         // starting binary number
2202             case STATE_binary:          // reading binary number
2203                 if (c != '0' && c != '1')
2204                 {
2205 #if ZEROH
2206                     if (ishex(c)
2207                         || c == 'H' || c == 'h'
2208                        )
2209                         goto hexh;
2210 #endif
2211                     if (c == '_')               // ignore embedded _
2212                     {   p++;
2213                         continue;
2214                     }
2215                     if (state == STATE_binary0)
2216                     {   error("binary digit expected");
2217                         state = STATE_error;
2218                         break;
2219                     }
2220                     else
2221                         goto done;
2222                 }
2223                 state = STATE_binary;
2224                 break;
2225
2226             case STATE_error:           // for error recovery
2227                 if (!isdigit(c))        // scan until non-digit
2228                     goto done;
2229                 break;
2230
2231             default:
2232                 assert(0);
2233         }
2234         stringbuffer.writeByte(c);
2235         p++;
2236     }
2237 done:
2238     stringbuffer.writeByte(0);          // terminate string
2239     if (state == STATE_octale)
2240         error("Octal digit expected");
2241
2242     uinteger_t n;                       // unsigned >=64 bit integer type
2243
2244     if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2245         n = stringbuffer.data[0] - '0';
2246     else
2247     {
2248         // Convert string to integer
2249 #if __DMC__
2250         errno = 0;
2251         n = strtoull((char *)stringbuffer.data,NULL,base);
2252         if (errno == ERANGE)
2253             error("integer overflow");
2254 #else
2255         // Not everybody implements strtoull()
2256         char *p = (char *)stringbuffer.data;
2257         int r = 10, d;
2258
2259         if (*p == '0')
2260         {
2261             if (p[1] == 'x' || p[1] == 'X')
2262                 p += 2, r = 16;
2263             else if (p[1] == 'b' || p[1] == 'B')
2264                 p += 2, r = 2;
2265             else if (isdigit(p[1]))
2266                 p += 1, r = 8;
2267         }
2268
2269         n = 0;
2270         while (1)
2271         {
2272             if (*p >= '0' && *p <= '9')
2273                 d = *p - '0';
2274             else if (*p >= 'a' && *p <= 'z')
2275                 d = *p - 'a' + 10;
2276             else if (*p >= 'A' && *p <= 'Z')
2277                 d = *p - 'A' + 10;
2278             else
2279                 break;
2280             if (d >= r)
2281                 break;
2282             if (n && n * r + d <= n)
2283             {
2284                 error ("integer overflow");
2285                 break;
2286             }
2287
2288             n = n * r + d;
2289             p++;
2290         }
2291 #endif
2292         if (sizeof(n) > 8 &&
2293             n > 0xFFFFFFFFFFFFFFFFULL)  // if n needs more than 64 bits
2294             error("integer overflow");
2295     }
2296
2297     // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2298     while (1)
2299     {   unsigned char f;
2300
2301         switch (*p)
2302         {   case 'U':
2303             case 'u':
2304                 f = FLAGS_unsigned;
2305                 goto L1;
2306
2307             case 'l':
2308                 if (1 || !global.params.useDeprecated)
2309                     error("'l' suffix is deprecated, use 'L' instead");
2310             case 'L':
2311                 f = FLAGS_long;
2312             L1:
2313                 p++;
2314                 if (flags & f)
2315                     error("unrecognized token");
2316                 flags = (FLAGS) (flags | f);
2317                 continue;
2318             default:
2319                 break;
2320         }
2321         break;
2322     }
2323
2324     switch (flags)
2325     {
2326         case 0:
2327             /* Octal or Hexadecimal constant.
2328              * First that fits: int, uint, long, ulong
2329              */
2330             if (n & 0x8000000000000000LL)
2331                     result = TOKuns64v;
2332             else if (n & 0xFFFFFFFF00000000LL)
2333                     result = TOKint64v;
2334             else if (n & 0x80000000)
2335                     result = TOKuns32v;
2336             else
2337                     result = TOKint32v;
2338             break;
2339
2340         case FLAGS_decimal:
2341             /* First that fits: int, long, long long
2342              */
2343             if (n & 0x8000000000000000LL)
2344             {       error("signed integer overflow");
2345                     result = TOKuns64v;
2346             }
2347             else if (n & 0xFFFFFFFF80000000LL)
2348                     result = TOKint64v;
2349             else
2350                     result = TOKint32v;
2351             break;
2352
2353         case FLAGS_unsigned:
2354         case FLAGS_decimal | FLAGS_unsigned:
2355             /* First that fits: uint, ulong
2356              */
2357             if (n & 0xFFFFFFFF00000000LL)
2358                     result = TOKuns64v;
2359             else
2360                     result = TOKuns32v;
2361             break;
2362
2363         case FLAGS_decimal | FLAGS_long:
2364             if (n & 0x8000000000000000LL)
2365             {       error("signed integer overflow");
2366                     result = TOKuns64v;
2367             }
2368             else
2369                     result = TOKint64v;
2370             break;
2371
2372         case FLAGS_long:
2373             if (n & 0x8000000000000000LL)
2374                     result = TOKuns64v;
2375             else
2376                     result = TOKint64v;
2377             break;
2378
2379         case FLAGS_unsigned | FLAGS_long:
2380         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2381             result = TOKuns64v;
2382             break;
2383
2384         default:
2385             #ifdef DEBUG
2386                 printf("%x\n",flags);
2387             #endif
2388             assert(0);
2389     }
2390     t->uns64value = n;
2391     return result;
2392 }
2393
2394 /**************************************
2395  * Read in characters, converting them to real.
2396  * Bugs:
2397  *      Exponent overflow not detected.
2398  *      Too much requested precision is not detected.
2399  */
2400
2401 TOK Lexer::inreal(Token *t)
2402 #ifdef __DMC__
2403 __in
2404 {
2405     assert(*p == '.' || isdigit(*p));
2406 }
2407 __out (result)
2408 {
2409     switch (result)
2410     {
2411         case TOKfloat32v:
2412         case TOKfloat64v:
2413         case TOKfloat80v:
2414         case TOKimaginary32v:
2415         case TOKimaginary64v:
2416         case TOKimaginary80v:
2417             break;
2418
2419         default:
2420             assert(0);
2421     }
2422 }
2423 __body
2424 #endif /* __DMC__ */
2425 {   int dblstate;
2426     unsigned c;
2427     char hex;                   // is this a hexadecimal-floating-constant?
2428     TOK result;
2429
2430     //printf("Lexer::inreal()\n");
2431     stringbuffer.reset();
2432     dblstate = 0;
2433     hex = 0;
2434 Lnext:
2435     while (1)
2436     {
2437         // Get next char from input
2438         c = *p++;
2439         //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2440         while (1)
2441         {
2442             switch (dblstate)
2443             {
2444                 case 0:                 // opening state
2445                     if (c == '0')
2446                         dblstate = 9;
2447                     else if (c == '.')
2448                         dblstate = 3;
2449                     else
2450                         dblstate = 1;
2451                     break;
2452
2453                 case 9:
2454                     dblstate = 1;
2455                     if (c == 'X' || c == 'x')
2456                     {   hex++;
2457                         break;
2458                     }
2459                 case 1:                 // digits to left of .
2460                 case 3:                 // digits to right of .
2461                 case 7:                 // continuing exponent digits
2462                     if (!isdigit(c) && !(hex && isxdigit(c)))
2463                     {
2464                         if (c == '_')
2465                             goto Lnext; // ignore embedded '_'
2466                         dblstate++;
2467                         continue;
2468                     }
2469                     break;
2470
2471                 case 2:                 // no more digits to left of .
2472                     if (c == '.')
2473                     {   dblstate++;
2474                         break;
2475                     }
2476                 case 4:                 // no more digits to right of .
2477                     if ((c == 'E' || c == 'e') ||
2478                         hex && (c == 'P' || c == 'p'))
2479                     {   dblstate = 5;
2480                         hex = 0;        // exponent is always decimal
2481                         break;
2482                     }
2483                     if (hex)
2484                         error("binary-exponent-part required");
2485                     goto done;
2486
2487                 case 5:                 // looking immediately to right of E
2488                     dblstate++;
2489                     if (c == '-' || c == '+')
2490                         break;
2491                 case 6:                 // 1st exponent digit expected
2492                     if (!isdigit(c))
2493                         error("exponent expected");
2494                     dblstate++;
2495                     break;
2496
2497                 case 8:                 // past end of exponent digits
2498                     goto done;
2499             }
2500             break;
2501         }
2502         stringbuffer.writeByte(c);
2503     }
2504 done:
2505     p--;
2506
2507     stringbuffer.writeByte(0);
2508
2509 #if _WIN32 && __DMC__
2510     char *save = __locale_decpoint;
2511     __locale_decpoint = ".";
2512 #endif
2513 #ifdef IN_GCC
2514     t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2515 #else
2516     t->float80value = strtold((char *)stringbuffer.data, NULL);
2517 #endif
2518     errno = 0;
2519     switch (*p)
2520     {
2521         case 'F':
2522         case 'f':
2523 #ifdef IN_GCC
2524             real_t::parse((char *)stringbuffer.data, real_t::Float);
2525 #else
2526             strtof((char *)stringbuffer.data, NULL);
2527 #endif
2528             result = TOKfloat32v;
2529             p++;
2530             break;
2531
2532         default:
2533 #ifdef IN_GCC
2534             real_t::parse((char *)stringbuffer.data, real_t::Double);
2535 #else
2536             strtod((char *)stringbuffer.data, NULL);
2537 #endif
2538             result = TOKfloat64v;
2539             break;
2540
2541         case 'l':
2542             if (!global.params.useDeprecated)
2543                 error("'l' suffix is deprecated, use 'L' instead");
2544         case 'L':
2545             result = TOKfloat80v;
2546             p++;
2547             break;
2548     }
2549     if (*p == 'i' || *p == 'I')
2550     {
2551         if (!global.params.useDeprecated && *p == 'I')
2552             error("'I' suffix is deprecated, use 'i' instead");
2553         p++;
2554         switch (result)
2555         {
2556             case TOKfloat32v:
2557                 result = TOKimaginary32v;
2558                 break;
2559             case TOKfloat64v:
2560                 result = TOKimaginary64v;
2561                 break;
2562             case TOKfloat80v:
2563                 result = TOKimaginary80v;
2564                 break;
2565         }
2566     }
2567 #if _WIN32 && __DMC__
2568     __locale_decpoint = save;
2569 #endif
2570     if (errno == ERANGE)
2571         error("number is not representable");
2572     return result;
2573 }
2574
2575 /*********************************************
2576  * Do pragma.
2577  * Currently, the only pragma supported is:
2578  *      #line linnum [filespec]
2579  */
2580
2581 void Lexer::pragma()
2582 {
2583     Token tok;
2584     int linnum;
2585     char *filespec = NULL;
2586     Loc loc = this->loc;
2587
2588     while (isblank(*p)) p++;
2589     if (*p == '\n')
2590         goto Lerr;
2591
2592     scan(&tok);
2593     if (tok.value != TOKidentifier || tok.ident != Id::line)
2594         goto Lerr;
2595
2596     scan(&tok);
2597     if (tok.value == TOKint32v || tok.value == TOKint64v)
2598         linnum = tok.uns64value - 1;
2599     else
2600         goto Lerr;
2601
2602     while (1)
2603     {
2604         switch (*p)
2605         {
2606             case 0:
2607             case 0x1A:
2608             case '\n':
2609             Lnewline:
2610                 this->loc.linnum = linnum;
2611                 if (filespec)
2612                     this->loc.filename = filespec;
2613                 return;
2614
2615             case '\r':
2616                 p++;
2617                 if (*p != '\n')
2618                 {   p--;
2619                     goto Lnewline;
2620                 }
2621                 continue;
2622
2623             case ' ':
2624             case '\t':
2625             case '\v':
2626             case '\f':
2627                 p++;
2628                 continue;                       // skip white space
2629
2630             case '_':
2631                 if (mod && memcmp(p, "__FILE__", 8) == 0)
2632                 {
2633                     p += 8;
2634                     filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2635                 }
2636                 continue;
2637
2638             case '"':
2639                 if (filespec)
2640                     goto Lerr;
2641                 stringbuffer.reset();
2642                 p++;
2643                 while (1)
2644                 {   unsigned c;
2645
2646                     c = *p;
2647                     switch (c)
2648                     {
2649                         case '\n':
2650                         case '\r':
2651                         case 0:
2652                         case 0x1A:
2653                             goto Lerr;
2654
2655                         case '"':
2656                             stringbuffer.writeByte(0);
2657                             filespec = mem.strdup((char *)stringbuffer.data);
2658                             p++;
2659                             break;
2660
2661                         default:
2662                             if (c & 0x80)
2663                             {   unsigned u = decodeUTF();
2664                                 if (u == PS || u == LS)
2665                                     goto Lerr;
2666                             }
2667                             stringbuffer.writeByte(c);
2668                             p++;
2669                             continue;
2670                     }
2671                     break;
2672                 }
2673                 continue;
2674
2675             default:
2676                 if (*p & 0x80)
2677                 {   unsigned u = decodeUTF();
2678                     if (u == PS || u == LS)
2679                         goto Lnewline;
2680                 }
2681                 goto Lerr;
2682         }
2683     }
2684
2685 Lerr:
2686     error(loc, "#line integer [\"filespec\"]\\n expected");
2687 }
2688
2689
2690 /********************************************
2691  * Decode UTF character.
2692  * Issue error messages for invalid sequences.
2693  * Return decoded character, advance p to last character in UTF sequence.
2694  */
2695
2696 unsigned Lexer::decodeUTF()
2697 {
2698     dchar_t u;
2699     unsigned char c;
2700     unsigned char *s = p;
2701     size_t len;
2702     size_t idx;
2703     char *msg;
2704
2705     c = *s;
2706     assert(c & 0x80);
2707
2708     // Check length of remaining string up to 6 UTF-8 characters
2709     for (len = 1; len < 6 && s[len]; len++)
2710         ;
2711
2712     idx = 0;
2713     msg = utf_decodeChar(s, len, &idx, &u);
2714     p += idx - 1;
2715     if (msg)
2716     {
2717         error("%s", msg);
2718     }
2719     return u;
2720 }
2721
2722
2723 /***************************************************
2724  * Parse doc comment embedded between t->ptr and p.
2725  * Remove trailing blanks and tabs from lines.
2726  * Replace all newlines with \n.
2727  * Remove leading comment character from each line.
2728  * Decide if it's a lineComment or a blockComment.
2729  * Append to previous one for this token.
2730  */
2731
2732 void Lexer::getDocComment(Token *t, unsigned lineComment)
2733 {
2734     OutBuffer buf;
2735     unsigned char ct = t->ptr[2];
2736     unsigned char *q = t->ptr + 3;      // start of comment text
2737     int linestart = 0;
2738
2739     unsigned char *qend = p;
2740     if (ct == '*' || ct == '+')
2741         qend -= 2;
2742
2743     /* Scan over initial row of ****'s or ++++'s or ////'s
2744      */
2745     for (; q < qend; q++)
2746     {
2747         if (*q != ct)
2748             break;
2749     }
2750
2751     /* Remove trailing row of ****'s or ++++'s
2752      */
2753     if (ct != '/')
2754     {
2755         for (; q < qend; qend--)
2756         {
2757             if (qend[-1] != ct)
2758                 break;
2759         }
2760     }
2761
2762     for (; q < qend; q++)
2763     {
2764         unsigned char c = *q;
2765
2766         switch (c)
2767         {
2768             case '*':
2769             case '+':
2770                 if (linestart && c == ct)
2771                 {   linestart = 0;
2772                     /* Trim preceding whitespace up to preceding \n
2773                      */
2774                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2775                         buf.offset--;
2776                     continue;
2777                 }
2778                 break;
2779
2780             case ' ':
2781             case '\t':
2782                 break;
2783
2784             case '\r':
2785                 if (q[1] == '\n')
2786                     continue;           // skip the \r
2787                 goto Lnewline;
2788
2789             default:
2790                 if (c == 226)
2791                 {
2792                     // If LS or PS
2793                     if (q[1] == 128 &&
2794                         (q[2] == 168 || q[2] == 169))
2795                     {
2796                         q += 2;
2797                         goto Lnewline;
2798                     }
2799                 }
2800                 linestart = 0;
2801                 break;
2802
2803             Lnewline:
2804                 c = '\n';               // replace all newlines with \n
2805             case '\n':
2806                 linestart = 1;
2807
2808                 /* Trim trailing whitespace
2809                  */
2810                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2811                     buf.offset--;
2812
2813                 break;
2814         }
2815         buf.writeByte(c);
2816     }
2817
2818     // Always end with a newline
2819     if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2820         buf.writeByte('\n');
2821
2822     buf.writeByte(0);
2823
2824     // It's a line comment if the start of the doc comment comes
2825     // after other non-whitespace on the same line.
2826     unsigned char** dc = (lineComment && anyToken)
2827                          ? &t->lineComment
2828                          : &t->blockComment;
2829
2830     // Combine with previous doc comment, if any
2831     if (*dc)
2832         *dc = combineComments(*dc, (unsigned char *)buf.data);
2833     else
2834         *dc = (unsigned char *)buf.extractData();
2835 }
2836
2837 /********************************************
2838  * Combine two document comments into one.
2839  */
2840
2841 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2842 {
2843     unsigned char *c = c2;
2844
2845     if (c1)
2846     {   c = c1;
2847         if (c2)
2848         {   size_t len1 = strlen((char *)c1);
2849             size_t len2 = strlen((char *)c2);
2850
2851             c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2852             memcpy(c, c1, len1);
2853             c[len1] = '\n';
2854             memcpy(c + len1 + 1, c2, len2);
2855             c[len1 + 1 + len2] = 0;
2856         }
2857     }
2858     return c;
2859 }
2860
2861 /********************************************
2862  * Create an identifier in the string table.
2863  */
2864
2865 Identifier *Lexer::idPool(const char *s)
2866 {
2867     size_t len = strlen(s);
2868     StringValue *sv = stringtable.update(s, len);
2869     Identifier *id = (Identifier *) sv->ptrvalue;
2870     if (!id)
2871     {
2872         id = new Identifier(sv->lstring.string, TOKidentifier);
2873         sv->ptrvalue = id;
2874     }
2875     return id;
2876 }
2877
2878 /*********************************************
2879  * Create a unique identifier using the prefix s.
2880  */
2881
2882 Identifier *Lexer::uniqueId(const char *s, int num)
2883 {   char buffer[32];
2884     size_t slen = strlen(s);
2885
2886     assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2887     sprintf(buffer, "%s%d", s, num);
2888     return idPool(buffer);
2889 }
2890
2891 Identifier *Lexer::uniqueId(const char *s)
2892 {
2893     static int num;
2894     return uniqueId(s, ++num);
2895 }
2896
2897 /****************************************
2898  */
2899
2900 struct Keyword
2901 {   char *name;
2902     enum TOK value;
2903 };
2904
2905 static Keyword keywords[] =
2906 {
2907 //    { "",             TOK     },
2908
2909     {   "this",         TOKthis         },
2910     {   "super",        TOKsuper        },
2911     {   "assert",       TOKassert       },
2912     {   "null",         TOKnull         },
2913     {   "true",         TOKtrue         },
2914     {   "false",        TOKfalse        },
2915     {   "cast",         TOKcast         },
2916     {   "new",          TOKnew          },
2917     {   "delete",       TOKdelete       },
2918     {   "throw",        TOKthrow        },
2919     {   "module",       TOKmodule       },
2920     {   "pragma",       TOKpragma       },
2921     {   "typeof",       TOKtypeof       },
2922     {   "typeid",       TOKtypeid       },
2923
2924     {   "template",     TOKtemplate     },
2925
2926     {   "void",         TOKvoid         },
2927     {   "byte",         TOKint8         },
2928     {   "ubyte",        TOKuns8         },
2929     {   "short",        TOKint16        },
2930     {   "ushort",       TOKuns16        },
2931     {   "int",          TOKint32        },
2932     {   "uint",         TOKuns32        },
2933     {   "long",         TOKint64        },
2934     {   "ulong",        TOKuns64        },
2935     {   "cent",         TOKcent,        },
2936     {   "ucent",        TOKucent,       },
2937     {   "float",        TOKfloat32      },
2938     {   "double",       TOKfloat64      },
2939     {   "real",         TOKfloat80      },
2940
2941     {   "bool",         TOKbool         },
2942     {   "char",         TOKchar         },
2943     {   "wchar",        TOKwchar        },
2944     {   "dchar",        TOKdchar        },
2945
2946     {   "ifloat",       TOKimaginary32  },
2947     {   "idouble",      TOKimaginary64  },
2948     {   "ireal",        TOKimaginary80  },
2949
2950     {   "cfloat",       TOKcomplex32    },
2951     {   "cdouble",      TOKcomplex64    },
2952     {   "creal",        TOKcomplex80    },
2953
2954     {   "delegate",     TOKdelegate     },
2955     {   "function",     TOKfunction     },
2956
2957     {   "is",           TOKis           },
2958     {   "if",           TOKif           },
2959     {   "else",         TOKelse         },
2960     {   "while",        TOKwhile        },
2961     {   "for",          TOKfor          },
2962     {   "do",           TOKdo           },
2963     {   "switch",       TOKswitch       },
2964     {   "case",         TOKcase         },
2965     {   "default",      TOKdefault      },
2966     {   "break",        TOKbreak        },
2967     {   "continue",     TOKcontinue     },
2968     {   "synchronized", TOKsynchronized },
2969     {   "return",       TOKreturn       },
2970     {   "goto",         TOKgoto         },
2971     {   "try",          TOKtry          },
2972     {   "catch",        TOKcatch        },
2973     {   "finally",      TOKfinally      },
2974     {   "with",         TOKwith         },
2975     {   "asm",          TOKasm          },
2976     {   "foreach",      TOKforeach      },
2977     {   "foreach_reverse",      TOKforeach_reverse      },
2978     {   "reversed",     TOKreversed     },
2979     {   "scope",        TOKscope        },
2980
2981     {   "struct",       TOKstruct       },
2982     {   "class",        TOKclass        },
2983     {   "interface",    TOKinterface    },
2984     {   "union",        TOKunion        },
2985     {   "enum",         TOKenum         },
2986     {   "import",       TOKimport       },
2987     {   "mixin",        TOKmixin        },
2988     {   "static",       TOKstatic       },
2989     {   "final",        TOKfinal        },
2990     {   "const",        TOKconst        },
2991     {   "typedef",      TOKtypedef      },
2992     {   "alias",        TOKalias        },
2993     {   "override",     TOKoverride     },
2994     {   "abstract",     TOKabstract     },
2995     {   "volatile",     TOKvolatile     },
2996     {   "debug",        TOKdebug        },
2997     {   "deprecated",   TOKdeprecated   },
2998     {   "in",           TOKin           },
2999     {   "out",          TOKout          },
3000     {   "inout",        TOKinout        },
3001     {   "lazy",         TOKlazy         },
3002     {   "auto",         TOKauto         },
3003
3004     {   "align",        TOKalign        },
3005     {   "extern",       TOKextern       },
3006     {   "private",      TOKprivate      },
3007     {   "package",      TOKpackage      },
3008     {   "protected",    TOKprotected    },
3009     {   "public",       TOKpublic       },
3010     {   "export",       TOKexport       },
3011
3012     {   "body",         TOKbody         },
3013     {   "invariant",    TOKinvariant    },
3014     {   "unittest",     TOKunittest     },
3015     {   "version",      TOKversion      },
3016     //{ "manifest",     TOKmanifest     },
3017
3018     // Added after 1.0
3019     {   "ref",          TOKref          },
3020     {   "macro",        TOKmacro        },
3021
3022
3023     // TAL
3024     {   "and",          TOKandand       },
3025     {   "or",           TOKoror         },
3026     {   "not",          TOKnot          },
3027     {   "extends",      TOKextends      },
3028     {   "implements",   TOKimplements   },
3029     {   "log_error",    TOKlog_error    },
3030     {   "log_warning",  TOKlog_warning  },
3031     {   "log_info",     TOKlog_info     },
3032     {   "log_trace",    TOKlog_trace    },
3033 #if V2
3034     {   "pure",         TOKpure         },
3035     {   "nothrow",      TOKnothrow      },
3036     {   "__thread",     TOKtls          },
3037     {   "__traits",     TOKtraits       },
3038     {   "__overloadset", TOKoverloadset },
3039     {   "__FILE__",     TOKfile         },
3040     {   "__LINE__",     TOKline         },
3041 #endif
3042 };
3043
3044 int Token::isKeyword()
3045 {
3046     for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
3047     {
3048         if (keywords[u].value == value)
3049             return 1;
3050     }
3051     return 0;
3052 }
3053
3054 void Lexer::initKeywords()
3055 {   StringValue *sv;
3056     unsigned u;
3057     enum TOK v;
3058     unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
3059
3060     if (global.params.Dversion == 1)
3061         nkeywords -= 2;
3062
3063     cmtable_init();
3064
3065     for (u = 0; u < nkeywords; u++)
3066     {   char *s;
3067
3068         //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
3069         s = keywords[u].name;
3070         v = keywords[u].value;
3071         sv = stringtable.insert(s, strlen(s));
3072         sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
3073
3074         //printf("tochars[%d] = '%s'\n",v, s);
3075         Token::tochars[v] = s;
3076     }
3077
3078     Token::tochars[TOKeof]              = "EOF";
3079     Token::tochars[TOKlcurly]           = "{";
3080     Token::tochars[TOKrcurly]           = "}";
3081     Token::tochars[TOKlparen]           = "(";
3082     Token::tochars[TOKrparen]           = ")";
3083     Token::tochars[TOKlbracket]         = "[";
3084     Token::tochars[TOKrbracket]         = "]";
3085     Token::tochars[TOKsemicolon]        = ";";
3086     Token::tochars[TOKcolon]            = ":";
3087     Token::tochars[TOKcomma]            = ",";
3088     Token::tochars[TOKdot]              = ".";
3089     Token::tochars[TOKxor]              = "^";
3090     Token::tochars[TOKxorass]           = "^=";
3091     Token::tochars[TOKassign]           = "=";
3092     Token::tochars[TOKconstruct]        = "=";
3093 #if V2
3094     Token::tochars[TOKblit]             = "=";
3095 #endif
3096     Token::tochars[TOKlt]               = "<";
3097     Token::tochars[TOKgt]               = ">";
3098     Token::tochars[TOKle]               = "<=";
3099     Token::tochars[TOKge]               = ">=";
3100     Token::tochars[TOKequal]            = "==";
3101     Token::tochars[TOKnotequal]         = "!=";
3102     Token::tochars[TOKnotidentity]      = "!is";
3103     Token::tochars[TOKtobool]           = "!!";
3104     Token::tochars[TOKat]               = "@";
3105
3106     Token::tochars[TOKunord]            = "!<>=";
3107     Token::tochars[TOKue]               = "!<>";
3108     Token::tochars[TOKlg]               = "<>";
3109     Token::tochars[TOKleg]              = "<>=";
3110     Token::tochars[TOKule]              = "!>";
3111     Token::tochars[TOKul]               = "!>=";
3112     Token::tochars[TOKuge]              = "!<";
3113     Token::tochars[TOKug]               = "!<=";
3114
3115     Token::tochars[TOKnot]              = "!";
3116     Token::tochars[TOKtobool]           = "!!";
3117     Token::tochars[TOKshl]              = "<<";
3118     Token::tochars[TOKshr]              = ">>";
3119     Token::tochars[TOKushr]             = ">>>";
3120     Token::tochars[TOKadd]              = "+";
3121     Token::tochars[TOKmin]              = "-";
3122     Token::tochars[TOKmul]              = "*";
3123     Token::tochars[TOKdiv]              = "/";
3124     Token::tochars[TOKmod]              = "%";
3125     Token::tochars[TOKslice]            = "..";
3126     Token::tochars[TOKdotdotdot]        = "...";
3127     Token::tochars[TOKand]              = "&";
3128     Token::tochars[TOKandand]           = "&&";
3129     Token::tochars[TOKor]               = "|";
3130     Token::tochars[TOKoror]             = "||";
3131     Token::tochars[TOKarray]            = "[]";
3132     Token::tochars[TOKindex]            = "[i]";
3133     Token::tochars[TOKaddress]          = "&";
3134     Token::tochars[TOKstar]             = "*";
3135     Token::tochars[TOKtilde]            = "~";
3136     Token::tochars[TOKdollar]           = "$";
3137     Token::tochars[TOKcast]             = "cast";
3138     Token::tochars[TOKplusplus]         = "++";
3139     Token::tochars[TOKminusminus]       = "--";
3140     Token::tochars[TOKtype]             = "type";
3141     Token::tochars[TOKquestion]         = "?";
3142     Token::tochars[TOKneg]              = "-";
3143     Token::tochars[TOKuadd]             = "+";
3144     Token::tochars[TOKvar]              = "var";
3145     Token::tochars[TOKaddass]           = "+=";
3146     Token::tochars[TOKminass]           = "-=";
3147     Token::tochars[TOKmulass]           = "*=";
3148     Token::tochars[TOKdivass]           = "/=";
3149     Token::tochars[TOKmodass]           = "%=";
3150     Token::tochars[TOKshlass]           = "<<=";
3151     Token::tochars[TOKshrass]           = ">>=";
3152     Token::tochars[TOKushrass]          = ">>>=";
3153     Token::tochars[TOKandass]           = "&=";
3154     Token::tochars[TOKorass]            = "|=";
3155     Token::tochars[TOKcatass]           = "~=";
3156     Token::tochars[TOKcat]              = "~";
3157     Token::tochars[TOKcall]             = "call";
3158     Token::tochars[TOKidentity]         = "is";
3159     Token::tochars[TOKnotidentity]      = "!is";
3160     Token::tochars[TOKendline]          = "\\n";
3161
3162     Token::tochars[TOKorass]            = "|=";
3163     Token::tochars[TOKidentifier]       = "identifier";
3164
3165      // For debugging
3166     Token::tochars[TOKdotexp]           = "dotexp";
3167     Token::tochars[TOKdotti]            = "dotti";
3168     Token::tochars[TOKdotvar]           = "dotvar";
3169     Token::tochars[TOKdottype]          = "dottype";
3170     Token::tochars[TOKsymoff]           = "symoff";
3171     Token::tochars[TOKtypedot]          = "typedot";
3172     Token::tochars[TOKarraylength]      = "arraylength";
3173     Token::tochars[TOKarrayliteral]     = "arrayliteral";
3174     Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3175     Token::tochars[TOKstructliteral]    = "structliteral";
3176     Token::tochars[TOKstring]           = "string";
3177     Token::tochars[TOKdsymbol]          = "symbol";
3178     Token::tochars[TOKtuple]            = "tuple";
3179     Token::tochars[TOKdeclaration]      = "declaration";
3180     Token::tochars[TOKdottd]            = "dottd";
3181     Token::tochars[TOKlogger]           = "logger";
3182     Token::tochars[TOKon_scope_exit]    = "scope(exit)";
3183 }