dlls/vbscript/lex.c

   1 /*
   2  * Copyright 2011 Jacek Caban for CodeWeavers
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  17  */
  18
  19 #include <assert.h>
  20 #include <limits.h>
  21 #include <math.h>
  22
  23 #include "vbscript.h"
  24 #include "parse.h"
  25 #include "parser.tab.h"
  26
  27 #include "wine/debug.h"
  28
  29 WINE_DEFAULT_DEBUG_CHANNEL(vbscript);
  30
  31 static const struct {
  32     const WCHAR *word;
  33     int token;
  34 } keywords[] = {
  35     {L"and",       tAND},
  36     {L"byref",     tBYREF},
  37     {L"byval",     tBYVAL},
  38     {L"call",      tCALL},
  39     {L"case",      tCASE},
  40     {L"class",     tCLASS},
  41     {L"const",     tCONST},
  42     {L"default",   tDEFAULT},
  43     {L"dim",       tDIM},
  44     {L"do",        tDO},
  45     {L"each",      tEACH},
  46     {L"else",      tELSE},
  47     {L"elseif",    tELSEIF},
  48     {L"empty",     tEMPTY},
  49     {L"end",       tEND},
  50     {L"eqv",       tEQV},
  51     {L"error",     tERROR},
  52     {L"exit",      tEXIT},
  53     {L"explicit",  tEXPLICIT},
  54     {L"false",     tFALSE},
  55     {L"for",       tFOR},
  56     {L"function",  tFUNCTION},
  57     {L"get",       tGET},
  58     {L"goto",      tGOTO},
  59     {L"if",        tIF},
  60     {L"imp",       tIMP},
  61     {L"in",        tIN},
  62     {L"is",        tIS},
  63     {L"let",       tLET},
  64     {L"loop",      tLOOP},
  65     {L"me",        tME},
  66     {L"mod",       tMOD},
  67     {L"new",       tNEW},
  68     {L"next",      tNEXT},
  69     {L"not",       tNOT},
  70     {L"nothing",   tNOTHING},
  71     {L"null",      tNULL},
  72     {L"on",        tON},
  73     {L"option",    tOPTION},
  74     {L"or",        tOR},
  75     {L"preserve",  tPRESERVE},
  76     {L"private",   tPRIVATE},
  77     {L"property",  tPROPERTY},
  78     {L"public",    tPUBLIC},
  79     {L"redim",     tREDIM},
  80     {L"rem",       tREM},
  81     {L"resume",    tRESUME},
  82     {L"select",    tSELECT},
  83     {L"set",       tSET},
  84     {L"step",      tSTEP},
  85     {L"stop",      tSTOP},
  86     {L"sub",       tSUB},
  87     {L"then",      tTHEN},
  88     {L"to",        tTO},
  89     {L"true",      tTRUE},
  90     {L"until",     tUNTIL},
  91     {L"wend",      tWEND},
  92     {L"while",     tWHILE},
  93     {L"with",      tWITH},
  94     {L"xor",       tXOR}
  95 };
  96
  97 static inline BOOL is_identifier_char(WCHAR c)
  98 {
  99     return iswalnum(c) || c == '_';
 100 }
 101
 102 static int check_keyword(parser_ctx_t *ctx, const WCHAR *word, const WCHAR **lval)
 103 {
 104     const WCHAR *p1 = ctx->ptr;
 105     const WCHAR *p2 = word;
 106     WCHAR c;
 107
 108     while(p1 < ctx->end && *p2) {
 109         c = towlower(*p1);
 110         if(c != *p2)
 111             return c - *p2;
 112         p1++;
 113         p2++;
 114     }
 115
 116     if(*p2 || (p1 < ctx->end && is_identifier_char(*p1)))
 117         return 1;
 118
 119     ctx->ptr = p1;
 120     *lval = word;
 121     return 0;
 122 }
 123
 124 static int check_keywords(parser_ctx_t *ctx, const WCHAR **lval)
 125 {
 126     int min = 0, max = ARRAY_SIZE(keywords)-1, r, i;
 127
 128     while(min <= max) {
 129         i = (min+max)/2;
 130
 131         r = check_keyword(ctx, keywords[i].word, lval);
 132         if(!r)
 133             return keywords[i].token;
 134
 135         if(r > 0)
 136             min = i+1;
 137         else
 138             max = i-1;
 139     }
 140
 141     return 0;
 142 }
 143
 144 static int parse_identifier(parser_ctx_t *ctx, const WCHAR **ret)
 145 {
 146     const WCHAR *ptr = ctx->ptr++;
 147     WCHAR *str;
 148     int len;
 149
 150     while(ctx->ptr < ctx->end && is_identifier_char(*ctx->ptr))
 151         ctx->ptr++;
 152     len = ctx->ptr-ptr;
 153
 154     str = parser_alloc(ctx, (len+1)*sizeof(WCHAR));
 155     if(!str)
 156         return 0;
 157
 158     memcpy(str, ptr, (len+1)*sizeof(WCHAR));
 159     str[len] = 0;
 160     *ret = str;
 161     return tIdentifier;
 162 }
 163
 164 static int parse_string_literal(parser_ctx_t *ctx, const WCHAR **ret)
 165 {
 166     const WCHAR *ptr = ++ctx->ptr;
 167     WCHAR *rptr;
 168     int len = 0;
 169
 170     while(ctx->ptr < ctx->end) {
 171         if(*ctx->ptr == '\n' || *ctx->ptr == '\r') {
 172             FIXME("newline inside string literal\n");
 173             return 0;
 174         }
 175
 176        if(*ctx->ptr == '"') {
 177             if(ctx->ptr[1] != '"')
 178                 break;
 179             len--;
 180             ctx->ptr++;
 181         }
 182         ctx->ptr++;
 183     }
 184
 185     if(ctx->ptr == ctx->end) {
 186         FIXME("unterminated string literal\n");
 187         return 0;
 188     }
 189
 190     len += ctx->ptr-ptr;
 191
 192     *ret = rptr = parser_alloc(ctx, (len+1)*sizeof(WCHAR));
 193     if(!rptr)
 194         return 0;
 195
 196     while(ptr < ctx->ptr) {
 197         if(*ptr == '"')
 198             ptr++;
 199         *rptr++ = *ptr++;
 200     }
 201
 202     *rptr = 0;
 203     ctx->ptr++;
 204     return tString;
 205 }
 206
 207 static int parse_date_literal(parser_ctx_t *ctx, DATE *ret)
 208 {
 209     const WCHAR *ptr = ++ctx->ptr;
 210     WCHAR *rptr;
 211     int len = 0;
 212     HRESULT res;
 213
 214     while(ctx->ptr < ctx->end) {
 215         if(*ctx->ptr == '\n' || *ctx->ptr == '\r') {
 216             FIXME("newline inside date literal\n");
 217             return 0;
 218         }
 219
 220        if(*ctx->ptr == '#')
 221             break;
 222        ctx->ptr++;
 223     }
 224
 225     if(ctx->ptr == ctx->end) {
 226         FIXME("unterminated date literal\n");
 227         return 0;
 228     }
 229
 230     len += ctx->ptr-ptr;
 231
 232     rptr = malloc((len+1)*sizeof(WCHAR));
 233     if(!rptr)
 234         return 0;
 235
 236     memcpy( rptr, ptr, len * sizeof(WCHAR));
 237     rptr[len] = 0;
 238     res = VarDateFromStr(rptr, ctx->lcid, 0, ret);
 239     free(rptr);
 240     if (FAILED(res)) {
 241         FIXME("Invalid date literal\n");
 242         return 0;
 243     }
 244
 245     ctx->ptr++;
 246     return tDate;
 247 }
 248
 249 static int parse_numeric_literal(parser_ctx_t *ctx, void **ret)
 250 {
 251     BOOL use_int = TRUE;
 252     LONGLONG d = 0, hlp;
 253     int exp = 0;
 254     double r;
 255
 256     if(*ctx->ptr == '0' && !('0' <= ctx->ptr[1] && ctx->ptr[1] <= '9') && ctx->ptr[1] != '.')
 257         return *ctx->ptr++;
 258
 259     while(ctx->ptr < ctx->end && is_digit(*ctx->ptr)) {
 260         hlp = d*10 + *(ctx->ptr++) - '0';
 261         if(d>MAXLONGLONG/10 || hlp<0) {
 262             exp++;
 263             break;
 264         }
 265         else
 266             d = hlp;
 267     }
 268     while(ctx->ptr < ctx->end && is_digit(*ctx->ptr)) {
 269         exp++;
 270         ctx->ptr++;
 271     }
 272
 273     if(*ctx->ptr == '.') {
 274         use_int = FALSE;
 275         ctx->ptr++;
 276
 277         while(ctx->ptr < ctx->end && is_digit(*ctx->ptr)) {
 278             hlp = d*10 + *(ctx->ptr++) - '0';
 279             if(d>MAXLONGLONG/10 || hlp<0)
 280                 break;
 281
 282             d = hlp;
 283             exp--;
 284         }
 285         while(ctx->ptr < ctx->end && is_digit(*ctx->ptr))
 286             ctx->ptr++;
 287     }
 288
 289     if(*ctx->ptr == 'e' || *ctx->ptr == 'E') {
 290         int e = 0, sign = 1;
 291
 292         ctx->ptr++;
 293         if(*ctx->ptr == '-') {
 294             ctx->ptr++;
 295             sign = -1;
 296         }else if(*ctx->ptr == '+') {
 297             ctx->ptr++;
 298         }
 299
 300         if(!is_digit(*ctx->ptr)) {
 301             FIXME("Invalid numeric literal\n");
 302             return 0;
 303         }
 304
 305         use_int = FALSE;
 306
 307         do {
 308             e = e*10 + *(ctx->ptr++) - '0';
 309             if(sign == -1 && -e+exp < -(INT_MAX/100)) {
 310                 /* The literal will be rounded to 0 anyway. */
 311                 while(is_digit(*ctx->ptr))
 312                     ctx->ptr++;
 313                 *(double*)ret = 0;
 314                 return tDouble;
 315             }
 316
 317             if(sign*e + exp > INT_MAX/100) {
 318                 FIXME("Invalid numeric literal\n");
 319                 return 0;
 320             }
 321         } while(is_digit(*ctx->ptr));
 322
 323         exp += sign*e;
 324     }
 325
 326     if(use_int && (LONG)d == d) {
 327         *(LONG*)ret = d;
 328         return tInt;
 329     }
 330
 331     r = exp>=0 ? d*pow(10, exp) : d/pow(10, -exp);
 332     if(isinf(r)) {
 333         FIXME("Invalid numeric literal\n");
 334         return 0;
 335     }
 336
 337     *(double*)ret = r;
 338     return tDouble;
 339 }
 340
 341 static int hex_to_int(WCHAR c)
 342 {
 343     if('0' <= c && c <= '9')
 344         return c-'0';
 345     if('a' <= c && c <= 'f')
 346         return c+10-'a';
 347     if('A' <= c && c <= 'F')
 348         return c+10-'A';
 349     return -1;
 350 }
 351
 352 static int parse_hex_literal(parser_ctx_t *ctx, LONG *ret)
 353 {
 354     const WCHAR *begin = ctx->ptr;
 355     unsigned l = 0, d;
 356
 357     while((d = hex_to_int(*++ctx->ptr)) != -1)
 358         l = l*16 + d;
 359
 360     if(begin + 9 /* max digits+1 */ < ctx->ptr) {
 361         FIXME("invalid literal\n");
 362         return 0;
 363     }
 364
 365     if(*ctx->ptr == '&') {
 366         ctx->ptr++;
 367         *ret = l;
 368     }else {
 369         *ret = l == (UINT16)l ? (INT16)l : l;
 370     }
 371     return tInt;
 372 }
 373
 374 static void skip_spaces(parser_ctx_t *ctx)
 375 {
 376     while(*ctx->ptr == ' ' || *ctx->ptr == '\t')
 377         ctx->ptr++;
 378 }
 379
 380 static int comment_line(parser_ctx_t *ctx)
 381 {
 382     ctx->ptr = wcspbrk(ctx->ptr, L"\n\r");
 383     if(ctx->ptr)
 384         ctx->ptr++;
 385     else
 386         ctx->ptr = ctx->end;
 387     return tNL;
 388 }
 389
 390 static int parse_next_token(void *lval, unsigned *loc, parser_ctx_t *ctx)
 391 {
 392     WCHAR c;
 393
 394     skip_spaces(ctx);
 395     *loc = ctx->ptr - ctx->code;
 396     if(ctx->ptr == ctx->end)
 397         return ctx->last_token == tNL ? 0 : tNL;
 398
 399     c = *ctx->ptr;
 400
 401     if('0' <= c && c <= '9')
 402         return parse_numeric_literal(ctx, lval);
 403
 404     if(iswalpha(c)) {
 405         int ret = 0;
 406         if(ctx->last_token != '.' && ctx->last_token != tDOT)
 407             ret = check_keywords(ctx, lval);
 408         if(!ret)
 409             return parse_identifier(ctx, lval);
 410         if(ret != tREM)
 411             return ret;
 412         c = '\'';
 413     }
 414
 415     switch(c) {
 416     case '\n':
 417     case '\r':
 418         ctx->ptr++;
 419         return tNL;
 420     case '\'':
 421         return comment_line(ctx);
 422     case ':':
 423     case ')':
 424     case ',':
 425     case '+':
 426     case '*':
 427     case '/':
 428     case '^':
 429     case '\\':
 430     case '_':
 431         return *ctx->ptr++;
 432     case '.':
 433         /*
 434          * We need to distinguish between '.' used as part of a member expression and
 435          * a beginning of a dot expression (a member expression accessing with statement
 436          * expression) and a floating point number like ".2" .
 437          */
 438         c = ctx->ptr > ctx->code ? ctx->ptr[-1] : '\n';
 439         if (is_identifier_char(c) || c == ')') {
 440             ctx->ptr++;
 441             return '.';
 442         }
 443         c = ctx->ptr[1];
 444         if('0' <= c && c <= '9')
 445             return parse_numeric_literal(ctx, lval);
 446         ctx->ptr++;
 447         return tDOT;
 448     case '-':
 449         if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '>')
 450             return comment_line(ctx);
 451         ctx->ptr++;
 452         return '-';
 453     case '(':
 454         /* NOTE:
 455          * We resolve empty brackets in lexer instead of parser to avoid complex conflicts
 456          * in call statement special case |f()| without 'call' keyword
 457          */
 458         ctx->ptr++;
 459         skip_spaces(ctx);
 460         if(*ctx->ptr == ')') {
 461             ctx->ptr++;
 462             return tEMPTYBRACKETS;
 463         }
 464         /*
 465          * Parser can't predict if bracket is part of argument expression or an argument
 466          * in call expression. We predict it here instead.
 467          */
 468         if(ctx->last_token == tIdentifier || ctx->last_token == ')')
 469             return '(';
 470         return tEXPRLBRACKET;
 471     case '"':
 472         return parse_string_literal(ctx, lval);
 473     case '#':
 474         return parse_date_literal(ctx, lval);
 475     case '&':
 476         if((*++ctx->ptr == 'h' || *ctx->ptr == 'H') && hex_to_int(ctx->ptr[1]) != -1)
 477             return parse_hex_literal(ctx, lval);
 478         return '&';
 479     case '=':
 480         switch(*++ctx->ptr) {
 481         case '<':
 482             ctx->ptr++;
 483             return tLTEQ;
 484         case '>':
 485             ctx->ptr++;
 486             return tGTEQ;
 487         }
 488         return '=';
 489     case '<':
 490         switch(*++ctx->ptr) {
 491         case '>':
 492             ctx->ptr++;
 493             return tNEQ;
 494         case '=':
 495             ctx->ptr++;
 496             return tLTEQ;
 497         case '!':
 498             if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '-')
 499                 return comment_line(ctx);
 500         }
 501         return '<';
 502     case '>':
 503         switch(*++ctx->ptr) {
 504         case '=':
 505             ctx->ptr++;
 506             return tGTEQ;
 507         case '<':
 508             ctx->ptr++;
 509             return tNEQ;
 510         }
 511         return '>';
 512     default:
 513         FIXME("Unhandled char %c in %s\n", *ctx->ptr, debugstr_w(ctx->ptr));
 514     }
 515
 516     return 0;
 517 }
 518
 519 int parser_lex(void *lval, unsigned *loc, parser_ctx_t *ctx)
 520 {
 521     int ret;
 522
 523     if (ctx->last_token == tEXPRESSION)
 524     {
 525         ctx->last_token = tNL;
 526         return tEXPRESSION;
 527     }
 528
 529     while(1) {
 530         ret = parse_next_token(lval, loc, ctx);
 531         if(ret == '_') {
 532             skip_spaces(ctx);
 533             if(*ctx->ptr != '\n' && *ctx->ptr != '\r') {
 534                 FIXME("'_' not followed by newline\n");
 535                 return 0;
 536             }
 537             if(*ctx->ptr == '\r')
 538                 ctx->ptr++;
 539             if(*ctx->ptr == '\n')
 540                 ctx->ptr++;
 541             continue;
 542         }
 543         if(ret != tNL || ctx->last_token != tNL)
 544             break;
 545
 546         ctx->last_nl = ctx->ptr-ctx->code;
 547     }
 548
 549     return (ctx->last_token = ret);
 550 }