dlls/vbscript/lex.c

   1 /*
   2  * Copyright 2011 Jacek Caban for CodeWeavers
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  17  */
  18
  19 #include "config.h"
  20 #include "wine/port.h"
  21
  22 #include <assert.h>
  23 #include <limits.h>
  24
  25 #include "vbscript.h"
  26 #include "parse.h"
  27 #include "parser.tab.h"
  28
  29 #include "wine/debug.h"
  30
  31 WINE_DEFAULT_DEBUG_CHANNEL(vbscript);
  32
  33 static const WCHAR andW[] = {'a','n','d',0};
  34 static const WCHAR byrefW[] = {'b','y','r','e','f',0};
  35 static const WCHAR byvalW[] = {'b','y','v','a','l',0};
  36 static const WCHAR callW[] = {'c','a','l','l',0};
  37 static const WCHAR caseW[] = {'c','a','s','e',0};
  38 static const WCHAR classW[] = {'c','l','a','s','s',0};
  39 static const WCHAR constW[] = {'c','o','n','s','t',0};
  40 static const WCHAR defaultW[] = {'d','e','f','a','u','l','t',0};
  41 static const WCHAR dimW[] = {'d','i','m',0};
  42 static const WCHAR doW[] = {'d','o',0};
  43 static const WCHAR eachW[] = {'e','a','c','h',0};
  44 static const WCHAR elseW[] = {'e','l','s','e',0};
  45 static const WCHAR elseifW[] = {'e','l','s','e','i','f',0};
  46 static const WCHAR emptyW[] = {'e','m','p','t','y',0};
  47 static const WCHAR endW[] = {'e','n','d',0};
  48 static const WCHAR eqvW[] = {'e','q','v',0};
  49 static const WCHAR errorW[] = {'e','r','r','o','r',0};
  50 static const WCHAR exitW[] = {'e','x','i','t',0};
  51 static const WCHAR explicitW[] = {'e','x','p','l','i','c','i','t',0};
  52 static const WCHAR falseW[] = {'f','a','l','s','e',0};
  53 static const WCHAR forW[] = {'f','o','r',0};
  54 static const WCHAR functionW[] = {'f','u','n','c','t','i','o','n',0};
  55 static const WCHAR getW[] = {'g','e','t',0};
  56 static const WCHAR gotoW[] = {'g','o','t','o',0};
  57 static const WCHAR ifW[] = {'i','f',0};
  58 static const WCHAR impW[] = {'i','m','p',0};
  59 static const WCHAR inW[] = {'i','n',0};
  60 static const WCHAR isW[] = {'i','s',0};
  61 static const WCHAR letW[] = {'l','e','t',0};
  62 static const WCHAR loopW[] = {'l','o','o','p',0};
  63 static const WCHAR meW[] = {'m','e',0};
  64 static const WCHAR modW[] = {'m','o','d',0};
  65 static const WCHAR newW[] = {'n','e','w',0};
  66 static const WCHAR nextW[] = {'n','e','x','t',0};
  67 static const WCHAR notW[] = {'n','o','t',0};
  68 static const WCHAR nothingW[] = {'n','o','t','h','i','n','g',0};
  69 static const WCHAR nullW[] = {'n','u','l','l',0};
  70 static const WCHAR onW[] = {'o','n',0};
  71 static const WCHAR optionW[] = {'o','p','t','i','o','n',0};
  72 static const WCHAR orW[] = {'o','r',0};
  73 static const WCHAR privateW[] = {'p','r','i','v','a','t','e',0};
  74 static const WCHAR propertyW[] = {'p','r','o','p','e','r','t','y',0};
  75 static const WCHAR publicW[] = {'p','u','b','l','i','c',0};
  76 static const WCHAR remW[] = {'r','e','m',0};
  77 static const WCHAR resumeW[] = {'r','e','s','u','m','e',0};
  78 static const WCHAR selectW[] = {'s','e','l','e','c','t',0};
  79 static const WCHAR setW[] = {'s','e','t',0};
  80 static const WCHAR stepW[] = {'s','t','e','p',0};
  81 static const WCHAR stopW[] = {'s','t','o','p',0};
  82 static const WCHAR subW[] = {'s','u','b',0};
  83 static const WCHAR thenW[] = {'t','h','e','n',0};
  84 static const WCHAR toW[] = {'t','o',0};
  85 static const WCHAR trueW[] = {'t','r','u','e',0};
  86 static const WCHAR untilW[] = {'u','n','t','i','l',0};
  87 static const WCHAR wendW[] = {'w','e','n','d',0};
  88 static const WCHAR whileW[] = {'w','h','i','l','e',0};
  89 static const WCHAR xorW[] = {'x','o','r',0};
  90
  91 static const struct {
  92     const WCHAR *word;
  93     int token;
  94 } keywords[] = {
  95     {andW,       tAND},
  96     {byrefW,     tBYREF},
  97     {byvalW,     tBYVAL},
  98     {callW,      tCALL},
  99     {caseW,      tCASE},
 100     {classW,     tCLASS},
 101     {constW,     tCONST},
 102     {defaultW,   tDEFAULT},
 103     {dimW,       tDIM},
 104     {doW,        tDO},
 105     {eachW,      tEACH},
 106     {elseW,      tELSE},
 107     {elseifW,    tELSEIF},
 108     {emptyW,     tEMPTY},
 109     {endW,       tEND},
 110     {eqvW,       tEQV},
 111     {errorW,     tERROR},
 112     {exitW,      tEXIT},
 113     {explicitW,  tEXPLICIT},
 114     {falseW,     tFALSE},
 115     {forW,       tFOR},
 116     {functionW,  tFUNCTION},
 117     {getW,       tGET},
 118     {gotoW,      tGOTO},
 119     {ifW,        tIF},
 120     {impW,       tIMP},
 121     {inW,        tIN},
 122     {isW,        tIS},
 123     {letW,       tLET},
 124     {loopW,      tLOOP},
 125     {meW,        tME},
 126     {modW,       tMOD},
 127     {newW,       tNEW},
 128     {nextW,      tNEXT},
 129     {notW,       tNOT},
 130     {nothingW,   tNOTHING},
 131     {nullW,      tNULL},
 132     {onW,        tON},
 133     {optionW,    tOPTION},
 134     {orW,        tOR},
 135     {privateW,   tPRIVATE},
 136     {propertyW,  tPROPERTY},
 137     {publicW,    tPUBLIC},
 138     {remW,       tREM},
 139     {resumeW,    tRESUME},
 140     {selectW,    tSELECT},
 141     {setW,       tSET},
 142     {stepW,      tSTEP},
 143     {stopW,      tSTOP},
 144     {subW,       tSUB},
 145     {thenW,      tTHEN},
 146     {toW,        tTO},
 147     {trueW,      tTRUE},
 148     {untilW,     tUNTIL},
 149     {wendW,      tWEND},
 150     {whileW,     tWHILE},
 151     {xorW,       tXOR}
 152 };
 153
 154 static inline BOOL is_identifier_char(WCHAR c)
 155 {
 156     return isalnumW(c) || c == '_';
 157 }
 158
 159 static int check_keyword(parser_ctx_t *ctx, const WCHAR *word)
 160 {
 161     const WCHAR *p1 = ctx->ptr;
 162     const WCHAR *p2 = word;
 163     WCHAR c;
 164
 165     while(p1 < ctx->end && *p2) {
 166         c = tolowerW(*p1);
 167         if(c != *p2)
 168             return c - *p2;
 169         p1++;
 170         p2++;
 171     }
 172
 173     if(*p2 || (p1 < ctx->end && is_identifier_char(*p1)))
 174         return 1;
 175
 176     ctx->ptr = p1;
 177     return 0;
 178 }
 179
 180 static int check_keywords(parser_ctx_t *ctx)
 181 {
 182     int min = 0, max = sizeof(keywords)/sizeof(keywords[0])-1, r, i;
 183
 184     while(min <= max) {
 185         i = (min+max)/2;
 186
 187         r = check_keyword(ctx, keywords[i].word);
 188         if(!r)
 189             return keywords[i].token;
 190
 191         if(r > 0)
 192             min = i+1;
 193         else
 194             max = i-1;
 195     }
 196
 197     return 0;
 198 }
 199
 200 static int parse_identifier(parser_ctx_t *ctx, const WCHAR **ret)
 201 {
 202     const WCHAR *ptr = ctx->ptr++;
 203     WCHAR *str;
 204     int len;
 205
 206     while(ctx->ptr < ctx->end && is_identifier_char(*ctx->ptr))
 207         ctx->ptr++;
 208     len = ctx->ptr-ptr;
 209
 210     str = parser_alloc(ctx, (len+1)*sizeof(WCHAR));
 211     if(!str)
 212         return 0;
 213
 214     memcpy(str, ptr, (len+1)*sizeof(WCHAR));
 215     str[len] = 0;
 216     *ret = str;
 217     return tIdentifier;
 218 }
 219
 220 static int parse_string_literal(parser_ctx_t *ctx, const WCHAR **ret)
 221 {
 222     const WCHAR *ptr = ++ctx->ptr;
 223     WCHAR *rptr;
 224     int len = 0;
 225
 226     while(ctx->ptr < ctx->end) {
 227         if(*ctx->ptr == '\n') {
 228             FIXME("newline inside string literal\n");
 229             return 0;
 230         }
 231
 232        if(*ctx->ptr == '"') {
 233             if(ctx->ptr[1] != '"')
 234                 break;
 235             len--;
 236             ctx->ptr++;
 237         }
 238         ctx->ptr++;
 239     }
 240
 241     if(ctx->ptr == ctx->end) {
 242         FIXME("unterminated string literal\n");
 243         return 0;
 244     }
 245
 246     len += ctx->ptr-ptr;
 247
 248     *ret = rptr = parser_alloc(ctx, (len+1)*sizeof(WCHAR));
 249     if(!rptr)
 250         return 0;
 251
 252     while(ptr < ctx->ptr) {
 253         if(*ptr == '"')
 254             ptr++;
 255         *rptr++ = *ptr++;
 256     }
 257
 258     *rptr = 0;
 259     ctx->ptr++;
 260     return tString;
 261 }
 262
 263 static int parse_numeric_literal(parser_ctx_t *ctx, void **ret)
 264 {
 265     BOOL use_int = TRUE;
 266     LONGLONG d = 0, hlp;
 267     int exp = 0;
 268     double r;
 269
 270     if(*ctx->ptr == '0' && !('0' <= ctx->ptr[1] && ctx->ptr[1] <= '9') && ctx->ptr[1] != '.')
 271         return *ctx->ptr++;
 272
 273     while(ctx->ptr < ctx->end && isdigitW(*ctx->ptr)) {
 274         hlp = d*10 + *(ctx->ptr++) - '0';
 275         if(d>MAXLONGLONG/10 || hlp<0) {
 276             exp++;
 277             break;
 278         }
 279         else
 280             d = hlp;
 281     }
 282     while(ctx->ptr < ctx->end && isdigitW(*ctx->ptr)) {
 283         exp++;
 284         ctx->ptr++;
 285     }
 286
 287     if(*ctx->ptr == '.') {
 288         use_int = FALSE;
 289         ctx->ptr++;
 290
 291         while(ctx->ptr < ctx->end && isdigitW(*ctx->ptr)) {
 292             hlp = d*10 + *(ctx->ptr++) - '0';
 293             if(d>MAXLONGLONG/10 || hlp<0)
 294                 break;
 295
 296             d = hlp;
 297             exp--;
 298         }
 299         while(ctx->ptr < ctx->end && isdigitW(*ctx->ptr))
 300             ctx->ptr++;
 301     }
 302
 303     if(*ctx->ptr == 'e' || *ctx->ptr == 'E') {
 304         int e = 0, sign = 1;
 305
 306         if(*++ctx->ptr == '-') {
 307             ctx->ptr++;
 308             sign = -1;
 309         }
 310
 311         if(!isdigitW(*ctx->ptr)) {
 312             FIXME("Invalid numeric literal\n");
 313             return 0;
 314         }
 315
 316         use_int = FALSE;
 317
 318         do {
 319             e = e*10 + *(ctx->ptr++) - '0';
 320             if(sign == -1 && -e+exp < -(INT_MAX/100)) {
 321                 /* The literal will be rounded to 0 anyway. */
 322                 while(isdigitW(*ctx->ptr))
 323                     ctx->ptr++;
 324                 *(double*)ret = 0;
 325                 return tDouble;
 326             }
 327
 328             if(sign*e + exp > INT_MAX/100) {
 329                 FIXME("Invalid numeric literal\n");
 330                 return 0;
 331             }
 332         } while(isdigitW(*ctx->ptr));
 333
 334         exp += sign*e;
 335     }
 336
 337     if(use_int && (LONG)d == d) {
 338         LONG l = d;
 339         *(LONG*)ret = l;
 340         return (short)l == l ? tShort : tLong;
 341     }
 342
 343     r = exp>=0 ? d*pow(10, exp) : d/pow(10, -exp);
 344     if(isinf(r)) {
 345         FIXME("Invalid numeric literal\n");
 346         return 0;
 347     }
 348
 349     *(double*)ret = r;
 350     return tDouble;
 351 }
 352
 353 static int hex_to_int(WCHAR c)
 354 {
 355     if('0' <= c && c <= '9')
 356         return c-'0';
 357     if('a' <= c && c <= 'f')
 358         return c+10-'a';
 359     if('A' <= c && c <= 'F')
 360         return c+10-'A';
 361     return -1;
 362 }
 363
 364 static int parse_hex_literal(parser_ctx_t *ctx, LONG *ret)
 365 {
 366     const WCHAR *begin = ctx->ptr;
 367     LONG l = 0, d;
 368
 369     while((d = hex_to_int(*++ctx->ptr)) != -1)
 370         l = l*16 + d;
 371
 372     if(begin + 9 /* max digits+1 */ < ctx->ptr || (*ctx->ptr != '&' && is_identifier_char(*ctx->ptr))) {
 373         FIXME("invalid literal\n");
 374         return 0;
 375     }
 376
 377     if(*ctx->ptr == '&')
 378         ctx->ptr++;
 379
 380     *ret = l;
 381     return (short)l == l ? tShort : tLong;
 382 }
 383
 384 static void skip_spaces(parser_ctx_t *ctx)
 385 {
 386     while(*ctx->ptr == ' ' || *ctx->ptr == '\t' || *ctx->ptr == '\r')
 387         ctx->ptr++;
 388 }
 389
 390 static int comment_line(parser_ctx_t *ctx)
 391 {
 392     ctx->ptr = strchrW(ctx->ptr, '\n');
 393     if(ctx->ptr)
 394         ctx->ptr++;
 395     else
 396         ctx->ptr = ctx->end;
 397     return tNL;
 398 }
 399
 400 static int parse_next_token(void *lval, parser_ctx_t *ctx)
 401 {
 402     WCHAR c;
 403
 404     skip_spaces(ctx);
 405     if(ctx->ptr == ctx->end)
 406         return ctx->last_token == tNL ? tEOF : tNL;
 407
 408     c = *ctx->ptr;
 409
 410     if('0' <= c && c <= '9')
 411         return parse_numeric_literal(ctx, lval);
 412
 413     if(isalphaW(c)) {
 414         int ret = check_keywords(ctx);
 415         if(!ret)
 416             return parse_identifier(ctx, lval);
 417         if(ret != tREM)
 418             return ret;
 419         c = '\'';
 420     }
 421
 422     switch(c) {
 423     case '\n':
 424         ctx->ptr++;
 425         return tNL;
 426     case '\'':
 427         return comment_line(ctx);
 428     case ':':
 429     case ')':
 430     case ',':
 431     case '=':
 432     case '+':
 433     case '*':
 434     case '/':
 435     case '^':
 436     case '\\':
 437     case '.':
 438     case '_':
 439         return *ctx->ptr++;
 440     case '-':
 441         if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '>')
 442             return comment_line(ctx);
 443         ctx->ptr++;
 444         return '-';
 445     case '(':
 446         /* NOTE:
 447          * We resolve empty brackets in lexer instead of parser to avoid complex conflicts
 448          * in call statement special case |f()| without 'call' keyword
 449          */
 450         ctx->ptr++;
 451         skip_spaces(ctx);
 452         if(*ctx->ptr == ')') {
 453             ctx->ptr++;
 454             return tEMPTYBRACKETS;
 455         }
 456         return '(';
 457     case '"':
 458         return parse_string_literal(ctx, lval);
 459     case '&':
 460         if(*++ctx->ptr == 'h' || *ctx->ptr == 'H')
 461             return parse_hex_literal(ctx, lval);
 462         return '&';
 463     case '<':
 464         switch(*++ctx->ptr) {
 465         case '>':
 466             ctx->ptr++;
 467             return tNEQ;
 468         case '=':
 469             ctx->ptr++;
 470             return tLTEQ;
 471         case '!':
 472             if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '-')
 473                 return comment_line(ctx);
 474         }
 475         return '<';
 476     case '>':
 477         if(*++ctx->ptr == '=') {
 478             ctx->ptr++;
 479             return tGTEQ;
 480         }
 481         return '>';
 482     default:
 483         FIXME("Unhandled char %c in %s\n", *ctx->ptr, debugstr_w(ctx->ptr));
 484     }
 485
 486     return 0;
 487 }
 488
 489 int parser_lex(void *lval, parser_ctx_t *ctx)
 490 {
 491     int ret;
 492
 493     while(1) {
 494         ret = parse_next_token(lval, ctx);
 495         if(ret == '_') {
 496             skip_spaces(ctx);
 497             if(*ctx->ptr != '\n') {
 498                 FIXME("'_' not followed by newline\n");
 499                 return 0;
 500             }
 501             ctx->ptr++;
 502             continue;
 503         }
 504         if(ret != tNL || ctx->last_token != tNL)
 505             break;
 506
 507         ctx->last_nl = ctx->ptr-ctx->code;
 508     }
 509
 510     return (ctx->last_token = ret);
 511 }