dlls/jscript/regexp.c

   1 /*
   2  * Copyright 2008 Jacek Caban for CodeWeavers
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  17  */
  18
  19 /*
  20  * Code in this file is based on files:
  21  * js/src/jsregexp.h
  22  * js/src/jsregexp.c
  23  * from Mozilla project, released under LGPL 2.1 or later.
  24  *
  25  * The Original Code is Mozilla Communicator client code, released
  26  * March 31, 1998.
  27  *
  28  * The Initial Developer of the Original Code is
  29  * Netscape Communications Corporation.
  30  * Portions created by the Initial Developer are Copyright (C) 1998
  31  * the Initial Developer. All Rights Reserved.
  32  */
  33
  34 #include <assert.h>
  35
  36 #include "jscript.h"
  37 #include "regexp.h"
  38
  39 #include "wine/debug.h"
  40
  41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
  42
  43 /* FIXME: Better error handling */
  44 #define ReportRegExpError(a,b,c) throw_error((a)->context, E_FAIL, L"")
  45 #define ReportRegExpErrorHelper(a,b,c,d) throw_error((a)->context, E_FAIL, L"")
  46 #define JS_ReportErrorNumber(a,b,c,d) throw_error((a), E_FAIL, L"")
  47 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f) throw_error((a), E_FAIL, L"")
  48 #define JS_COUNT_OPERATION(a,b) throw_error((a), E_FAIL, L"")
  49
  50
  51 typedef BYTE JSPackedBool;
  52
  53 /*
  54  * This struct holds a bitmap representation of a class from a regexp.
  55  * There's a list of these referenced by the classList field in the regexp_t
  56  * struct below. The initial state has startIndex set to the offset in the
  57  * original regexp source of the beginning of the class contents. The first
  58  * use of the class converts the source representation into a bitmap.
  59  *
  60  */
  61 typedef struct RECharSet {
  62     JSPackedBool    converted;
  63     JSPackedBool    sense;
  64     WORD            length;
  65     union {
  66         BYTE        *bits;
  67         struct {
  68             size_t  startIndex;
  69             size_t  length;
  70         } src;
  71     } u;
  72 } RECharSet;
  73
  74 #define JSMSG_MIN_TOO_BIG 47
  75 #define JSMSG_MAX_TOO_BIG 48
  76 #define JSMSG_OUT_OF_ORDER 49
  77 #define JSMSG_OUT_OF_MEMORY 137
  78
  79 #define LINE_SEPARATOR  0x2028
  80 #define PARA_SEPARATOR  0x2029
  81
  82 #define RE_IS_LETTER(c)     (((c >= 'A') && (c <= 'Z')) ||                    \
  83                              ((c >= 'a') && (c <= 'z')) )
  84 #define RE_IS_LINE_TERM(c)  ((c == '\n') || (c == '\r') ||                    \
  85                              (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
  86
  87 #define JS_ISWORD(c)    ((c) < 128 && (isalnum(c) || (c) == '_'))
  88
  89 #define JS7_ISDEC(c)    ((((unsigned)(c)) - '0') <= 9)
  90 #define JS7_UNDEC(c)    ((c) - '0')
  91
  92 typedef enum REOp {
  93     REOP_EMPTY,
  94     REOP_BOL,
  95     REOP_EOL,
  96     REOP_WBDRY,
  97     REOP_WNONBDRY,
  98     REOP_DOT,
  99     REOP_DIGIT,
 100     REOP_NONDIGIT,
 101     REOP_ALNUM,
 102     REOP_NONALNUM,
 103     REOP_SPACE,
 104     REOP_NONSPACE,
 105     REOP_BACKREF,
 106     REOP_FLAT,
 107     REOP_FLAT1,
 108     REOP_FLATi,
 109     REOP_FLAT1i,
 110     REOP_UCFLAT1,
 111     REOP_UCFLAT1i,
 112     REOP_UCFLAT,
 113     REOP_UCFLATi,
 114     REOP_CLASS,
 115     REOP_NCLASS,
 116     REOP_ALT,
 117     REOP_QUANT,
 118     REOP_STAR,
 119     REOP_PLUS,
 120     REOP_OPT,
 121     REOP_LPAREN,
 122     REOP_RPAREN,
 123     REOP_JUMP,
 124     REOP_DOTSTAR,
 125     REOP_LPARENNON,
 126     REOP_ASSERT,
 127     REOP_ASSERT_NOT,
 128     REOP_ASSERTTEST,
 129     REOP_ASSERTNOTTEST,
 130     REOP_MINIMALSTAR,
 131     REOP_MINIMALPLUS,
 132     REOP_MINIMALOPT,
 133     REOP_MINIMALQUANT,
 134     REOP_ENDCHILD,
 135     REOP_REPEAT,
 136     REOP_MINIMALREPEAT,
 137     REOP_ALTPREREQ,
 138     REOP_ALTPREREQ2,
 139     REOP_ENDALT,
 140     REOP_CONCAT,
 141     REOP_END,
 142     REOP_LIMIT /* META: no operator >= to this */
 143 } REOp;
 144
 145 #define REOP_IS_SIMPLE(op)  ((op) <= REOP_NCLASS)
 146
 147 static const char *reop_names[] = {
 148     "empty",
 149     "bol",
 150     "eol",
 151     "wbdry",
 152     "wnonbdry",
 153     "dot",
 154     "digit",
 155     "nondigit",
 156     "alnum",
 157     "nonalnum",
 158     "space",
 159     "nonspace",
 160     "backref",
 161     "flat",
 162     "flat1",
 163     "flati",
 164     "flat1i",
 165     "ucflat1",
 166     "ucflat1i",
 167     "ucflat",
 168     "ucflati",
 169     "class",
 170     "nclass",
 171     "alt",
 172     "quant",
 173     "star",
 174     "plus",
 175     "opt",
 176     "lparen",
 177     "rparen",
 178     "jump",
 179     "dotstar",
 180     "lparennon",
 181     "assert",
 182     "assert_not",
 183     "asserttest",
 184     "assertnottest",
 185     "minimalstar",
 186     "minimalplus",
 187     "minimalopt",
 188     "minimalquant",
 189     "endchild",
 190     "repeat",
 191     "minimalrepeat",
 192     "altprereq",
 193     "alrprereq2",
 194     "endalt",
 195     "concat",
 196     "end",
 197     NULL
 198 };
 199
 200 typedef struct REProgState {
 201     jsbytecode *continue_pc;        /* current continuation data */
 202     jsbytecode continue_op;
 203     ptrdiff_t index;                /* progress in text */
 204     size_t parenSoFar;              /* highest indexed paren started */
 205     union {
 206         struct {
 207             UINT min;               /* current quantifier limits */
 208             UINT max;
 209         } quantifier;
 210         struct {
 211             size_t top;             /* backtrack stack state */
 212             size_t sz;
 213         } assertion;
 214     } u;
 215 } REProgState;
 216
 217 typedef struct REBackTrackData {
 218     size_t sz;                      /* size of previous stack entry */
 219     jsbytecode *backtrack_pc;       /* where to backtrack to */
 220     jsbytecode backtrack_op;
 221     const WCHAR *cp;                /* index in text of match at backtrack */
 222     size_t parenIndex;              /* start index of saved paren contents */
 223     size_t parenCount;              /* # of saved paren contents */
 224     size_t saveStateStackTop;       /* number of parent states */
 225     /* saved parent states follow */
 226     /* saved paren contents follow */
 227 } REBackTrackData;
 228
 229 #define INITIAL_STATESTACK  100
 230 #define INITIAL_BACKTRACK   8000
 231
 232 typedef struct REGlobalData {
 233     void *cx;
 234     regexp_t *regexp;               /* the RE in execution */
 235     BOOL ok;                        /* runtime error (out_of_memory only?) */
 236     size_t start;                   /* offset to start at */
 237     ptrdiff_t skipped;              /* chars skipped anchoring this r.e. */
 238     const WCHAR    *cpbegin;        /* text base address */
 239     const WCHAR    *cpend;          /* text limit address */
 240
 241     REProgState *stateStack;        /* stack of state of current parents */
 242     size_t stateStackTop;
 243     size_t stateStackLimit;
 244
 245     REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
 246     REBackTrackData *backTrackSP;
 247     size_t backTrackStackSize;
 248     size_t cursz;                   /* size of current stack entry */
 249     size_t backTrackCount;          /* how many times we've backtracked */
 250     size_t backTrackLimit;          /* upper limit on backtrack states */
 251
 252     heap_pool_t *pool;              /* It's faster to use one malloc'd pool
 253                                        than to malloc/free the three items
 254                                        that are allocated from this pool */
 255 } REGlobalData;
 256
 257 typedef struct RENode RENode;
 258 struct RENode {
 259     REOp            op;         /* r.e. op bytecode */
 260     RENode          *next;      /* next in concatenation order */
 261     void            *kid;       /* first operand */
 262     union {
 263         void        *kid2;      /* second operand */
 264         INT         num;        /* could be a number */
 265         size_t      parenIndex; /* or a parenthesis index */
 266         struct {                /* or a quantifier range */
 267             UINT  min;
 268             UINT  max;
 269             JSPackedBool greedy;
 270         } range;
 271         struct {                /* or a character class */
 272             size_t  startIndex;
 273             size_t  kidlen;     /* length of string at kid, in jschars */
 274             size_t  index;      /* index into class list */
 275             WORD  bmsize;       /* bitmap size, based on max char code */
 276             JSPackedBool sense;
 277         } ucclass;
 278         struct {                /* or a literal sequence */
 279             WCHAR   chr;        /* of one character */
 280             size_t  length;     /* or many (via the kid) */
 281         } flat;
 282         struct {
 283             RENode  *kid2;      /* second operand from ALT */
 284             WCHAR   ch1;        /* match char for ALTPREREQ */
 285             WCHAR   ch2;        /* ditto, or class index for ALTPREREQ2 */
 286         } altprereq;
 287     } u;
 288 };
 289
 290 #define CLASS_CACHE_SIZE    4
 291
 292 typedef struct CompilerState {
 293     script_ctx_t    *context;
 294     const WCHAR     *cpbegin;
 295     const WCHAR     *cpend;
 296     const WCHAR     *cp;
 297     size_t          parenCount;
 298     size_t          classCount;   /* number of [] encountered */
 299     size_t          treeDepth;    /* maximum depth of parse tree */
 300     size_t          progLength;   /* estimated bytecode length */
 301     RENode          *result;
 302     size_t          classBitmapsMem; /* memory to hold all class bitmaps */
 303     struct {
 304         const WCHAR *start;         /* small cache of class strings */
 305         size_t length;              /* since they're often the same */
 306         size_t index;
 307     } classCache[CLASS_CACHE_SIZE];
 308     WORD          flags;
 309
 310     heap_pool_t *pool;              /* It's faster to use one malloc'd pool
 311                                        than to malloc/free */
 312 } CompilerState;
 313
 314 typedef struct EmitStateStackEntry {
 315     jsbytecode      *altHead;       /* start of REOP_ALT* opcode */
 316     jsbytecode      *nextAltFixup;  /* fixup pointer to next-alt offset */
 317     jsbytecode      *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
 318     jsbytecode      *endTermFixup;  /* fixup ptr. to REOPT_ALTPREREQ* offset */
 319     RENode          *continueNode;  /* original REOP_ALT* node being stacked */
 320     jsbytecode      continueOp;     /* REOP_JUMP or REOP_ENDALT continuation */
 321     JSPackedBool    jumpToJumpFlag; /* true if we've patched jump-to-jump to
 322                                        avoid 16-bit unsigned offset overflow */
 323 } EmitStateStackEntry;
 324
 325 /*
 326  * Immediate operand sizes and getter/setters.  Unlike the ones in jsopcode.h,
 327  * the getters and setters take the pc of the offset, not of the opcode before
 328  * the offset.
 329  */
 330 #define ARG_LEN             2
 331 #define GET_ARG(pc)         ((WORD)(((pc)[0] << 8) | (pc)[1]))
 332 #define SET_ARG(pc, arg)    ((pc)[0] = (jsbytecode) ((arg) >> 8),       \
 333                              (pc)[1] = (jsbytecode) (arg))
 334
 335 #define OFFSET_LEN          ARG_LEN
 336 #define OFFSET_MAX          ((1 << (ARG_LEN * 8)) - 1)
 337 #define GET_OFFSET(pc)      GET_ARG(pc)
 338
 339 static BOOL ParseRegExp(CompilerState*);
 340
 341 /*
 342  * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
 343  * For sanity, we limit it to 2^24 bytes.
 344  */
 345 #define TREE_DEPTH_MAX  ((1 << 24) / sizeof(EmitStateStackEntry))
 346
 347 /*
 348  * The maximum memory that can be allocated for class bitmaps.
 349  * For sanity, we limit it to 2^24 bytes.
 350  */
 351 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
 352
 353 /*
 354  * Functions to get size and write/read bytecode that represent small indexes
 355  * compactly.
 356  * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
 357  * indicates that the following byte brings more bits to the index. Otherwise
 358  * this is the last byte in the index bytecode representing highest index bits.
 359  */
 360 static size_t
 361 GetCompactIndexWidth(size_t index)
 362 {
 363     size_t width;
 364
 365     for (width = 1; (index >>= 7) != 0; ++width) { }
 366     return width;
 367 }
 368
 369 static inline jsbytecode *
 370 WriteCompactIndex(jsbytecode *pc, size_t index)
 371 {
 372     size_t next;
 373
 374     while ((next = index >> 7) != 0) {
 375         *pc++ = (jsbytecode)(index | 0x80);
 376         index = next;
 377     }
 378     *pc++ = (jsbytecode)index;
 379     return pc;
 380 }
 381
 382 static inline jsbytecode *
 383 ReadCompactIndex(jsbytecode *pc, size_t *result)
 384 {
 385     size_t nextByte;
 386
 387     nextByte = *pc++;
 388     if ((nextByte & 0x80) == 0) {
 389         /*
 390          * Short-circuit the most common case when compact index <= 127.
 391          */
 392         *result = nextByte;
 393     } else {
 394         size_t shift = 7;
 395         *result = 0x7F & nextByte;
 396         do {
 397             nextByte = *pc++;
 398             *result |= (nextByte & 0x7F) << shift;
 399             shift += 7;
 400         } while ((nextByte & 0x80) != 0);
 401     }
 402     return pc;
 403 }
 404
 405 /* Construct and initialize an RENode, returning NULL for out-of-memory */
 406 static RENode *
 407 NewRENode(CompilerState *state, REOp op)
 408 {
 409     RENode *ren;
 410
 411     ren = heap_pool_alloc(state->pool, sizeof(*ren));
 412     if (!ren) {
 413         throw_error(state->context, E_OUTOFMEMORY, L"");
 414         return NULL;
 415     }
 416     ren->op = op;
 417     ren->next = NULL;
 418     ren->kid = NULL;
 419     return ren;
 420 }
 421
 422 /*
 423  * Validates and converts hex ascii value.
 424  */
 425 static BOOL
 426 isASCIIHexDigit(WCHAR c, UINT *digit)
 427 {
 428     UINT cv = c;
 429
 430     if (cv < '0')
 431         return FALSE;
 432     if (cv <= '9') {
 433         *digit = cv - '0';
 434         return TRUE;
 435     }
 436     cv |= 0x20;
 437     if (cv >= 'a' && cv <= 'f') {
 438         *digit = cv - 'a' + 10;
 439         return TRUE;
 440     }
 441     return FALSE;
 442 }
 443
 444 typedef struct {
 445     REOp op;
 446     const WCHAR *errPos;
 447     size_t parenIndex;
 448 } REOpData;
 449
 450 #define JUMP_OFFSET_HI(off)     ((jsbytecode)((off) >> 8))
 451 #define JUMP_OFFSET_LO(off)     ((jsbytecode)(off))
 452
 453 static BOOL
 454 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
 455 {
 456     ptrdiff_t offset = target - jump;
 457
 458     /* Check that target really points forward. */
 459     assert(offset >= 2);
 460     if ((size_t)offset > OFFSET_MAX)
 461         return FALSE;
 462
 463     jump[0] = JUMP_OFFSET_HI(offset);
 464     jump[1] = JUMP_OFFSET_LO(offset);
 465     return TRUE;
 466 }
 467
 468 /*
 469  * Generate bytecode for the tree rooted at t using an explicit stack instead
 470  * of recursion.
 471  */
 472 static jsbytecode *
 473 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
 474                jsbytecode *pc, RENode *t)
 475 {
 476     EmitStateStackEntry *emitStateSP, *emitStateStack;
 477     RECharSet *charSet;
 478     REOp op;
 479
 480     if (treeDepth == 0) {
 481         emitStateStack = NULL;
 482     } else {
 483         emitStateStack = malloc(sizeof(EmitStateStackEntry) * treeDepth);
 484         if (!emitStateStack)
 485             return NULL;
 486     }
 487     emitStateSP = emitStateStack;
 488     op = t->op;
 489     assert(op < REOP_LIMIT);
 490
 491     for (;;) {
 492         *pc++ = op;
 493         switch (op) {
 494           case REOP_EMPTY:
 495             --pc;
 496             break;
 497
 498           case REOP_ALTPREREQ2:
 499           case REOP_ALTPREREQ:
 500             assert(emitStateSP);
 501             emitStateSP->altHead = pc - 1;
 502             emitStateSP->endTermFixup = pc;
 503             pc += OFFSET_LEN;
 504             SET_ARG(pc, t->u.altprereq.ch1);
 505             pc += ARG_LEN;
 506             SET_ARG(pc, t->u.altprereq.ch2);
 507             pc += ARG_LEN;
 508
 509             emitStateSP->nextAltFixup = pc;    /* offset to next alternate */
 510             pc += OFFSET_LEN;
 511
 512             emitStateSP->continueNode = t;
 513             emitStateSP->continueOp = REOP_JUMP;
 514             emitStateSP->jumpToJumpFlag = FALSE;
 515             ++emitStateSP;
 516             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 517             t = t->kid;
 518             op = t->op;
 519             assert(op < REOP_LIMIT);
 520             continue;
 521
 522           case REOP_JUMP:
 523             emitStateSP->nextTermFixup = pc;    /* offset to following term */
 524             pc += OFFSET_LEN;
 525             if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
 526                 goto jump_too_big;
 527             emitStateSP->continueOp = REOP_ENDALT;
 528             ++emitStateSP;
 529             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 530             t = t->u.kid2;
 531             op = t->op;
 532             assert(op < REOP_LIMIT);
 533             continue;
 534
 535           case REOP_ENDALT:
 536             /*
 537              * If we already patched emitStateSP->nextTermFixup to jump to
 538              * a nearer jump, to avoid 16-bit immediate offset overflow, we
 539              * are done here.
 540              */
 541             if (emitStateSP->jumpToJumpFlag)
 542                 break;
 543
 544             /*
 545              * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
 546              * REOP_ENDALT is executed only on successful match of the last
 547              * alternate in a group.
 548              */
 549             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 550                 goto jump_too_big;
 551             if (t->op != REOP_ALT) {
 552                 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
 553                     goto jump_too_big;
 554             }
 555
 556             /*
 557              * If the program is bigger than the REOP_JUMP offset range, then
 558              * we must check for alternates before this one that are part of
 559              * the same group, and fix up their jump offsets to target jumps
 560              * close enough to fit in a 16-bit unsigned offset immediate.
 561              */
 562             if ((size_t)(pc - re->program) > OFFSET_MAX &&
 563                 emitStateSP > emitStateStack) {
 564                 EmitStateStackEntry *esp, *esp2;
 565                 jsbytecode *alt, *jump;
 566                 ptrdiff_t span, header;
 567
 568                 esp2 = emitStateSP;
 569                 alt = esp2->altHead;
 570                 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
 571                     if (esp->continueOp == REOP_ENDALT &&
 572                         !esp->jumpToJumpFlag &&
 573                         esp->nextTermFixup + OFFSET_LEN == alt &&
 574                         (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
 575                                        ? esp->endTermFixup
 576                                        : esp->nextTermFixup)) > OFFSET_MAX) {
 577                         alt = esp->altHead;
 578                         jump = esp->nextTermFixup;
 579
 580                         /*
 581                          * The span must be 1 less than the distance from
 582                          * jump offset to jump offset, so we actually jump
 583                          * to a REOP_JUMP bytecode, not to its offset!
 584                          */
 585                         for (;;) {
 586                             assert(jump < esp2->nextTermFixup);
 587                             span = esp2->nextTermFixup - jump - 1;
 588                             if ((size_t)span <= OFFSET_MAX)
 589                                 break;
 590                             do {
 591                                 if (--esp2 == esp)
 592                                     goto jump_too_big;
 593                             } while (esp2->continueOp != REOP_ENDALT);
 594                         }
 595
 596                         jump[0] = JUMP_OFFSET_HI(span);
 597                         jump[1] = JUMP_OFFSET_LO(span);
 598
 599                         if (esp->continueNode->op != REOP_ALT) {
 600                             /*
 601                              * We must patch the offset at esp->endTermFixup
 602                              * as well, for the REOP_ALTPREREQ{,2} opcodes.
 603                              * If we're unlucky and endTermFixup is more than
 604                              * OFFSET_MAX bytes from its target, we cheat by
 605                              * jumping 6 bytes to the jump whose offset is at
 606                              * esp->nextTermFixup, which has the same target.
 607                              */
 608                             jump = esp->endTermFixup;
 609                             header = esp->nextTermFixup - jump;
 610                             span += header;
 611                             if ((size_t)span > OFFSET_MAX)
 612                                 span = header;
 613
 614                             jump[0] = JUMP_OFFSET_HI(span);
 615                             jump[1] = JUMP_OFFSET_LO(span);
 616                         }
 617
 618                         esp->jumpToJumpFlag = TRUE;
 619                     }
 620                 }
 621             }
 622             break;
 623
 624           case REOP_ALT:
 625             assert(emitStateSP);
 626             emitStateSP->altHead = pc - 1;
 627             emitStateSP->nextAltFixup = pc;     /* offset to next alternate */
 628             pc += OFFSET_LEN;
 629             emitStateSP->continueNode = t;
 630             emitStateSP->continueOp = REOP_JUMP;
 631             emitStateSP->jumpToJumpFlag = FALSE;
 632             ++emitStateSP;
 633             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 634             t = t->kid;
 635             op = t->op;
 636             assert(op < REOP_LIMIT);
 637             continue;
 638
 639           case REOP_FLAT:
 640             /*
 641              * Coalesce FLATs if possible and if it would not increase bytecode
 642              * beyond preallocated limit. The latter happens only when bytecode
 643              * size for coalesced string with offset p and length 2 exceeds 6
 644              * bytes preallocated for 2 single char nodes, i.e. when
 645              * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
 646              * GetCompactIndexWidth(p) > 4.
 647              * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
 648              * nodes strictly decreases bytecode size, the check has to be
 649              * done only for the first coalescing.
 650              */
 651             if (t->kid &&
 652                 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
 653             {
 654                 while (t->next &&
 655                        t->next->op == REOP_FLAT &&
 656                        (WCHAR*)t->kid + t->u.flat.length ==
 657                        t->next->kid) {
 658                     t->u.flat.length += t->next->u.flat.length;
 659                     t->next = t->next->next;
 660                 }
 661             }
 662             if (t->kid && t->u.flat.length > 1) {
 663                 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
 664                 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
 665                 pc = WriteCompactIndex(pc, t->u.flat.length);
 666             } else if (t->u.flat.chr < 256) {
 667                 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
 668                 *pc++ = (jsbytecode) t->u.flat.chr;
 669             } else {
 670                 pc[-1] = (state->flags & REG_FOLD)
 671                          ? REOP_UCFLAT1i
 672                          : REOP_UCFLAT1;
 673                 SET_ARG(pc, t->u.flat.chr);
 674                 pc += ARG_LEN;
 675             }
 676             break;
 677
 678           case REOP_LPAREN:
 679             assert(emitStateSP);
 680             pc = WriteCompactIndex(pc, t->u.parenIndex);
 681             emitStateSP->continueNode = t;
 682             emitStateSP->continueOp = REOP_RPAREN;
 683             ++emitStateSP;
 684             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 685             t = t->kid;
 686             op = t->op;
 687             continue;
 688
 689           case REOP_RPAREN:
 690             pc = WriteCompactIndex(pc, t->u.parenIndex);
 691             break;
 692
 693           case REOP_BACKREF:
 694             pc = WriteCompactIndex(pc, t->u.parenIndex);
 695             break;
 696
 697           case REOP_ASSERT:
 698             assert(emitStateSP);
 699             emitStateSP->nextTermFixup = pc;
 700             pc += OFFSET_LEN;
 701             emitStateSP->continueNode = t;
 702             emitStateSP->continueOp = REOP_ASSERTTEST;
 703             ++emitStateSP;
 704             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 705             t = t->kid;
 706             op = t->op;
 707             continue;
 708
 709           case REOP_ASSERTTEST:
 710           case REOP_ASSERTNOTTEST:
 711             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 712                 goto jump_too_big;
 713             break;
 714
 715           case REOP_ASSERT_NOT:
 716             assert(emitStateSP);
 717             emitStateSP->nextTermFixup = pc;
 718             pc += OFFSET_LEN;
 719             emitStateSP->continueNode = t;
 720             emitStateSP->continueOp = REOP_ASSERTNOTTEST;
 721             ++emitStateSP;
 722             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 723             t = t->kid;
 724             op = t->op;
 725             continue;
 726
 727           case REOP_QUANT:
 728             assert(emitStateSP);
 729             if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
 730                 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
 731             } else if (t->u.range.min == 0 && t->u.range.max == 1) {
 732                 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
 733             } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
 734                 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
 735             } else {
 736                 if (!t->u.range.greedy)
 737                     pc[-1] = REOP_MINIMALQUANT;
 738                 pc = WriteCompactIndex(pc, t->u.range.min);
 739                 /*
 740                  * Write max + 1 to avoid using size_t(max) + 1 bytes
 741                  * for (UINT)-1 sentinel.
 742                  */
 743                 pc = WriteCompactIndex(pc, t->u.range.max + 1);
 744             }
 745             emitStateSP->nextTermFixup = pc;
 746             pc += OFFSET_LEN;
 747             emitStateSP->continueNode = t;
 748             emitStateSP->continueOp = REOP_ENDCHILD;
 749             ++emitStateSP;
 750             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 751             t = t->kid;
 752             op = t->op;
 753             continue;
 754
 755           case REOP_ENDCHILD:
 756             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 757                 goto jump_too_big;
 758             break;
 759
 760           case REOP_CLASS:
 761             if (!t->u.ucclass.sense)
 762                 pc[-1] = REOP_NCLASS;
 763             pc = WriteCompactIndex(pc, t->u.ucclass.index);
 764             charSet = &re->classList[t->u.ucclass.index];
 765             charSet->converted = FALSE;
 766             charSet->length = t->u.ucclass.bmsize;
 767             charSet->u.src.startIndex = t->u.ucclass.startIndex;
 768             charSet->u.src.length = t->u.ucclass.kidlen;
 769             charSet->sense = t->u.ucclass.sense;
 770             break;
 771
 772           default:
 773             break;
 774         }
 775
 776         t = t->next;
 777         if (t) {
 778             op = t->op;
 779         } else {
 780             if (emitStateSP == emitStateStack)
 781                 break;
 782             --emitStateSP;
 783             t = emitStateSP->continueNode;
 784             op = (REOp) emitStateSP->continueOp;
 785         }
 786     }
 787
 788   cleanup:
 789     free(emitStateStack);
 790     return pc;
 791
 792   jump_too_big:
 793     ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
 794     pc = NULL;
 795     goto cleanup;
 796 }
 797
 798 /*
 799  * Process the op against the two top operands, reducing them to a single
 800  * operand in the penultimate slot. Update progLength and treeDepth.
 801  */
 802 static BOOL
 803 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
 804           INT operandSP)
 805 {
 806     RENode *result;
 807
 808     switch (opData->op) {
 809       case REOP_ALT:
 810         result = NewRENode(state, REOP_ALT);
 811         if (!result)
 812             return FALSE;
 813         result->kid = operandStack[operandSP - 2];
 814         result->u.kid2 = operandStack[operandSP - 1];
 815         operandStack[operandSP - 2] = result;
 816
 817         if (state->treeDepth == TREE_DEPTH_MAX) {
 818             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
 819             return FALSE;
 820         }
 821         ++state->treeDepth;
 822
 823         /*
 824          * Look at both alternates to see if there's a FLAT or a CLASS at
 825          * the start of each. If so, use a prerequisite match.
 826          */
 827         if (((RENode *) result->kid)->op == REOP_FLAT &&
 828             ((RENode *) result->u.kid2)->op == REOP_FLAT &&
 829             (state->flags & REG_FOLD) == 0) {
 830             result->op = REOP_ALTPREREQ;
 831             result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
 832             result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
 833             /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
 834                                             JUMP, <end> ... ENDALT */
 835             state->progLength += 13;
 836         }
 837         else
 838         if (((RENode *) result->kid)->op == REOP_CLASS &&
 839             ((RENode *) result->kid)->u.ucclass.index < 256 &&
 840             ((RENode *) result->u.kid2)->op == REOP_FLAT &&
 841             (state->flags & REG_FOLD) == 0) {
 842             result->op = REOP_ALTPREREQ2;
 843             result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
 844             result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
 845             /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
 846                                             JUMP, <end> ... ENDALT */
 847             state->progLength += 13;
 848         }
 849         else
 850         if (((RENode *) result->kid)->op == REOP_FLAT &&
 851             ((RENode *) result->u.kid2)->op == REOP_CLASS &&
 852             ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
 853             (state->flags & REG_FOLD) == 0) {
 854             result->op = REOP_ALTPREREQ2;
 855             result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
 856             result->u.altprereq.ch2 =
 857                 ((RENode *) result->u.kid2)->u.ucclass.index;
 858             /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
 859                                           JUMP, <end> ... ENDALT */
 860             state->progLength += 13;
 861         }
 862         else {
 863             /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
 864             state->progLength += 7;
 865         }
 866         break;
 867
 868       case REOP_CONCAT:
 869         result = operandStack[operandSP - 2];
 870         while (result->next)
 871             result = result->next;
 872         result->next = operandStack[operandSP - 1];
 873         break;
 874
 875       case REOP_ASSERT:
 876       case REOP_ASSERT_NOT:
 877       case REOP_LPARENNON:
 878       case REOP_LPAREN:
 879         /* These should have been processed by a close paren. */
 880         ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
 881                                 opData->errPos);
 882         return FALSE;
 883
 884       default:;
 885     }
 886     return TRUE;
 887 }
 888
 889 /*
 890  * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
 891  * it being on the stack, and to propagate errors to its callers.
 892  */
 893 #define JSREG_FIND_PAREN_COUNT  0x8000
 894 #define JSREG_FIND_PAREN_ERROR  0x4000
 895
 896 /*
 897  * Magic return value from FindParenCount and GetDecimalValue, to indicate
 898  * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
 899  * its findMax parameter is non-null.
 900  */
 901 #define OVERFLOW_VALUE          ((UINT)-1)
 902
 903 static UINT
 904 FindParenCount(CompilerState *state)
 905 {
 906     CompilerState temp;
 907     int i;
 908
 909     if (state->flags & JSREG_FIND_PAREN_COUNT)
 910         return OVERFLOW_VALUE;
 911
 912     /*
 913      * Copy state into temp, flag it so we never report an invalid backref,
 914      * and reset its members to parse the entire regexp.  This is obviously
 915      * suboptimal, but GetDecimalValue calls us only if a backref appears to
 916      * refer to a forward parenthetical, which is rare.
 917      */
 918     temp = *state;
 919     temp.flags |= JSREG_FIND_PAREN_COUNT;
 920     temp.cp = temp.cpbegin;
 921     temp.parenCount = 0;
 922     temp.classCount = 0;
 923     temp.progLength = 0;
 924     temp.treeDepth = 0;
 925     temp.classBitmapsMem = 0;
 926     for (i = 0; i < CLASS_CACHE_SIZE; i++)
 927         temp.classCache[i].start = NULL;
 928
 929     if (!ParseRegExp(&temp)) {
 930         state->flags |= JSREG_FIND_PAREN_ERROR;
 931         return OVERFLOW_VALUE;
 932     }
 933     return temp.parenCount;
 934 }
 935
 936 /*
 937  * Extract and return a decimal value at state->cp.  The initial character c
 938  * has already been read.  Return OVERFLOW_VALUE if the result exceeds max.
 939  * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
 940  * state->flags to discover whether an error occurred under findMax.
 941  */
 942 static UINT
 943 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
 944                 CompilerState *state)
 945 {
 946     UINT value = JS7_UNDEC(c);
 947     BOOL overflow = (value > max && (!findMax || value > findMax(state)));
 948
 949     /* The following restriction allows simpler overflow checks. */
 950     assert(max <= ((UINT)-1 - 9) / 10);
 951     while (state->cp < state->cpend) {
 952         c = *state->cp;
 953         if (!JS7_ISDEC(c))
 954             break;
 955         value = 10 * value + JS7_UNDEC(c);
 956         if (!overflow && value > max && (!findMax || value > findMax(state)))
 957             overflow = TRUE;
 958         ++state->cp;
 959     }
 960     return overflow ? OVERFLOW_VALUE : value;
 961 }
 962
 963 /*
 964  * Calculate the total size of the bitmap required for a class expression.
 965  */
 966 static BOOL
 967 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
 968                     const WCHAR *end)
 969 {
 970     UINT max = 0;
 971     BOOL inRange = FALSE;
 972     WCHAR c, rangeStart = 0;
 973     UINT n, digit, nDigits, i;
 974
 975     target->u.ucclass.bmsize = 0;
 976     target->u.ucclass.sense = TRUE;
 977
 978     if (src == end)
 979         return TRUE;
 980
 981     if (*src == '^') {
 982         ++src;
 983         target->u.ucclass.sense = FALSE;
 984     }
 985
 986     while (src != end) {
 987         BOOL canStartRange = TRUE;
 988         UINT localMax = 0;
 989
 990         switch (*src) {
 991           case '\\':
 992             ++src;
 993             c = *src++;
 994             switch (c) {
 995               case 'b':
 996                 localMax = 0x8;
 997                 break;
 998               case 'f':
 999                 localMax = 0xC;
1000                 break;
1001               case 'n':
1002                 localMax = 0xA;
1003                 break;
1004               case 'r':
1005                 localMax = 0xD;
1006                 break;
1007               case 't':
1008                 localMax = 0x9;
1009                 break;
1010               case 'v':
1011                 localMax = 0xB;
1012                 break;
1013               case 'c':
1014                 if (src < end && RE_IS_LETTER(*src)) {
1015                     localMax = (UINT) (*src++) & 0x1F;
1016                 } else {
1017                     --src;
1018                     localMax = '\\';
1019                 }
1020                 break;
1021               case 'x':
1022                 nDigits = 2;
1023                 goto lexHex;
1024               case 'u':
1025                 nDigits = 4;
1026 lexHex:
1027                 n = 0;
1028                 for (i = 0; (i < nDigits) && (src < end); i++) {
1029                     c = *src++;
1030                     if (!isASCIIHexDigit(c, &digit)) {
1031                         /*
1032                          * Back off to accepting the original
1033                          *'\' as a literal.
1034                          */
1035                         src -= i + 1;
1036                         n = '\\';
1037                         break;
1038                     }
1039                     n = (n << 4) | digit;
1040                 }
1041                 localMax = n;
1042                 break;
1043               case 'd':
1044                 canStartRange = FALSE;
1045                 if (inRange) {
1046                     JS_ReportErrorNumber(state->context,
1047                                          js_GetErrorMessage, NULL,
1048                                          JSMSG_BAD_CLASS_RANGE);
1049                     return FALSE;
1050                 }
1051                 localMax = '9';
1052                 break;
1053               case 'D':
1054               case 's':
1055               case 'S':
1056               case 'w':
1057               case 'W':
1058                 canStartRange = FALSE;
1059                 if (inRange) {
1060                     JS_ReportErrorNumber(state->context,
1061                                          js_GetErrorMessage, NULL,
1062                                          JSMSG_BAD_CLASS_RANGE);
1063                     return FALSE;
1064                 }
1065                 max = 65535;
1066
1067                 /*
1068                  * If this is the start of a range, ensure that it's less than
1069                  * the end.
1070                  */
1071                 localMax = 0;
1072                 break;
1073               case '0':
1074               case '1':
1075               case '2':
1076               case '3':
1077               case '4':
1078               case '5':
1079               case '6':
1080               case '7':
1081                 /*
1082                  *  This is a non-ECMA extension - decimal escapes (in this
1083                  *  case, octal!) are supposed to be an error inside class
1084                  *  ranges, but supported here for backwards compatibility.
1085                  *
1086                  */
1087                 n = JS7_UNDEC(c);
1088                 c = *src;
1089                 if ('0' <= c && c <= '7') {
1090                     src++;
1091                     n = 8 * n + JS7_UNDEC(c);
1092                     c = *src;
1093                     if ('0' <= c && c <= '7') {
1094                         src++;
1095                         i = 8 * n + JS7_UNDEC(c);
1096                         if (i <= 0377)
1097                             n = i;
1098                         else
1099                             src--;
1100                     }
1101                 }
1102                 localMax = n;
1103                 break;
1104
1105               default:
1106                 localMax = c;
1107                 break;
1108             }
1109             break;
1110           default:
1111             localMax = *src++;
1112             break;
1113         }
1114
1115         if (inRange) {
1116             /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1117             if (rangeStart > localMax) {
1118                 JS_ReportErrorNumber(state->context,
1119                                      js_GetErrorMessage, NULL,
1120                                      JSMSG_BAD_CLASS_RANGE);
1121                 return FALSE;
1122             }
1123             inRange = FALSE;
1124         } else {
1125             if (canStartRange && src < end - 1) {
1126                 if (*src == '-') {
1127                     ++src;
1128                     inRange = TRUE;
1129                     rangeStart = (WCHAR)localMax;
1130                     continue;
1131                 }
1132             }
1133             if (state->flags & REG_FOLD)
1134                 rangeStart = localMax;   /* one run of the uc/dc loop below */
1135         }
1136
1137         if (state->flags & REG_FOLD) {
1138             WCHAR maxch = localMax;
1139
1140             for (i = rangeStart; i <= localMax; i++) {
1141                 WCHAR uch, dch;
1142
1143                 uch = towupper(i);
1144                 dch = towlower(i);
1145                 if(maxch < uch)
1146                     maxch = uch;
1147                 if(maxch < dch)
1148                     maxch = dch;
1149             }
1150             localMax = maxch;
1151         }
1152
1153         if (localMax > max)
1154             max = localMax;
1155     }
1156     target->u.ucclass.bmsize = max;
1157     return TRUE;
1158 }
1159
1160 static INT
1161 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1162 {
1163     UINT min, max;
1164     WCHAR c;
1165     const WCHAR *errp = state->cp++;
1166
1167     c = *state->cp;
1168     if (JS7_ISDEC(c)) {
1169         ++state->cp;
1170         min = GetDecimalValue(c, 0xFFFF, NULL, state);
1171         c = *state->cp;
1172
1173         if (!ignoreValues && min == OVERFLOW_VALUE)
1174             return JSMSG_MIN_TOO_BIG;
1175
1176         if (c == ',') {
1177             c = *++state->cp;
1178             if (JS7_ISDEC(c)) {
1179                 ++state->cp;
1180                 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1181                 c = *state->cp;
1182                 if (!ignoreValues && max == OVERFLOW_VALUE)
1183                     return JSMSG_MAX_TOO_BIG;
1184                 if (!ignoreValues && min > max)
1185                     return JSMSG_OUT_OF_ORDER;
1186             } else {
1187                 max = (UINT)-1;
1188             }
1189         } else {
1190             max = min;
1191         }
1192         if (c == '}') {
1193             state->result = NewRENode(state, REOP_QUANT);
1194             if (!state->result)
1195                 return JSMSG_OUT_OF_MEMORY;
1196             state->result->u.range.min = min;
1197             state->result->u.range.max = max;
1198             /*
1199              * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1200              * where <max> is written as compact(max+1) to make
1201              * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1202              */
1203             state->progLength += (1 + GetCompactIndexWidth(min)
1204                                   + GetCompactIndexWidth(max + 1)
1205                                   +3);
1206             return 0;
1207         }
1208     }
1209
1210     state->cp = errp;
1211     return -1;
1212 }
1213
1214 static BOOL
1215 ParseQuantifier(CompilerState *state)
1216 {
1217     RENode *term;
1218     term = state->result;
1219     if (state->cp < state->cpend) {
1220         switch (*state->cp) {
1221           case '+':
1222             state->result = NewRENode(state, REOP_QUANT);
1223             if (!state->result)
1224                 return FALSE;
1225             state->result->u.range.min = 1;
1226             state->result->u.range.max = (UINT)-1;
1227             /* <PLUS>, <next> ... <ENDCHILD> */
1228             state->progLength += 4;
1229             goto quantifier;
1230           case '*':
1231             state->result = NewRENode(state, REOP_QUANT);
1232             if (!state->result)
1233                 return FALSE;
1234             state->result->u.range.min = 0;
1235             state->result->u.range.max = (UINT)-1;
1236             /* <STAR>, <next> ... <ENDCHILD> */
1237             state->progLength += 4;
1238             goto quantifier;
1239           case '?':
1240             state->result = NewRENode(state, REOP_QUANT);
1241             if (!state->result)
1242                 return FALSE;
1243             state->result->u.range.min = 0;
1244             state->result->u.range.max = 1;
1245             /* <OPT>, <next> ... <ENDCHILD> */
1246             state->progLength += 4;
1247             goto quantifier;
1248           case '{':       /* balance '}' */
1249           {
1250             INT err;
1251
1252             err = ParseMinMaxQuantifier(state, FALSE);
1253             if (err == 0)
1254                 goto quantifier;
1255             if (err == -1)
1256                 return TRUE;
1257
1258             ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1259             return FALSE;
1260           }
1261           default:;
1262         }
1263     }
1264     return TRUE;
1265
1266 quantifier:
1267     if (state->treeDepth == TREE_DEPTH_MAX) {
1268         ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1269         return FALSE;
1270     }
1271
1272     ++state->treeDepth;
1273     ++state->cp;
1274     state->result->kid = term;
1275     if (state->cp < state->cpend && *state->cp == '?') {
1276         ++state->cp;
1277         state->result->u.range.greedy = FALSE;
1278     } else {
1279         state->result->u.range.greedy = TRUE;
1280     }
1281     return TRUE;
1282 }
1283
1284 /*
1285  *  item:       assertion               An item is either an assertion or
1286  *              quantatom               a quantified atom.
1287  *
1288  *  assertion:  '^'                     Assertions match beginning of string
1289  *                                      (or line if the class static property
1290  *                                      RegExp.multiline is true).
1291  *              '$'                     End of string (or line if the class
1292  *                                      static property RegExp.multiline is
1293  *                                      true).
1294  *              '\b'                    Word boundary (between \w and \W).
1295  *              '\B'                    Word non-boundary.
1296  *
1297  *  quantatom:  atom                    An unquantified atom.
1298  *              quantatom '{' n ',' m '}'
1299  *                                      Atom must occur between n and m times.
1300  *              quantatom '{' n ',' '}' Atom must occur at least n times.
1301  *              quantatom '{' n '}'     Atom must occur exactly n times.
1302  *              quantatom '*'           Zero or more times (same as {0,}).
1303  *              quantatom '+'           One or more times (same as {1,}).
1304  *              quantatom '?'           Zero or one time (same as {0,1}).
1305  *
1306  *              any of which can be optionally followed by '?' for ungreedy
1307  *
1308  *  atom:       '(' regexp ')'          A parenthesized regexp (what matched
1309  *                                      can be addressed using a backreference,
1310  *                                      see '\' n below).
1311  *              '.'                     Matches any char except '\n'.
1312  *              '[' classlist ']'       A character class.
1313  *              '[' '^' classlist ']'   A negated character class.
1314  *              '\f'                    Form Feed.
1315  *              '\n'                    Newline (Line Feed).
1316  *              '\r'                    Carriage Return.
1317  *              '\t'                    Horizontal Tab.
1318  *              '\v'                    Vertical Tab.
1319  *              '\d'                    A digit (same as [0-9]).
1320  *              '\D'                    A non-digit.
1321  *              '\w'                    A word character, [0-9a-z_A-Z].
1322  *              '\W'                    A non-word character.
1323  *              '\s'                    A whitespace character, [ \b\f\n\r\t\v].
1324  *              '\S'                    A non-whitespace character.
1325  *              '\' n                   A backreference to the nth (n decimal
1326  *                                      and positive) parenthesized expression.
1327  *              '\' octal               An octal escape sequence (octal must be
1328  *                                      two or three digits long, unless it is
1329  *                                      0 for the null character).
1330  *              '\x' hex                A hex escape (hex must be two digits).
1331  *              '\u' unicode            A unicode escape (must be four digits).
1332  *              '\c' ctrl               A control character, ctrl is a letter.
1333  *              '\' literalatomchar     Any character except one of the above
1334  *                                      that follow '\' in an atom.
1335  *              otheratomchar           Any character not first among the other
1336  *                                      atom right-hand sides.
1337  */
1338 static BOOL
1339 ParseTerm(CompilerState *state)
1340 {
1341     WCHAR c = *state->cp++;
1342     UINT nDigits;
1343     UINT num, tmp, n, i;
1344     const WCHAR *termStart;
1345
1346     switch (c) {
1347     /* assertions and atoms */
1348       case '^':
1349         state->result = NewRENode(state, REOP_BOL);
1350         if (!state->result)
1351             return FALSE;
1352         state->progLength++;
1353         return TRUE;
1354       case '$':
1355         state->result = NewRENode(state, REOP_EOL);
1356         if (!state->result)
1357             return FALSE;
1358         state->progLength++;
1359         return TRUE;
1360       case '\\':
1361         if (state->cp >= state->cpend) {
1362             /* a trailing '\' is an error */
1363             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1364             return FALSE;
1365         }
1366         c = *state->cp++;
1367         switch (c) {
1368         /* assertion escapes */
1369           case 'b' :
1370             state->result = NewRENode(state, REOP_WBDRY);
1371             if (!state->result)
1372                 return FALSE;
1373             state->progLength++;
1374             return TRUE;
1375           case 'B':
1376             state->result = NewRENode(state, REOP_WNONBDRY);
1377             if (!state->result)
1378                 return FALSE;
1379             state->progLength++;
1380             return TRUE;
1381           /* Decimal escape */
1382           case '0':
1383               /* Give a strict warning. See also the note below. */
1384               WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1385      doOctal:
1386             num = 0;
1387             while (state->cp < state->cpend) {
1388                 c = *state->cp;
1389                 if (c < '0' || '7' < c)
1390                     break;
1391                 state->cp++;
1392                 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1393                 if (tmp > 0377)
1394                     break;
1395                 num = tmp;
1396             }
1397             c = (WCHAR)num;
1398     doFlat:
1399             state->result = NewRENode(state, REOP_FLAT);
1400             if (!state->result)
1401                 return FALSE;
1402             state->result->u.flat.chr = c;
1403             state->result->u.flat.length = 1;
1404             state->progLength += 3;
1405             break;
1406           case '1':
1407           case '2':
1408           case '3':
1409           case '4':
1410           case '5':
1411           case '6':
1412           case '7':
1413           case '8':
1414           case '9':
1415             termStart = state->cp - 1;
1416             num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1417             if (state->flags & JSREG_FIND_PAREN_ERROR)
1418                 return FALSE;
1419             if (num == OVERFLOW_VALUE) {
1420                 /* Give a strict mode warning. */
1421                 WARN("back-reference exceeds number of capturing parentheses\n");
1422
1423                 /*
1424                  * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1425                  * error here. However, for compatibility with IE, we treat the
1426                  * whole backref as flat if the first character in it is not a
1427                  * valid octal character, and as an octal escape otherwise.
1428                  */
1429                 state->cp = termStart;
1430                 if (c >= '8') {
1431                     /* Treat this as flat. termStart - 1 is the \. */
1432                     c = '\\';
1433                     goto asFlat;
1434                 }
1435
1436                 /* Treat this as an octal escape. */
1437                 goto doOctal;
1438             }
1439             assert(1 <= num && num <= 0x10000);
1440             state->result = NewRENode(state, REOP_BACKREF);
1441             if (!state->result)
1442                 return FALSE;
1443             state->result->u.parenIndex = num - 1;
1444             state->progLength
1445                 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1446             break;
1447           /* Control escape */
1448           case 'f':
1449             c = 0xC;
1450             goto doFlat;
1451           case 'n':
1452             c = 0xA;
1453             goto doFlat;
1454           case 'r':
1455             c = 0xD;
1456             goto doFlat;
1457           case 't':
1458             c = 0x9;
1459             goto doFlat;
1460           case 'v':
1461             c = 0xB;
1462             goto doFlat;
1463           /* Control letter */
1464           case 'c':
1465             if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1466                 c = (WCHAR) (*state->cp++ & 0x1F);
1467             } else {
1468                 /* back off to accepting the original '\' as a literal */
1469                 --state->cp;
1470                 c = '\\';
1471             }
1472             goto doFlat;
1473           /* HexEscapeSequence */
1474           case 'x':
1475             nDigits = 2;
1476             goto lexHex;
1477           /* UnicodeEscapeSequence */
1478           case 'u':
1479             nDigits = 4;
1480 lexHex:
1481             n = 0;
1482             for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1483                 UINT digit;
1484                 c = *state->cp++;
1485                 if (!isASCIIHexDigit(c, &digit)) {
1486                     /*
1487                      * Back off to accepting the original 'u' or 'x' as a
1488                      * literal.
1489                      */
1490                     state->cp -= i + 2;
1491                     n = *state->cp++;
1492                     break;
1493                 }
1494                 n = (n << 4) | digit;
1495             }
1496             c = (WCHAR) n;
1497             goto doFlat;
1498           /* Character class escapes */
1499           case 'd':
1500             state->result = NewRENode(state, REOP_DIGIT);
1501 doSimple:
1502             if (!state->result)
1503                 return FALSE;
1504             state->progLength++;
1505             break;
1506           case 'D':
1507             state->result = NewRENode(state, REOP_NONDIGIT);
1508             goto doSimple;
1509           case 's':
1510             state->result = NewRENode(state, REOP_SPACE);
1511             goto doSimple;
1512           case 'S':
1513             state->result = NewRENode(state, REOP_NONSPACE);
1514             goto doSimple;
1515           case 'w':
1516             state->result = NewRENode(state, REOP_ALNUM);
1517             goto doSimple;
1518           case 'W':
1519             state->result = NewRENode(state, REOP_NONALNUM);
1520             goto doSimple;
1521           /* IdentityEscape */
1522           default:
1523             state->result = NewRENode(state, REOP_FLAT);
1524             if (!state->result)
1525                 return FALSE;
1526             state->result->u.flat.chr = c;
1527             state->result->u.flat.length = 1;
1528             state->result->kid = (void *) (state->cp - 1);
1529             state->progLength += 3;
1530             break;
1531         }
1532         break;
1533       case '[':
1534         state->result = NewRENode(state, REOP_CLASS);
1535         if (!state->result)
1536             return FALSE;
1537         termStart = state->cp;
1538         state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1539         for (;;) {
1540             if (state->cp == state->cpend) {
1541                 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1542                                         JSMSG_UNTERM_CLASS, termStart);
1543
1544                 return FALSE;
1545             }
1546             if (*state->cp == '\\') {
1547                 state->cp++;
1548                 if (state->cp != state->cpend)
1549                     state->cp++;
1550                 continue;
1551             }
1552             if (*state->cp == ']') {
1553                 state->result->u.ucclass.kidlen = state->cp - termStart;
1554                 break;
1555             }
1556             state->cp++;
1557         }
1558         for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1559             if (!state->classCache[i].start) {
1560                 state->classCache[i].start = termStart;
1561                 state->classCache[i].length = state->result->u.ucclass.kidlen;
1562                 state->classCache[i].index = state->classCount;
1563                 break;
1564             }
1565             if (state->classCache[i].length ==
1566                 state->result->u.ucclass.kidlen) {
1567                 for (n = 0; ; n++) {
1568                     if (n == state->classCache[i].length) {
1569                         state->result->u.ucclass.index
1570                             = state->classCache[i].index;
1571                         goto claim;
1572                     }
1573                     if (state->classCache[i].start[n] != termStart[n])
1574                         break;
1575                 }
1576             }
1577         }
1578         state->result->u.ucclass.index = state->classCount++;
1579
1580     claim:
1581         /*
1582          * Call CalculateBitmapSize now as we want any errors it finds
1583          * to be reported during the parse phase, not at execution.
1584          */
1585         if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1586             return FALSE;
1587         /*
1588          * Update classBitmapsMem with number of bytes to hold bmsize bits,
1589          * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1590          * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1591          */
1592         n = (state->result->u.ucclass.bmsize >> 3) + 1;
1593         if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1594             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1595             return FALSE;
1596         }
1597         state->classBitmapsMem += n;
1598         /* CLASS, <index> */
1599         state->progLength
1600             += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1601         break;
1602
1603       case '.':
1604         state->result = NewRENode(state, REOP_DOT);
1605         goto doSimple;
1606
1607       case '{':
1608       {
1609         const WCHAR *errp = state->cp--;
1610         INT err;
1611
1612         err = ParseMinMaxQuantifier(state, TRUE);
1613         state->cp = errp;
1614
1615         if (err < 0)
1616             goto asFlat;
1617
1618         /* FALL THROUGH */
1619       }
1620       case '*':
1621       case '+':
1622       case '?':
1623         throw_error(state->context, state->context->version < SCRIPTLANGUAGEVERSION_ES5 ? JS_E_REGEXP_SYNTAX : JS_E_UNEXPECTED_QUANTIFIER, L"");
1624         return FALSE;
1625       default:
1626 asFlat:
1627         state->result = NewRENode(state, REOP_FLAT);
1628         if (!state->result)
1629             return FALSE;
1630         state->result->u.flat.chr = c;
1631         state->result->u.flat.length = 1;
1632         state->result->kid = (void *) (state->cp - 1);
1633         state->progLength += 3;
1634         break;
1635     }
1636     return ParseQuantifier(state);
1637 }
1638
1639 /*
1640  * Top-down regular expression grammar, based closely on Perl4.
1641  *
1642  *  regexp:     altern                  A regular expression is one or more
1643  *              altern '|' regexp       alternatives separated by vertical bar.
1644  */
1645 #define INITIAL_STACK_SIZE  128
1646
1647 static BOOL
1648 ParseRegExp(CompilerState *state)
1649 {
1650     size_t parenIndex;
1651     RENode *operand;
1652     REOpData *operatorStack;
1653     RENode **operandStack;
1654     REOp op;
1655     INT i;
1656     BOOL result = FALSE;
1657
1658     INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1659     INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1660
1661     /* Watch out for empty regexp */
1662     if (state->cp == state->cpend) {
1663         state->result = NewRENode(state, REOP_EMPTY);
1664         return (state->result != NULL);
1665     }
1666
1667     operatorStack = malloc(sizeof(REOpData) * operatorStackSize);
1668     if (!operatorStack)
1669         return FALSE;
1670
1671     operandStack = malloc(sizeof(RENode *) * operandStackSize);
1672     if (!operandStack)
1673         goto out;
1674
1675     for (;;) {
1676         parenIndex = state->parenCount;
1677         if (state->cp == state->cpend) {
1678             /*
1679              * If we are at the end of the regexp and we're short one or more
1680              * operands, the regexp must have the form /x|/ or some such, with
1681              * left parentheses making us short more than one operand.
1682              */
1683             if (operatorSP >= operandSP) {
1684                 operand = NewRENode(state, REOP_EMPTY);
1685                 if (!operand)
1686                     goto out;
1687                 goto pushOperand;
1688             }
1689         } else {
1690             switch (*state->cp) {
1691               case '(':
1692                 ++state->cp;
1693                 if (state->cp + 1 < state->cpend &&
1694                     *state->cp == '?' &&
1695                     (state->cp[1] == '=' ||
1696                      state->cp[1] == '!' ||
1697                      state->cp[1] == ':')) {
1698                     switch (state->cp[1]) {
1699                       case '=':
1700                         op = REOP_ASSERT;
1701                         /* ASSERT, <next>, ... ASSERTTEST */
1702                         state->progLength += 4;
1703                         break;
1704                       case '!':
1705                         op = REOP_ASSERT_NOT;
1706                         /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1707                         state->progLength += 4;
1708                         break;
1709                       default:
1710                         op = REOP_LPARENNON;
1711                         break;
1712                     }
1713                     state->cp += 2;
1714                 } else {
1715                     op = REOP_LPAREN;
1716                     /* LPAREN, <index>, ... RPAREN, <index> */
1717                     state->progLength
1718                         += 2 * (1 + GetCompactIndexWidth(parenIndex));
1719                     state->parenCount++;
1720                     if (state->parenCount == 65535) {
1721                         ReportRegExpError(state, JSREPORT_ERROR,
1722                                           JSMSG_TOO_MANY_PARENS);
1723                         goto out;
1724                     }
1725                 }
1726                 goto pushOperator;
1727
1728               case ')':
1729                 /*
1730                  * If there's no stacked open parenthesis, throw syntax error.
1731                  */
1732                 for (i = operatorSP - 1; ; i--) {
1733                     if (i < 0) {
1734                         ReportRegExpError(state, JSREPORT_ERROR,
1735                                           JSMSG_UNMATCHED_RIGHT_PAREN);
1736                         goto out;
1737                     }
1738                     if (operatorStack[i].op == REOP_ASSERT ||
1739                         operatorStack[i].op == REOP_ASSERT_NOT ||
1740                         operatorStack[i].op == REOP_LPARENNON ||
1741                         operatorStack[i].op == REOP_LPAREN) {
1742                         break;
1743                     }
1744                 }
1745                 /* FALL THROUGH */
1746
1747               case '|':
1748                 /* Expected an operand before these, so make an empty one */
1749                 operand = NewRENode(state, REOP_EMPTY);
1750                 if (!operand)
1751                     goto out;
1752                 goto pushOperand;
1753
1754               default:
1755                 if (!ParseTerm(state))
1756                     goto out;
1757                 operand = state->result;
1758 pushOperand:
1759                 if (operandSP == operandStackSize) {
1760                     RENode **tmp;
1761                     operandStackSize += operandStackSize;
1762                     tmp = realloc(operandStack, sizeof(RENode *) * operandStackSize);
1763                     if (!tmp)
1764                         goto out;
1765                     operandStack = tmp;
1766                 }
1767                 operandStack[operandSP++] = operand;
1768                 break;
1769             }
1770         }
1771
1772         /* At the end; process remaining operators. */
1773 restartOperator:
1774         if (state->cp == state->cpend) {
1775             while (operatorSP) {
1776                 --operatorSP;
1777                 if (!ProcessOp(state, &operatorStack[operatorSP],
1778                                operandStack, operandSP))
1779                     goto out;
1780                 --operandSP;
1781             }
1782             assert(operandSP == 1);
1783             state->result = operandStack[0];
1784             result = TRUE;
1785             goto out;
1786         }
1787
1788         switch (*state->cp) {
1789           case '|':
1790             /* Process any stacked 'concat' operators */
1791             ++state->cp;
1792             while (operatorSP &&
1793                    operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1794                 --operatorSP;
1795                 if (!ProcessOp(state, &operatorStack[operatorSP],
1796                                operandStack, operandSP)) {
1797                     goto out;
1798                 }
1799                 --operandSP;
1800             }
1801             op = REOP_ALT;
1802             goto pushOperator;
1803
1804           case ')':
1805             /*
1806              * If there's no stacked open parenthesis, throw syntax error.
1807              */
1808             for (i = operatorSP - 1; ; i--) {
1809                 if (i < 0) {
1810                     ReportRegExpError(state, JSREPORT_ERROR,
1811                                       JSMSG_UNMATCHED_RIGHT_PAREN);
1812                     goto out;
1813                 }
1814                 if (operatorStack[i].op == REOP_ASSERT ||
1815                     operatorStack[i].op == REOP_ASSERT_NOT ||
1816                     operatorStack[i].op == REOP_LPARENNON ||
1817                     operatorStack[i].op == REOP_LPAREN) {
1818                     break;
1819                 }
1820             }
1821             ++state->cp;
1822
1823             /* Process everything on the stack until the open parenthesis. */
1824             for (;;) {
1825                 assert(operatorSP);
1826                 --operatorSP;
1827                 switch (operatorStack[operatorSP].op) {
1828                   case REOP_ASSERT:
1829                   case REOP_ASSERT_NOT:
1830                   case REOP_LPAREN:
1831                     operand = NewRENode(state, operatorStack[operatorSP].op);
1832                     if (!operand)
1833                         goto out;
1834                     operand->u.parenIndex =
1835                         operatorStack[operatorSP].parenIndex;
1836                     assert(operandSP);
1837                     operand->kid = operandStack[operandSP - 1];
1838                     operandStack[operandSP - 1] = operand;
1839                     if (state->treeDepth == TREE_DEPTH_MAX) {
1840                         ReportRegExpError(state, JSREPORT_ERROR,
1841                                           JSMSG_REGEXP_TOO_COMPLEX);
1842                         goto out;
1843                     }
1844                     ++state->treeDepth;
1845                     /* FALL THROUGH */
1846
1847                   case REOP_LPARENNON:
1848                     state->result = operandStack[operandSP - 1];
1849                     if (!ParseQuantifier(state))
1850                         goto out;
1851                     operandStack[operandSP - 1] = state->result;
1852                     goto restartOperator;
1853                   default:
1854                     if (!ProcessOp(state, &operatorStack[operatorSP],
1855                                    operandStack, operandSP))
1856                         goto out;
1857                     --operandSP;
1858                     break;
1859                 }
1860             }
1861             break;
1862
1863           case '{':
1864           {
1865             const WCHAR *errp = state->cp;
1866
1867             if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1868                 /*
1869                  * This didn't even scan correctly as a quantifier, so we should
1870                  * treat it as flat.
1871                  */
1872                 op = REOP_CONCAT;
1873                 goto pushOperator;
1874             }
1875
1876             state->cp = errp;
1877             /* FALL THROUGH */
1878           }
1879
1880           case '+':
1881           case '*':
1882           case '?':
1883             ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1884                                     state->cp);
1885             result = FALSE;
1886             goto out;
1887
1888           default:
1889             /* Anything else is the start of the next term. */
1890             op = REOP_CONCAT;
1891 pushOperator:
1892             if (operatorSP == operatorStackSize) {
1893                 REOpData *tmp;
1894                 operatorStackSize += operatorStackSize;
1895                 tmp = realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1896                 if (!tmp)
1897                     goto out;
1898                 operatorStack = tmp;
1899             }
1900             operatorStack[operatorSP].op = op;
1901             operatorStack[operatorSP].errPos = state->cp;
1902             operatorStack[operatorSP++].parenIndex = parenIndex;
1903             break;
1904         }
1905     }
1906 out:
1907     free(operatorStack);
1908     free(operandStack);
1909     return result;
1910 }
1911
1912 /*
1913  * Save the current state of the match - the position in the input
1914  * text as well as the position in the bytecode. The state of any
1915  * parent expressions is also saved (preceding state).
1916  * Contents of parenCount parentheses from parenIndex are also saved.
1917  */
1918 static REBackTrackData *
1919 PushBackTrackState(REGlobalData *gData, REOp op,
1920                    jsbytecode *target, match_state_t *x, const WCHAR *cp,
1921                    size_t parenIndex, size_t parenCount)
1922 {
1923     size_t i;
1924     REBackTrackData *result =
1925         (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1926
1927     size_t sz = sizeof(REBackTrackData) +
1928                 gData->stateStackTop * sizeof(REProgState) +
1929                 parenCount * sizeof(RECapture);
1930
1931     ptrdiff_t btsize = gData->backTrackStackSize;
1932     ptrdiff_t btincr = ((char *)result + sz) -
1933                        ((char *)gData->backTrackStack + btsize);
1934
1935     TRACE("\tBT_Push: %Iu,%Iu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1936
1937     JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1938     if (btincr > 0) {
1939         ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1940
1941         JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1942         btincr = ((btincr+btsize-1)/btsize)*btsize;
1943         gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1944         if (!gData->backTrackStack) {
1945             throw_error(gData->cx, E_OUTOFMEMORY, L"");
1946             gData->ok = FALSE;
1947             return NULL;
1948         }
1949         gData->backTrackStackSize = btsize + btincr;
1950         result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1951     }
1952     gData->backTrackSP = result;
1953     result->sz = gData->cursz;
1954     gData->cursz = sz;
1955
1956     result->backtrack_op = op;
1957     result->backtrack_pc = target;
1958     result->cp = cp;
1959     result->parenCount = parenCount;
1960     result->parenIndex = parenIndex;
1961
1962     result->saveStateStackTop = gData->stateStackTop;
1963     assert(gData->stateStackTop);
1964     memcpy(result + 1, gData->stateStack,
1965            sizeof(REProgState) * result->saveStateStackTop);
1966
1967     if (parenCount != 0) {
1968         memcpy((char *)(result + 1) +
1969                sizeof(REProgState) * result->saveStateStackTop,
1970                &x->parens[parenIndex],
1971                sizeof(RECapture) * parenCount);
1972         for (i = 0; i != parenCount; i++)
1973             x->parens[parenIndex + i].index = -1;
1974     }
1975
1976     return result;
1977 }
1978
1979 static inline match_state_t *
1980 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1981               size_t length)
1982 {
1983     size_t i;
1984     assert(gData->cpend >= x->cp);
1985     if (length > (size_t)(gData->cpend - x->cp))
1986         return NULL;
1987     for (i = 0; i != length; i++) {
1988         if (towupper(matchChars[i]) != towupper(x->cp[i]))
1989             return NULL;
1990     }
1991     x->cp += length;
1992     return x;
1993 }
1994
1995 /*
1996  * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1997  * 2. If E is not a character then go to step 6.
1998  * 3. Let ch be E's character.
1999  * 4. Let A be a one-element RECharSet containing the character ch.
2000  * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2001  * 6. E must be an integer. Let n be that integer.
2002  * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2003  * 8. Return an internal Matcher closure that takes two arguments, a State x
2004  *    and a Continuation c, and performs the following:
2005  *     1. Let cap be x's captures internal array.
2006  *     2. Let s be cap[n].
2007  *     3. If s is undefined, then call c(x) and return its result.
2008  *     4. Let e be x's endIndex.
2009  *     5. Let len be s's length.
2010  *     6. Let f be e+len.
2011  *     7. If f>InputLength, return failure.
2012  *     8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2013  *        such that Canonicalize(s[i]) is not the same character as
2014  *        Canonicalize(Input [e+i]), then return failure.
2015  *     9. Let y be the State (f, cap).
2016  *     10. Call c(y) and return its result.
2017  */
2018 static match_state_t *
2019 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2020 {
2021     size_t len, i;
2022     const WCHAR *parenContent;
2023     RECapture *cap = &x->parens[parenIndex];
2024
2025     if (cap->index == -1)
2026         return x;
2027
2028     len = cap->length;
2029     if (x->cp + len > gData->cpend)
2030         return NULL;
2031
2032     parenContent = &gData->cpbegin[cap->index];
2033     if (gData->regexp->flags & REG_FOLD) {
2034         for (i = 0; i < len; i++) {
2035             if (towupper(parenContent[i]) != towupper(x->cp[i]))
2036                 return NULL;
2037         }
2038     } else {
2039         for (i = 0; i < len; i++) {
2040             if (parenContent[i] != x->cp[i])
2041                 return NULL;
2042         }
2043     }
2044     x->cp += len;
2045     return x;
2046 }
2047
2048 /* Add a single character to the RECharSet */
2049 static void
2050 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2051 {
2052     UINT byteIndex = (UINT)(c >> 3);
2053     assert(c <= cs->length);
2054     cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2055 }
2056
2057
2058 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2059 static void
2060 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2061 {
2062     UINT i;
2063
2064     UINT byteIndex1 = c1 >> 3;
2065     UINT byteIndex2 = c2 >> 3;
2066
2067     assert(c2 <= cs->length && c1 <= c2);
2068
2069     c1 &= 0x7;
2070     c2 &= 0x7;
2071
2072     if (byteIndex1 == byteIndex2) {
2073         cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2074     } else {
2075         cs->u.bits[byteIndex1] |= 0xFF << c1;
2076         for (i = byteIndex1 + 1; i < byteIndex2; i++)
2077             cs->u.bits[i] = 0xFF;
2078         cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2079     }
2080 }
2081
2082 /* Compile the source of the class into a RECharSet */
2083 static BOOL
2084 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2085 {
2086     const WCHAR *src, *end;
2087     BOOL inRange = FALSE;
2088     WCHAR rangeStart = 0;
2089     UINT byteLength, n;
2090     WCHAR c, thisCh;
2091     INT nDigits, i;
2092
2093     assert(!charSet->converted);
2094     /*
2095      * Assert that startIndex and length points to chars inside [] inside
2096      * source string.
2097      */
2098     assert(1 <= charSet->u.src.startIndex);
2099     assert(charSet->u.src.startIndex < gData->regexp->source_len);
2100     assert(charSet->u.src.length <= gData->regexp->source_len
2101             - 1 - charSet->u.src.startIndex);
2102
2103     charSet->converted = TRUE;
2104     src = gData->regexp->source + charSet->u.src.startIndex;
2105
2106     end = src + charSet->u.src.length;
2107
2108     assert(src[-1] == '[' && end[0] == ']');
2109
2110     byteLength = (charSet->length >> 3) + 1;
2111     charSet->u.bits = malloc(byteLength);
2112     if (!charSet->u.bits) {
2113         throw_error(gData->cx, E_OUTOFMEMORY, L"");
2114         gData->ok = FALSE;
2115         return FALSE;
2116     }
2117     memset(charSet->u.bits, 0, byteLength);
2118
2119     if (src == end)
2120         return TRUE;
2121
2122     if (*src == '^') {
2123         assert(charSet->sense == FALSE);
2124         ++src;
2125     } else {
2126         assert(charSet->sense == TRUE);
2127     }
2128
2129     while (src != end) {
2130         switch (*src) {
2131           case '\\':
2132             ++src;
2133             c = *src++;
2134             switch (c) {
2135               case 'b':
2136                 thisCh = 0x8;
2137                 break;
2138               case 'f':
2139                 thisCh = 0xC;
2140                 break;
2141               case 'n':
2142                 thisCh = 0xA;
2143                 break;
2144               case 'r':
2145                 thisCh = 0xD;
2146                 break;
2147               case 't':
2148                 thisCh = 0x9;
2149                 break;
2150               case 'v':
2151                 thisCh = 0xB;
2152                 break;
2153               case 'c':
2154                 if (src < end && JS_ISWORD(*src)) {
2155                     thisCh = (WCHAR)(*src++ & 0x1F);
2156                 } else {
2157                     --src;
2158                     thisCh = '\\';
2159                 }
2160                 break;
2161               case 'x':
2162                 nDigits = 2;
2163                 goto lexHex;
2164               case 'u':
2165                 nDigits = 4;
2166             lexHex:
2167                 n = 0;
2168                 for (i = 0; (i < nDigits) && (src < end); i++) {
2169                     UINT digit;
2170                     c = *src++;
2171                     if (!isASCIIHexDigit(c, &digit)) {
2172                         /*
2173                          * Back off to accepting the original '\'
2174                          * as a literal
2175                          */
2176                         src -= i + 1;
2177                         n = '\\';
2178                         break;
2179                     }
2180                     n = (n << 4) | digit;
2181                 }
2182                 thisCh = (WCHAR)n;
2183                 break;
2184               case '0':
2185               case '1':
2186               case '2':
2187               case '3':
2188               case '4':
2189               case '5':
2190               case '6':
2191               case '7':
2192                 /*
2193                  *  This is a non-ECMA extension - decimal escapes (in this
2194                  *  case, octal!) are supposed to be an error inside class
2195                  *  ranges, but supported here for backwards compatibility.
2196                  */
2197                 n = JS7_UNDEC(c);
2198                 c = *src;
2199                 if ('0' <= c && c <= '7') {
2200                     src++;
2201                     n = 8 * n + JS7_UNDEC(c);
2202                     c = *src;
2203                     if ('0' <= c && c <= '7') {
2204                         src++;
2205                         i = 8 * n + JS7_UNDEC(c);
2206                         if (i <= 0377)
2207                             n = i;
2208                         else
2209                             src--;
2210                     }
2211                 }
2212                 thisCh = (WCHAR)n;
2213                 break;
2214
2215               case 'd':
2216                 AddCharacterRangeToCharSet(charSet, '0', '9');
2217                 continue;   /* don't need range processing */
2218               case 'D':
2219                 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2220                 AddCharacterRangeToCharSet(charSet,
2221                                            (WCHAR)('9' + 1),
2222                                            (WCHAR)charSet->length);
2223                 continue;
2224               case 's':
2225                 for (i = (INT)charSet->length; i >= 0; i--)
2226                     if (iswspace(i))
2227                         AddCharacterToCharSet(charSet, (WCHAR)i);
2228                 continue;
2229               case 'S':
2230                 for (i = (INT)charSet->length; i >= 0; i--)
2231                     if (!iswspace(i))
2232                         AddCharacterToCharSet(charSet, (WCHAR)i);
2233                 continue;
2234               case 'w':
2235                 for (i = (INT)charSet->length; i >= 0; i--)
2236                     if (JS_ISWORD(i))
2237                         AddCharacterToCharSet(charSet, (WCHAR)i);
2238                 continue;
2239               case 'W':
2240                 for (i = (INT)charSet->length; i >= 0; i--)
2241                     if (!JS_ISWORD(i))
2242                         AddCharacterToCharSet(charSet, (WCHAR)i);
2243                 continue;
2244               default:
2245                 thisCh = c;
2246                 break;
2247
2248             }
2249             break;
2250
2251           default:
2252             thisCh = *src++;
2253             break;
2254
2255         }
2256         if (inRange) {
2257             if (gData->regexp->flags & REG_FOLD) {
2258                 assert(rangeStart <= thisCh);
2259                 for (i = rangeStart; i <= thisCh; i++) {
2260                     WCHAR uch, dch;
2261
2262                     AddCharacterToCharSet(charSet, i);
2263                     uch = towupper(i);
2264                     dch = towlower(i);
2265                     if (i != uch)
2266                         AddCharacterToCharSet(charSet, uch);
2267                     if (i != dch)
2268                         AddCharacterToCharSet(charSet, dch);
2269                 }
2270             } else {
2271                 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2272             }
2273             inRange = FALSE;
2274         } else {
2275             if (gData->regexp->flags & REG_FOLD) {
2276                 AddCharacterToCharSet(charSet, towupper(thisCh));
2277                 AddCharacterToCharSet(charSet, towlower(thisCh));
2278             } else {
2279                 AddCharacterToCharSet(charSet, thisCh);
2280             }
2281             if (src < end - 1) {
2282                 if (*src == '-') {
2283                     ++src;
2284                     inRange = TRUE;
2285                     rangeStart = thisCh;
2286                 }
2287             }
2288         }
2289     }
2290     return TRUE;
2291 }
2292
2293 static BOOL
2294 ReallocStateStack(REGlobalData *gData)
2295 {
2296     size_t limit = gData->stateStackLimit;
2297     size_t sz = sizeof(REProgState) * limit;
2298
2299     gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2300     if (!gData->stateStack) {
2301         throw_error(gData->cx, E_OUTOFMEMORY, L"");
2302         gData->ok = FALSE;
2303         return FALSE;
2304     }
2305     gData->stateStackLimit = limit + limit;
2306     return TRUE;
2307 }
2308
2309 #define PUSH_STATE_STACK(data)                                                \
2310     do {                                                                      \
2311         ++(data)->stateStackTop;                                              \
2312         if ((data)->stateStackTop == (data)->stateStackLimit &&               \
2313             !ReallocStateStack((data))) {                                     \
2314             return NULL;                                                      \
2315         }                                                                     \
2316     }while(0)
2317
2318 /*
2319  * Apply the current op against the given input to see if it's going to match
2320  * or fail. Return false if we don't get a match, true if we do. If updatecp is
2321  * true, then update the current state's cp. Always update startpc to the next
2322  * op.
2323  */
2324 static inline match_state_t *
2325 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2326             jsbytecode **startpc, BOOL updatecp)
2327 {
2328     match_state_t *result = NULL;
2329     WCHAR matchCh;
2330     size_t parenIndex;
2331     size_t offset, length, index;
2332     jsbytecode *pc = *startpc;  /* pc has already been incremented past op */
2333     const WCHAR *source;
2334     const WCHAR *startcp = x->cp;
2335     WCHAR ch;
2336     RECharSet *charSet;
2337
2338     const char *opname = reop_names[op];
2339     TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2340           (int)gData->stateStackTop * 2, "", opname);
2341
2342     switch (op) {
2343       case REOP_EMPTY:
2344         result = x;
2345         break;
2346       case REOP_BOL:
2347         if (x->cp != gData->cpbegin) {
2348             if (/*!gData->cx->regExpStatics.multiline &&  FIXME !!! */
2349                 !(gData->regexp->flags & REG_MULTILINE)) {
2350                 break;
2351             }
2352             if (!RE_IS_LINE_TERM(x->cp[-1]))
2353                 break;
2354         }
2355         result = x;
2356         break;
2357       case REOP_EOL:
2358         if (x->cp != gData->cpend) {
2359             if (/*!gData->cx->regExpStatics.multiline &&*/
2360                 !(gData->regexp->flags & REG_MULTILINE)) {
2361                 break;
2362             }
2363             if (!RE_IS_LINE_TERM(*x->cp))
2364                 break;
2365         }
2366         result = x;
2367         break;
2368       case REOP_WBDRY:
2369         if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2370             !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2371             result = x;
2372         }
2373         break;
2374       case REOP_WNONBDRY:
2375         if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2376             (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2377             result = x;
2378         }
2379         break;
2380       case REOP_DOT:
2381         if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2382             result = x;
2383             result->cp++;
2384         }
2385         break;
2386       case REOP_DIGIT:
2387         if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2388             result = x;
2389             result->cp++;
2390         }
2391         break;
2392       case REOP_NONDIGIT:
2393         if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2394             result = x;
2395             result->cp++;
2396         }
2397         break;
2398       case REOP_ALNUM:
2399         if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2400             result = x;
2401             result->cp++;
2402         }
2403         break;
2404       case REOP_NONALNUM:
2405         if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2406             result = x;
2407             result->cp++;
2408         }
2409         break;
2410       case REOP_SPACE:
2411         if (x->cp != gData->cpend && iswspace(*x->cp)) {
2412             result = x;
2413             result->cp++;
2414         }
2415         break;
2416       case REOP_NONSPACE:
2417         if (x->cp != gData->cpend && !iswspace(*x->cp)) {
2418             result = x;
2419             result->cp++;
2420         }
2421         break;
2422       case REOP_BACKREF:
2423         pc = ReadCompactIndex(pc, &parenIndex);
2424         assert(parenIndex < gData->regexp->parenCount);
2425         result = BackrefMatcher(gData, x, parenIndex);
2426         break;
2427       case REOP_FLAT:
2428         pc = ReadCompactIndex(pc, &offset);
2429         assert(offset < gData->regexp->source_len);
2430         pc = ReadCompactIndex(pc, &length);
2431         assert(1 <= length);
2432         assert(length <= gData->regexp->source_len - offset);
2433         if (length <= (size_t)(gData->cpend - x->cp)) {
2434             source = gData->regexp->source + offset;
2435             TRACE("%s\n", debugstr_wn(source, length));
2436             for (index = 0; index != length; index++) {
2437                 if (source[index] != x->cp[index])
2438                     return NULL;
2439             }
2440             x->cp += length;
2441             result = x;
2442         }
2443         break;
2444       case REOP_FLAT1:
2445         matchCh = *pc++;
2446         TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2447         if (x->cp != gData->cpend && *x->cp == matchCh) {
2448             result = x;
2449             result->cp++;
2450         }
2451         break;
2452       case REOP_FLATi:
2453         pc = ReadCompactIndex(pc, &offset);
2454         assert(offset < gData->regexp->source_len);
2455         pc = ReadCompactIndex(pc, &length);
2456         assert(1 <= length);
2457         assert(length <= gData->regexp->source_len - offset);
2458         source = gData->regexp->source;
2459         result = FlatNIMatcher(gData, x, source + offset, length);
2460         break;
2461       case REOP_FLAT1i:
2462         matchCh = *pc++;
2463         if (x->cp != gData->cpend && towupper(*x->cp) == towupper(matchCh)) {
2464             result = x;
2465             result->cp++;
2466         }
2467         break;
2468       case REOP_UCFLAT1:
2469         matchCh = GET_ARG(pc);
2470         TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2471         pc += ARG_LEN;
2472         if (x->cp != gData->cpend && *x->cp == matchCh) {
2473             result = x;
2474             result->cp++;
2475         }
2476         break;
2477       case REOP_UCFLAT1i:
2478         matchCh = GET_ARG(pc);
2479         pc += ARG_LEN;
2480         if (x->cp != gData->cpend && towupper(*x->cp) == towupper(matchCh)) {
2481             result = x;
2482             result->cp++;
2483         }
2484         break;
2485       case REOP_CLASS:
2486         pc = ReadCompactIndex(pc, &index);
2487         assert(index < gData->regexp->classCount);
2488         if (x->cp != gData->cpend) {
2489             charSet = &gData->regexp->classList[index];
2490             assert(charSet->converted);
2491             ch = *x->cp;
2492             index = ch >> 3;
2493             if (charSet->length != 0 &&
2494                 ch <= charSet->length &&
2495                 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2496                 result = x;
2497                 result->cp++;
2498             }
2499         }
2500         break;
2501       case REOP_NCLASS:
2502         pc = ReadCompactIndex(pc, &index);
2503         assert(index < gData->regexp->classCount);
2504         if (x->cp != gData->cpend) {
2505             charSet = &gData->regexp->classList[index];
2506             assert(charSet->converted);
2507             ch = *x->cp;
2508             index = ch >> 3;
2509             if (charSet->length == 0 ||
2510                 ch > charSet->length ||
2511                 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2512                 result = x;
2513                 result->cp++;
2514             }
2515         }
2516         break;
2517
2518       default:
2519         assert(FALSE);
2520     }
2521     if (result) {
2522         if (!updatecp)
2523             x->cp = startcp;
2524         *startpc = pc;
2525         TRACE(" *\n");
2526         return result;
2527     }
2528     x->cp = startcp;
2529     return NULL;
2530 }
2531
2532 static inline match_state_t *
2533 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2534 {
2535     match_state_t *result = NULL;
2536     REBackTrackData *backTrackData;
2537     jsbytecode *nextpc, *testpc;
2538     REOp nextop;
2539     RECapture *cap;
2540     REProgState *curState;
2541     const WCHAR *startcp;
2542     size_t parenIndex, k;
2543     size_t parenSoFar = 0;
2544
2545     WCHAR matchCh1, matchCh2;
2546     RECharSet *charSet;
2547
2548     BOOL anchor;
2549     jsbytecode *pc = gData->regexp->program;
2550     REOp op = (REOp) *pc++;
2551
2552     /*
2553      * If the first node is a simple match, step the index into the string
2554      * until that match is made, or fail if it can't be found at all.
2555      */
2556     if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2557         anchor = FALSE;
2558         while (x->cp <= gData->cpend) {
2559             nextpc = pc;    /* reset back to start each time */
2560             result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2561             if (result) {
2562                 anchor = TRUE;
2563                 x = result;
2564                 pc = nextpc;    /* accept skip to next opcode */
2565                 op = (REOp) *pc++;
2566                 assert(op < REOP_LIMIT);
2567                 break;
2568             }
2569             gData->skipped++;
2570             x->cp++;
2571         }
2572         if (!anchor)
2573             goto bad;
2574     }
2575
2576     for (;;) {
2577         const char *opname = reop_names[op];
2578         TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2579               (int)gData->stateStackTop * 2, "", opname);
2580
2581         if (REOP_IS_SIMPLE(op)) {
2582             result = SimpleMatch(gData, x, op, &pc, TRUE);
2583         } else {
2584             curState = &gData->stateStack[gData->stateStackTop];
2585             switch (op) {
2586               case REOP_END:
2587                 goto good;
2588               case REOP_ALTPREREQ2:
2589                 nextpc = pc + GET_OFFSET(pc);   /* start of next op */
2590                 pc += ARG_LEN;
2591                 matchCh2 = GET_ARG(pc);
2592                 pc += ARG_LEN;
2593                 k = GET_ARG(pc);
2594                 pc += ARG_LEN;
2595
2596                 if (x->cp != gData->cpend) {
2597                     if (*x->cp == matchCh2)
2598                         goto doAlt;
2599
2600                     charSet = &gData->regexp->classList[k];
2601                     if (!charSet->converted && !ProcessCharSet(gData, charSet))
2602                         goto bad;
2603                     matchCh1 = *x->cp;
2604                     k = matchCh1 >> 3;
2605                     if ((charSet->length == 0 ||
2606                          matchCh1 > charSet->length ||
2607                          !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2608                         charSet->sense) {
2609                         goto doAlt;
2610                     }
2611                 }
2612                 result = NULL;
2613                 break;
2614
2615               case REOP_ALTPREREQ:
2616                 nextpc = pc + GET_OFFSET(pc);   /* start of next op */
2617                 pc += ARG_LEN;
2618                 matchCh1 = GET_ARG(pc);
2619                 pc += ARG_LEN;
2620                 matchCh2 = GET_ARG(pc);
2621                 pc += ARG_LEN;
2622                 if (x->cp == gData->cpend ||
2623                     (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2624                     result = NULL;
2625                     break;
2626                 }
2627                 /* else fall through... */
2628
2629               case REOP_ALT:
2630               doAlt:
2631                 nextpc = pc + GET_OFFSET(pc);   /* start of next alternate */
2632                 pc += ARG_LEN;                  /* start of this alternate */
2633                 curState->parenSoFar = parenSoFar;
2634                 PUSH_STATE_STACK(gData);
2635                 op = (REOp) *pc++;
2636                 startcp = x->cp;
2637                 if (REOP_IS_SIMPLE(op)) {
2638                     if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2639                         op = (REOp) *nextpc++;
2640                         pc = nextpc;
2641                         continue;
2642                     }
2643                     result = x;
2644                     op = (REOp) *pc++;
2645                 }
2646                 nextop = (REOp) *nextpc++;
2647                 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2648                     goto bad;
2649                 continue;
2650
2651               /*
2652                * Occurs at (successful) end of REOP_ALT,
2653                */
2654               case REOP_JUMP:
2655                 /*
2656                  * If we have not gotten a result here, it is because of an
2657                  * empty match.  Do the same thing REOP_EMPTY would do.
2658                  */
2659                 if (!result)
2660                     result = x;
2661
2662                 --gData->stateStackTop;
2663                 pc += GET_OFFSET(pc);
2664                 op = (REOp) *pc++;
2665                 continue;
2666
2667               /*
2668                * Occurs at last (successful) end of REOP_ALT,
2669                */
2670               case REOP_ENDALT:
2671                 /*
2672                  * If we have not gotten a result here, it is because of an
2673                  * empty match.  Do the same thing REOP_EMPTY would do.
2674                  */
2675                 if (!result)
2676                     result = x;
2677
2678                 --gData->stateStackTop;
2679                 op = (REOp) *pc++;
2680                 continue;
2681
2682               case REOP_LPAREN:
2683                 pc = ReadCompactIndex(pc, &parenIndex);
2684                 TRACE("[ %Iu ]\n", (ULONG_PTR)parenIndex);
2685                 assert(parenIndex < gData->regexp->parenCount);
2686                 if (parenIndex + 1 > parenSoFar)
2687                     parenSoFar = parenIndex + 1;
2688                 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2689                 x->parens[parenIndex].length = 0;
2690                 op = (REOp) *pc++;
2691                 continue;
2692
2693               case REOP_RPAREN:
2694               {
2695                 ptrdiff_t delta;
2696
2697                 pc = ReadCompactIndex(pc, &parenIndex);
2698                 assert(parenIndex < gData->regexp->parenCount);
2699                 cap = &x->parens[parenIndex];
2700                 delta = x->cp - (gData->cpbegin + cap->index);
2701                 cap->length = (delta < 0) ? 0 : (size_t) delta;
2702                 op = (REOp) *pc++;
2703
2704                 if (!result)
2705                     result = x;
2706                 continue;
2707               }
2708               case REOP_ASSERT:
2709                 nextpc = pc + GET_OFFSET(pc);  /* start of term after ASSERT */
2710                 pc += ARG_LEN;                 /* start of ASSERT child */
2711                 op = (REOp) *pc++;
2712                 testpc = pc;
2713                 if (REOP_IS_SIMPLE(op) &&
2714                     !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2715                     result = NULL;
2716                     break;
2717                 }
2718                 curState->u.assertion.top =
2719                     (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2720                 curState->u.assertion.sz = gData->cursz;
2721                 curState->index = x->cp - gData->cpbegin;
2722                 curState->parenSoFar = parenSoFar;
2723                 PUSH_STATE_STACK(gData);
2724                 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2725                                         nextpc, x, x->cp, 0, 0)) {
2726                     goto bad;
2727                 }
2728                 continue;
2729
2730               case REOP_ASSERT_NOT:
2731                 nextpc = pc + GET_OFFSET(pc);
2732                 pc += ARG_LEN;
2733                 op = (REOp) *pc++;
2734                 testpc = pc;
2735                 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2736                     SimpleMatch(gData, x, op, &testpc, FALSE) &&
2737                     *testpc == REOP_ASSERTNOTTEST) {
2738                     result = NULL;
2739                     break;
2740                 }
2741                 curState->u.assertion.top
2742                     = (char *)gData->backTrackSP -
2743                       (char *)gData->backTrackStack;
2744                 curState->u.assertion.sz = gData->cursz;
2745                 curState->index = x->cp - gData->cpbegin;
2746                 curState->parenSoFar = parenSoFar;
2747                 PUSH_STATE_STACK(gData);
2748                 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2749                                         nextpc, x, x->cp, 0, 0)) {
2750                     goto bad;
2751                 }
2752                 continue;
2753
2754               case REOP_ASSERTTEST:
2755                 --gData->stateStackTop;
2756                 --curState;
2757                 x->cp = gData->cpbegin + curState->index;
2758                 gData->backTrackSP =
2759                     (REBackTrackData *) ((char *)gData->backTrackStack +
2760                                          curState->u.assertion.top);
2761                 gData->cursz = curState->u.assertion.sz;
2762                 if (result)
2763                     result = x;
2764                 break;
2765
2766               case REOP_ASSERTNOTTEST:
2767                 --gData->stateStackTop;
2768                 --curState;
2769                 x->cp = gData->cpbegin + curState->index;
2770                 gData->backTrackSP =
2771                     (REBackTrackData *) ((char *)gData->backTrackStack +
2772                                          curState->u.assertion.top);
2773                 gData->cursz = curState->u.assertion.sz;
2774                 result = (!result) ? x : NULL;
2775                 break;
2776               case REOP_STAR:
2777                 curState->u.quantifier.min = 0;
2778                 curState->u.quantifier.max = (UINT)-1;
2779                 goto quantcommon;
2780               case REOP_PLUS:
2781                 curState->u.quantifier.min = 1;
2782                 curState->u.quantifier.max = (UINT)-1;
2783                 goto quantcommon;
2784               case REOP_OPT:
2785                 curState->u.quantifier.min = 0;
2786                 curState->u.quantifier.max = 1;
2787                 goto quantcommon;
2788               case REOP_QUANT:
2789                 pc = ReadCompactIndex(pc, &k);
2790                 curState->u.quantifier.min = k;
2791                 pc = ReadCompactIndex(pc, &k);
2792                 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2793                 curState->u.quantifier.max = k - 1;
2794                 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2795               quantcommon:
2796                 if (curState->u.quantifier.max == 0) {
2797                     pc = pc + GET_OFFSET(pc);
2798                     op = (REOp) *pc++;
2799                     result = x;
2800                     continue;
2801                 }
2802                 /* Step over <next> */
2803                 nextpc = pc + ARG_LEN;
2804                 op = (REOp) *nextpc++;
2805                 startcp = x->cp;
2806                 if (REOP_IS_SIMPLE(op)) {
2807                     if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2808                         if (curState->u.quantifier.min == 0)
2809                             result = x;
2810                         else
2811                             result = NULL;
2812                         pc = pc + GET_OFFSET(pc);
2813                         break;
2814                     }
2815                     op = (REOp) *nextpc++;
2816                     result = x;
2817                 }
2818                 curState->index = startcp - gData->cpbegin;
2819                 curState->continue_op = REOP_REPEAT;
2820                 curState->continue_pc = pc;
2821                 curState->parenSoFar = parenSoFar;
2822                 PUSH_STATE_STACK(gData);
2823                 if (curState->u.quantifier.min == 0 &&
2824                     !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2825                                         0, 0)) {
2826                     goto bad;
2827                 }
2828                 pc = nextpc;
2829                 continue;
2830
2831               case REOP_ENDCHILD: /* marks the end of a quantifier child */
2832                 pc = curState[-1].continue_pc;
2833                 op = (REOp) curState[-1].continue_op;
2834
2835                 if (!result)
2836                     result = x;
2837                 continue;
2838
2839               case REOP_REPEAT:
2840                 --curState;
2841                 do {
2842                     --gData->stateStackTop;
2843                     if (!result) {
2844                         /* Failed, see if we have enough children. */
2845                         if (curState->u.quantifier.min == 0)
2846                             goto repeatDone;
2847                         goto break_switch;
2848                     }
2849                     if (curState->u.quantifier.min == 0 &&
2850                         x->cp == gData->cpbegin + curState->index) {
2851                         /* matched an empty string, that'll get us nowhere */
2852                         result = NULL;
2853                         goto break_switch;
2854                     }
2855                     if (curState->u.quantifier.min != 0)
2856                         curState->u.quantifier.min--;
2857                     if (curState->u.quantifier.max != (UINT) -1)
2858                         curState->u.quantifier.max--;
2859                     if (curState->u.quantifier.max == 0)
2860                         goto repeatDone;
2861                     nextpc = pc + ARG_LEN;
2862                     nextop = (REOp) *nextpc;
2863                     startcp = x->cp;
2864                     if (REOP_IS_SIMPLE(nextop)) {
2865                         nextpc++;
2866                         if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2867                             if (curState->u.quantifier.min == 0)
2868                                 goto repeatDone;
2869                             result = NULL;
2870                             goto break_switch;
2871                         }
2872                         result = x;
2873                     }
2874                     curState->index = startcp - gData->cpbegin;
2875                     PUSH_STATE_STACK(gData);
2876                     if (curState->u.quantifier.min == 0 &&
2877                         !PushBackTrackState(gData, REOP_REPEAT,
2878                                             pc, x, startcp,
2879                                             curState->parenSoFar,
2880                                             parenSoFar -
2881                                             curState->parenSoFar)) {
2882                         goto bad;
2883                     }
2884                 } while (*nextpc == REOP_ENDCHILD);
2885                 pc = nextpc;
2886                 op = (REOp) *pc++;
2887                 parenSoFar = curState->parenSoFar;
2888                 continue;
2889
2890               repeatDone:
2891                 result = x;
2892                 pc += GET_OFFSET(pc);
2893                 goto break_switch;
2894
2895               case REOP_MINIMALSTAR:
2896                 curState->u.quantifier.min = 0;
2897                 curState->u.quantifier.max = (UINT)-1;
2898                 goto minimalquantcommon;
2899               case REOP_MINIMALPLUS:
2900                 curState->u.quantifier.min = 1;
2901                 curState->u.quantifier.max = (UINT)-1;
2902                 goto minimalquantcommon;
2903               case REOP_MINIMALOPT:
2904                 curState->u.quantifier.min = 0;
2905                 curState->u.quantifier.max = 1;
2906                 goto minimalquantcommon;
2907               case REOP_MINIMALQUANT:
2908                 pc = ReadCompactIndex(pc, &k);
2909                 curState->u.quantifier.min = k;
2910                 pc = ReadCompactIndex(pc, &k);
2911                 /* See REOP_QUANT comments about k - 1. */
2912                 curState->u.quantifier.max = k - 1;
2913                 assert(curState->u.quantifier.min
2914                           <= curState->u.quantifier.max);
2915               minimalquantcommon:
2916                 curState->index = x->cp - gData->cpbegin;
2917                 curState->parenSoFar = parenSoFar;
2918                 PUSH_STATE_STACK(gData);
2919                 if (curState->u.quantifier.min != 0) {
2920                     curState->continue_op = REOP_MINIMALREPEAT;
2921                     curState->continue_pc = pc;
2922                     /* step over <next> */
2923                     pc += OFFSET_LEN;
2924                     op = (REOp) *pc++;
2925                 } else {
2926                     if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2927                                             pc, x, x->cp, 0, 0)) {
2928                         goto bad;
2929                     }
2930                     --gData->stateStackTop;
2931                     pc = pc + GET_OFFSET(pc);
2932                     op = (REOp) *pc++;
2933                 }
2934                 continue;
2935
2936               case REOP_MINIMALREPEAT:
2937                 --gData->stateStackTop;
2938                 --curState;
2939
2940                 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2941 #define PREPARE_REPEAT()                                                      \
2942     do {                                                                      \
2943         curState->index = x->cp - gData->cpbegin;                          \
2944         curState->continue_op = REOP_MINIMALREPEAT;                           \
2945         curState->continue_pc = pc;                                           \
2946         pc += ARG_LEN;                                                        \
2947         for (k = curState->parenSoFar; k < parenSoFar; k++)                   \
2948             x->parens[k].index = -1;                                          \
2949         PUSH_STATE_STACK(gData);                                              \
2950         op = (REOp) *pc++;                                                    \
2951         assert(op < REOP_LIMIT);                                              \
2952     }while(0)
2953
2954                 if (!result) {
2955                     TRACE(" -\n");
2956                     /*
2957                      * Non-greedy failure - try to consume another child.
2958                      */
2959                     if (curState->u.quantifier.max == (UINT) -1 ||
2960                         curState->u.quantifier.max > 0) {
2961                         PREPARE_REPEAT();
2962                         continue;
2963                     }
2964                     /* Don't need to adjust pc since we're going to pop. */
2965                     break;
2966                 }
2967                 if (curState->u.quantifier.min == 0 &&
2968                     x->cp == gData->cpbegin + curState->index) {
2969                     /* Matched an empty string, that'll get us nowhere. */
2970                     result = NULL;
2971                     break;
2972                 }
2973                 if (curState->u.quantifier.min != 0)
2974                     curState->u.quantifier.min--;
2975                 if (curState->u.quantifier.max != (UINT) -1)
2976                     curState->u.quantifier.max--;
2977                 if (curState->u.quantifier.min != 0) {
2978                     PREPARE_REPEAT();
2979                     continue;
2980                 }
2981                 curState->index = x->cp - gData->cpbegin;
2982                 curState->parenSoFar = parenSoFar;
2983                 PUSH_STATE_STACK(gData);
2984                 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2985                                         pc, x, x->cp,
2986                                         curState->parenSoFar,
2987                                         parenSoFar - curState->parenSoFar)) {
2988                     goto bad;
2989                 }
2990                 --gData->stateStackTop;
2991                 pc = pc + GET_OFFSET(pc);
2992                 op = (REOp) *pc++;
2993                 assert(op < REOP_LIMIT);
2994                 continue;
2995               default:
2996                 assert(FALSE);
2997                 result = NULL;
2998             }
2999           break_switch:;
3000         }
3001
3002         /*
3003          *  If the match failed and there's a backtrack option, take it.
3004          *  Otherwise this is a complete and utter failure.
3005          */
3006         if (!result) {
3007             if (gData->cursz == 0)
3008                 return NULL;
3009
3010             /* Potentially detect explosive regex here. */
3011             gData->backTrackCount++;
3012             if (gData->backTrackLimit &&
3013                 gData->backTrackCount >= gData->backTrackLimit) {
3014                 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3015                                      JSMSG_REGEXP_TOO_COMPLEX);
3016                 gData->ok = FALSE;
3017                 return NULL;
3018             }
3019
3020             backTrackData = gData->backTrackSP;
3021             gData->cursz = backTrackData->sz;
3022             gData->backTrackSP =
3023                 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3024             x->cp = backTrackData->cp;
3025             pc = backTrackData->backtrack_pc;
3026             op = (REOp) backTrackData->backtrack_op;
3027             assert(op < REOP_LIMIT);
3028             gData->stateStackTop = backTrackData->saveStateStackTop;
3029             assert(gData->stateStackTop);
3030
3031             memcpy(gData->stateStack, backTrackData + 1,
3032                    sizeof(REProgState) * backTrackData->saveStateStackTop);
3033             curState = &gData->stateStack[gData->stateStackTop - 1];
3034
3035             if (backTrackData->parenCount) {
3036                 memcpy(&x->parens[backTrackData->parenIndex],
3037                        (char *)(backTrackData + 1) +
3038                        sizeof(REProgState) * backTrackData->saveStateStackTop,
3039                        sizeof(RECapture) * backTrackData->parenCount);
3040                 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3041             } else {
3042                 for (k = curState->parenSoFar; k < parenSoFar; k++)
3043                     x->parens[k].index = -1;
3044                 parenSoFar = curState->parenSoFar;
3045             }
3046
3047             TRACE("\tBT_Pop: %Id,%Id\n",
3048                      (ULONG_PTR)backTrackData->parenIndex,
3049                      (ULONG_PTR)backTrackData->parenCount);
3050             continue;
3051         }
3052         x = result;
3053
3054         /*
3055          *  Continue with the expression.
3056          */
3057         op = (REOp)*pc++;
3058         assert(op < REOP_LIMIT);
3059     }
3060
3061 bad:
3062     TRACE("\n");
3063     return NULL;
3064
3065 good:
3066     TRACE("\n");
3067     return x;
3068 }
3069
3070 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3071 {
3072     match_state_t *result;
3073     const WCHAR *cp = x->cp;
3074     const WCHAR *cp2;
3075     UINT j;
3076
3077     /*
3078      * Have to include the position beyond the last character
3079      * in order to detect end-of-input/line condition.
3080      */
3081     for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3082         gData->skipped = cp2 - cp;
3083         x->cp = cp2;
3084         for (j = 0; j < gData->regexp->parenCount; j++)
3085             x->parens[j].index = -1;
3086         result = ExecuteREBytecode(gData, x);
3087         if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3088             return result;
3089         gData->backTrackSP = gData->backTrackStack;
3090         gData->cursz = 0;
3091         gData->stateStackTop = 0;
3092         cp2 = cp + gData->skipped;
3093     }
3094     return NULL;
3095 }
3096
3097 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3098 {
3099     UINT i;
3100
3101     gData->backTrackStackSize = INITIAL_BACKTRACK;
3102     gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3103     if (!gData->backTrackStack)
3104         goto bad;
3105
3106     gData->backTrackSP = gData->backTrackStack;
3107     gData->cursz = 0;
3108     gData->backTrackCount = 0;
3109     gData->backTrackLimit = 0;
3110
3111     gData->stateStackLimit = INITIAL_STATESTACK;
3112     gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3113     if (!gData->stateStack)
3114         goto bad;
3115
3116     gData->stateStackTop = 0;
3117     gData->cx = cx;
3118     gData->pool = pool;
3119     gData->regexp = re;
3120     gData->ok = TRUE;
3121
3122     for (i = 0; i < re->classCount; i++) {
3123         if (!re->classList[i].converted &&
3124                 !ProcessCharSet(gData, &re->classList[i])) {
3125             return E_FAIL;
3126         }
3127     }
3128
3129     return S_OK;
3130
3131 bad:
3132     gData->ok = FALSE;
3133     return E_OUTOFMEMORY;
3134 }
3135
3136 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3137         const WCHAR *str, DWORD str_len, match_state_t *result)
3138 {
3139     match_state_t *res;
3140     REGlobalData gData;
3141     heap_pool_t *mark = heap_pool_mark(pool);
3142     const WCHAR *str_beg = result->cp;
3143     HRESULT hres;
3144
3145     assert(result->cp != NULL);
3146
3147     gData.cpbegin = str;
3148     gData.cpend = str+str_len;
3149     gData.start = result->cp-str;
3150     gData.skipped = 0;
3151     gData.pool = pool;
3152
3153     hres = InitMatch(regexp, cx, pool, &gData);
3154     if(FAILED(hres)) {
3155         WARN("InitMatch failed\n");
3156         heap_pool_clear(mark);
3157         return hres;
3158     }
3159
3160     res = MatchRegExp(&gData, result);
3161     heap_pool_clear(mark);
3162     if(!gData.ok) {
3163         WARN("MatchRegExp failed\n");
3164         return E_FAIL;
3165     }
3166
3167     if(!res) {
3168         result->match_len = 0;
3169         return S_FALSE;
3170     }
3171
3172     result->match_len = (result->cp-str_beg) - gData.skipped;
3173     result->paren_count = regexp->parenCount;
3174     return S_OK;
3175 }
3176
3177 void regexp_destroy(regexp_t *re)
3178 {
3179     if (re->classList) {
3180         UINT i;
3181         for (i = 0; i < re->classCount; i++) {
3182             if (re->classList[i].converted)
3183                 free(re->classList[i].u.bits);
3184             re->classList[i].u.bits = NULL;
3185         }
3186         free(re->classList);
3187     }
3188     free(re);
3189 }
3190
3191 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3192         DWORD str_len, WORD flags, BOOL flat)
3193 {
3194     regexp_t *re;
3195     heap_pool_t *mark;
3196     CompilerState state;
3197     size_t resize;
3198     jsbytecode *endPC;
3199     UINT i;
3200     size_t len;
3201
3202     re = NULL;
3203     mark = heap_pool_mark(pool);
3204     len = str_len;
3205
3206     state.context = cx;
3207     state.pool = pool;
3208     state.cp = str;
3209     if (!state.cp)
3210         goto out;
3211     state.cpbegin = state.cp;
3212     state.cpend = state.cp + len;
3213     state.flags = flags;
3214     state.parenCount = 0;
3215     state.classCount = 0;
3216     state.progLength = 0;
3217     state.treeDepth = 0;
3218     state.classBitmapsMem = 0;
3219     for (i = 0; i < CLASS_CACHE_SIZE; i++)
3220         state.classCache[i].start = NULL;
3221
3222     if (len != 0 && flat) {
3223         state.result = NewRENode(&state, REOP_FLAT);
3224         if (!state.result)
3225             goto out;
3226         state.result->u.flat.chr = *state.cpbegin;
3227         state.result->u.flat.length = len;
3228         state.result->kid = (void *) state.cpbegin;
3229         /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3230         state.progLength += 1 + GetCompactIndexWidth(0)
3231                           + GetCompactIndexWidth(len);
3232     } else {
3233         if (!ParseRegExp(&state))
3234             goto out;
3235     }
3236     resize = offsetof(regexp_t, program) + state.progLength + 1;
3237     re = malloc(resize);
3238     if (!re)
3239         goto out;
3240
3241     assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3242     re->classCount = state.classCount;
3243     if (re->classCount) {
3244         re->classList = malloc(re->classCount * sizeof(RECharSet));
3245         if (!re->classList) {
3246             regexp_destroy(re);
3247             re = NULL;
3248             goto out;
3249         }
3250         for (i = 0; i < re->classCount; i++)
3251             re->classList[i].converted = FALSE;
3252     } else {
3253         re->classList = NULL;
3254     }
3255     endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3256     if (!endPC) {
3257         regexp_destroy(re);
3258         re = NULL;
3259         goto out;
3260     }
3261     *endPC++ = REOP_END;
3262     /*
3263      * Check whether size was overestimated and shrink using realloc.
3264      * This is safe since no pointers to newly parsed regexp or its parts
3265      * besides re exist here.
3266      */
3267     if ((size_t)(endPC - re->program) != state.progLength + 1) {
3268         regexp_t *tmp;
3269         assert((size_t)(endPC - re->program) < state.progLength + 1);
3270         resize = offsetof(regexp_t, program) + (endPC - re->program);
3271         tmp = realloc(re, resize);
3272         if (tmp)
3273             re = tmp;
3274     }
3275
3276     re->flags = flags;
3277     re->parenCount = state.parenCount;
3278     re->source = str;
3279     re->source_len = str_len;
3280
3281 out:
3282     heap_pool_clear(mark);
3283     return re;
3284 }