dlls/jscript/regexp.c

   1 /*
   2  * Copyright 2008 Jacek Caban for CodeWeavers
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  17  */
  18
  19 /*
  20  * Code in this file is based on files:
  21  * js/src/jsregexp.h
  22  * js/src/jsregexp.c
  23  * from Mozilla project, released under LGPL 2.1 or later.
  24  *
  25  * The Original Code is Mozilla Communicator client code, released
  26  * March 31, 1998.
  27  *
  28  * The Initial Developer of the Original Code is
  29  * Netscape Communications Corporation.
  30  * Portions created by the Initial Developer are Copyright (C) 1998
  31  * the Initial Developer. All Rights Reserved.
  32  */
  33
  34 #include <assert.h>
  35
  36 #include "jscript.h"
  37 #include "regexp.h"
  38
  39 #include "wine/debug.h"
  40
  41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
  42
  43 /* FIXME: Better error handling */
  44 #define ReportRegExpError(a,b,c)
  45 #define ReportRegExpErrorHelper(a,b,c,d)
  46 #define JS_ReportErrorNumber(a,b,c,d)
  47 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
  48 #define js_ReportOutOfScriptQuota(a)
  49 #define JS_ReportOutOfMemory(a)
  50 #define JS_COUNT_OPERATION(a,b)
  51
  52
  53 typedef BYTE JSPackedBool;
  54
  55 /*
  56  * This struct holds a bitmap representation of a class from a regexp.
  57  * There's a list of these referenced by the classList field in the regexp_t
  58  * struct below. The initial state has startIndex set to the offset in the
  59  * original regexp source of the beginning of the class contents. The first
  60  * use of the class converts the source representation into a bitmap.
  61  *
  62  */
  63 typedef struct RECharSet {
  64     JSPackedBool    converted;
  65     JSPackedBool    sense;
  66     WORD            length;
  67     union {
  68         BYTE        *bits;
  69         struct {
  70             size_t  startIndex;
  71             size_t  length;
  72         } src;
  73     } u;
  74 } RECharSet;
  75
  76 #define JSMSG_MIN_TOO_BIG 47
  77 #define JSMSG_MAX_TOO_BIG 48
  78 #define JSMSG_OUT_OF_ORDER 49
  79 #define JSMSG_OUT_OF_MEMORY 137
  80
  81 #define LINE_SEPARATOR  0x2028
  82 #define PARA_SEPARATOR  0x2029
  83
  84 #define RE_IS_LETTER(c)     (((c >= 'A') && (c <= 'Z')) ||                    \
  85                              ((c >= 'a') && (c <= 'z')) )
  86 #define RE_IS_LINE_TERM(c)  ((c == '\n') || (c == '\r') ||                    \
  87                              (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
  88
  89 #define JS_ISWORD(c)    ((c) < 128 && (isalnum(c) || (c) == '_'))
  90
  91 #define JS7_ISDEC(c)    ((((unsigned)(c)) - '0') <= 9)
  92 #define JS7_UNDEC(c)    ((c) - '0')
  93
  94 typedef enum REOp {
  95     REOP_EMPTY,
  96     REOP_BOL,
  97     REOP_EOL,
  98     REOP_WBDRY,
  99     REOP_WNONBDRY,
 100     REOP_DOT,
 101     REOP_DIGIT,
 102     REOP_NONDIGIT,
 103     REOP_ALNUM,
 104     REOP_NONALNUM,
 105     REOP_SPACE,
 106     REOP_NONSPACE,
 107     REOP_BACKREF,
 108     REOP_FLAT,
 109     REOP_FLAT1,
 110     REOP_FLATi,
 111     REOP_FLAT1i,
 112     REOP_UCFLAT1,
 113     REOP_UCFLAT1i,
 114     REOP_UCFLAT,
 115     REOP_UCFLATi,
 116     REOP_CLASS,
 117     REOP_NCLASS,
 118     REOP_ALT,
 119     REOP_QUANT,
 120     REOP_STAR,
 121     REOP_PLUS,
 122     REOP_OPT,
 123     REOP_LPAREN,
 124     REOP_RPAREN,
 125     REOP_JUMP,
 126     REOP_DOTSTAR,
 127     REOP_LPARENNON,
 128     REOP_ASSERT,
 129     REOP_ASSERT_NOT,
 130     REOP_ASSERTTEST,
 131     REOP_ASSERTNOTTEST,
 132     REOP_MINIMALSTAR,
 133     REOP_MINIMALPLUS,
 134     REOP_MINIMALOPT,
 135     REOP_MINIMALQUANT,
 136     REOP_ENDCHILD,
 137     REOP_REPEAT,
 138     REOP_MINIMALREPEAT,
 139     REOP_ALTPREREQ,
 140     REOP_ALTPREREQ2,
 141     REOP_ENDALT,
 142     REOP_CONCAT,
 143     REOP_END,
 144     REOP_LIMIT /* META: no operator >= to this */
 145 } REOp;
 146
 147 #define REOP_IS_SIMPLE(op)  ((op) <= REOP_NCLASS)
 148
 149 static const char *reop_names[] = {
 150     "empty",
 151     "bol",
 152     "eol",
 153     "wbdry",
 154     "wnonbdry",
 155     "dot",
 156     "digit",
 157     "nondigit",
 158     "alnum",
 159     "nonalnum",
 160     "space",
 161     "nonspace",
 162     "backref",
 163     "flat",
 164     "flat1",
 165     "flati",
 166     "flat1i",
 167     "ucflat1",
 168     "ucflat1i",
 169     "ucflat",
 170     "ucflati",
 171     "class",
 172     "nclass",
 173     "alt",
 174     "quant",
 175     "star",
 176     "plus",
 177     "opt",
 178     "lparen",
 179     "rparen",
 180     "jump",
 181     "dotstar",
 182     "lparennon",
 183     "assert",
 184     "assert_not",
 185     "asserttest",
 186     "assertnottest",
 187     "minimalstar",
 188     "minimalplus",
 189     "minimalopt",
 190     "minimalquant",
 191     "endchild",
 192     "repeat",
 193     "minimalrepeat",
 194     "altprereq",
 195     "alrprereq2",
 196     "endalt",
 197     "concat",
 198     "end",
 199     NULL
 200 };
 201
 202 typedef struct REProgState {
 203     jsbytecode *continue_pc;        /* current continuation data */
 204     jsbytecode continue_op;
 205     ptrdiff_t index;                /* progress in text */
 206     size_t parenSoFar;              /* highest indexed paren started */
 207     union {
 208         struct {
 209             UINT min;               /* current quantifier limits */
 210             UINT max;
 211         } quantifier;
 212         struct {
 213             size_t top;             /* backtrack stack state */
 214             size_t sz;
 215         } assertion;
 216     } u;
 217 } REProgState;
 218
 219 typedef struct REBackTrackData {
 220     size_t sz;                      /* size of previous stack entry */
 221     jsbytecode *backtrack_pc;       /* where to backtrack to */
 222     jsbytecode backtrack_op;
 223     const WCHAR *cp;                /* index in text of match at backtrack */
 224     size_t parenIndex;              /* start index of saved paren contents */
 225     size_t parenCount;              /* # of saved paren contents */
 226     size_t saveStateStackTop;       /* number of parent states */
 227     /* saved parent states follow */
 228     /* saved paren contents follow */
 229 } REBackTrackData;
 230
 231 #define INITIAL_STATESTACK  100
 232 #define INITIAL_BACKTRACK   8000
 233
 234 typedef struct REGlobalData {
 235     void *cx;
 236     regexp_t *regexp;               /* the RE in execution */
 237     BOOL ok;                        /* runtime error (out_of_memory only?) */
 238     size_t start;                   /* offset to start at */
 239     ptrdiff_t skipped;              /* chars skipped anchoring this r.e. */
 240     const WCHAR    *cpbegin;        /* text base address */
 241     const WCHAR    *cpend;          /* text limit address */
 242
 243     REProgState *stateStack;        /* stack of state of current parents */
 244     size_t stateStackTop;
 245     size_t stateStackLimit;
 246
 247     REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
 248     REBackTrackData *backTrackSP;
 249     size_t backTrackStackSize;
 250     size_t cursz;                   /* size of current stack entry */
 251     size_t backTrackCount;          /* how many times we've backtracked */
 252     size_t backTrackLimit;          /* upper limit on backtrack states */
 253
 254     heap_pool_t *pool;              /* It's faster to use one malloc'd pool
 255                                        than to malloc/free the three items
 256                                        that are allocated from this pool */
 257 } REGlobalData;
 258
 259 typedef struct RENode RENode;
 260 struct RENode {
 261     REOp            op;         /* r.e. op bytecode */
 262     RENode          *next;      /* next in concatenation order */
 263     void            *kid;       /* first operand */
 264     union {
 265         void        *kid2;      /* second operand */
 266         INT         num;        /* could be a number */
 267         size_t      parenIndex; /* or a parenthesis index */
 268         struct {                /* or a quantifier range */
 269             UINT  min;
 270             UINT  max;
 271             JSPackedBool greedy;
 272         } range;
 273         struct {                /* or a character class */
 274             size_t  startIndex;
 275             size_t  kidlen;     /* length of string at kid, in jschars */
 276             size_t  index;      /* index into class list */
 277             WORD  bmsize;       /* bitmap size, based on max char code */
 278             JSPackedBool sense;
 279         } ucclass;
 280         struct {                /* or a literal sequence */
 281             WCHAR   chr;        /* of one character */
 282             size_t  length;     /* or many (via the kid) */
 283         } flat;
 284         struct {
 285             RENode  *kid2;      /* second operand from ALT */
 286             WCHAR   ch1;        /* match char for ALTPREREQ */
 287             WCHAR   ch2;        /* ditto, or class index for ALTPREREQ2 */
 288         } altprereq;
 289     } u;
 290 };
 291
 292 #define CLASS_CACHE_SIZE    4
 293
 294 typedef struct CompilerState {
 295     void            *context;
 296     const WCHAR     *cpbegin;
 297     const WCHAR     *cpend;
 298     const WCHAR     *cp;
 299     size_t          parenCount;
 300     size_t          classCount;   /* number of [] encountered */
 301     size_t          treeDepth;    /* maximum depth of parse tree */
 302     size_t          progLength;   /* estimated bytecode length */
 303     RENode          *result;
 304     size_t          classBitmapsMem; /* memory to hold all class bitmaps */
 305     struct {
 306         const WCHAR *start;         /* small cache of class strings */
 307         size_t length;              /* since they're often the same */
 308         size_t index;
 309     } classCache[CLASS_CACHE_SIZE];
 310     WORD          flags;
 311
 312     heap_pool_t *pool;              /* It's faster to use one malloc'd pool
 313                                        than to malloc/free */
 314 } CompilerState;
 315
 316 typedef struct EmitStateStackEntry {
 317     jsbytecode      *altHead;       /* start of REOP_ALT* opcode */
 318     jsbytecode      *nextAltFixup;  /* fixup pointer to next-alt offset */
 319     jsbytecode      *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
 320     jsbytecode      *endTermFixup;  /* fixup ptr. to REOPT_ALTPREREQ* offset */
 321     RENode          *continueNode;  /* original REOP_ALT* node being stacked */
 322     jsbytecode      continueOp;     /* REOP_JUMP or REOP_ENDALT continuation */
 323     JSPackedBool    jumpToJumpFlag; /* true if we've patched jump-to-jump to
 324                                        avoid 16-bit unsigned offset overflow */
 325 } EmitStateStackEntry;
 326
 327 /*
 328  * Immediate operand sizes and getter/setters.  Unlike the ones in jsopcode.h,
 329  * the getters and setters take the pc of the offset, not of the opcode before
 330  * the offset.
 331  */
 332 #define ARG_LEN             2
 333 #define GET_ARG(pc)         ((WORD)(((pc)[0] << 8) | (pc)[1]))
 334 #define SET_ARG(pc, arg)    ((pc)[0] = (jsbytecode) ((arg) >> 8),       \
 335                              (pc)[1] = (jsbytecode) (arg))
 336
 337 #define OFFSET_LEN          ARG_LEN
 338 #define OFFSET_MAX          ((1 << (ARG_LEN * 8)) - 1)
 339 #define GET_OFFSET(pc)      GET_ARG(pc)
 340
 341 static BOOL ParseRegExp(CompilerState*);
 342
 343 /*
 344  * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
 345  * For sanity, we limit it to 2^24 bytes.
 346  */
 347 #define TREE_DEPTH_MAX  ((1 << 24) / sizeof(EmitStateStackEntry))
 348
 349 /*
 350  * The maximum memory that can be allocated for class bitmaps.
 351  * For sanity, we limit it to 2^24 bytes.
 352  */
 353 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
 354
 355 /*
 356  * Functions to get size and write/read bytecode that represent small indexes
 357  * compactly.
 358  * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
 359  * indicates that the following byte brings more bits to the index. Otherwise
 360  * this is the last byte in the index bytecode representing highest index bits.
 361  */
 362 static size_t
 363 GetCompactIndexWidth(size_t index)
 364 {
 365     size_t width;
 366
 367     for (width = 1; (index >>= 7) != 0; ++width) { }
 368     return width;
 369 }
 370
 371 static inline jsbytecode *
 372 WriteCompactIndex(jsbytecode *pc, size_t index)
 373 {
 374     size_t next;
 375
 376     while ((next = index >> 7) != 0) {
 377         *pc++ = (jsbytecode)(index | 0x80);
 378         index = next;
 379     }
 380     *pc++ = (jsbytecode)index;
 381     return pc;
 382 }
 383
 384 static inline jsbytecode *
 385 ReadCompactIndex(jsbytecode *pc, size_t *result)
 386 {
 387     size_t nextByte;
 388
 389     nextByte = *pc++;
 390     if ((nextByte & 0x80) == 0) {
 391         /*
 392          * Short-circuit the most common case when compact index <= 127.
 393          */
 394         *result = nextByte;
 395     } else {
 396         size_t shift = 7;
 397         *result = 0x7F & nextByte;
 398         do {
 399             nextByte = *pc++;
 400             *result |= (nextByte & 0x7F) << shift;
 401             shift += 7;
 402         } while ((nextByte & 0x80) != 0);
 403     }
 404     return pc;
 405 }
 406
 407 /* Construct and initialize an RENode, returning NULL for out-of-memory */
 408 static RENode *
 409 NewRENode(CompilerState *state, REOp op)
 410 {
 411     RENode *ren;
 412
 413     ren = heap_pool_alloc(state->pool, sizeof(*ren));
 414     if (!ren) {
 415         /* js_ReportOutOfScriptQuota(cx); */
 416         return NULL;
 417     }
 418     ren->op = op;
 419     ren->next = NULL;
 420     ren->kid = NULL;
 421     return ren;
 422 }
 423
 424 /*
 425  * Validates and converts hex ascii value.
 426  */
 427 static BOOL
 428 isASCIIHexDigit(WCHAR c, UINT *digit)
 429 {
 430     UINT cv = c;
 431
 432     if (cv < '0')
 433         return FALSE;
 434     if (cv <= '9') {
 435         *digit = cv - '0';
 436         return TRUE;
 437     }
 438     cv |= 0x20;
 439     if (cv >= 'a' && cv <= 'f') {
 440         *digit = cv - 'a' + 10;
 441         return TRUE;
 442     }
 443     return FALSE;
 444 }
 445
 446 typedef struct {
 447     REOp op;
 448     const WCHAR *errPos;
 449     size_t parenIndex;
 450 } REOpData;
 451
 452 #define JUMP_OFFSET_HI(off)     ((jsbytecode)((off) >> 8))
 453 #define JUMP_OFFSET_LO(off)     ((jsbytecode)(off))
 454
 455 static BOOL
 456 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
 457 {
 458     ptrdiff_t offset = target - jump;
 459
 460     /* Check that target really points forward. */
 461     assert(offset >= 2);
 462     if ((size_t)offset > OFFSET_MAX)
 463         return FALSE;
 464
 465     jump[0] = JUMP_OFFSET_HI(offset);
 466     jump[1] = JUMP_OFFSET_LO(offset);
 467     return TRUE;
 468 }
 469
 470 /*
 471  * Generate bytecode for the tree rooted at t using an explicit stack instead
 472  * of recursion.
 473  */
 474 static jsbytecode *
 475 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
 476                jsbytecode *pc, RENode *t)
 477 {
 478     EmitStateStackEntry *emitStateSP, *emitStateStack;
 479     RECharSet *charSet;
 480     REOp op;
 481
 482     if (treeDepth == 0) {
 483         emitStateStack = NULL;
 484     } else {
 485         emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
 486         if (!emitStateStack)
 487             return NULL;
 488     }
 489     emitStateSP = emitStateStack;
 490     op = t->op;
 491     assert(op < REOP_LIMIT);
 492
 493     for (;;) {
 494         *pc++ = op;
 495         switch (op) {
 496           case REOP_EMPTY:
 497             --pc;
 498             break;
 499
 500           case REOP_ALTPREREQ2:
 501           case REOP_ALTPREREQ:
 502             assert(emitStateSP);
 503             emitStateSP->altHead = pc - 1;
 504             emitStateSP->endTermFixup = pc;
 505             pc += OFFSET_LEN;
 506             SET_ARG(pc, t->u.altprereq.ch1);
 507             pc += ARG_LEN;
 508             SET_ARG(pc, t->u.altprereq.ch2);
 509             pc += ARG_LEN;
 510
 511             emitStateSP->nextAltFixup = pc;    /* offset to next alternate */
 512             pc += OFFSET_LEN;
 513
 514             emitStateSP->continueNode = t;
 515             emitStateSP->continueOp = REOP_JUMP;
 516             emitStateSP->jumpToJumpFlag = FALSE;
 517             ++emitStateSP;
 518             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 519             t = t->kid;
 520             op = t->op;
 521             assert(op < REOP_LIMIT);
 522             continue;
 523
 524           case REOP_JUMP:
 525             emitStateSP->nextTermFixup = pc;    /* offset to following term */
 526             pc += OFFSET_LEN;
 527             if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
 528                 goto jump_too_big;
 529             emitStateSP->continueOp = REOP_ENDALT;
 530             ++emitStateSP;
 531             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 532             t = t->u.kid2;
 533             op = t->op;
 534             assert(op < REOP_LIMIT);
 535             continue;
 536
 537           case REOP_ENDALT:
 538             /*
 539              * If we already patched emitStateSP->nextTermFixup to jump to
 540              * a nearer jump, to avoid 16-bit immediate offset overflow, we
 541              * are done here.
 542              */
 543             if (emitStateSP->jumpToJumpFlag)
 544                 break;
 545
 546             /*
 547              * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
 548              * REOP_ENDALT is executed only on successful match of the last
 549              * alternate in a group.
 550              */
 551             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 552                 goto jump_too_big;
 553             if (t->op != REOP_ALT) {
 554                 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
 555                     goto jump_too_big;
 556             }
 557
 558             /*
 559              * If the program is bigger than the REOP_JUMP offset range, then
 560              * we must check for alternates before this one that are part of
 561              * the same group, and fix up their jump offsets to target jumps
 562              * close enough to fit in a 16-bit unsigned offset immediate.
 563              */
 564             if ((size_t)(pc - re->program) > OFFSET_MAX &&
 565                 emitStateSP > emitStateStack) {
 566                 EmitStateStackEntry *esp, *esp2;
 567                 jsbytecode *alt, *jump;
 568                 ptrdiff_t span, header;
 569
 570                 esp2 = emitStateSP;
 571                 alt = esp2->altHead;
 572                 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
 573                     if (esp->continueOp == REOP_ENDALT &&
 574                         !esp->jumpToJumpFlag &&
 575                         esp->nextTermFixup + OFFSET_LEN == alt &&
 576                         (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
 577                                        ? esp->endTermFixup
 578                                        : esp->nextTermFixup)) > OFFSET_MAX) {
 579                         alt = esp->altHead;
 580                         jump = esp->nextTermFixup;
 581
 582                         /*
 583                          * The span must be 1 less than the distance from
 584                          * jump offset to jump offset, so we actually jump
 585                          * to a REOP_JUMP bytecode, not to its offset!
 586                          */
 587                         for (;;) {
 588                             assert(jump < esp2->nextTermFixup);
 589                             span = esp2->nextTermFixup - jump - 1;
 590                             if ((size_t)span <= OFFSET_MAX)
 591                                 break;
 592                             do {
 593                                 if (--esp2 == esp)
 594                                     goto jump_too_big;
 595                             } while (esp2->continueOp != REOP_ENDALT);
 596                         }
 597
 598                         jump[0] = JUMP_OFFSET_HI(span);
 599                         jump[1] = JUMP_OFFSET_LO(span);
 600
 601                         if (esp->continueNode->op != REOP_ALT) {
 602                             /*
 603                              * We must patch the offset at esp->endTermFixup
 604                              * as well, for the REOP_ALTPREREQ{,2} opcodes.
 605                              * If we're unlucky and endTermFixup is more than
 606                              * OFFSET_MAX bytes from its target, we cheat by
 607                              * jumping 6 bytes to the jump whose offset is at
 608                              * esp->nextTermFixup, which has the same target.
 609                              */
 610                             jump = esp->endTermFixup;
 611                             header = esp->nextTermFixup - jump;
 612                             span += header;
 613                             if ((size_t)span > OFFSET_MAX)
 614                                 span = header;
 615
 616                             jump[0] = JUMP_OFFSET_HI(span);
 617                             jump[1] = JUMP_OFFSET_LO(span);
 618                         }
 619
 620                         esp->jumpToJumpFlag = TRUE;
 621                     }
 622                 }
 623             }
 624             break;
 625
 626           case REOP_ALT:
 627             assert(emitStateSP);
 628             emitStateSP->altHead = pc - 1;
 629             emitStateSP->nextAltFixup = pc;     /* offset to next alternate */
 630             pc += OFFSET_LEN;
 631             emitStateSP->continueNode = t;
 632             emitStateSP->continueOp = REOP_JUMP;
 633             emitStateSP->jumpToJumpFlag = FALSE;
 634             ++emitStateSP;
 635             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 636             t = t->kid;
 637             op = t->op;
 638             assert(op < REOP_LIMIT);
 639             continue;
 640
 641           case REOP_FLAT:
 642             /*
 643              * Coalesce FLATs if possible and if it would not increase bytecode
 644              * beyond preallocated limit. The latter happens only when bytecode
 645              * size for coalesced string with offset p and length 2 exceeds 6
 646              * bytes preallocated for 2 single char nodes, i.e. when
 647              * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
 648              * GetCompactIndexWidth(p) > 4.
 649              * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
 650              * nodes strictly decreases bytecode size, the check has to be
 651              * done only for the first coalescing.
 652              */
 653             if (t->kid &&
 654                 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
 655             {
 656                 while (t->next &&
 657                        t->next->op == REOP_FLAT &&
 658                        (WCHAR*)t->kid + t->u.flat.length ==
 659                        t->next->kid) {
 660                     t->u.flat.length += t->next->u.flat.length;
 661                     t->next = t->next->next;
 662                 }
 663             }
 664             if (t->kid && t->u.flat.length > 1) {
 665                 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
 666                 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
 667                 pc = WriteCompactIndex(pc, t->u.flat.length);
 668             } else if (t->u.flat.chr < 256) {
 669                 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
 670                 *pc++ = (jsbytecode) t->u.flat.chr;
 671             } else {
 672                 pc[-1] = (state->flags & REG_FOLD)
 673                          ? REOP_UCFLAT1i
 674                          : REOP_UCFLAT1;
 675                 SET_ARG(pc, t->u.flat.chr);
 676                 pc += ARG_LEN;
 677             }
 678             break;
 679
 680           case REOP_LPAREN:
 681             assert(emitStateSP);
 682             pc = WriteCompactIndex(pc, t->u.parenIndex);
 683             emitStateSP->continueNode = t;
 684             emitStateSP->continueOp = REOP_RPAREN;
 685             ++emitStateSP;
 686             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 687             t = t->kid;
 688             op = t->op;
 689             continue;
 690
 691           case REOP_RPAREN:
 692             pc = WriteCompactIndex(pc, t->u.parenIndex);
 693             break;
 694
 695           case REOP_BACKREF:
 696             pc = WriteCompactIndex(pc, t->u.parenIndex);
 697             break;
 698
 699           case REOP_ASSERT:
 700             assert(emitStateSP);
 701             emitStateSP->nextTermFixup = pc;
 702             pc += OFFSET_LEN;
 703             emitStateSP->continueNode = t;
 704             emitStateSP->continueOp = REOP_ASSERTTEST;
 705             ++emitStateSP;
 706             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 707             t = t->kid;
 708             op = t->op;
 709             continue;
 710
 711           case REOP_ASSERTTEST:
 712           case REOP_ASSERTNOTTEST:
 713             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 714                 goto jump_too_big;
 715             break;
 716
 717           case REOP_ASSERT_NOT:
 718             assert(emitStateSP);
 719             emitStateSP->nextTermFixup = pc;
 720             pc += OFFSET_LEN;
 721             emitStateSP->continueNode = t;
 722             emitStateSP->continueOp = REOP_ASSERTNOTTEST;
 723             ++emitStateSP;
 724             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 725             t = t->kid;
 726             op = t->op;
 727             continue;
 728
 729           case REOP_QUANT:
 730             assert(emitStateSP);
 731             if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
 732                 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
 733             } else if (t->u.range.min == 0 && t->u.range.max == 1) {
 734                 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
 735             } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
 736                 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
 737             } else {
 738                 if (!t->u.range.greedy)
 739                     pc[-1] = REOP_MINIMALQUANT;
 740                 pc = WriteCompactIndex(pc, t->u.range.min);
 741                 /*
 742                  * Write max + 1 to avoid using size_t(max) + 1 bytes
 743                  * for (UINT)-1 sentinel.
 744                  */
 745                 pc = WriteCompactIndex(pc, t->u.range.max + 1);
 746             }
 747             emitStateSP->nextTermFixup = pc;
 748             pc += OFFSET_LEN;
 749             emitStateSP->continueNode = t;
 750             emitStateSP->continueOp = REOP_ENDCHILD;
 751             ++emitStateSP;
 752             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 753             t = t->kid;
 754             op = t->op;
 755             continue;
 756
 757           case REOP_ENDCHILD:
 758             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 759                 goto jump_too_big;
 760             break;
 761
 762           case REOP_CLASS:
 763             if (!t->u.ucclass.sense)
 764                 pc[-1] = REOP_NCLASS;
 765             pc = WriteCompactIndex(pc, t->u.ucclass.index);
 766             charSet = &re->classList[t->u.ucclass.index];
 767             charSet->converted = FALSE;
 768             charSet->length = t->u.ucclass.bmsize;
 769             charSet->u.src.startIndex = t->u.ucclass.startIndex;
 770             charSet->u.src.length = t->u.ucclass.kidlen;
 771             charSet->sense = t->u.ucclass.sense;
 772             break;
 773
 774           default:
 775             break;
 776         }
 777
 778         t = t->next;
 779         if (t) {
 780             op = t->op;
 781         } else {
 782             if (emitStateSP == emitStateStack)
 783                 break;
 784             --emitStateSP;
 785             t = emitStateSP->continueNode;
 786             op = (REOp) emitStateSP->continueOp;
 787         }
 788     }
 789
 790   cleanup:
 791     heap_free(emitStateStack);
 792     return pc;
 793
 794   jump_too_big:
 795     ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
 796     pc = NULL;
 797     goto cleanup;
 798 }
 799
 800 /*
 801  * Process the op against the two top operands, reducing them to a single
 802  * operand in the penultimate slot. Update progLength and treeDepth.
 803  */
 804 static BOOL
 805 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
 806           INT operandSP)
 807 {
 808     RENode *result;
 809
 810     switch (opData->op) {
 811       case REOP_ALT:
 812         result = NewRENode(state, REOP_ALT);
 813         if (!result)
 814             return FALSE;
 815         result->kid = operandStack[operandSP - 2];
 816         result->u.kid2 = operandStack[operandSP - 1];
 817         operandStack[operandSP - 2] = result;
 818
 819         if (state->treeDepth == TREE_DEPTH_MAX) {
 820             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
 821             return FALSE;
 822         }
 823         ++state->treeDepth;
 824
 825         /*
 826          * Look at both alternates to see if there's a FLAT or a CLASS at
 827          * the start of each. If so, use a prerequisite match.
 828          */
 829         if (((RENode *) result->kid)->op == REOP_FLAT &&
 830             ((RENode *) result->u.kid2)->op == REOP_FLAT &&
 831             (state->flags & REG_FOLD) == 0) {
 832             result->op = REOP_ALTPREREQ;
 833             result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
 834             result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
 835             /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
 836                                             JUMP, <end> ... ENDALT */
 837             state->progLength += 13;
 838         }
 839         else
 840         if (((RENode *) result->kid)->op == REOP_CLASS &&
 841             ((RENode *) result->kid)->u.ucclass.index < 256 &&
 842             ((RENode *) result->u.kid2)->op == REOP_FLAT &&
 843             (state->flags & REG_FOLD) == 0) {
 844             result->op = REOP_ALTPREREQ2;
 845             result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
 846             result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
 847             /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
 848                                             JUMP, <end> ... ENDALT */
 849             state->progLength += 13;
 850         }
 851         else
 852         if (((RENode *) result->kid)->op == REOP_FLAT &&
 853             ((RENode *) result->u.kid2)->op == REOP_CLASS &&
 854             ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
 855             (state->flags & REG_FOLD) == 0) {
 856             result->op = REOP_ALTPREREQ2;
 857             result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
 858             result->u.altprereq.ch2 =
 859                 ((RENode *) result->u.kid2)->u.ucclass.index;
 860             /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
 861                                           JUMP, <end> ... ENDALT */
 862             state->progLength += 13;
 863         }
 864         else {
 865             /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
 866             state->progLength += 7;
 867         }
 868         break;
 869
 870       case REOP_CONCAT:
 871         result = operandStack[operandSP - 2];
 872         while (result->next)
 873             result = result->next;
 874         result->next = operandStack[operandSP - 1];
 875         break;
 876
 877       case REOP_ASSERT:
 878       case REOP_ASSERT_NOT:
 879       case REOP_LPARENNON:
 880       case REOP_LPAREN:
 881         /* These should have been processed by a close paren. */
 882         ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
 883                                 opData->errPos);
 884         return FALSE;
 885
 886       default:;
 887     }
 888     return TRUE;
 889 }
 890
 891 /*
 892  * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
 893  * it being on the stack, and to propagate errors to its callers.
 894  */
 895 #define JSREG_FIND_PAREN_COUNT  0x8000
 896 #define JSREG_FIND_PAREN_ERROR  0x4000
 897
 898 /*
 899  * Magic return value from FindParenCount and GetDecimalValue, to indicate
 900  * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
 901  * its findMax parameter is non-null.
 902  */
 903 #define OVERFLOW_VALUE          ((UINT)-1)
 904
 905 static UINT
 906 FindParenCount(CompilerState *state)
 907 {
 908     CompilerState temp;
 909     int i;
 910
 911     if (state->flags & JSREG_FIND_PAREN_COUNT)
 912         return OVERFLOW_VALUE;
 913
 914     /*
 915      * Copy state into temp, flag it so we never report an invalid backref,
 916      * and reset its members to parse the entire regexp.  This is obviously
 917      * suboptimal, but GetDecimalValue calls us only if a backref appears to
 918      * refer to a forward parenthetical, which is rare.
 919      */
 920     temp = *state;
 921     temp.flags |= JSREG_FIND_PAREN_COUNT;
 922     temp.cp = temp.cpbegin;
 923     temp.parenCount = 0;
 924     temp.classCount = 0;
 925     temp.progLength = 0;
 926     temp.treeDepth = 0;
 927     temp.classBitmapsMem = 0;
 928     for (i = 0; i < CLASS_CACHE_SIZE; i++)
 929         temp.classCache[i].start = NULL;
 930
 931     if (!ParseRegExp(&temp)) {
 932         state->flags |= JSREG_FIND_PAREN_ERROR;
 933         return OVERFLOW_VALUE;
 934     }
 935     return temp.parenCount;
 936 }
 937
 938 /*
 939  * Extract and return a decimal value at state->cp.  The initial character c
 940  * has already been read.  Return OVERFLOW_VALUE if the result exceeds max.
 941  * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
 942  * state->flags to discover whether an error occurred under findMax.
 943  */
 944 static UINT
 945 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
 946                 CompilerState *state)
 947 {
 948     UINT value = JS7_UNDEC(c);
 949     BOOL overflow = (value > max && (!findMax || value > findMax(state)));
 950
 951     /* The following restriction allows simpler overflow checks. */
 952     assert(max <= ((UINT)-1 - 9) / 10);
 953     while (state->cp < state->cpend) {
 954         c = *state->cp;
 955         if (!JS7_ISDEC(c))
 956             break;
 957         value = 10 * value + JS7_UNDEC(c);
 958         if (!overflow && value > max && (!findMax || value > findMax(state)))
 959             overflow = TRUE;
 960         ++state->cp;
 961     }
 962     return overflow ? OVERFLOW_VALUE : value;
 963 }
 964
 965 /*
 966  * Calculate the total size of the bitmap required for a class expression.
 967  */
 968 static BOOL
 969 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
 970                     const WCHAR *end)
 971 {
 972     UINT max = 0;
 973     BOOL inRange = FALSE;
 974     WCHAR c, rangeStart = 0;
 975     UINT n, digit, nDigits, i;
 976
 977     target->u.ucclass.bmsize = 0;
 978     target->u.ucclass.sense = TRUE;
 979
 980     if (src == end)
 981         return TRUE;
 982
 983     if (*src == '^') {
 984         ++src;
 985         target->u.ucclass.sense = FALSE;
 986     }
 987
 988     while (src != end) {
 989         BOOL canStartRange = TRUE;
 990         UINT localMax = 0;
 991
 992         switch (*src) {
 993           case '\\':
 994             ++src;
 995             c = *src++;
 996             switch (c) {
 997               case 'b':
 998                 localMax = 0x8;
 999                 break;
1000               case 'f':
1001                 localMax = 0xC;
1002                 break;
1003               case 'n':
1004                 localMax = 0xA;
1005                 break;
1006               case 'r':
1007                 localMax = 0xD;
1008                 break;
1009               case 't':
1010                 localMax = 0x9;
1011                 break;
1012               case 'v':
1013                 localMax = 0xB;
1014                 break;
1015               case 'c':
1016                 if (src < end && RE_IS_LETTER(*src)) {
1017                     localMax = (UINT) (*src++) & 0x1F;
1018                 } else {
1019                     --src;
1020                     localMax = '\\';
1021                 }
1022                 break;
1023               case 'x':
1024                 nDigits = 2;
1025                 goto lexHex;
1026               case 'u':
1027                 nDigits = 4;
1028 lexHex:
1029                 n = 0;
1030                 for (i = 0; (i < nDigits) && (src < end); i++) {
1031                     c = *src++;
1032                     if (!isASCIIHexDigit(c, &digit)) {
1033                         /*
1034                          * Back off to accepting the original
1035                          *'\' as a literal.
1036                          */
1037                         src -= i + 1;
1038                         n = '\\';
1039                         break;
1040                     }
1041                     n = (n << 4) | digit;
1042                 }
1043                 localMax = n;
1044                 break;
1045               case 'd':
1046                 canStartRange = FALSE;
1047                 if (inRange) {
1048                     JS_ReportErrorNumber(state->context,
1049                                          js_GetErrorMessage, NULL,
1050                                          JSMSG_BAD_CLASS_RANGE);
1051                     return FALSE;
1052                 }
1053                 localMax = '9';
1054                 break;
1055               case 'D':
1056               case 's':
1057               case 'S':
1058               case 'w':
1059               case 'W':
1060                 canStartRange = FALSE;
1061                 if (inRange) {
1062                     JS_ReportErrorNumber(state->context,
1063                                          js_GetErrorMessage, NULL,
1064                                          JSMSG_BAD_CLASS_RANGE);
1065                     return FALSE;
1066                 }
1067                 max = 65535;
1068
1069                 /*
1070                  * If this is the start of a range, ensure that it's less than
1071                  * the end.
1072                  */
1073                 localMax = 0;
1074                 break;
1075               case '0':
1076               case '1':
1077               case '2':
1078               case '3':
1079               case '4':
1080               case '5':
1081               case '6':
1082               case '7':
1083                 /*
1084                  *  This is a non-ECMA extension - decimal escapes (in this
1085                  *  case, octal!) are supposed to be an error inside class
1086                  *  ranges, but supported here for backwards compatibility.
1087                  *
1088                  */
1089                 n = JS7_UNDEC(c);
1090                 c = *src;
1091                 if ('0' <= c && c <= '7') {
1092                     src++;
1093                     n = 8 * n + JS7_UNDEC(c);
1094                     c = *src;
1095                     if ('0' <= c && c <= '7') {
1096                         src++;
1097                         i = 8 * n + JS7_UNDEC(c);
1098                         if (i <= 0377)
1099                             n = i;
1100                         else
1101                             src--;
1102                     }
1103                 }
1104                 localMax = n;
1105                 break;
1106
1107               default:
1108                 localMax = c;
1109                 break;
1110             }
1111             break;
1112           default:
1113             localMax = *src++;
1114             break;
1115         }
1116
1117         if (inRange) {
1118             /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1119             if (rangeStart > localMax) {
1120                 JS_ReportErrorNumber(state->context,
1121                                      js_GetErrorMessage, NULL,
1122                                      JSMSG_BAD_CLASS_RANGE);
1123                 return FALSE;
1124             }
1125             inRange = FALSE;
1126         } else {
1127             if (canStartRange && src < end - 1) {
1128                 if (*src == '-') {
1129                     ++src;
1130                     inRange = TRUE;
1131                     rangeStart = (WCHAR)localMax;
1132                     continue;
1133                 }
1134             }
1135             if (state->flags & REG_FOLD)
1136                 rangeStart = localMax;   /* one run of the uc/dc loop below */
1137         }
1138
1139         if (state->flags & REG_FOLD) {
1140             WCHAR maxch = localMax;
1141
1142             for (i = rangeStart; i <= localMax; i++) {
1143                 WCHAR uch, dch;
1144
1145                 uch = toupperW(i);
1146                 dch = tolowerW(i);
1147                 if(maxch < uch)
1148                     maxch = uch;
1149                 if(maxch < dch)
1150                     maxch = dch;
1151             }
1152             localMax = maxch;
1153         }
1154
1155         if (localMax > max)
1156             max = localMax;
1157     }
1158     target->u.ucclass.bmsize = max;
1159     return TRUE;
1160 }
1161
1162 static INT
1163 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1164 {
1165     UINT min, max;
1166     WCHAR c;
1167     const WCHAR *errp = state->cp++;
1168
1169     c = *state->cp;
1170     if (JS7_ISDEC(c)) {
1171         ++state->cp;
1172         min = GetDecimalValue(c, 0xFFFF, NULL, state);
1173         c = *state->cp;
1174
1175         if (!ignoreValues && min == OVERFLOW_VALUE)
1176             return JSMSG_MIN_TOO_BIG;
1177
1178         if (c == ',') {
1179             c = *++state->cp;
1180             if (JS7_ISDEC(c)) {
1181                 ++state->cp;
1182                 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1183                 c = *state->cp;
1184                 if (!ignoreValues && max == OVERFLOW_VALUE)
1185                     return JSMSG_MAX_TOO_BIG;
1186                 if (!ignoreValues && min > max)
1187                     return JSMSG_OUT_OF_ORDER;
1188             } else {
1189                 max = (UINT)-1;
1190             }
1191         } else {
1192             max = min;
1193         }
1194         if (c == '}') {
1195             state->result = NewRENode(state, REOP_QUANT);
1196             if (!state->result)
1197                 return JSMSG_OUT_OF_MEMORY;
1198             state->result->u.range.min = min;
1199             state->result->u.range.max = max;
1200             /*
1201              * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1202              * where <max> is written as compact(max+1) to make
1203              * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1204              */
1205             state->progLength += (1 + GetCompactIndexWidth(min)
1206                                   + GetCompactIndexWidth(max + 1)
1207                                   +3);
1208             return 0;
1209         }
1210     }
1211
1212     state->cp = errp;
1213     return -1;
1214 }
1215
1216 static BOOL
1217 ParseQuantifier(CompilerState *state)
1218 {
1219     RENode *term;
1220     term = state->result;
1221     if (state->cp < state->cpend) {
1222         switch (*state->cp) {
1223           case '+':
1224             state->result = NewRENode(state, REOP_QUANT);
1225             if (!state->result)
1226                 return FALSE;
1227             state->result->u.range.min = 1;
1228             state->result->u.range.max = (UINT)-1;
1229             /* <PLUS>, <next> ... <ENDCHILD> */
1230             state->progLength += 4;
1231             goto quantifier;
1232           case '*':
1233             state->result = NewRENode(state, REOP_QUANT);
1234             if (!state->result)
1235                 return FALSE;
1236             state->result->u.range.min = 0;
1237             state->result->u.range.max = (UINT)-1;
1238             /* <STAR>, <next> ... <ENDCHILD> */
1239             state->progLength += 4;
1240             goto quantifier;
1241           case '?':
1242             state->result = NewRENode(state, REOP_QUANT);
1243             if (!state->result)
1244                 return FALSE;
1245             state->result->u.range.min = 0;
1246             state->result->u.range.max = 1;
1247             /* <OPT>, <next> ... <ENDCHILD> */
1248             state->progLength += 4;
1249             goto quantifier;
1250           case '{':       /* balance '}' */
1251           {
1252             INT err;
1253
1254             err = ParseMinMaxQuantifier(state, FALSE);
1255             if (err == 0)
1256                 goto quantifier;
1257             if (err == -1)
1258                 return TRUE;
1259
1260             ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1261             return FALSE;
1262           }
1263           default:;
1264         }
1265     }
1266     return TRUE;
1267
1268 quantifier:
1269     if (state->treeDepth == TREE_DEPTH_MAX) {
1270         ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1271         return FALSE;
1272     }
1273
1274     ++state->treeDepth;
1275     ++state->cp;
1276     state->result->kid = term;
1277     if (state->cp < state->cpend && *state->cp == '?') {
1278         ++state->cp;
1279         state->result->u.range.greedy = FALSE;
1280     } else {
1281         state->result->u.range.greedy = TRUE;
1282     }
1283     return TRUE;
1284 }
1285
1286 /*
1287  *  item:       assertion               An item is either an assertion or
1288  *              quantatom               a quantified atom.
1289  *
1290  *  assertion:  '^'                     Assertions match beginning of string
1291  *                                      (or line if the class static property
1292  *                                      RegExp.multiline is true).
1293  *              '$'                     End of string (or line if the class
1294  *                                      static property RegExp.multiline is
1295  *                                      true).
1296  *              '\b'                    Word boundary (between \w and \W).
1297  *              '\B'                    Word non-boundary.
1298  *
1299  *  quantatom:  atom                    An unquantified atom.
1300  *              quantatom '{' n ',' m '}'
1301  *                                      Atom must occur between n and m times.
1302  *              quantatom '{' n ',' '}' Atom must occur at least n times.
1303  *              quantatom '{' n '}'     Atom must occur exactly n times.
1304  *              quantatom '*'           Zero or more times (same as {0,}).
1305  *              quantatom '+'           One or more times (same as {1,}).
1306  *              quantatom '?'           Zero or one time (same as {0,1}).
1307  *
1308  *              any of which can be optionally followed by '?' for ungreedy
1309  *
1310  *  atom:       '(' regexp ')'          A parenthesized regexp (what matched
1311  *                                      can be addressed using a backreference,
1312  *                                      see '\' n below).
1313  *              '.'                     Matches any char except '\n'.
1314  *              '[' classlist ']'       A character class.
1315  *              '[' '^' classlist ']'   A negated character class.
1316  *              '\f'                    Form Feed.
1317  *              '\n'                    Newline (Line Feed).
1318  *              '\r'                    Carriage Return.
1319  *              '\t'                    Horizontal Tab.
1320  *              '\v'                    Vertical Tab.
1321  *              '\d'                    A digit (same as [0-9]).
1322  *              '\D'                    A non-digit.
1323  *              '\w'                    A word character, [0-9a-z_A-Z].
1324  *              '\W'                    A non-word character.
1325  *              '\s'                    A whitespace character, [ \b\f\n\r\t\v].
1326  *              '\S'                    A non-whitespace character.
1327  *              '\' n                   A backreference to the nth (n decimal
1328  *                                      and positive) parenthesized expression.
1329  *              '\' octal               An octal escape sequence (octal must be
1330  *                                      two or three digits long, unless it is
1331  *                                      0 for the null character).
1332  *              '\x' hex                A hex escape (hex must be two digits).
1333  *              '\u' unicode            A unicode escape (must be four digits).
1334  *              '\c' ctrl               A control character, ctrl is a letter.
1335  *              '\' literalatomchar     Any character except one of the above
1336  *                                      that follow '\' in an atom.
1337  *              otheratomchar           Any character not first among the other
1338  *                                      atom right-hand sides.
1339  */
1340 static BOOL
1341 ParseTerm(CompilerState *state)
1342 {
1343     WCHAR c = *state->cp++;
1344     UINT nDigits;
1345     UINT num, tmp, n, i;
1346     const WCHAR *termStart;
1347
1348     switch (c) {
1349     /* assertions and atoms */
1350       case '^':
1351         state->result = NewRENode(state, REOP_BOL);
1352         if (!state->result)
1353             return FALSE;
1354         state->progLength++;
1355         return TRUE;
1356       case '$':
1357         state->result = NewRENode(state, REOP_EOL);
1358         if (!state->result)
1359             return FALSE;
1360         state->progLength++;
1361         return TRUE;
1362       case '\\':
1363         if (state->cp >= state->cpend) {
1364             /* a trailing '\' is an error */
1365             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1366             return FALSE;
1367         }
1368         c = *state->cp++;
1369         switch (c) {
1370         /* assertion escapes */
1371           case 'b' :
1372             state->result = NewRENode(state, REOP_WBDRY);
1373             if (!state->result)
1374                 return FALSE;
1375             state->progLength++;
1376             return TRUE;
1377           case 'B':
1378             state->result = NewRENode(state, REOP_WNONBDRY);
1379             if (!state->result)
1380                 return FALSE;
1381             state->progLength++;
1382             return TRUE;
1383           /* Decimal escape */
1384           case '0':
1385               /* Give a strict warning. See also the note below. */
1386               WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1387      doOctal:
1388             num = 0;
1389             while (state->cp < state->cpend) {
1390                 c = *state->cp;
1391                 if (c < '0' || '7' < c)
1392                     break;
1393                 state->cp++;
1394                 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1395                 if (tmp > 0377)
1396                     break;
1397                 num = tmp;
1398             }
1399             c = (WCHAR)num;
1400     doFlat:
1401             state->result = NewRENode(state, REOP_FLAT);
1402             if (!state->result)
1403                 return FALSE;
1404             state->result->u.flat.chr = c;
1405             state->result->u.flat.length = 1;
1406             state->progLength += 3;
1407             break;
1408           case '1':
1409           case '2':
1410           case '3':
1411           case '4':
1412           case '5':
1413           case '6':
1414           case '7':
1415           case '8':
1416           case '9':
1417             termStart = state->cp - 1;
1418             num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1419             if (state->flags & JSREG_FIND_PAREN_ERROR)
1420                 return FALSE;
1421             if (num == OVERFLOW_VALUE) {
1422                 /* Give a strict mode warning. */
1423                 WARN("back-reference exceeds number of capturing parentheses\n");
1424
1425                 /*
1426                  * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1427                  * error here. However, for compatibility with IE, we treat the
1428                  * whole backref as flat if the first character in it is not a
1429                  * valid octal character, and as an octal escape otherwise.
1430                  */
1431                 state->cp = termStart;
1432                 if (c >= '8') {
1433                     /* Treat this as flat. termStart - 1 is the \. */
1434                     c = '\\';
1435                     goto asFlat;
1436                 }
1437
1438                 /* Treat this as an octal escape. */
1439                 goto doOctal;
1440             }
1441             assert(1 <= num && num <= 0x10000);
1442             state->result = NewRENode(state, REOP_BACKREF);
1443             if (!state->result)
1444                 return FALSE;
1445             state->result->u.parenIndex = num - 1;
1446             state->progLength
1447                 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1448             break;
1449           /* Control escape */
1450           case 'f':
1451             c = 0xC;
1452             goto doFlat;
1453           case 'n':
1454             c = 0xA;
1455             goto doFlat;
1456           case 'r':
1457             c = 0xD;
1458             goto doFlat;
1459           case 't':
1460             c = 0x9;
1461             goto doFlat;
1462           case 'v':
1463             c = 0xB;
1464             goto doFlat;
1465           /* Control letter */
1466           case 'c':
1467             if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1468                 c = (WCHAR) (*state->cp++ & 0x1F);
1469             } else {
1470                 /* back off to accepting the original '\' as a literal */
1471                 --state->cp;
1472                 c = '\\';
1473             }
1474             goto doFlat;
1475           /* HexEscapeSequence */
1476           case 'x':
1477             nDigits = 2;
1478             goto lexHex;
1479           /* UnicodeEscapeSequence */
1480           case 'u':
1481             nDigits = 4;
1482 lexHex:
1483             n = 0;
1484             for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1485                 UINT digit;
1486                 c = *state->cp++;
1487                 if (!isASCIIHexDigit(c, &digit)) {
1488                     /*
1489                      * Back off to accepting the original 'u' or 'x' as a
1490                      * literal.
1491                      */
1492                     state->cp -= i + 2;
1493                     n = *state->cp++;
1494                     break;
1495                 }
1496                 n = (n << 4) | digit;
1497             }
1498             c = (WCHAR) n;
1499             goto doFlat;
1500           /* Character class escapes */
1501           case 'd':
1502             state->result = NewRENode(state, REOP_DIGIT);
1503 doSimple:
1504             if (!state->result)
1505                 return FALSE;
1506             state->progLength++;
1507             break;
1508           case 'D':
1509             state->result = NewRENode(state, REOP_NONDIGIT);
1510             goto doSimple;
1511           case 's':
1512             state->result = NewRENode(state, REOP_SPACE);
1513             goto doSimple;
1514           case 'S':
1515             state->result = NewRENode(state, REOP_NONSPACE);
1516             goto doSimple;
1517           case 'w':
1518             state->result = NewRENode(state, REOP_ALNUM);
1519             goto doSimple;
1520           case 'W':
1521             state->result = NewRENode(state, REOP_NONALNUM);
1522             goto doSimple;
1523           /* IdentityEscape */
1524           default:
1525             state->result = NewRENode(state, REOP_FLAT);
1526             if (!state->result)
1527                 return FALSE;
1528             state->result->u.flat.chr = c;
1529             state->result->u.flat.length = 1;
1530             state->result->kid = (void *) (state->cp - 1);
1531             state->progLength += 3;
1532             break;
1533         }
1534         break;
1535       case '[':
1536         state->result = NewRENode(state, REOP_CLASS);
1537         if (!state->result)
1538             return FALSE;
1539         termStart = state->cp;
1540         state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1541         for (;;) {
1542             if (state->cp == state->cpend) {
1543                 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1544                                         JSMSG_UNTERM_CLASS, termStart);
1545
1546                 return FALSE;
1547             }
1548             if (*state->cp == '\\') {
1549                 state->cp++;
1550                 if (state->cp != state->cpend)
1551                     state->cp++;
1552                 continue;
1553             }
1554             if (*state->cp == ']') {
1555                 state->result->u.ucclass.kidlen = state->cp - termStart;
1556                 break;
1557             }
1558             state->cp++;
1559         }
1560         for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1561             if (!state->classCache[i].start) {
1562                 state->classCache[i].start = termStart;
1563                 state->classCache[i].length = state->result->u.ucclass.kidlen;
1564                 state->classCache[i].index = state->classCount;
1565                 break;
1566             }
1567             if (state->classCache[i].length ==
1568                 state->result->u.ucclass.kidlen) {
1569                 for (n = 0; ; n++) {
1570                     if (n == state->classCache[i].length) {
1571                         state->result->u.ucclass.index
1572                             = state->classCache[i].index;
1573                         goto claim;
1574                     }
1575                     if (state->classCache[i].start[n] != termStart[n])
1576                         break;
1577                 }
1578             }
1579         }
1580         state->result->u.ucclass.index = state->classCount++;
1581
1582     claim:
1583         /*
1584          * Call CalculateBitmapSize now as we want any errors it finds
1585          * to be reported during the parse phase, not at execution.
1586          */
1587         if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1588             return FALSE;
1589         /*
1590          * Update classBitmapsMem with number of bytes to hold bmsize bits,
1591          * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1592          * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1593          */
1594         n = (state->result->u.ucclass.bmsize >> 3) + 1;
1595         if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1596             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1597             return FALSE;
1598         }
1599         state->classBitmapsMem += n;
1600         /* CLASS, <index> */
1601         state->progLength
1602             += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1603         break;
1604
1605       case '.':
1606         state->result = NewRENode(state, REOP_DOT);
1607         goto doSimple;
1608
1609       case '{':
1610       {
1611         const WCHAR *errp = state->cp--;
1612         INT err;
1613
1614         err = ParseMinMaxQuantifier(state, TRUE);
1615         state->cp = errp;
1616
1617         if (err < 0)
1618             goto asFlat;
1619
1620         /* FALL THROUGH */
1621       }
1622       case '*':
1623       case '+':
1624       case '?':
1625         ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1626                                 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1627         return FALSE;
1628       default:
1629 asFlat:
1630         state->result = NewRENode(state, REOP_FLAT);
1631         if (!state->result)
1632             return FALSE;
1633         state->result->u.flat.chr = c;
1634         state->result->u.flat.length = 1;
1635         state->result->kid = (void *) (state->cp - 1);
1636         state->progLength += 3;
1637         break;
1638     }
1639     return ParseQuantifier(state);
1640 }
1641
1642 /*
1643  * Top-down regular expression grammar, based closely on Perl4.
1644  *
1645  *  regexp:     altern                  A regular expression is one or more
1646  *              altern '|' regexp       alternatives separated by vertical bar.
1647  */
1648 #define INITIAL_STACK_SIZE  128
1649
1650 static BOOL
1651 ParseRegExp(CompilerState *state)
1652 {
1653     size_t parenIndex;
1654     RENode *operand;
1655     REOpData *operatorStack;
1656     RENode **operandStack;
1657     REOp op;
1658     INT i;
1659     BOOL result = FALSE;
1660
1661     INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1662     INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1663
1664     /* Watch out for empty regexp */
1665     if (state->cp == state->cpend) {
1666         state->result = NewRENode(state, REOP_EMPTY);
1667         return (state->result != NULL);
1668     }
1669
1670     operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1671     if (!operatorStack)
1672         return FALSE;
1673
1674     operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1675     if (!operandStack)
1676         goto out;
1677
1678     for (;;) {
1679         parenIndex = state->parenCount;
1680         if (state->cp == state->cpend) {
1681             /*
1682              * If we are at the end of the regexp and we're short one or more
1683              * operands, the regexp must have the form /x|/ or some such, with
1684              * left parentheses making us short more than one operand.
1685              */
1686             if (operatorSP >= operandSP) {
1687                 operand = NewRENode(state, REOP_EMPTY);
1688                 if (!operand)
1689                     goto out;
1690                 goto pushOperand;
1691             }
1692         } else {
1693             switch (*state->cp) {
1694               case '(':
1695                 ++state->cp;
1696                 if (state->cp + 1 < state->cpend &&
1697                     *state->cp == '?' &&
1698                     (state->cp[1] == '=' ||
1699                      state->cp[1] == '!' ||
1700                      state->cp[1] == ':')) {
1701                     switch (state->cp[1]) {
1702                       case '=':
1703                         op = REOP_ASSERT;
1704                         /* ASSERT, <next>, ... ASSERTTEST */
1705                         state->progLength += 4;
1706                         break;
1707                       case '!':
1708                         op = REOP_ASSERT_NOT;
1709                         /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1710                         state->progLength += 4;
1711                         break;
1712                       default:
1713                         op = REOP_LPARENNON;
1714                         break;
1715                     }
1716                     state->cp += 2;
1717                 } else {
1718                     op = REOP_LPAREN;
1719                     /* LPAREN, <index>, ... RPAREN, <index> */
1720                     state->progLength
1721                         += 2 * (1 + GetCompactIndexWidth(parenIndex));
1722                     state->parenCount++;
1723                     if (state->parenCount == 65535) {
1724                         ReportRegExpError(state, JSREPORT_ERROR,
1725                                           JSMSG_TOO_MANY_PARENS);
1726                         goto out;
1727                     }
1728                 }
1729                 goto pushOperator;
1730
1731               case ')':
1732                 /*
1733                  * If there's no stacked open parenthesis, throw syntax error.
1734                  */
1735                 for (i = operatorSP - 1; ; i--) {
1736                     if (i < 0) {
1737                         ReportRegExpError(state, JSREPORT_ERROR,
1738                                           JSMSG_UNMATCHED_RIGHT_PAREN);
1739                         goto out;
1740                     }
1741                     if (operatorStack[i].op == REOP_ASSERT ||
1742                         operatorStack[i].op == REOP_ASSERT_NOT ||
1743                         operatorStack[i].op == REOP_LPARENNON ||
1744                         operatorStack[i].op == REOP_LPAREN) {
1745                         break;
1746                     }
1747                 }
1748                 /* FALL THROUGH */
1749
1750               case '|':
1751                 /* Expected an operand before these, so make an empty one */
1752                 operand = NewRENode(state, REOP_EMPTY);
1753                 if (!operand)
1754                     goto out;
1755                 goto pushOperand;
1756
1757               default:
1758                 if (!ParseTerm(state))
1759                     goto out;
1760                 operand = state->result;
1761 pushOperand:
1762                 if (operandSP == operandStackSize) {
1763                     RENode **tmp;
1764                     operandStackSize += operandStackSize;
1765                     tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1766                     if (!tmp)
1767                         goto out;
1768                     operandStack = tmp;
1769                 }
1770                 operandStack[operandSP++] = operand;
1771                 break;
1772             }
1773         }
1774
1775         /* At the end; process remaining operators. */
1776 restartOperator:
1777         if (state->cp == state->cpend) {
1778             while (operatorSP) {
1779                 --operatorSP;
1780                 if (!ProcessOp(state, &operatorStack[operatorSP],
1781                                operandStack, operandSP))
1782                     goto out;
1783                 --operandSP;
1784             }
1785             assert(operandSP == 1);
1786             state->result = operandStack[0];
1787             result = TRUE;
1788             goto out;
1789         }
1790
1791         switch (*state->cp) {
1792           case '|':
1793             /* Process any stacked 'concat' operators */
1794             ++state->cp;
1795             while (operatorSP &&
1796                    operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1797                 --operatorSP;
1798                 if (!ProcessOp(state, &operatorStack[operatorSP],
1799                                operandStack, operandSP)) {
1800                     goto out;
1801                 }
1802                 --operandSP;
1803             }
1804             op = REOP_ALT;
1805             goto pushOperator;
1806
1807           case ')':
1808             /*
1809              * If there's no stacked open parenthesis, throw syntax error.
1810              */
1811             for (i = operatorSP - 1; ; i--) {
1812                 if (i < 0) {
1813                     ReportRegExpError(state, JSREPORT_ERROR,
1814                                       JSMSG_UNMATCHED_RIGHT_PAREN);
1815                     goto out;
1816                 }
1817                 if (operatorStack[i].op == REOP_ASSERT ||
1818                     operatorStack[i].op == REOP_ASSERT_NOT ||
1819                     operatorStack[i].op == REOP_LPARENNON ||
1820                     operatorStack[i].op == REOP_LPAREN) {
1821                     break;
1822                 }
1823             }
1824             ++state->cp;
1825
1826             /* Process everything on the stack until the open parenthesis. */
1827             for (;;) {
1828                 assert(operatorSP);
1829                 --operatorSP;
1830                 switch (operatorStack[operatorSP].op) {
1831                   case REOP_ASSERT:
1832                   case REOP_ASSERT_NOT:
1833                   case REOP_LPAREN:
1834                     operand = NewRENode(state, operatorStack[operatorSP].op);
1835                     if (!operand)
1836                         goto out;
1837                     operand->u.parenIndex =
1838                         operatorStack[operatorSP].parenIndex;
1839                     assert(operandSP);
1840                     operand->kid = operandStack[operandSP - 1];
1841                     operandStack[operandSP - 1] = operand;
1842                     if (state->treeDepth == TREE_DEPTH_MAX) {
1843                         ReportRegExpError(state, JSREPORT_ERROR,
1844                                           JSMSG_REGEXP_TOO_COMPLEX);
1845                         goto out;
1846                     }
1847                     ++state->treeDepth;
1848                     /* FALL THROUGH */
1849
1850                   case REOP_LPARENNON:
1851                     state->result = operandStack[operandSP - 1];
1852                     if (!ParseQuantifier(state))
1853                         goto out;
1854                     operandStack[operandSP - 1] = state->result;
1855                     goto restartOperator;
1856                   default:
1857                     if (!ProcessOp(state, &operatorStack[operatorSP],
1858                                    operandStack, operandSP))
1859                         goto out;
1860                     --operandSP;
1861                     break;
1862                 }
1863             }
1864             break;
1865
1866           case '{':
1867           {
1868             const WCHAR *errp = state->cp;
1869
1870             if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1871                 /*
1872                  * This didn't even scan correctly as a quantifier, so we should
1873                  * treat it as flat.
1874                  */
1875                 op = REOP_CONCAT;
1876                 goto pushOperator;
1877             }
1878
1879             state->cp = errp;
1880             /* FALL THROUGH */
1881           }
1882
1883           case '+':
1884           case '*':
1885           case '?':
1886             ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1887                                     state->cp);
1888             result = FALSE;
1889             goto out;
1890
1891           default:
1892             /* Anything else is the start of the next term. */
1893             op = REOP_CONCAT;
1894 pushOperator:
1895             if (operatorSP == operatorStackSize) {
1896                 REOpData *tmp;
1897                 operatorStackSize += operatorStackSize;
1898                 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1899                 if (!tmp)
1900                     goto out;
1901                 operatorStack = tmp;
1902             }
1903             operatorStack[operatorSP].op = op;
1904             operatorStack[operatorSP].errPos = state->cp;
1905             operatorStack[operatorSP++].parenIndex = parenIndex;
1906             break;
1907         }
1908     }
1909 out:
1910     heap_free(operatorStack);
1911     heap_free(operandStack);
1912     return result;
1913 }
1914
1915 /*
1916  * Save the current state of the match - the position in the input
1917  * text as well as the position in the bytecode. The state of any
1918  * parent expressions is also saved (preceding state).
1919  * Contents of parenCount parentheses from parenIndex are also saved.
1920  */
1921 static REBackTrackData *
1922 PushBackTrackState(REGlobalData *gData, REOp op,
1923                    jsbytecode *target, match_state_t *x, const WCHAR *cp,
1924                    size_t parenIndex, size_t parenCount)
1925 {
1926     size_t i;
1927     REBackTrackData *result =
1928         (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1929
1930     size_t sz = sizeof(REBackTrackData) +
1931                 gData->stateStackTop * sizeof(REProgState) +
1932                 parenCount * sizeof(RECapture);
1933
1934     ptrdiff_t btsize = gData->backTrackStackSize;
1935     ptrdiff_t btincr = ((char *)result + sz) -
1936                        ((char *)gData->backTrackStack + btsize);
1937
1938     TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1939
1940     JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1941     if (btincr > 0) {
1942         ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1943
1944         JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1945         btincr = ((btincr+btsize-1)/btsize)*btsize;
1946         gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1947         if (!gData->backTrackStack) {
1948             js_ReportOutOfScriptQuota(gData->cx);
1949             gData->ok = FALSE;
1950             return NULL;
1951         }
1952         gData->backTrackStackSize = btsize + btincr;
1953         result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1954     }
1955     gData->backTrackSP = result;
1956     result->sz = gData->cursz;
1957     gData->cursz = sz;
1958
1959     result->backtrack_op = op;
1960     result->backtrack_pc = target;
1961     result->cp = cp;
1962     result->parenCount = parenCount;
1963     result->parenIndex = parenIndex;
1964
1965     result->saveStateStackTop = gData->stateStackTop;
1966     assert(gData->stateStackTop);
1967     memcpy(result + 1, gData->stateStack,
1968            sizeof(REProgState) * result->saveStateStackTop);
1969
1970     if (parenCount != 0) {
1971         memcpy((char *)(result + 1) +
1972                sizeof(REProgState) * result->saveStateStackTop,
1973                &x->parens[parenIndex],
1974                sizeof(RECapture) * parenCount);
1975         for (i = 0; i != parenCount; i++)
1976             x->parens[parenIndex + i].index = -1;
1977     }
1978
1979     return result;
1980 }
1981
1982 static inline match_state_t *
1983 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1984               size_t length)
1985 {
1986     size_t i;
1987     assert(gData->cpend >= x->cp);
1988     if (length > (size_t)(gData->cpend - x->cp))
1989         return NULL;
1990     for (i = 0; i != length; i++) {
1991         if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
1992             return NULL;
1993     }
1994     x->cp += length;
1995     return x;
1996 }
1997
1998 /*
1999  * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2000  * 2. If E is not a character then go to step 6.
2001  * 3. Let ch be E's character.
2002  * 4. Let A be a one-element RECharSet containing the character ch.
2003  * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2004  * 6. E must be an integer. Let n be that integer.
2005  * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2006  * 8. Return an internal Matcher closure that takes two arguments, a State x
2007  *    and a Continuation c, and performs the following:
2008  *     1. Let cap be x's captures internal array.
2009  *     2. Let s be cap[n].
2010  *     3. If s is undefined, then call c(x) and return its result.
2011  *     4. Let e be x's endIndex.
2012  *     5. Let len be s's length.
2013  *     6. Let f be e+len.
2014  *     7. If f>InputLength, return failure.
2015  *     8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2016  *        such that Canonicalize(s[i]) is not the same character as
2017  *        Canonicalize(Input [e+i]), then return failure.
2018  *     9. Let y be the State (f, cap).
2019  *     10. Call c(y) and return its result.
2020  */
2021 static match_state_t *
2022 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2023 {
2024     size_t len, i;
2025     const WCHAR *parenContent;
2026     RECapture *cap = &x->parens[parenIndex];
2027
2028     if (cap->index == -1)
2029         return x;
2030
2031     len = cap->length;
2032     if (x->cp + len > gData->cpend)
2033         return NULL;
2034
2035     parenContent = &gData->cpbegin[cap->index];
2036     if (gData->regexp->flags & REG_FOLD) {
2037         for (i = 0; i < len; i++) {
2038             if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2039                 return NULL;
2040         }
2041     } else {
2042         for (i = 0; i < len; i++) {
2043             if (parenContent[i] != x->cp[i])
2044                 return NULL;
2045         }
2046     }
2047     x->cp += len;
2048     return x;
2049 }
2050
2051 /* Add a single character to the RECharSet */
2052 static void
2053 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2054 {
2055     UINT byteIndex = (UINT)(c >> 3);
2056     assert(c <= cs->length);
2057     cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2058 }
2059
2060
2061 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2062 static void
2063 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2064 {
2065     UINT i;
2066
2067     UINT byteIndex1 = c1 >> 3;
2068     UINT byteIndex2 = c2 >> 3;
2069
2070     assert(c2 <= cs->length && c1 <= c2);
2071
2072     c1 &= 0x7;
2073     c2 &= 0x7;
2074
2075     if (byteIndex1 == byteIndex2) {
2076         cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2077     } else {
2078         cs->u.bits[byteIndex1] |= 0xFF << c1;
2079         for (i = byteIndex1 + 1; i < byteIndex2; i++)
2080             cs->u.bits[i] = 0xFF;
2081         cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2082     }
2083 }
2084
2085 /* Compile the source of the class into a RECharSet */
2086 static BOOL
2087 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2088 {
2089     const WCHAR *src, *end;
2090     BOOL inRange = FALSE;
2091     WCHAR rangeStart = 0;
2092     UINT byteLength, n;
2093     WCHAR c, thisCh;
2094     INT nDigits, i;
2095
2096     assert(!charSet->converted);
2097     /*
2098      * Assert that startIndex and length points to chars inside [] inside
2099      * source string.
2100      */
2101     assert(1 <= charSet->u.src.startIndex);
2102     assert(charSet->u.src.startIndex < gData->regexp->source_len);
2103     assert(charSet->u.src.length <= gData->regexp->source_len
2104             - 1 - charSet->u.src.startIndex);
2105
2106     charSet->converted = TRUE;
2107     src = gData->regexp->source + charSet->u.src.startIndex;
2108
2109     end = src + charSet->u.src.length;
2110
2111     assert(src[-1] == '[' && end[0] == ']');
2112
2113     byteLength = (charSet->length >> 3) + 1;
2114     charSet->u.bits = heap_alloc(byteLength);
2115     if (!charSet->u.bits) {
2116         JS_ReportOutOfMemory(gData->cx);
2117         gData->ok = FALSE;
2118         return FALSE;
2119     }
2120     memset(charSet->u.bits, 0, byteLength);
2121
2122     if (src == end)
2123         return TRUE;
2124
2125     if (*src == '^') {
2126         assert(charSet->sense == FALSE);
2127         ++src;
2128     } else {
2129         assert(charSet->sense == TRUE);
2130     }
2131
2132     while (src != end) {
2133         switch (*src) {
2134           case '\\':
2135             ++src;
2136             c = *src++;
2137             switch (c) {
2138               case 'b':
2139                 thisCh = 0x8;
2140                 break;
2141               case 'f':
2142                 thisCh = 0xC;
2143                 break;
2144               case 'n':
2145                 thisCh = 0xA;
2146                 break;
2147               case 'r':
2148                 thisCh = 0xD;
2149                 break;
2150               case 't':
2151                 thisCh = 0x9;
2152                 break;
2153               case 'v':
2154                 thisCh = 0xB;
2155                 break;
2156               case 'c':
2157                 if (src < end && JS_ISWORD(*src)) {
2158                     thisCh = (WCHAR)(*src++ & 0x1F);
2159                 } else {
2160                     --src;
2161                     thisCh = '\\';
2162                 }
2163                 break;
2164               case 'x':
2165                 nDigits = 2;
2166                 goto lexHex;
2167               case 'u':
2168                 nDigits = 4;
2169             lexHex:
2170                 n = 0;
2171                 for (i = 0; (i < nDigits) && (src < end); i++) {
2172                     UINT digit;
2173                     c = *src++;
2174                     if (!isASCIIHexDigit(c, &digit)) {
2175                         /*
2176                          * Back off to accepting the original '\'
2177                          * as a literal
2178                          */
2179                         src -= i + 1;
2180                         n = '\\';
2181                         break;
2182                     }
2183                     n = (n << 4) | digit;
2184                 }
2185                 thisCh = (WCHAR)n;
2186                 break;
2187               case '0':
2188               case '1':
2189               case '2':
2190               case '3':
2191               case '4':
2192               case '5':
2193               case '6':
2194               case '7':
2195                 /*
2196                  *  This is a non-ECMA extension - decimal escapes (in this
2197                  *  case, octal!) are supposed to be an error inside class
2198                  *  ranges, but supported here for backwards compatibility.
2199                  */
2200                 n = JS7_UNDEC(c);
2201                 c = *src;
2202                 if ('0' <= c && c <= '7') {
2203                     src++;
2204                     n = 8 * n + JS7_UNDEC(c);
2205                     c = *src;
2206                     if ('0' <= c && c <= '7') {
2207                         src++;
2208                         i = 8 * n + JS7_UNDEC(c);
2209                         if (i <= 0377)
2210                             n = i;
2211                         else
2212                             src--;
2213                     }
2214                 }
2215                 thisCh = (WCHAR)n;
2216                 break;
2217
2218               case 'd':
2219                 AddCharacterRangeToCharSet(charSet, '0', '9');
2220                 continue;   /* don't need range processing */
2221               case 'D':
2222                 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2223                 AddCharacterRangeToCharSet(charSet,
2224                                            (WCHAR)('9' + 1),
2225                                            (WCHAR)charSet->length);
2226                 continue;
2227               case 's':
2228                 for (i = (INT)charSet->length; i >= 0; i--)
2229                     if (isspaceW(i))
2230                         AddCharacterToCharSet(charSet, (WCHAR)i);
2231                 continue;
2232               case 'S':
2233                 for (i = (INT)charSet->length; i >= 0; i--)
2234                     if (!isspaceW(i))
2235                         AddCharacterToCharSet(charSet, (WCHAR)i);
2236                 continue;
2237               case 'w':
2238                 for (i = (INT)charSet->length; i >= 0; i--)
2239                     if (JS_ISWORD(i))
2240                         AddCharacterToCharSet(charSet, (WCHAR)i);
2241                 continue;
2242               case 'W':
2243                 for (i = (INT)charSet->length; i >= 0; i--)
2244                     if (!JS_ISWORD(i))
2245                         AddCharacterToCharSet(charSet, (WCHAR)i);
2246                 continue;
2247               default:
2248                 thisCh = c;
2249                 break;
2250
2251             }
2252             break;
2253
2254           default:
2255             thisCh = *src++;
2256             break;
2257
2258         }
2259         if (inRange) {
2260             if (gData->regexp->flags & REG_FOLD) {
2261                 assert(rangeStart <= thisCh);
2262                 for (i = rangeStart; i <= thisCh; i++) {
2263                     WCHAR uch, dch;
2264
2265                     AddCharacterToCharSet(charSet, i);
2266                     uch = toupperW(i);
2267                     dch = tolowerW(i);
2268                     if (i != uch)
2269                         AddCharacterToCharSet(charSet, uch);
2270                     if (i != dch)
2271                         AddCharacterToCharSet(charSet, dch);
2272                 }
2273             } else {
2274                 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2275             }
2276             inRange = FALSE;
2277         } else {
2278             if (gData->regexp->flags & REG_FOLD) {
2279                 AddCharacterToCharSet(charSet, toupperW(thisCh));
2280                 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2281             } else {
2282                 AddCharacterToCharSet(charSet, thisCh);
2283             }
2284             if (src < end - 1) {
2285                 if (*src == '-') {
2286                     ++src;
2287                     inRange = TRUE;
2288                     rangeStart = thisCh;
2289                 }
2290             }
2291         }
2292     }
2293     return TRUE;
2294 }
2295
2296 static BOOL
2297 ReallocStateStack(REGlobalData *gData)
2298 {
2299     size_t limit = gData->stateStackLimit;
2300     size_t sz = sizeof(REProgState) * limit;
2301
2302     gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2303     if (!gData->stateStack) {
2304         js_ReportOutOfScriptQuota(gData->cx);
2305         gData->ok = FALSE;
2306         return FALSE;
2307     }
2308     gData->stateStackLimit = limit + limit;
2309     return TRUE;
2310 }
2311
2312 #define PUSH_STATE_STACK(data)                                                \
2313     do {                                                                      \
2314         ++(data)->stateStackTop;                                              \
2315         if ((data)->stateStackTop == (data)->stateStackLimit &&               \
2316             !ReallocStateStack((data))) {                                     \
2317             return NULL;                                                      \
2318         }                                                                     \
2319     }while(0)
2320
2321 /*
2322  * Apply the current op against the given input to see if it's going to match
2323  * or fail. Return false if we don't get a match, true if we do. If updatecp is
2324  * true, then update the current state's cp. Always update startpc to the next
2325  * op.
2326  */
2327 static inline match_state_t *
2328 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2329             jsbytecode **startpc, BOOL updatecp)
2330 {
2331     match_state_t *result = NULL;
2332     WCHAR matchCh;
2333     size_t parenIndex;
2334     size_t offset, length, index;
2335     jsbytecode *pc = *startpc;  /* pc has already been incremented past op */
2336     const WCHAR *source;
2337     const WCHAR *startcp = x->cp;
2338     WCHAR ch;
2339     RECharSet *charSet;
2340
2341     const char *opname = reop_names[op];
2342     TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2343           (int)gData->stateStackTop * 2, "", opname);
2344
2345     switch (op) {
2346       case REOP_EMPTY:
2347         result = x;
2348         break;
2349       case REOP_BOL:
2350         if (x->cp != gData->cpbegin) {
2351             if (/*!gData->cx->regExpStatics.multiline &&  FIXME !!! */
2352                 !(gData->regexp->flags & REG_MULTILINE)) {
2353                 break;
2354             }
2355             if (!RE_IS_LINE_TERM(x->cp[-1]))
2356                 break;
2357         }
2358         result = x;
2359         break;
2360       case REOP_EOL:
2361         if (x->cp != gData->cpend) {
2362             if (/*!gData->cx->regExpStatics.multiline &&*/
2363                 !(gData->regexp->flags & REG_MULTILINE)) {
2364                 break;
2365             }
2366             if (!RE_IS_LINE_TERM(*x->cp))
2367                 break;
2368         }
2369         result = x;
2370         break;
2371       case REOP_WBDRY:
2372         if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2373             !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2374             result = x;
2375         }
2376         break;
2377       case REOP_WNONBDRY:
2378         if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2379             (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2380             result = x;
2381         }
2382         break;
2383       case REOP_DOT:
2384         if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2385             result = x;
2386             result->cp++;
2387         }
2388         break;
2389       case REOP_DIGIT:
2390         if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2391             result = x;
2392             result->cp++;
2393         }
2394         break;
2395       case REOP_NONDIGIT:
2396         if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2397             result = x;
2398             result->cp++;
2399         }
2400         break;
2401       case REOP_ALNUM:
2402         if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2403             result = x;
2404             result->cp++;
2405         }
2406         break;
2407       case REOP_NONALNUM:
2408         if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2409             result = x;
2410             result->cp++;
2411         }
2412         break;
2413       case REOP_SPACE:
2414         if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2415             result = x;
2416             result->cp++;
2417         }
2418         break;
2419       case REOP_NONSPACE:
2420         if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2421             result = x;
2422             result->cp++;
2423         }
2424         break;
2425       case REOP_BACKREF:
2426         pc = ReadCompactIndex(pc, &parenIndex);
2427         assert(parenIndex < gData->regexp->parenCount);
2428         result = BackrefMatcher(gData, x, parenIndex);
2429         break;
2430       case REOP_FLAT:
2431         pc = ReadCompactIndex(pc, &offset);
2432         assert(offset < gData->regexp->source_len);
2433         pc = ReadCompactIndex(pc, &length);
2434         assert(1 <= length);
2435         assert(length <= gData->regexp->source_len - offset);
2436         if (length <= (size_t)(gData->cpend - x->cp)) {
2437             source = gData->regexp->source + offset;
2438             TRACE("%s\n", debugstr_wn(source, length));
2439             for (index = 0; index != length; index++) {
2440                 if (source[index] != x->cp[index])
2441                     return NULL;
2442             }
2443             x->cp += length;
2444             result = x;
2445         }
2446         break;
2447       case REOP_FLAT1:
2448         matchCh = *pc++;
2449         TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2450         if (x->cp != gData->cpend && *x->cp == matchCh) {
2451             result = x;
2452             result->cp++;
2453         }
2454         break;
2455       case REOP_FLATi:
2456         pc = ReadCompactIndex(pc, &offset);
2457         assert(offset < gData->regexp->source_len);
2458         pc = ReadCompactIndex(pc, &length);
2459         assert(1 <= length);
2460         assert(length <= gData->regexp->source_len - offset);
2461         source = gData->regexp->source;
2462         result = FlatNIMatcher(gData, x, source + offset, length);
2463         break;
2464       case REOP_FLAT1i:
2465         matchCh = *pc++;
2466         if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2467             result = x;
2468             result->cp++;
2469         }
2470         break;
2471       case REOP_UCFLAT1:
2472         matchCh = GET_ARG(pc);
2473         TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2474         pc += ARG_LEN;
2475         if (x->cp != gData->cpend && *x->cp == matchCh) {
2476             result = x;
2477             result->cp++;
2478         }
2479         break;
2480       case REOP_UCFLAT1i:
2481         matchCh = GET_ARG(pc);
2482         pc += ARG_LEN;
2483         if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2484             result = x;
2485             result->cp++;
2486         }
2487         break;
2488       case REOP_CLASS:
2489         pc = ReadCompactIndex(pc, &index);
2490         assert(index < gData->regexp->classCount);
2491         if (x->cp != gData->cpend) {
2492             charSet = &gData->regexp->classList[index];
2493             assert(charSet->converted);
2494             ch = *x->cp;
2495             index = ch >> 3;
2496             if (charSet->length != 0 &&
2497                 ch <= charSet->length &&
2498                 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2499                 result = x;
2500                 result->cp++;
2501             }
2502         }
2503         break;
2504       case REOP_NCLASS:
2505         pc = ReadCompactIndex(pc, &index);
2506         assert(index < gData->regexp->classCount);
2507         if (x->cp != gData->cpend) {
2508             charSet = &gData->regexp->classList[index];
2509             assert(charSet->converted);
2510             ch = *x->cp;
2511             index = ch >> 3;
2512             if (charSet->length == 0 ||
2513                 ch > charSet->length ||
2514                 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2515                 result = x;
2516                 result->cp++;
2517             }
2518         }
2519         break;
2520
2521       default:
2522         assert(FALSE);
2523     }
2524     if (result) {
2525         if (!updatecp)
2526             x->cp = startcp;
2527         *startpc = pc;
2528         TRACE(" *\n");
2529         return result;
2530     }
2531     x->cp = startcp;
2532     return NULL;
2533 }
2534
2535 static inline match_state_t *
2536 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2537 {
2538     match_state_t *result = NULL;
2539     REBackTrackData *backTrackData;
2540     jsbytecode *nextpc, *testpc;
2541     REOp nextop;
2542     RECapture *cap;
2543     REProgState *curState;
2544     const WCHAR *startcp;
2545     size_t parenIndex, k;
2546     size_t parenSoFar = 0;
2547
2548     WCHAR matchCh1, matchCh2;
2549     RECharSet *charSet;
2550
2551     BOOL anchor;
2552     jsbytecode *pc = gData->regexp->program;
2553     REOp op = (REOp) *pc++;
2554
2555     /*
2556      * If the first node is a simple match, step the index into the string
2557      * until that match is made, or fail if it can't be found at all.
2558      */
2559     if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2560         anchor = FALSE;
2561         while (x->cp <= gData->cpend) {
2562             nextpc = pc;    /* reset back to start each time */
2563             result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2564             if (result) {
2565                 anchor = TRUE;
2566                 x = result;
2567                 pc = nextpc;    /* accept skip to next opcode */
2568                 op = (REOp) *pc++;
2569                 assert(op < REOP_LIMIT);
2570                 break;
2571             }
2572             gData->skipped++;
2573             x->cp++;
2574         }
2575         if (!anchor)
2576             goto bad;
2577     }
2578
2579     for (;;) {
2580         const char *opname = reop_names[op];
2581         TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2582               (int)gData->stateStackTop * 2, "", opname);
2583
2584         if (REOP_IS_SIMPLE(op)) {
2585             result = SimpleMatch(gData, x, op, &pc, TRUE);
2586         } else {
2587             curState = &gData->stateStack[gData->stateStackTop];
2588             switch (op) {
2589               case REOP_END:
2590                 goto good;
2591               case REOP_ALTPREREQ2:
2592                 nextpc = pc + GET_OFFSET(pc);   /* start of next op */
2593                 pc += ARG_LEN;
2594                 matchCh2 = GET_ARG(pc);
2595                 pc += ARG_LEN;
2596                 k = GET_ARG(pc);
2597                 pc += ARG_LEN;
2598
2599                 if (x->cp != gData->cpend) {
2600                     if (*x->cp == matchCh2)
2601                         goto doAlt;
2602
2603                     charSet = &gData->regexp->classList[k];
2604                     if (!charSet->converted && !ProcessCharSet(gData, charSet))
2605                         goto bad;
2606                     matchCh1 = *x->cp;
2607                     k = matchCh1 >> 3;
2608                     if ((charSet->length == 0 ||
2609                          matchCh1 > charSet->length ||
2610                          !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2611                         charSet->sense) {
2612                         goto doAlt;
2613                     }
2614                 }
2615                 result = NULL;
2616                 break;
2617
2618               case REOP_ALTPREREQ:
2619                 nextpc = pc + GET_OFFSET(pc);   /* start of next op */
2620                 pc += ARG_LEN;
2621                 matchCh1 = GET_ARG(pc);
2622                 pc += ARG_LEN;
2623                 matchCh2 = GET_ARG(pc);
2624                 pc += ARG_LEN;
2625                 if (x->cp == gData->cpend ||
2626                     (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2627                     result = NULL;
2628                     break;
2629                 }
2630                 /* else fall through... */
2631
2632               case REOP_ALT:
2633               doAlt:
2634                 nextpc = pc + GET_OFFSET(pc);   /* start of next alternate */
2635                 pc += ARG_LEN;                  /* start of this alternate */
2636                 curState->parenSoFar = parenSoFar;
2637                 PUSH_STATE_STACK(gData);
2638                 op = (REOp) *pc++;
2639                 startcp = x->cp;
2640                 if (REOP_IS_SIMPLE(op)) {
2641                     if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2642                         op = (REOp) *nextpc++;
2643                         pc = nextpc;
2644                         continue;
2645                     }
2646                     result = x;
2647                     op = (REOp) *pc++;
2648                 }
2649                 nextop = (REOp) *nextpc++;
2650                 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2651                     goto bad;
2652                 continue;
2653
2654               /*
2655                * Occurs at (successful) end of REOP_ALT,
2656                */
2657               case REOP_JUMP:
2658                 /*
2659                  * If we have not gotten a result here, it is because of an
2660                  * empty match.  Do the same thing REOP_EMPTY would do.
2661                  */
2662                 if (!result)
2663                     result = x;
2664
2665                 --gData->stateStackTop;
2666                 pc += GET_OFFSET(pc);
2667                 op = (REOp) *pc++;
2668                 continue;
2669
2670               /*
2671                * Occurs at last (successful) end of REOP_ALT,
2672                */
2673               case REOP_ENDALT:
2674                 /*
2675                  * If we have not gotten a result here, it is because of an
2676                  * empty match.  Do the same thing REOP_EMPTY would do.
2677                  */
2678                 if (!result)
2679                     result = x;
2680
2681                 --gData->stateStackTop;
2682                 op = (REOp) *pc++;
2683                 continue;
2684
2685               case REOP_LPAREN:
2686                 pc = ReadCompactIndex(pc, &parenIndex);
2687                 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2688                 assert(parenIndex < gData->regexp->parenCount);
2689                 if (parenIndex + 1 > parenSoFar)
2690                     parenSoFar = parenIndex + 1;
2691                 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2692                 x->parens[parenIndex].length = 0;
2693                 op = (REOp) *pc++;
2694                 continue;
2695
2696               case REOP_RPAREN:
2697               {
2698                 ptrdiff_t delta;
2699
2700                 pc = ReadCompactIndex(pc, &parenIndex);
2701                 assert(parenIndex < gData->regexp->parenCount);
2702                 cap = &x->parens[parenIndex];
2703                 delta = x->cp - (gData->cpbegin + cap->index);
2704                 cap->length = (delta < 0) ? 0 : (size_t) delta;
2705                 op = (REOp) *pc++;
2706
2707                 if (!result)
2708                     result = x;
2709                 continue;
2710               }
2711               case REOP_ASSERT:
2712                 nextpc = pc + GET_OFFSET(pc);  /* start of term after ASSERT */
2713                 pc += ARG_LEN;                 /* start of ASSERT child */
2714                 op = (REOp) *pc++;
2715                 testpc = pc;
2716                 if (REOP_IS_SIMPLE(op) &&
2717                     !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2718                     result = NULL;
2719                     break;
2720                 }
2721                 curState->u.assertion.top =
2722                     (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2723                 curState->u.assertion.sz = gData->cursz;
2724                 curState->index = x->cp - gData->cpbegin;
2725                 curState->parenSoFar = parenSoFar;
2726                 PUSH_STATE_STACK(gData);
2727                 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2728                                         nextpc, x, x->cp, 0, 0)) {
2729                     goto bad;
2730                 }
2731                 continue;
2732
2733               case REOP_ASSERT_NOT:
2734                 nextpc = pc + GET_OFFSET(pc);
2735                 pc += ARG_LEN;
2736                 op = (REOp) *pc++;
2737                 testpc = pc;
2738                 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2739                     SimpleMatch(gData, x, op, &testpc, FALSE) &&
2740                     *testpc == REOP_ASSERTNOTTEST) {
2741                     result = NULL;
2742                     break;
2743                 }
2744                 curState->u.assertion.top
2745                     = (char *)gData->backTrackSP -
2746                       (char *)gData->backTrackStack;
2747                 curState->u.assertion.sz = gData->cursz;
2748                 curState->index = x->cp - gData->cpbegin;
2749                 curState->parenSoFar = parenSoFar;
2750                 PUSH_STATE_STACK(gData);
2751                 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2752                                         nextpc, x, x->cp, 0, 0)) {
2753                     goto bad;
2754                 }
2755                 continue;
2756
2757               case REOP_ASSERTTEST:
2758                 --gData->stateStackTop;
2759                 --curState;
2760                 x->cp = gData->cpbegin + curState->index;
2761                 gData->backTrackSP =
2762                     (REBackTrackData *) ((char *)gData->backTrackStack +
2763                                          curState->u.assertion.top);
2764                 gData->cursz = curState->u.assertion.sz;
2765                 if (result)
2766                     result = x;
2767                 break;
2768
2769               case REOP_ASSERTNOTTEST:
2770                 --gData->stateStackTop;
2771                 --curState;
2772                 x->cp = gData->cpbegin + curState->index;
2773                 gData->backTrackSP =
2774                     (REBackTrackData *) ((char *)gData->backTrackStack +
2775                                          curState->u.assertion.top);
2776                 gData->cursz = curState->u.assertion.sz;
2777                 result = (!result) ? x : NULL;
2778                 break;
2779               case REOP_STAR:
2780                 curState->u.quantifier.min = 0;
2781                 curState->u.quantifier.max = (UINT)-1;
2782                 goto quantcommon;
2783               case REOP_PLUS:
2784                 curState->u.quantifier.min = 1;
2785                 curState->u.quantifier.max = (UINT)-1;
2786                 goto quantcommon;
2787               case REOP_OPT:
2788                 curState->u.quantifier.min = 0;
2789                 curState->u.quantifier.max = 1;
2790                 goto quantcommon;
2791               case REOP_QUANT:
2792                 pc = ReadCompactIndex(pc, &k);
2793                 curState->u.quantifier.min = k;
2794                 pc = ReadCompactIndex(pc, &k);
2795                 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2796                 curState->u.quantifier.max = k - 1;
2797                 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2798               quantcommon:
2799                 if (curState->u.quantifier.max == 0) {
2800                     pc = pc + GET_OFFSET(pc);
2801                     op = (REOp) *pc++;
2802                     result = x;
2803                     continue;
2804                 }
2805                 /* Step over <next> */
2806                 nextpc = pc + ARG_LEN;
2807                 op = (REOp) *nextpc++;
2808                 startcp = x->cp;
2809                 if (REOP_IS_SIMPLE(op)) {
2810                     if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2811                         if (curState->u.quantifier.min == 0)
2812                             result = x;
2813                         else
2814                             result = NULL;
2815                         pc = pc + GET_OFFSET(pc);
2816                         break;
2817                     }
2818                     op = (REOp) *nextpc++;
2819                     result = x;
2820                 }
2821                 curState->index = startcp - gData->cpbegin;
2822                 curState->continue_op = REOP_REPEAT;
2823                 curState->continue_pc = pc;
2824                 curState->parenSoFar = parenSoFar;
2825                 PUSH_STATE_STACK(gData);
2826                 if (curState->u.quantifier.min == 0 &&
2827                     !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2828                                         0, 0)) {
2829                     goto bad;
2830                 }
2831                 pc = nextpc;
2832                 continue;
2833
2834               case REOP_ENDCHILD: /* marks the end of a quantifier child */
2835                 pc = curState[-1].continue_pc;
2836                 op = (REOp) curState[-1].continue_op;
2837
2838                 if (!result)
2839                     result = x;
2840                 continue;
2841
2842               case REOP_REPEAT:
2843                 --curState;
2844                 do {
2845                     --gData->stateStackTop;
2846                     if (!result) {
2847                         /* Failed, see if we have enough children. */
2848                         if (curState->u.quantifier.min == 0)
2849                             goto repeatDone;
2850                         goto break_switch;
2851                     }
2852                     if (curState->u.quantifier.min == 0 &&
2853                         x->cp == gData->cpbegin + curState->index) {
2854                         /* matched an empty string, that'll get us nowhere */
2855                         result = NULL;
2856                         goto break_switch;
2857                     }
2858                     if (curState->u.quantifier.min != 0)
2859                         curState->u.quantifier.min--;
2860                     if (curState->u.quantifier.max != (UINT) -1)
2861                         curState->u.quantifier.max--;
2862                     if (curState->u.quantifier.max == 0)
2863                         goto repeatDone;
2864                     nextpc = pc + ARG_LEN;
2865                     nextop = (REOp) *nextpc;
2866                     startcp = x->cp;
2867                     if (REOP_IS_SIMPLE(nextop)) {
2868                         nextpc++;
2869                         if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2870                             if (curState->u.quantifier.min == 0)
2871                                 goto repeatDone;
2872                             result = NULL;
2873                             goto break_switch;
2874                         }
2875                         result = x;
2876                     }
2877                     curState->index = startcp - gData->cpbegin;
2878                     PUSH_STATE_STACK(gData);
2879                     if (curState->u.quantifier.min == 0 &&
2880                         !PushBackTrackState(gData, REOP_REPEAT,
2881                                             pc, x, startcp,
2882                                             curState->parenSoFar,
2883                                             parenSoFar -
2884                                             curState->parenSoFar)) {
2885                         goto bad;
2886                     }
2887                 } while (*nextpc == REOP_ENDCHILD);
2888                 pc = nextpc;
2889                 op = (REOp) *pc++;
2890                 parenSoFar = curState->parenSoFar;
2891                 continue;
2892
2893               repeatDone:
2894                 result = x;
2895                 pc += GET_OFFSET(pc);
2896                 goto break_switch;
2897
2898               case REOP_MINIMALSTAR:
2899                 curState->u.quantifier.min = 0;
2900                 curState->u.quantifier.max = (UINT)-1;
2901                 goto minimalquantcommon;
2902               case REOP_MINIMALPLUS:
2903                 curState->u.quantifier.min = 1;
2904                 curState->u.quantifier.max = (UINT)-1;
2905                 goto minimalquantcommon;
2906               case REOP_MINIMALOPT:
2907                 curState->u.quantifier.min = 0;
2908                 curState->u.quantifier.max = 1;
2909                 goto minimalquantcommon;
2910               case REOP_MINIMALQUANT:
2911                 pc = ReadCompactIndex(pc, &k);
2912                 curState->u.quantifier.min = k;
2913                 pc = ReadCompactIndex(pc, &k);
2914                 /* See REOP_QUANT comments about k - 1. */
2915                 curState->u.quantifier.max = k - 1;
2916                 assert(curState->u.quantifier.min
2917                           <= curState->u.quantifier.max);
2918               minimalquantcommon:
2919                 curState->index = x->cp - gData->cpbegin;
2920                 curState->parenSoFar = parenSoFar;
2921                 PUSH_STATE_STACK(gData);
2922                 if (curState->u.quantifier.min != 0) {
2923                     curState->continue_op = REOP_MINIMALREPEAT;
2924                     curState->continue_pc = pc;
2925                     /* step over <next> */
2926                     pc += OFFSET_LEN;
2927                     op = (REOp) *pc++;
2928                 } else {
2929                     if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2930                                             pc, x, x->cp, 0, 0)) {
2931                         goto bad;
2932                     }
2933                     --gData->stateStackTop;
2934                     pc = pc + GET_OFFSET(pc);
2935                     op = (REOp) *pc++;
2936                 }
2937                 continue;
2938
2939               case REOP_MINIMALREPEAT:
2940                 --gData->stateStackTop;
2941                 --curState;
2942
2943                 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2944 #define PREPARE_REPEAT()                                                      \
2945     do {                                                                      \
2946         curState->index = x->cp - gData->cpbegin;                          \
2947         curState->continue_op = REOP_MINIMALREPEAT;                           \
2948         curState->continue_pc = pc;                                           \
2949         pc += ARG_LEN;                                                        \
2950         for (k = curState->parenSoFar; k < parenSoFar; k++)                   \
2951             x->parens[k].index = -1;                                          \
2952         PUSH_STATE_STACK(gData);                                              \
2953         op = (REOp) *pc++;                                                    \
2954         assert(op < REOP_LIMIT);                                              \
2955     }while(0)
2956
2957                 if (!result) {
2958                     TRACE(" -\n");
2959                     /*
2960                      * Non-greedy failure - try to consume another child.
2961                      */
2962                     if (curState->u.quantifier.max == (UINT) -1 ||
2963                         curState->u.quantifier.max > 0) {
2964                         PREPARE_REPEAT();
2965                         continue;
2966                     }
2967                     /* Don't need to adjust pc since we're going to pop. */
2968                     break;
2969                 }
2970                 if (curState->u.quantifier.min == 0 &&
2971                     x->cp == gData->cpbegin + curState->index) {
2972                     /* Matched an empty string, that'll get us nowhere. */
2973                     result = NULL;
2974                     break;
2975                 }
2976                 if (curState->u.quantifier.min != 0)
2977                     curState->u.quantifier.min--;
2978                 if (curState->u.quantifier.max != (UINT) -1)
2979                     curState->u.quantifier.max--;
2980                 if (curState->u.quantifier.min != 0) {
2981                     PREPARE_REPEAT();
2982                     continue;
2983                 }
2984                 curState->index = x->cp - gData->cpbegin;
2985                 curState->parenSoFar = parenSoFar;
2986                 PUSH_STATE_STACK(gData);
2987                 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2988                                         pc, x, x->cp,
2989                                         curState->parenSoFar,
2990                                         parenSoFar - curState->parenSoFar)) {
2991                     goto bad;
2992                 }
2993                 --gData->stateStackTop;
2994                 pc = pc + GET_OFFSET(pc);
2995                 op = (REOp) *pc++;
2996                 assert(op < REOP_LIMIT);
2997                 continue;
2998               default:
2999                 assert(FALSE);
3000                 result = NULL;
3001             }
3002           break_switch:;
3003         }
3004
3005         /*
3006          *  If the match failed and there's a backtrack option, take it.
3007          *  Otherwise this is a complete and utter failure.
3008          */
3009         if (!result) {
3010             if (gData->cursz == 0)
3011                 return NULL;
3012
3013             /* Potentially detect explosive regex here. */
3014             gData->backTrackCount++;
3015             if (gData->backTrackLimit &&
3016                 gData->backTrackCount >= gData->backTrackLimit) {
3017                 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3018                                      JSMSG_REGEXP_TOO_COMPLEX);
3019                 gData->ok = FALSE;
3020                 return NULL;
3021             }
3022
3023             backTrackData = gData->backTrackSP;
3024             gData->cursz = backTrackData->sz;
3025             gData->backTrackSP =
3026                 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3027             x->cp = backTrackData->cp;
3028             pc = backTrackData->backtrack_pc;
3029             op = (REOp) backTrackData->backtrack_op;
3030             assert(op < REOP_LIMIT);
3031             gData->stateStackTop = backTrackData->saveStateStackTop;
3032             assert(gData->stateStackTop);
3033
3034             memcpy(gData->stateStack, backTrackData + 1,
3035                    sizeof(REProgState) * backTrackData->saveStateStackTop);
3036             curState = &gData->stateStack[gData->stateStackTop - 1];
3037
3038             if (backTrackData->parenCount) {
3039                 memcpy(&x->parens[backTrackData->parenIndex],
3040                        (char *)(backTrackData + 1) +
3041                        sizeof(REProgState) * backTrackData->saveStateStackTop,
3042                        sizeof(RECapture) * backTrackData->parenCount);
3043                 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3044             } else {
3045                 for (k = curState->parenSoFar; k < parenSoFar; k++)
3046                     x->parens[k].index = -1;
3047                 parenSoFar = curState->parenSoFar;
3048             }
3049
3050             TRACE("\tBT_Pop: %ld,%ld\n",
3051                      (ULONG_PTR)backTrackData->parenIndex,
3052                      (ULONG_PTR)backTrackData->parenCount);
3053             continue;
3054         }
3055         x = result;
3056
3057         /*
3058          *  Continue with the expression.
3059          */
3060         op = (REOp)*pc++;
3061         assert(op < REOP_LIMIT);
3062     }
3063
3064 bad:
3065     TRACE("\n");
3066     return NULL;
3067
3068 good:
3069     TRACE("\n");
3070     return x;
3071 }
3072
3073 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3074 {
3075     match_state_t *result;
3076     const WCHAR *cp = x->cp;
3077     const WCHAR *cp2;
3078     UINT j;
3079
3080     /*
3081      * Have to include the position beyond the last character
3082      * in order to detect end-of-input/line condition.
3083      */
3084     for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3085         gData->skipped = cp2 - cp;
3086         x->cp = cp2;
3087         for (j = 0; j < gData->regexp->parenCount; j++)
3088             x->parens[j].index = -1;
3089         result = ExecuteREBytecode(gData, x);
3090         if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3091             return result;
3092         gData->backTrackSP = gData->backTrackStack;
3093         gData->cursz = 0;
3094         gData->stateStackTop = 0;
3095         cp2 = cp + gData->skipped;
3096     }
3097     return NULL;
3098 }
3099
3100 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3101 {
3102     UINT i;
3103
3104     gData->backTrackStackSize = INITIAL_BACKTRACK;
3105     gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3106     if (!gData->backTrackStack)
3107         goto bad;
3108
3109     gData->backTrackSP = gData->backTrackStack;
3110     gData->cursz = 0;
3111     gData->backTrackCount = 0;
3112     gData->backTrackLimit = 0;
3113
3114     gData->stateStackLimit = INITIAL_STATESTACK;
3115     gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3116     if (!gData->stateStack)
3117         goto bad;
3118
3119     gData->stateStackTop = 0;
3120     gData->cx = cx;
3121     gData->pool = pool;
3122     gData->regexp = re;
3123     gData->ok = TRUE;
3124
3125     for (i = 0; i < re->classCount; i++) {
3126         if (!re->classList[i].converted &&
3127                 !ProcessCharSet(gData, &re->classList[i])) {
3128             return E_FAIL;
3129         }
3130     }
3131
3132     return S_OK;
3133
3134 bad:
3135     js_ReportOutOfScriptQuota(cx);
3136     gData->ok = FALSE;
3137     return E_OUTOFMEMORY;
3138 }
3139
3140 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3141         const WCHAR *str, DWORD str_len, match_state_t *result)
3142 {
3143     match_state_t *res;
3144     REGlobalData gData;
3145     heap_pool_t *mark = heap_pool_mark(pool);
3146     const WCHAR *str_beg = result->cp;
3147     HRESULT hres;
3148
3149     assert(result->cp != NULL);
3150
3151     gData.cpbegin = str;
3152     gData.cpend = str+str_len;
3153     gData.start = result->cp-str;
3154     gData.skipped = 0;
3155     gData.pool = pool;
3156
3157     hres = InitMatch(regexp, cx, pool, &gData);
3158     if(FAILED(hres)) {
3159         WARN("InitMatch failed\n");
3160         heap_pool_clear(mark);
3161         return hres;
3162     }
3163
3164     res = MatchRegExp(&gData, result);
3165     heap_pool_clear(mark);
3166     if(!gData.ok) {
3167         WARN("MatchRegExp failed\n");
3168         return E_FAIL;
3169     }
3170
3171     if(!res) {
3172         result->match_len = 0;
3173         return S_FALSE;
3174     }
3175
3176     result->match_len = (result->cp-str_beg) - gData.skipped;
3177     result->paren_count = regexp->parenCount;
3178     return S_OK;
3179 }
3180
3181 void regexp_destroy(regexp_t *re)
3182 {
3183     if (re->classList) {
3184         UINT i;
3185         for (i = 0; i < re->classCount; i++) {
3186             if (re->classList[i].converted)
3187                 heap_free(re->classList[i].u.bits);
3188             re->classList[i].u.bits = NULL;
3189         }
3190         heap_free(re->classList);
3191     }
3192     heap_free(re);
3193 }
3194
3195 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3196         DWORD str_len, WORD flags, BOOL flat)
3197 {
3198     regexp_t *re;
3199     heap_pool_t *mark;
3200     CompilerState state;
3201     size_t resize;
3202     jsbytecode *endPC;
3203     UINT i;
3204     size_t len;
3205
3206     re = NULL;
3207     mark = heap_pool_mark(pool);
3208     len = str_len;
3209
3210     state.context = cx;
3211     state.pool = pool;
3212     state.cp = str;
3213     if (!state.cp)
3214         goto out;
3215     state.cpbegin = state.cp;
3216     state.cpend = state.cp + len;
3217     state.flags = flags;
3218     state.parenCount = 0;
3219     state.classCount = 0;
3220     state.progLength = 0;
3221     state.treeDepth = 0;
3222     state.classBitmapsMem = 0;
3223     for (i = 0; i < CLASS_CACHE_SIZE; i++)
3224         state.classCache[i].start = NULL;
3225
3226     if (len != 0 && flat) {
3227         state.result = NewRENode(&state, REOP_FLAT);
3228         if (!state.result)
3229             goto out;
3230         state.result->u.flat.chr = *state.cpbegin;
3231         state.result->u.flat.length = len;
3232         state.result->kid = (void *) state.cpbegin;
3233         /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3234         state.progLength += 1 + GetCompactIndexWidth(0)
3235                           + GetCompactIndexWidth(len);
3236     } else {
3237         if (!ParseRegExp(&state))
3238             goto out;
3239     }
3240     resize = offsetof(regexp_t, program) + state.progLength + 1;
3241     re = heap_alloc(resize);
3242     if (!re)
3243         goto out;
3244
3245     assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3246     re->classCount = state.classCount;
3247     if (re->classCount) {
3248         re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3249         if (!re->classList) {
3250             regexp_destroy(re);
3251             re = NULL;
3252             goto out;
3253         }
3254         for (i = 0; i < re->classCount; i++)
3255             re->classList[i].converted = FALSE;
3256     } else {
3257         re->classList = NULL;
3258     }
3259     endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3260     if (!endPC) {
3261         regexp_destroy(re);
3262         re = NULL;
3263         goto out;
3264     }
3265     *endPC++ = REOP_END;
3266     /*
3267      * Check whether size was overestimated and shrink using realloc.
3268      * This is safe since no pointers to newly parsed regexp or its parts
3269      * besides re exist here.
3270      */
3271     if ((size_t)(endPC - re->program) != state.progLength + 1) {
3272         regexp_t *tmp;
3273         assert((size_t)(endPC - re->program) < state.progLength + 1);
3274         resize = offsetof(regexp_t, program) + (endPC - re->program);
3275         tmp = heap_realloc(re, resize);
3276         if (tmp)
3277             re = tmp;
3278     }
3279
3280     re->flags = flags;
3281     re->parenCount = state.parenCount;
3282     re->source = str;
3283     re->source_len = str_len;
3284
3285 out:
3286     heap_pool_clear(mark);
3287     return re;
3288 }