d3dcompiler: Don't allow semantics on void functions.
[wine/multimedia.git] / dlls / jscript / regexp.c
blob996f9f0808daa91743b960cc0826782ec88f8cdd
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
35 #include <math.h>
37 #include "jscript.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
44 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
45 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
46 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
48 typedef BYTE JSPackedBool;
49 typedef BYTE jsbytecode;
52 * This struct holds a bitmap representation of a class from a regexp.
53 * There's a list of these referenced by the classList field in the JSRegExp
54 * struct below. The initial state has startIndex set to the offset in the
55 * original regexp source of the beginning of the class contents. The first
56 * use of the class converts the source representation into a bitmap.
59 typedef struct RECharSet {
60 JSPackedBool converted;
61 JSPackedBool sense;
62 WORD length;
63 union {
64 BYTE *bits;
65 struct {
66 size_t startIndex;
67 size_t length;
68 } src;
69 } u;
70 } RECharSet;
72 typedef struct {
73 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
74 size_t parenCount; /* number of parenthesized submatches */
75 size_t classCount; /* count [...] bitmaps */
76 RECharSet *classList; /* list of [...] bitmaps */
77 BSTR source; /* locked source string, sans // */
78 jsbytecode program[1]; /* regular expression bytecode */
79 } JSRegExp;
81 typedef struct {
82 jsdisp_t dispex;
84 JSRegExp *jsregexp;
85 BSTR str;
86 INT last_index;
87 VARIANT last_index_var;
88 } RegExpInstance;
90 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
91 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
92 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
93 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
94 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
95 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
96 static const WCHAR execW[] = {'e','x','e','c',0};
97 static const WCHAR testW[] = {'t','e','s','t',0};
99 static const WCHAR leftContextW[] =
100 {'l','e','f','t','C','o','n','t','e','x','t',0};
101 static const WCHAR rightContextW[] =
102 {'r','i','g','h','t','C','o','n','t','e','x','t',0};
104 static const WCHAR undefinedW[] = {'u','n','d','e','f','i','n','e','d',0};
105 static const WCHAR emptyW[] = {0};
107 /* FIXME: Better error handling */
108 #define ReportRegExpError(a,b,c)
109 #define ReportRegExpErrorHelper(a,b,c,d)
110 #define JS_ReportErrorNumber(a,b,c,d)
111 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
112 #define js_ReportOutOfScriptQuota(a)
113 #define JS_ReportOutOfMemory(a)
114 #define JS_COUNT_OPERATION(a,b)
116 #define JSMSG_MIN_TOO_BIG 47
117 #define JSMSG_MAX_TOO_BIG 48
118 #define JSMSG_OUT_OF_ORDER 49
119 #define JSMSG_OUT_OF_MEMORY 137
121 #define LINE_SEPARATOR 0x2028
122 #define PARA_SEPARATOR 0x2029
124 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
125 ((c >= 'a') && (c <= 'z')) )
126 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
127 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
129 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
131 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
132 #define JS7_UNDEC(c) ((c) - '0')
134 typedef enum REOp {
135 REOP_EMPTY,
136 REOP_BOL,
137 REOP_EOL,
138 REOP_WBDRY,
139 REOP_WNONBDRY,
140 REOP_DOT,
141 REOP_DIGIT,
142 REOP_NONDIGIT,
143 REOP_ALNUM,
144 REOP_NONALNUM,
145 REOP_SPACE,
146 REOP_NONSPACE,
147 REOP_BACKREF,
148 REOP_FLAT,
149 REOP_FLAT1,
150 REOP_FLATi,
151 REOP_FLAT1i,
152 REOP_UCFLAT1,
153 REOP_UCFLAT1i,
154 REOP_UCFLAT,
155 REOP_UCFLATi,
156 REOP_CLASS,
157 REOP_NCLASS,
158 REOP_ALT,
159 REOP_QUANT,
160 REOP_STAR,
161 REOP_PLUS,
162 REOP_OPT,
163 REOP_LPAREN,
164 REOP_RPAREN,
165 REOP_JUMP,
166 REOP_DOTSTAR,
167 REOP_LPARENNON,
168 REOP_ASSERT,
169 REOP_ASSERT_NOT,
170 REOP_ASSERTTEST,
171 REOP_ASSERTNOTTEST,
172 REOP_MINIMALSTAR,
173 REOP_MINIMALPLUS,
174 REOP_MINIMALOPT,
175 REOP_MINIMALQUANT,
176 REOP_ENDCHILD,
177 REOP_REPEAT,
178 REOP_MINIMALREPEAT,
179 REOP_ALTPREREQ,
180 REOP_ALTPREREQ2,
181 REOP_ENDALT,
182 REOP_CONCAT,
183 REOP_END,
184 REOP_LIMIT /* META: no operator >= to this */
185 } REOp;
187 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
189 static const char *reop_names[] = {
190 "empty",
191 "bol",
192 "eol",
193 "wbdry",
194 "wnonbdry",
195 "dot",
196 "digit",
197 "nondigit",
198 "alnum",
199 "nonalnum",
200 "space",
201 "nonspace",
202 "backref",
203 "flat",
204 "flat1",
205 "flati",
206 "flat1i",
207 "ucflat1",
208 "ucflat1i",
209 "ucflat",
210 "ucflati",
211 "class",
212 "nclass",
213 "alt",
214 "quant",
215 "star",
216 "plus",
217 "opt",
218 "lparen",
219 "rparen",
220 "jump",
221 "dotstar",
222 "lparennon",
223 "assert",
224 "assert_not",
225 "asserttest",
226 "assertnottest",
227 "minimalstar",
228 "minimalplus",
229 "minimalopt",
230 "minimalquant",
231 "endchild",
232 "repeat",
233 "minimalrepeat",
234 "altprereq",
235 "alrprereq2",
236 "endalt",
237 "concat",
238 "end",
239 NULL
242 typedef struct RECapture {
243 ptrdiff_t index; /* start of contents, -1 for empty */
244 size_t length; /* length of capture */
245 } RECapture;
247 typedef struct REMatchState {
248 const WCHAR *cp;
249 RECapture parens[1]; /* first of 're->parenCount' captures,
250 allocated at end of this struct */
251 } REMatchState;
253 typedef struct REProgState {
254 jsbytecode *continue_pc; /* current continuation data */
255 jsbytecode continue_op;
256 ptrdiff_t index; /* progress in text */
257 size_t parenSoFar; /* highest indexed paren started */
258 union {
259 struct {
260 UINT min; /* current quantifier limits */
261 UINT max;
262 } quantifier;
263 struct {
264 size_t top; /* backtrack stack state */
265 size_t sz;
266 } assertion;
267 } u;
268 } REProgState;
270 typedef struct REBackTrackData {
271 size_t sz; /* size of previous stack entry */
272 jsbytecode *backtrack_pc; /* where to backtrack to */
273 jsbytecode backtrack_op;
274 const WCHAR *cp; /* index in text of match at backtrack */
275 size_t parenIndex; /* start index of saved paren contents */
276 size_t parenCount; /* # of saved paren contents */
277 size_t saveStateStackTop; /* number of parent states */
278 /* saved parent states follow */
279 /* saved paren contents follow */
280 } REBackTrackData;
282 #define INITIAL_STATESTACK 100
283 #define INITIAL_BACKTRACK 8000
285 typedef struct REGlobalData {
286 script_ctx_t *cx;
287 JSRegExp *regexp; /* the RE in execution */
288 BOOL ok; /* runtime error (out_of_memory only?) */
289 size_t start; /* offset to start at */
290 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
291 const WCHAR *cpbegin; /* text base address */
292 const WCHAR *cpend; /* text limit address */
294 REProgState *stateStack; /* stack of state of current parents */
295 size_t stateStackTop;
296 size_t stateStackLimit;
298 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
299 REBackTrackData *backTrackSP;
300 size_t backTrackStackSize;
301 size_t cursz; /* size of current stack entry */
302 size_t backTrackCount; /* how many times we've backtracked */
303 size_t backTrackLimit; /* upper limit on backtrack states */
305 jsheap_t *pool; /* It's faster to use one malloc'd pool
306 than to malloc/free the three items
307 that are allocated from this pool */
308 } REGlobalData;
310 typedef struct RENode RENode;
311 struct RENode {
312 REOp op; /* r.e. op bytecode */
313 RENode *next; /* next in concatenation order */
314 void *kid; /* first operand */
315 union {
316 void *kid2; /* second operand */
317 INT num; /* could be a number */
318 size_t parenIndex; /* or a parenthesis index */
319 struct { /* or a quantifier range */
320 UINT min;
321 UINT max;
322 JSPackedBool greedy;
323 } range;
324 struct { /* or a character class */
325 size_t startIndex;
326 size_t kidlen; /* length of string at kid, in jschars */
327 size_t index; /* index into class list */
328 WORD bmsize; /* bitmap size, based on max char code */
329 JSPackedBool sense;
330 } ucclass;
331 struct { /* or a literal sequence */
332 WCHAR chr; /* of one character */
333 size_t length; /* or many (via the kid) */
334 } flat;
335 struct {
336 RENode *kid2; /* second operand from ALT */
337 WCHAR ch1; /* match char for ALTPREREQ */
338 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
339 } altprereq;
340 } u;
343 #define CLASS_CACHE_SIZE 4
345 typedef struct CompilerState {
346 script_ctx_t *context;
347 const WCHAR *cpbegin;
348 const WCHAR *cpend;
349 const WCHAR *cp;
350 size_t parenCount;
351 size_t classCount; /* number of [] encountered */
352 size_t treeDepth; /* maximum depth of parse tree */
353 size_t progLength; /* estimated bytecode length */
354 RENode *result;
355 size_t classBitmapsMem; /* memory to hold all class bitmaps */
356 struct {
357 const WCHAR *start; /* small cache of class strings */
358 size_t length; /* since they're often the same */
359 size_t index;
360 } classCache[CLASS_CACHE_SIZE];
361 WORD flags;
362 } CompilerState;
364 typedef struct EmitStateStackEntry {
365 jsbytecode *altHead; /* start of REOP_ALT* opcode */
366 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
367 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
368 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
369 RENode *continueNode; /* original REOP_ALT* node being stacked */
370 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
371 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
372 avoid 16-bit unsigned offset overflow */
373 } EmitStateStackEntry;
376 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
377 * the getters and setters take the pc of the offset, not of the opcode before
378 * the offset.
380 #define ARG_LEN 2
381 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
382 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
383 (pc)[1] = (jsbytecode) (arg))
385 #define OFFSET_LEN ARG_LEN
386 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
387 #define GET_OFFSET(pc) GET_ARG(pc)
389 static BOOL ParseRegExp(CompilerState*);
392 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
393 * For sanity, we limit it to 2^24 bytes.
395 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
398 * The maximum memory that can be allocated for class bitmaps.
399 * For sanity, we limit it to 2^24 bytes.
401 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
404 * Functions to get size and write/read bytecode that represent small indexes
405 * compactly.
406 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
407 * indicates that the following byte brings more bits to the index. Otherwise
408 * this is the last byte in the index bytecode representing highest index bits.
410 static size_t
411 GetCompactIndexWidth(size_t index)
413 size_t width;
415 for (width = 1; (index >>= 7) != 0; ++width) { }
416 return width;
419 static inline jsbytecode *
420 WriteCompactIndex(jsbytecode *pc, size_t index)
422 size_t next;
424 while ((next = index >> 7) != 0) {
425 *pc++ = (jsbytecode)(index | 0x80);
426 index = next;
428 *pc++ = (jsbytecode)index;
429 return pc;
432 static inline jsbytecode *
433 ReadCompactIndex(jsbytecode *pc, size_t *result)
435 size_t nextByte;
437 nextByte = *pc++;
438 if ((nextByte & 0x80) == 0) {
440 * Short-circuit the most common case when compact index <= 127.
442 *result = nextByte;
443 } else {
444 size_t shift = 7;
445 *result = 0x7F & nextByte;
446 do {
447 nextByte = *pc++;
448 *result |= (nextByte & 0x7F) << shift;
449 shift += 7;
450 } while ((nextByte & 0x80) != 0);
452 return pc;
455 /* Construct and initialize an RENode, returning NULL for out-of-memory */
456 static RENode *
457 NewRENode(CompilerState *state, REOp op)
459 RENode *ren;
461 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
462 if (!ren) {
463 /* js_ReportOutOfScriptQuota(cx); */
464 return NULL;
466 ren->op = op;
467 ren->next = NULL;
468 ren->kid = NULL;
469 return ren;
473 * Validates and converts hex ascii value.
475 static BOOL
476 isASCIIHexDigit(WCHAR c, UINT *digit)
478 UINT cv = c;
480 if (cv < '0')
481 return FALSE;
482 if (cv <= '9') {
483 *digit = cv - '0';
484 return TRUE;
486 cv |= 0x20;
487 if (cv >= 'a' && cv <= 'f') {
488 *digit = cv - 'a' + 10;
489 return TRUE;
491 return FALSE;
494 typedef struct {
495 REOp op;
496 const WCHAR *errPos;
497 size_t parenIndex;
498 } REOpData;
500 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
501 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
503 static BOOL
504 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
506 ptrdiff_t offset = target - jump;
508 /* Check that target really points forward. */
509 assert(offset >= 2);
510 if ((size_t)offset > OFFSET_MAX)
511 return FALSE;
513 jump[0] = JUMP_OFFSET_HI(offset);
514 jump[1] = JUMP_OFFSET_LO(offset);
515 return TRUE;
519 * Generate bytecode for the tree rooted at t using an explicit stack instead
520 * of recursion.
522 static jsbytecode *
523 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
524 jsbytecode *pc, RENode *t)
526 EmitStateStackEntry *emitStateSP, *emitStateStack;
527 RECharSet *charSet;
528 REOp op;
530 if (treeDepth == 0) {
531 emitStateStack = NULL;
532 } else {
533 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
534 if (!emitStateStack)
535 return NULL;
537 emitStateSP = emitStateStack;
538 op = t->op;
539 assert(op < REOP_LIMIT);
541 for (;;) {
542 *pc++ = op;
543 switch (op) {
544 case REOP_EMPTY:
545 --pc;
546 break;
548 case REOP_ALTPREREQ2:
549 case REOP_ALTPREREQ:
550 assert(emitStateSP);
551 emitStateSP->altHead = pc - 1;
552 emitStateSP->endTermFixup = pc;
553 pc += OFFSET_LEN;
554 SET_ARG(pc, t->u.altprereq.ch1);
555 pc += ARG_LEN;
556 SET_ARG(pc, t->u.altprereq.ch2);
557 pc += ARG_LEN;
559 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
560 pc += OFFSET_LEN;
562 emitStateSP->continueNode = t;
563 emitStateSP->continueOp = REOP_JUMP;
564 emitStateSP->jumpToJumpFlag = FALSE;
565 ++emitStateSP;
566 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
567 t = t->kid;
568 op = t->op;
569 assert(op < REOP_LIMIT);
570 continue;
572 case REOP_JUMP:
573 emitStateSP->nextTermFixup = pc; /* offset to following term */
574 pc += OFFSET_LEN;
575 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
576 goto jump_too_big;
577 emitStateSP->continueOp = REOP_ENDALT;
578 ++emitStateSP;
579 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
580 t = t->u.kid2;
581 op = t->op;
582 assert(op < REOP_LIMIT);
583 continue;
585 case REOP_ENDALT:
587 * If we already patched emitStateSP->nextTermFixup to jump to
588 * a nearer jump, to avoid 16-bit immediate offset overflow, we
589 * are done here.
591 if (emitStateSP->jumpToJumpFlag)
592 break;
595 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
596 * REOP_ENDALT is executed only on successful match of the last
597 * alternate in a group.
599 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
600 goto jump_too_big;
601 if (t->op != REOP_ALT) {
602 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
603 goto jump_too_big;
607 * If the program is bigger than the REOP_JUMP offset range, then
608 * we must check for alternates before this one that are part of
609 * the same group, and fix up their jump offsets to target jumps
610 * close enough to fit in a 16-bit unsigned offset immediate.
612 if ((size_t)(pc - re->program) > OFFSET_MAX &&
613 emitStateSP > emitStateStack) {
614 EmitStateStackEntry *esp, *esp2;
615 jsbytecode *alt, *jump;
616 ptrdiff_t span, header;
618 esp2 = emitStateSP;
619 alt = esp2->altHead;
620 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
621 if (esp->continueOp == REOP_ENDALT &&
622 !esp->jumpToJumpFlag &&
623 esp->nextTermFixup + OFFSET_LEN == alt &&
624 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
625 ? esp->endTermFixup
626 : esp->nextTermFixup)) > OFFSET_MAX) {
627 alt = esp->altHead;
628 jump = esp->nextTermFixup;
631 * The span must be 1 less than the distance from
632 * jump offset to jump offset, so we actually jump
633 * to a REOP_JUMP bytecode, not to its offset!
635 for (;;) {
636 assert(jump < esp2->nextTermFixup);
637 span = esp2->nextTermFixup - jump - 1;
638 if ((size_t)span <= OFFSET_MAX)
639 break;
640 do {
641 if (--esp2 == esp)
642 goto jump_too_big;
643 } while (esp2->continueOp != REOP_ENDALT);
646 jump[0] = JUMP_OFFSET_HI(span);
647 jump[1] = JUMP_OFFSET_LO(span);
649 if (esp->continueNode->op != REOP_ALT) {
651 * We must patch the offset at esp->endTermFixup
652 * as well, for the REOP_ALTPREREQ{,2} opcodes.
653 * If we're unlucky and endTermFixup is more than
654 * OFFSET_MAX bytes from its target, we cheat by
655 * jumping 6 bytes to the jump whose offset is at
656 * esp->nextTermFixup, which has the same target.
658 jump = esp->endTermFixup;
659 header = esp->nextTermFixup - jump;
660 span += header;
661 if ((size_t)span > OFFSET_MAX)
662 span = header;
664 jump[0] = JUMP_OFFSET_HI(span);
665 jump[1] = JUMP_OFFSET_LO(span);
668 esp->jumpToJumpFlag = TRUE;
672 break;
674 case REOP_ALT:
675 assert(emitStateSP);
676 emitStateSP->altHead = pc - 1;
677 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
678 pc += OFFSET_LEN;
679 emitStateSP->continueNode = t;
680 emitStateSP->continueOp = REOP_JUMP;
681 emitStateSP->jumpToJumpFlag = FALSE;
682 ++emitStateSP;
683 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
684 t = t->kid;
685 op = t->op;
686 assert(op < REOP_LIMIT);
687 continue;
689 case REOP_FLAT:
691 * Coalesce FLATs if possible and if it would not increase bytecode
692 * beyond preallocated limit. The latter happens only when bytecode
693 * size for coalesced string with offset p and length 2 exceeds 6
694 * bytes preallocated for 2 single char nodes, i.e. when
695 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
696 * GetCompactIndexWidth(p) > 4.
697 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
698 * nodes strictly decreases bytecode size, the check has to be
699 * done only for the first coalescing.
701 if (t->kid &&
702 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
704 while (t->next &&
705 t->next->op == REOP_FLAT &&
706 (WCHAR*)t->kid + t->u.flat.length ==
707 t->next->kid) {
708 t->u.flat.length += t->next->u.flat.length;
709 t->next = t->next->next;
712 if (t->kid && t->u.flat.length > 1) {
713 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
714 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
715 pc = WriteCompactIndex(pc, t->u.flat.length);
716 } else if (t->u.flat.chr < 256) {
717 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
718 *pc++ = (jsbytecode) t->u.flat.chr;
719 } else {
720 pc[-1] = (state->flags & JSREG_FOLD)
721 ? REOP_UCFLAT1i
722 : REOP_UCFLAT1;
723 SET_ARG(pc, t->u.flat.chr);
724 pc += ARG_LEN;
726 break;
728 case REOP_LPAREN:
729 assert(emitStateSP);
730 pc = WriteCompactIndex(pc, t->u.parenIndex);
731 emitStateSP->continueNode = t;
732 emitStateSP->continueOp = REOP_RPAREN;
733 ++emitStateSP;
734 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
735 t = t->kid;
736 op = t->op;
737 continue;
739 case REOP_RPAREN:
740 pc = WriteCompactIndex(pc, t->u.parenIndex);
741 break;
743 case REOP_BACKREF:
744 pc = WriteCompactIndex(pc, t->u.parenIndex);
745 break;
747 case REOP_ASSERT:
748 assert(emitStateSP);
749 emitStateSP->nextTermFixup = pc;
750 pc += OFFSET_LEN;
751 emitStateSP->continueNode = t;
752 emitStateSP->continueOp = REOP_ASSERTTEST;
753 ++emitStateSP;
754 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
755 t = t->kid;
756 op = t->op;
757 continue;
759 case REOP_ASSERTTEST:
760 case REOP_ASSERTNOTTEST:
761 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
762 goto jump_too_big;
763 break;
765 case REOP_ASSERT_NOT:
766 assert(emitStateSP);
767 emitStateSP->nextTermFixup = pc;
768 pc += OFFSET_LEN;
769 emitStateSP->continueNode = t;
770 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
771 ++emitStateSP;
772 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
773 t = t->kid;
774 op = t->op;
775 continue;
777 case REOP_QUANT:
778 assert(emitStateSP);
779 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
780 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
781 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
782 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
783 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
784 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
785 } else {
786 if (!t->u.range.greedy)
787 pc[-1] = REOP_MINIMALQUANT;
788 pc = WriteCompactIndex(pc, t->u.range.min);
790 * Write max + 1 to avoid using size_t(max) + 1 bytes
791 * for (UINT)-1 sentinel.
793 pc = WriteCompactIndex(pc, t->u.range.max + 1);
795 emitStateSP->nextTermFixup = pc;
796 pc += OFFSET_LEN;
797 emitStateSP->continueNode = t;
798 emitStateSP->continueOp = REOP_ENDCHILD;
799 ++emitStateSP;
800 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
801 t = t->kid;
802 op = t->op;
803 continue;
805 case REOP_ENDCHILD:
806 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
807 goto jump_too_big;
808 break;
810 case REOP_CLASS:
811 if (!t->u.ucclass.sense)
812 pc[-1] = REOP_NCLASS;
813 pc = WriteCompactIndex(pc, t->u.ucclass.index);
814 charSet = &re->classList[t->u.ucclass.index];
815 charSet->converted = FALSE;
816 charSet->length = t->u.ucclass.bmsize;
817 charSet->u.src.startIndex = t->u.ucclass.startIndex;
818 charSet->u.src.length = t->u.ucclass.kidlen;
819 charSet->sense = t->u.ucclass.sense;
820 break;
822 default:
823 break;
826 t = t->next;
827 if (t) {
828 op = t->op;
829 } else {
830 if (emitStateSP == emitStateStack)
831 break;
832 --emitStateSP;
833 t = emitStateSP->continueNode;
834 op = (REOp) emitStateSP->continueOp;
838 cleanup:
839 heap_free(emitStateStack);
840 return pc;
842 jump_too_big:
843 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
844 pc = NULL;
845 goto cleanup;
849 * Process the op against the two top operands, reducing them to a single
850 * operand in the penultimate slot. Update progLength and treeDepth.
852 static BOOL
853 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
854 INT operandSP)
856 RENode *result;
858 switch (opData->op) {
859 case REOP_ALT:
860 result = NewRENode(state, REOP_ALT);
861 if (!result)
862 return FALSE;
863 result->kid = operandStack[operandSP - 2];
864 result->u.kid2 = operandStack[operandSP - 1];
865 operandStack[operandSP - 2] = result;
867 if (state->treeDepth == TREE_DEPTH_MAX) {
868 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
869 return FALSE;
871 ++state->treeDepth;
874 * Look at both alternates to see if there's a FLAT or a CLASS at
875 * the start of each. If so, use a prerequisite match.
877 if (((RENode *) result->kid)->op == REOP_FLAT &&
878 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
879 (state->flags & JSREG_FOLD) == 0) {
880 result->op = REOP_ALTPREREQ;
881 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
882 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
883 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
884 JUMP, <end> ... ENDALT */
885 state->progLength += 13;
887 else
888 if (((RENode *) result->kid)->op == REOP_CLASS &&
889 ((RENode *) result->kid)->u.ucclass.index < 256 &&
890 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
891 (state->flags & JSREG_FOLD) == 0) {
892 result->op = REOP_ALTPREREQ2;
893 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
894 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
895 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
896 JUMP, <end> ... ENDALT */
897 state->progLength += 13;
899 else
900 if (((RENode *) result->kid)->op == REOP_FLAT &&
901 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
902 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
903 (state->flags & JSREG_FOLD) == 0) {
904 result->op = REOP_ALTPREREQ2;
905 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
906 result->u.altprereq.ch2 =
907 ((RENode *) result->u.kid2)->u.ucclass.index;
908 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
909 JUMP, <end> ... ENDALT */
910 state->progLength += 13;
912 else {
913 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
914 state->progLength += 7;
916 break;
918 case REOP_CONCAT:
919 result = operandStack[operandSP - 2];
920 while (result->next)
921 result = result->next;
922 result->next = operandStack[operandSP - 1];
923 break;
925 case REOP_ASSERT:
926 case REOP_ASSERT_NOT:
927 case REOP_LPARENNON:
928 case REOP_LPAREN:
929 /* These should have been processed by a close paren. */
930 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
931 opData->errPos);
932 return FALSE;
934 default:;
936 return TRUE;
940 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
941 * its being on the stack, and to propagate errors to its callers.
943 #define JSREG_FIND_PAREN_COUNT 0x8000
944 #define JSREG_FIND_PAREN_ERROR 0x4000
947 * Magic return value from FindParenCount and GetDecimalValue, to indicate
948 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
949 * its findMax parameter is non-null.
951 #define OVERFLOW_VALUE ((UINT)-1)
953 static UINT
954 FindParenCount(CompilerState *state)
956 CompilerState temp;
957 int i;
959 if (state->flags & JSREG_FIND_PAREN_COUNT)
960 return OVERFLOW_VALUE;
963 * Copy state into temp, flag it so we never report an invalid backref,
964 * and reset its members to parse the entire regexp. This is obviously
965 * suboptimal, but GetDecimalValue calls us only if a backref appears to
966 * refer to a forward parenthetical, which is rare.
968 temp = *state;
969 temp.flags |= JSREG_FIND_PAREN_COUNT;
970 temp.cp = temp.cpbegin;
971 temp.parenCount = 0;
972 temp.classCount = 0;
973 temp.progLength = 0;
974 temp.treeDepth = 0;
975 temp.classBitmapsMem = 0;
976 for (i = 0; i < CLASS_CACHE_SIZE; i++)
977 temp.classCache[i].start = NULL;
979 if (!ParseRegExp(&temp)) {
980 state->flags |= JSREG_FIND_PAREN_ERROR;
981 return OVERFLOW_VALUE;
983 return temp.parenCount;
987 * Extract and return a decimal value at state->cp. The initial character c
988 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
989 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
990 * state->flags to discover whether an error occurred under findMax.
992 static UINT
993 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
994 CompilerState *state)
996 UINT value = JS7_UNDEC(c);
997 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
999 /* The following restriction allows simpler overflow checks. */
1000 assert(max <= ((UINT)-1 - 9) / 10);
1001 while (state->cp < state->cpend) {
1002 c = *state->cp;
1003 if (!JS7_ISDEC(c))
1004 break;
1005 value = 10 * value + JS7_UNDEC(c);
1006 if (!overflow && value > max && (!findMax || value > findMax(state)))
1007 overflow = TRUE;
1008 ++state->cp;
1010 return overflow ? OVERFLOW_VALUE : value;
1014 * Calculate the total size of the bitmap required for a class expression.
1016 static BOOL
1017 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1018 const WCHAR *end)
1020 UINT max = 0;
1021 BOOL inRange = FALSE;
1022 WCHAR c, rangeStart = 0;
1023 UINT n, digit, nDigits, i;
1025 target->u.ucclass.bmsize = 0;
1026 target->u.ucclass.sense = TRUE;
1028 if (src == end)
1029 return TRUE;
1031 if (*src == '^') {
1032 ++src;
1033 target->u.ucclass.sense = FALSE;
1036 while (src != end) {
1037 BOOL canStartRange = TRUE;
1038 UINT localMax = 0;
1040 switch (*src) {
1041 case '\\':
1042 ++src;
1043 c = *src++;
1044 switch (c) {
1045 case 'b':
1046 localMax = 0x8;
1047 break;
1048 case 'f':
1049 localMax = 0xC;
1050 break;
1051 case 'n':
1052 localMax = 0xA;
1053 break;
1054 case 'r':
1055 localMax = 0xD;
1056 break;
1057 case 't':
1058 localMax = 0x9;
1059 break;
1060 case 'v':
1061 localMax = 0xB;
1062 break;
1063 case 'c':
1064 if (src < end && RE_IS_LETTER(*src)) {
1065 localMax = (UINT) (*src++) & 0x1F;
1066 } else {
1067 --src;
1068 localMax = '\\';
1070 break;
1071 case 'x':
1072 nDigits = 2;
1073 goto lexHex;
1074 case 'u':
1075 nDigits = 4;
1076 lexHex:
1077 n = 0;
1078 for (i = 0; (i < nDigits) && (src < end); i++) {
1079 c = *src++;
1080 if (!isASCIIHexDigit(c, &digit)) {
1082 * Back off to accepting the original
1083 *'\' as a literal.
1085 src -= i + 1;
1086 n = '\\';
1087 break;
1089 n = (n << 4) | digit;
1091 localMax = n;
1092 break;
1093 case 'd':
1094 canStartRange = FALSE;
1095 if (inRange) {
1096 JS_ReportErrorNumber(state->context,
1097 js_GetErrorMessage, NULL,
1098 JSMSG_BAD_CLASS_RANGE);
1099 return FALSE;
1101 localMax = '9';
1102 break;
1103 case 'D':
1104 case 's':
1105 case 'S':
1106 case 'w':
1107 case 'W':
1108 canStartRange = FALSE;
1109 if (inRange) {
1110 JS_ReportErrorNumber(state->context,
1111 js_GetErrorMessage, NULL,
1112 JSMSG_BAD_CLASS_RANGE);
1113 return FALSE;
1115 max = 65535;
1118 * If this is the start of a range, ensure that it's less than
1119 * the end.
1121 localMax = 0;
1122 break;
1123 case '0':
1124 case '1':
1125 case '2':
1126 case '3':
1127 case '4':
1128 case '5':
1129 case '6':
1130 case '7':
1132 * This is a non-ECMA extension - decimal escapes (in this
1133 * case, octal!) are supposed to be an error inside class
1134 * ranges, but supported here for backwards compatibility.
1137 n = JS7_UNDEC(c);
1138 c = *src;
1139 if ('0' <= c && c <= '7') {
1140 src++;
1141 n = 8 * n + JS7_UNDEC(c);
1142 c = *src;
1143 if ('0' <= c && c <= '7') {
1144 src++;
1145 i = 8 * n + JS7_UNDEC(c);
1146 if (i <= 0377)
1147 n = i;
1148 else
1149 src--;
1152 localMax = n;
1153 break;
1155 default:
1156 localMax = c;
1157 break;
1159 break;
1160 default:
1161 localMax = *src++;
1162 break;
1165 if (inRange) {
1166 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1167 if (rangeStart > localMax) {
1168 JS_ReportErrorNumber(state->context,
1169 js_GetErrorMessage, NULL,
1170 JSMSG_BAD_CLASS_RANGE);
1171 return FALSE;
1173 inRange = FALSE;
1174 } else {
1175 if (canStartRange && src < end - 1) {
1176 if (*src == '-') {
1177 ++src;
1178 inRange = TRUE;
1179 rangeStart = (WCHAR)localMax;
1180 continue;
1183 if (state->flags & JSREG_FOLD)
1184 rangeStart = localMax; /* one run of the uc/dc loop below */
1187 if (state->flags & JSREG_FOLD) {
1188 WCHAR maxch = localMax;
1190 for (i = rangeStart; i <= localMax; i++) {
1191 WCHAR uch, dch;
1193 uch = toupperW(i);
1194 dch = tolowerW(i);
1195 if(maxch < uch)
1196 maxch = uch;
1197 if(maxch < dch)
1198 maxch = dch;
1200 localMax = maxch;
1203 if (localMax > max)
1204 max = localMax;
1206 target->u.ucclass.bmsize = max;
1207 return TRUE;
1210 static INT
1211 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1213 UINT min, max;
1214 WCHAR c;
1215 const WCHAR *errp = state->cp++;
1217 c = *state->cp;
1218 if (JS7_ISDEC(c)) {
1219 ++state->cp;
1220 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1221 c = *state->cp;
1223 if (!ignoreValues && min == OVERFLOW_VALUE)
1224 return JSMSG_MIN_TOO_BIG;
1226 if (c == ',') {
1227 c = *++state->cp;
1228 if (JS7_ISDEC(c)) {
1229 ++state->cp;
1230 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1231 c = *state->cp;
1232 if (!ignoreValues && max == OVERFLOW_VALUE)
1233 return JSMSG_MAX_TOO_BIG;
1234 if (!ignoreValues && min > max)
1235 return JSMSG_OUT_OF_ORDER;
1236 } else {
1237 max = (UINT)-1;
1239 } else {
1240 max = min;
1242 if (c == '}') {
1243 state->result = NewRENode(state, REOP_QUANT);
1244 if (!state->result)
1245 return JSMSG_OUT_OF_MEMORY;
1246 state->result->u.range.min = min;
1247 state->result->u.range.max = max;
1249 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1250 * where <max> is written as compact(max+1) to make
1251 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1253 state->progLength += (1 + GetCompactIndexWidth(min)
1254 + GetCompactIndexWidth(max + 1)
1255 +3);
1256 return 0;
1260 state->cp = errp;
1261 return -1;
1264 static BOOL
1265 ParseQuantifier(CompilerState *state)
1267 RENode *term;
1268 term = state->result;
1269 if (state->cp < state->cpend) {
1270 switch (*state->cp) {
1271 case '+':
1272 state->result = NewRENode(state, REOP_QUANT);
1273 if (!state->result)
1274 return FALSE;
1275 state->result->u.range.min = 1;
1276 state->result->u.range.max = (UINT)-1;
1277 /* <PLUS>, <next> ... <ENDCHILD> */
1278 state->progLength += 4;
1279 goto quantifier;
1280 case '*':
1281 state->result = NewRENode(state, REOP_QUANT);
1282 if (!state->result)
1283 return FALSE;
1284 state->result->u.range.min = 0;
1285 state->result->u.range.max = (UINT)-1;
1286 /* <STAR>, <next> ... <ENDCHILD> */
1287 state->progLength += 4;
1288 goto quantifier;
1289 case '?':
1290 state->result = NewRENode(state, REOP_QUANT);
1291 if (!state->result)
1292 return FALSE;
1293 state->result->u.range.min = 0;
1294 state->result->u.range.max = 1;
1295 /* <OPT>, <next> ... <ENDCHILD> */
1296 state->progLength += 4;
1297 goto quantifier;
1298 case '{': /* balance '}' */
1300 INT err;
1302 err = ParseMinMaxQuantifier(state, FALSE);
1303 if (err == 0)
1304 goto quantifier;
1305 if (err == -1)
1306 return TRUE;
1308 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1309 return FALSE;
1311 default:;
1314 return TRUE;
1316 quantifier:
1317 if (state->treeDepth == TREE_DEPTH_MAX) {
1318 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1319 return FALSE;
1322 ++state->treeDepth;
1323 ++state->cp;
1324 state->result->kid = term;
1325 if (state->cp < state->cpend && *state->cp == '?') {
1326 ++state->cp;
1327 state->result->u.range.greedy = FALSE;
1328 } else {
1329 state->result->u.range.greedy = TRUE;
1331 return TRUE;
1335 * item: assertion An item is either an assertion or
1336 * quantatom a quantified atom.
1338 * assertion: '^' Assertions match beginning of string
1339 * (or line if the class static property
1340 * RegExp.multiline is true).
1341 * '$' End of string (or line if the class
1342 * static property RegExp.multiline is
1343 * true).
1344 * '\b' Word boundary (between \w and \W).
1345 * '\B' Word non-boundary.
1347 * quantatom: atom An unquantified atom.
1348 * quantatom '{' n ',' m '}'
1349 * Atom must occur between n and m times.
1350 * quantatom '{' n ',' '}' Atom must occur at least n times.
1351 * quantatom '{' n '}' Atom must occur exactly n times.
1352 * quantatom '*' Zero or more times (same as {0,}).
1353 * quantatom '+' One or more times (same as {1,}).
1354 * quantatom '?' Zero or one time (same as {0,1}).
1356 * any of which can be optionally followed by '?' for ungreedy
1358 * atom: '(' regexp ')' A parenthesized regexp (what matched
1359 * can be addressed using a backreference,
1360 * see '\' n below).
1361 * '.' Matches any char except '\n'.
1362 * '[' classlist ']' A character class.
1363 * '[' '^' classlist ']' A negated character class.
1364 * '\f' Form Feed.
1365 * '\n' Newline (Line Feed).
1366 * '\r' Carriage Return.
1367 * '\t' Horizontal Tab.
1368 * '\v' Vertical Tab.
1369 * '\d' A digit (same as [0-9]).
1370 * '\D' A non-digit.
1371 * '\w' A word character, [0-9a-z_A-Z].
1372 * '\W' A non-word character.
1373 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1374 * '\S' A non-whitespace character.
1375 * '\' n A backreference to the nth (n decimal
1376 * and positive) parenthesized expression.
1377 * '\' octal An octal escape sequence (octal must be
1378 * two or three digits long, unless it is
1379 * 0 for the null character).
1380 * '\x' hex A hex escape (hex must be two digits).
1381 * '\u' unicode A unicode escape (must be four digits).
1382 * '\c' ctrl A control character, ctrl is a letter.
1383 * '\' literalatomchar Any character except one of the above
1384 * that follow '\' in an atom.
1385 * otheratomchar Any character not first among the other
1386 * atom right-hand sides.
1388 static BOOL
1389 ParseTerm(CompilerState *state)
1391 WCHAR c = *state->cp++;
1392 UINT nDigits;
1393 UINT num, tmp, n, i;
1394 const WCHAR *termStart;
1396 switch (c) {
1397 /* assertions and atoms */
1398 case '^':
1399 state->result = NewRENode(state, REOP_BOL);
1400 if (!state->result)
1401 return FALSE;
1402 state->progLength++;
1403 return TRUE;
1404 case '$':
1405 state->result = NewRENode(state, REOP_EOL);
1406 if (!state->result)
1407 return FALSE;
1408 state->progLength++;
1409 return TRUE;
1410 case '\\':
1411 if (state->cp >= state->cpend) {
1412 /* a trailing '\' is an error */
1413 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1414 return FALSE;
1416 c = *state->cp++;
1417 switch (c) {
1418 /* assertion escapes */
1419 case 'b' :
1420 state->result = NewRENode(state, REOP_WBDRY);
1421 if (!state->result)
1422 return FALSE;
1423 state->progLength++;
1424 return TRUE;
1425 case 'B':
1426 state->result = NewRENode(state, REOP_WNONBDRY);
1427 if (!state->result)
1428 return FALSE;
1429 state->progLength++;
1430 return TRUE;
1431 /* Decimal escape */
1432 case '0':
1433 /* Give a strict warning. See also the note below. */
1434 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1435 doOctal:
1436 num = 0;
1437 while (state->cp < state->cpend) {
1438 c = *state->cp;
1439 if (c < '0' || '7' < c)
1440 break;
1441 state->cp++;
1442 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1443 if (tmp > 0377)
1444 break;
1445 num = tmp;
1447 c = (WCHAR)num;
1448 doFlat:
1449 state->result = NewRENode(state, REOP_FLAT);
1450 if (!state->result)
1451 return FALSE;
1452 state->result->u.flat.chr = c;
1453 state->result->u.flat.length = 1;
1454 state->progLength += 3;
1455 break;
1456 case '1':
1457 case '2':
1458 case '3':
1459 case '4':
1460 case '5':
1461 case '6':
1462 case '7':
1463 case '8':
1464 case '9':
1465 termStart = state->cp - 1;
1466 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1467 if (state->flags & JSREG_FIND_PAREN_ERROR)
1468 return FALSE;
1469 if (num == OVERFLOW_VALUE) {
1470 /* Give a strict mode warning. */
1471 WARN("back-reference exceeds number of capturing parentheses\n");
1474 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1475 * error here. However, for compatibility with IE, we treat the
1476 * whole backref as flat if the first character in it is not a
1477 * valid octal character, and as an octal escape otherwise.
1479 state->cp = termStart;
1480 if (c >= '8') {
1481 /* Treat this as flat. termStart - 1 is the \. */
1482 c = '\\';
1483 goto asFlat;
1486 /* Treat this as an octal escape. */
1487 goto doOctal;
1489 assert(1 <= num && num <= 0x10000);
1490 state->result = NewRENode(state, REOP_BACKREF);
1491 if (!state->result)
1492 return FALSE;
1493 state->result->u.parenIndex = num - 1;
1494 state->progLength
1495 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1496 break;
1497 /* Control escape */
1498 case 'f':
1499 c = 0xC;
1500 goto doFlat;
1501 case 'n':
1502 c = 0xA;
1503 goto doFlat;
1504 case 'r':
1505 c = 0xD;
1506 goto doFlat;
1507 case 't':
1508 c = 0x9;
1509 goto doFlat;
1510 case 'v':
1511 c = 0xB;
1512 goto doFlat;
1513 /* Control letter */
1514 case 'c':
1515 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1516 c = (WCHAR) (*state->cp++ & 0x1F);
1517 } else {
1518 /* back off to accepting the original '\' as a literal */
1519 --state->cp;
1520 c = '\\';
1522 goto doFlat;
1523 /* HexEscapeSequence */
1524 case 'x':
1525 nDigits = 2;
1526 goto lexHex;
1527 /* UnicodeEscapeSequence */
1528 case 'u':
1529 nDigits = 4;
1530 lexHex:
1531 n = 0;
1532 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1533 UINT digit;
1534 c = *state->cp++;
1535 if (!isASCIIHexDigit(c, &digit)) {
1537 * Back off to accepting the original 'u' or 'x' as a
1538 * literal.
1540 state->cp -= i + 2;
1541 n = *state->cp++;
1542 break;
1544 n = (n << 4) | digit;
1546 c = (WCHAR) n;
1547 goto doFlat;
1548 /* Character class escapes */
1549 case 'd':
1550 state->result = NewRENode(state, REOP_DIGIT);
1551 doSimple:
1552 if (!state->result)
1553 return FALSE;
1554 state->progLength++;
1555 break;
1556 case 'D':
1557 state->result = NewRENode(state, REOP_NONDIGIT);
1558 goto doSimple;
1559 case 's':
1560 state->result = NewRENode(state, REOP_SPACE);
1561 goto doSimple;
1562 case 'S':
1563 state->result = NewRENode(state, REOP_NONSPACE);
1564 goto doSimple;
1565 case 'w':
1566 state->result = NewRENode(state, REOP_ALNUM);
1567 goto doSimple;
1568 case 'W':
1569 state->result = NewRENode(state, REOP_NONALNUM);
1570 goto doSimple;
1571 /* IdentityEscape */
1572 default:
1573 state->result = NewRENode(state, REOP_FLAT);
1574 if (!state->result)
1575 return FALSE;
1576 state->result->u.flat.chr = c;
1577 state->result->u.flat.length = 1;
1578 state->result->kid = (void *) (state->cp - 1);
1579 state->progLength += 3;
1580 break;
1582 break;
1583 case '[':
1584 state->result = NewRENode(state, REOP_CLASS);
1585 if (!state->result)
1586 return FALSE;
1587 termStart = state->cp;
1588 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1589 for (;;) {
1590 if (state->cp == state->cpend) {
1591 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1592 JSMSG_UNTERM_CLASS, termStart);
1594 return FALSE;
1596 if (*state->cp == '\\') {
1597 state->cp++;
1598 if (state->cp != state->cpend)
1599 state->cp++;
1600 continue;
1602 if (*state->cp == ']') {
1603 state->result->u.ucclass.kidlen = state->cp - termStart;
1604 break;
1606 state->cp++;
1608 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1609 if (!state->classCache[i].start) {
1610 state->classCache[i].start = termStart;
1611 state->classCache[i].length = state->result->u.ucclass.kidlen;
1612 state->classCache[i].index = state->classCount;
1613 break;
1615 if (state->classCache[i].length ==
1616 state->result->u.ucclass.kidlen) {
1617 for (n = 0; ; n++) {
1618 if (n == state->classCache[i].length) {
1619 state->result->u.ucclass.index
1620 = state->classCache[i].index;
1621 goto claim;
1623 if (state->classCache[i].start[n] != termStart[n])
1624 break;
1628 state->result->u.ucclass.index = state->classCount++;
1630 claim:
1632 * Call CalculateBitmapSize now as we want any errors it finds
1633 * to be reported during the parse phase, not at execution.
1635 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1636 return FALSE;
1638 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1639 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1640 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1642 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1643 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1644 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1645 return FALSE;
1647 state->classBitmapsMem += n;
1648 /* CLASS, <index> */
1649 state->progLength
1650 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1651 break;
1653 case '.':
1654 state->result = NewRENode(state, REOP_DOT);
1655 goto doSimple;
1657 case '{':
1659 const WCHAR *errp = state->cp--;
1660 INT err;
1662 err = ParseMinMaxQuantifier(state, TRUE);
1663 state->cp = errp;
1665 if (err < 0)
1666 goto asFlat;
1668 /* FALL THROUGH */
1670 case '*':
1671 case '+':
1672 case '?':
1673 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1674 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1675 return FALSE;
1676 default:
1677 asFlat:
1678 state->result = NewRENode(state, REOP_FLAT);
1679 if (!state->result)
1680 return FALSE;
1681 state->result->u.flat.chr = c;
1682 state->result->u.flat.length = 1;
1683 state->result->kid = (void *) (state->cp - 1);
1684 state->progLength += 3;
1685 break;
1687 return ParseQuantifier(state);
1691 * Top-down regular expression grammar, based closely on Perl4.
1693 * regexp: altern A regular expression is one or more
1694 * altern '|' regexp alternatives separated by vertical bar.
1696 #define INITIAL_STACK_SIZE 128
1698 static BOOL
1699 ParseRegExp(CompilerState *state)
1701 size_t parenIndex;
1702 RENode *operand;
1703 REOpData *operatorStack;
1704 RENode **operandStack;
1705 REOp op;
1706 INT i;
1707 BOOL result = FALSE;
1709 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1710 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1712 /* Watch out for empty regexp */
1713 if (state->cp == state->cpend) {
1714 state->result = NewRENode(state, REOP_EMPTY);
1715 return (state->result != NULL);
1718 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1719 if (!operatorStack)
1720 return FALSE;
1722 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1723 if (!operandStack)
1724 goto out;
1726 for (;;) {
1727 parenIndex = state->parenCount;
1728 if (state->cp == state->cpend) {
1730 * If we are at the end of the regexp and we're short one or more
1731 * operands, the regexp must have the form /x|/ or some such, with
1732 * left parentheses making us short more than one operand.
1734 if (operatorSP >= operandSP) {
1735 operand = NewRENode(state, REOP_EMPTY);
1736 if (!operand)
1737 goto out;
1738 goto pushOperand;
1740 } else {
1741 switch (*state->cp) {
1742 case '(':
1743 ++state->cp;
1744 if (state->cp + 1 < state->cpend &&
1745 *state->cp == '?' &&
1746 (state->cp[1] == '=' ||
1747 state->cp[1] == '!' ||
1748 state->cp[1] == ':')) {
1749 switch (state->cp[1]) {
1750 case '=':
1751 op = REOP_ASSERT;
1752 /* ASSERT, <next>, ... ASSERTTEST */
1753 state->progLength += 4;
1754 break;
1755 case '!':
1756 op = REOP_ASSERT_NOT;
1757 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1758 state->progLength += 4;
1759 break;
1760 default:
1761 op = REOP_LPARENNON;
1762 break;
1764 state->cp += 2;
1765 } else {
1766 op = REOP_LPAREN;
1767 /* LPAREN, <index>, ... RPAREN, <index> */
1768 state->progLength
1769 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1770 state->parenCount++;
1771 if (state->parenCount == 65535) {
1772 ReportRegExpError(state, JSREPORT_ERROR,
1773 JSMSG_TOO_MANY_PARENS);
1774 goto out;
1777 goto pushOperator;
1779 case ')':
1781 * If there's no stacked open parenthesis, throw syntax error.
1783 for (i = operatorSP - 1; ; i--) {
1784 if (i < 0) {
1785 ReportRegExpError(state, JSREPORT_ERROR,
1786 JSMSG_UNMATCHED_RIGHT_PAREN);
1787 goto out;
1789 if (operatorStack[i].op == REOP_ASSERT ||
1790 operatorStack[i].op == REOP_ASSERT_NOT ||
1791 operatorStack[i].op == REOP_LPARENNON ||
1792 operatorStack[i].op == REOP_LPAREN) {
1793 break;
1796 /* FALL THROUGH */
1798 case '|':
1799 /* Expected an operand before these, so make an empty one */
1800 operand = NewRENode(state, REOP_EMPTY);
1801 if (!operand)
1802 goto out;
1803 goto pushOperand;
1805 default:
1806 if (!ParseTerm(state))
1807 goto out;
1808 operand = state->result;
1809 pushOperand:
1810 if (operandSP == operandStackSize) {
1811 RENode **tmp;
1812 operandStackSize += operandStackSize;
1813 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1814 if (!tmp)
1815 goto out;
1816 operandStack = tmp;
1818 operandStack[operandSP++] = operand;
1819 break;
1823 /* At the end; process remaining operators. */
1824 restartOperator:
1825 if (state->cp == state->cpend) {
1826 while (operatorSP) {
1827 --operatorSP;
1828 if (!ProcessOp(state, &operatorStack[operatorSP],
1829 operandStack, operandSP))
1830 goto out;
1831 --operandSP;
1833 assert(operandSP == 1);
1834 state->result = operandStack[0];
1835 result = TRUE;
1836 goto out;
1839 switch (*state->cp) {
1840 case '|':
1841 /* Process any stacked 'concat' operators */
1842 ++state->cp;
1843 while (operatorSP &&
1844 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1845 --operatorSP;
1846 if (!ProcessOp(state, &operatorStack[operatorSP],
1847 operandStack, operandSP)) {
1848 goto out;
1850 --operandSP;
1852 op = REOP_ALT;
1853 goto pushOperator;
1855 case ')':
1857 * If there's no stacked open parenthesis, throw syntax error.
1859 for (i = operatorSP - 1; ; i--) {
1860 if (i < 0) {
1861 ReportRegExpError(state, JSREPORT_ERROR,
1862 JSMSG_UNMATCHED_RIGHT_PAREN);
1863 goto out;
1865 if (operatorStack[i].op == REOP_ASSERT ||
1866 operatorStack[i].op == REOP_ASSERT_NOT ||
1867 operatorStack[i].op == REOP_LPARENNON ||
1868 operatorStack[i].op == REOP_LPAREN) {
1869 break;
1872 ++state->cp;
1874 /* Process everything on the stack until the open parenthesis. */
1875 for (;;) {
1876 assert(operatorSP);
1877 --operatorSP;
1878 switch (operatorStack[operatorSP].op) {
1879 case REOP_ASSERT:
1880 case REOP_ASSERT_NOT:
1881 case REOP_LPAREN:
1882 operand = NewRENode(state, operatorStack[operatorSP].op);
1883 if (!operand)
1884 goto out;
1885 operand->u.parenIndex =
1886 operatorStack[operatorSP].parenIndex;
1887 assert(operandSP);
1888 operand->kid = operandStack[operandSP - 1];
1889 operandStack[operandSP - 1] = operand;
1890 if (state->treeDepth == TREE_DEPTH_MAX) {
1891 ReportRegExpError(state, JSREPORT_ERROR,
1892 JSMSG_REGEXP_TOO_COMPLEX);
1893 goto out;
1895 ++state->treeDepth;
1896 /* FALL THROUGH */
1898 case REOP_LPARENNON:
1899 state->result = operandStack[operandSP - 1];
1900 if (!ParseQuantifier(state))
1901 goto out;
1902 operandStack[operandSP - 1] = state->result;
1903 goto restartOperator;
1904 default:
1905 if (!ProcessOp(state, &operatorStack[operatorSP],
1906 operandStack, operandSP))
1907 goto out;
1908 --operandSP;
1909 break;
1912 break;
1914 case '{':
1916 const WCHAR *errp = state->cp;
1918 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1920 * This didn't even scan correctly as a quantifier, so we should
1921 * treat it as flat.
1923 op = REOP_CONCAT;
1924 goto pushOperator;
1927 state->cp = errp;
1928 /* FALL THROUGH */
1931 case '+':
1932 case '*':
1933 case '?':
1934 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1935 state->cp);
1936 result = FALSE;
1937 goto out;
1939 default:
1940 /* Anything else is the start of the next term. */
1941 op = REOP_CONCAT;
1942 pushOperator:
1943 if (operatorSP == operatorStackSize) {
1944 REOpData *tmp;
1945 operatorStackSize += operatorStackSize;
1946 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1947 if (!tmp)
1948 goto out;
1949 operatorStack = tmp;
1951 operatorStack[operatorSP].op = op;
1952 operatorStack[operatorSP].errPos = state->cp;
1953 operatorStack[operatorSP++].parenIndex = parenIndex;
1954 break;
1957 out:
1958 heap_free(operatorStack);
1959 heap_free(operandStack);
1960 return result;
1964 * Save the current state of the match - the position in the input
1965 * text as well as the position in the bytecode. The state of any
1966 * parent expressions is also saved (preceding state).
1967 * Contents of parenCount parentheses from parenIndex are also saved.
1969 static REBackTrackData *
1970 PushBackTrackState(REGlobalData *gData, REOp op,
1971 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1972 size_t parenIndex, size_t parenCount)
1974 size_t i;
1975 REBackTrackData *result =
1976 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1978 size_t sz = sizeof(REBackTrackData) +
1979 gData->stateStackTop * sizeof(REProgState) +
1980 parenCount * sizeof(RECapture);
1982 ptrdiff_t btsize = gData->backTrackStackSize;
1983 ptrdiff_t btincr = ((char *)result + sz) -
1984 ((char *)gData->backTrackStack + btsize);
1986 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1988 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1989 if (btincr > 0) {
1990 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1992 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1993 btincr = ((btincr+btsize-1)/btsize)*btsize;
1994 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1995 if (!gData->backTrackStack) {
1996 js_ReportOutOfScriptQuota(gData->cx);
1997 gData->ok = FALSE;
1998 return NULL;
2000 gData->backTrackStackSize = btsize + btincr;
2001 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
2003 gData->backTrackSP = result;
2004 result->sz = gData->cursz;
2005 gData->cursz = sz;
2007 result->backtrack_op = op;
2008 result->backtrack_pc = target;
2009 result->cp = cp;
2010 result->parenCount = parenCount;
2011 result->parenIndex = parenIndex;
2013 result->saveStateStackTop = gData->stateStackTop;
2014 assert(gData->stateStackTop);
2015 memcpy(result + 1, gData->stateStack,
2016 sizeof(REProgState) * result->saveStateStackTop);
2018 if (parenCount != 0) {
2019 memcpy((char *)(result + 1) +
2020 sizeof(REProgState) * result->saveStateStackTop,
2021 &x->parens[parenIndex],
2022 sizeof(RECapture) * parenCount);
2023 for (i = 0; i != parenCount; i++)
2024 x->parens[parenIndex + i].index = -1;
2027 return result;
2030 static inline REMatchState *
2031 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2032 size_t length)
2034 size_t i;
2035 assert(gData->cpend >= x->cp);
2036 if (length > (size_t)(gData->cpend - x->cp))
2037 return NULL;
2038 for (i = 0; i != length; i++) {
2039 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2040 return NULL;
2042 x->cp += length;
2043 return x;
2047 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2048 * 2. If E is not a character then go to step 6.
2049 * 3. Let ch be E's character.
2050 * 4. Let A be a one-element RECharSet containing the character ch.
2051 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2052 * 6. E must be an integer. Let n be that integer.
2053 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2054 * 8. Return an internal Matcher closure that takes two arguments, a State x
2055 * and a Continuation c, and performs the following:
2056 * 1. Let cap be x's captures internal array.
2057 * 2. Let s be cap[n].
2058 * 3. If s is undefined, then call c(x) and return its result.
2059 * 4. Let e be x's endIndex.
2060 * 5. Let len be s's length.
2061 * 6. Let f be e+len.
2062 * 7. If f>InputLength, return failure.
2063 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2064 * such that Canonicalize(s[i]) is not the same character as
2065 * Canonicalize(Input [e+i]), then return failure.
2066 * 9. Let y be the State (f, cap).
2067 * 10. Call c(y) and return its result.
2069 static REMatchState *
2070 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2072 size_t len, i;
2073 const WCHAR *parenContent;
2074 RECapture *cap = &x->parens[parenIndex];
2076 if (cap->index == -1)
2077 return x;
2079 len = cap->length;
2080 if (x->cp + len > gData->cpend)
2081 return NULL;
2083 parenContent = &gData->cpbegin[cap->index];
2084 if (gData->regexp->flags & JSREG_FOLD) {
2085 for (i = 0; i < len; i++) {
2086 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2087 return NULL;
2089 } else {
2090 for (i = 0; i < len; i++) {
2091 if (parenContent[i] != x->cp[i])
2092 return NULL;
2095 x->cp += len;
2096 return x;
2099 /* Add a single character to the RECharSet */
2100 static void
2101 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2103 UINT byteIndex = (UINT)(c >> 3);
2104 assert(c <= cs->length);
2105 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2109 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2110 static void
2111 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2113 UINT i;
2115 UINT byteIndex1 = c1 >> 3;
2116 UINT byteIndex2 = c2 >> 3;
2118 assert(c2 <= cs->length && c1 <= c2);
2120 c1 &= 0x7;
2121 c2 &= 0x7;
2123 if (byteIndex1 == byteIndex2) {
2124 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2125 } else {
2126 cs->u.bits[byteIndex1] |= 0xFF << c1;
2127 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2128 cs->u.bits[i] = 0xFF;
2129 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2133 /* Compile the source of the class into a RECharSet */
2134 static BOOL
2135 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2137 const WCHAR *src, *end;
2138 BOOL inRange = FALSE;
2139 WCHAR rangeStart = 0;
2140 UINT byteLength, n;
2141 WCHAR c, thisCh;
2142 INT nDigits, i;
2144 assert(!charSet->converted);
2146 * Assert that startIndex and length points to chars inside [] inside
2147 * source string.
2149 assert(1 <= charSet->u.src.startIndex);
2150 assert(charSet->u.src.startIndex
2151 < SysStringLen(gData->regexp->source));
2152 assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
2153 - 1 - charSet->u.src.startIndex);
2155 charSet->converted = TRUE;
2156 src = gData->regexp->source + charSet->u.src.startIndex;
2158 end = src + charSet->u.src.length;
2160 assert(src[-1] == '[' && end[0] == ']');
2162 byteLength = (charSet->length >> 3) + 1;
2163 charSet->u.bits = heap_alloc(byteLength);
2164 if (!charSet->u.bits) {
2165 JS_ReportOutOfMemory(gData->cx);
2166 gData->ok = FALSE;
2167 return FALSE;
2169 memset(charSet->u.bits, 0, byteLength);
2171 if (src == end)
2172 return TRUE;
2174 if (*src == '^') {
2175 assert(charSet->sense == FALSE);
2176 ++src;
2177 } else {
2178 assert(charSet->sense == TRUE);
2181 while (src != end) {
2182 switch (*src) {
2183 case '\\':
2184 ++src;
2185 c = *src++;
2186 switch (c) {
2187 case 'b':
2188 thisCh = 0x8;
2189 break;
2190 case 'f':
2191 thisCh = 0xC;
2192 break;
2193 case 'n':
2194 thisCh = 0xA;
2195 break;
2196 case 'r':
2197 thisCh = 0xD;
2198 break;
2199 case 't':
2200 thisCh = 0x9;
2201 break;
2202 case 'v':
2203 thisCh = 0xB;
2204 break;
2205 case 'c':
2206 if (src < end && JS_ISWORD(*src)) {
2207 thisCh = (WCHAR)(*src++ & 0x1F);
2208 } else {
2209 --src;
2210 thisCh = '\\';
2212 break;
2213 case 'x':
2214 nDigits = 2;
2215 goto lexHex;
2216 case 'u':
2217 nDigits = 4;
2218 lexHex:
2219 n = 0;
2220 for (i = 0; (i < nDigits) && (src < end); i++) {
2221 UINT digit;
2222 c = *src++;
2223 if (!isASCIIHexDigit(c, &digit)) {
2225 * Back off to accepting the original '\'
2226 * as a literal
2228 src -= i + 1;
2229 n = '\\';
2230 break;
2232 n = (n << 4) | digit;
2234 thisCh = (WCHAR)n;
2235 break;
2236 case '0':
2237 case '1':
2238 case '2':
2239 case '3':
2240 case '4':
2241 case '5':
2242 case '6':
2243 case '7':
2245 * This is a non-ECMA extension - decimal escapes (in this
2246 * case, octal!) are supposed to be an error inside class
2247 * ranges, but supported here for backwards compatibility.
2249 n = JS7_UNDEC(c);
2250 c = *src;
2251 if ('0' <= c && c <= '7') {
2252 src++;
2253 n = 8 * n + JS7_UNDEC(c);
2254 c = *src;
2255 if ('0' <= c && c <= '7') {
2256 src++;
2257 i = 8 * n + JS7_UNDEC(c);
2258 if (i <= 0377)
2259 n = i;
2260 else
2261 src--;
2264 thisCh = (WCHAR)n;
2265 break;
2267 case 'd':
2268 AddCharacterRangeToCharSet(charSet, '0', '9');
2269 continue; /* don't need range processing */
2270 case 'D':
2271 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2272 AddCharacterRangeToCharSet(charSet,
2273 (WCHAR)('9' + 1),
2274 (WCHAR)charSet->length);
2275 continue;
2276 case 's':
2277 for (i = (INT)charSet->length; i >= 0; i--)
2278 if (isspaceW(i))
2279 AddCharacterToCharSet(charSet, (WCHAR)i);
2280 continue;
2281 case 'S':
2282 for (i = (INT)charSet->length; i >= 0; i--)
2283 if (!isspaceW(i))
2284 AddCharacterToCharSet(charSet, (WCHAR)i);
2285 continue;
2286 case 'w':
2287 for (i = (INT)charSet->length; i >= 0; i--)
2288 if (JS_ISWORD(i))
2289 AddCharacterToCharSet(charSet, (WCHAR)i);
2290 continue;
2291 case 'W':
2292 for (i = (INT)charSet->length; i >= 0; i--)
2293 if (!JS_ISWORD(i))
2294 AddCharacterToCharSet(charSet, (WCHAR)i);
2295 continue;
2296 default:
2297 thisCh = c;
2298 break;
2301 break;
2303 default:
2304 thisCh = *src++;
2305 break;
2308 if (inRange) {
2309 if (gData->regexp->flags & JSREG_FOLD) {
2310 assert(rangeStart <= thisCh);
2311 for (i = rangeStart; i <= thisCh; i++) {
2312 WCHAR uch, dch;
2314 AddCharacterToCharSet(charSet, i);
2315 uch = toupperW(i);
2316 dch = tolowerW(i);
2317 if (i != uch)
2318 AddCharacterToCharSet(charSet, uch);
2319 if (i != dch)
2320 AddCharacterToCharSet(charSet, dch);
2322 } else {
2323 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2325 inRange = FALSE;
2326 } else {
2327 if (gData->regexp->flags & JSREG_FOLD) {
2328 AddCharacterToCharSet(charSet, toupperW(thisCh));
2329 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2330 } else {
2331 AddCharacterToCharSet(charSet, thisCh);
2333 if (src < end - 1) {
2334 if (*src == '-') {
2335 ++src;
2336 inRange = TRUE;
2337 rangeStart = thisCh;
2342 return TRUE;
2345 static BOOL
2346 ReallocStateStack(REGlobalData *gData)
2348 size_t limit = gData->stateStackLimit;
2349 size_t sz = sizeof(REProgState) * limit;
2351 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2352 if (!gData->stateStack) {
2353 js_ReportOutOfScriptQuota(gData->cx);
2354 gData->ok = FALSE;
2355 return FALSE;
2357 gData->stateStackLimit = limit + limit;
2358 return TRUE;
2361 #define PUSH_STATE_STACK(data) \
2362 do { \
2363 ++(data)->stateStackTop; \
2364 if ((data)->stateStackTop == (data)->stateStackLimit && \
2365 !ReallocStateStack((data))) { \
2366 return NULL; \
2368 }while(0)
2371 * Apply the current op against the given input to see if it's going to match
2372 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2373 * true, then update the current state's cp. Always update startpc to the next
2374 * op.
2376 static inline REMatchState *
2377 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2378 jsbytecode **startpc, BOOL updatecp)
2380 REMatchState *result = NULL;
2381 WCHAR matchCh;
2382 size_t parenIndex;
2383 size_t offset, length, index;
2384 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2385 WCHAR *source;
2386 const WCHAR *startcp = x->cp;
2387 WCHAR ch;
2388 RECharSet *charSet;
2390 const char *opname = reop_names[op];
2391 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2392 (int)gData->stateStackTop * 2, "", opname);
2394 switch (op) {
2395 case REOP_EMPTY:
2396 result = x;
2397 break;
2398 case REOP_BOL:
2399 if (x->cp != gData->cpbegin) {
2400 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2401 !(gData->regexp->flags & JSREG_MULTILINE)) {
2402 break;
2404 if (!RE_IS_LINE_TERM(x->cp[-1]))
2405 break;
2407 result = x;
2408 break;
2409 case REOP_EOL:
2410 if (x->cp != gData->cpend) {
2411 if (/*!gData->cx->regExpStatics.multiline &&*/
2412 !(gData->regexp->flags & JSREG_MULTILINE)) {
2413 break;
2415 if (!RE_IS_LINE_TERM(*x->cp))
2416 break;
2418 result = x;
2419 break;
2420 case REOP_WBDRY:
2421 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2422 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2423 result = x;
2425 break;
2426 case REOP_WNONBDRY:
2427 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2428 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2429 result = x;
2431 break;
2432 case REOP_DOT:
2433 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2434 result = x;
2435 result->cp++;
2437 break;
2438 case REOP_DIGIT:
2439 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2440 result = x;
2441 result->cp++;
2443 break;
2444 case REOP_NONDIGIT:
2445 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2446 result = x;
2447 result->cp++;
2449 break;
2450 case REOP_ALNUM:
2451 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2452 result = x;
2453 result->cp++;
2455 break;
2456 case REOP_NONALNUM:
2457 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2458 result = x;
2459 result->cp++;
2461 break;
2462 case REOP_SPACE:
2463 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2464 result = x;
2465 result->cp++;
2467 break;
2468 case REOP_NONSPACE:
2469 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2470 result = x;
2471 result->cp++;
2473 break;
2474 case REOP_BACKREF:
2475 pc = ReadCompactIndex(pc, &parenIndex);
2476 assert(parenIndex < gData->regexp->parenCount);
2477 result = BackrefMatcher(gData, x, parenIndex);
2478 break;
2479 case REOP_FLAT:
2480 pc = ReadCompactIndex(pc, &offset);
2481 assert(offset < SysStringLen(gData->regexp->source));
2482 pc = ReadCompactIndex(pc, &length);
2483 assert(1 <= length);
2484 assert(length <= SysStringLen(gData->regexp->source) - offset);
2485 if (length <= (size_t)(gData->cpend - x->cp)) {
2486 source = gData->regexp->source + offset;
2487 TRACE("%s\n", debugstr_wn(source, length));
2488 for (index = 0; index != length; index++) {
2489 if (source[index] != x->cp[index])
2490 return NULL;
2492 x->cp += length;
2493 result = x;
2495 break;
2496 case REOP_FLAT1:
2497 matchCh = *pc++;
2498 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2499 if (x->cp != gData->cpend && *x->cp == matchCh) {
2500 result = x;
2501 result->cp++;
2503 break;
2504 case REOP_FLATi:
2505 pc = ReadCompactIndex(pc, &offset);
2506 assert(offset < SysStringLen(gData->regexp->source));
2507 pc = ReadCompactIndex(pc, &length);
2508 assert(1 <= length);
2509 assert(length <= SysStringLen(gData->regexp->source) - offset);
2510 source = gData->regexp->source;
2511 result = FlatNIMatcher(gData, x, source + offset, length);
2512 break;
2513 case REOP_FLAT1i:
2514 matchCh = *pc++;
2515 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2516 result = x;
2517 result->cp++;
2519 break;
2520 case REOP_UCFLAT1:
2521 matchCh = GET_ARG(pc);
2522 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2523 pc += ARG_LEN;
2524 if (x->cp != gData->cpend && *x->cp == matchCh) {
2525 result = x;
2526 result->cp++;
2528 break;
2529 case REOP_UCFLAT1i:
2530 matchCh = GET_ARG(pc);
2531 pc += ARG_LEN;
2532 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2533 result = x;
2534 result->cp++;
2536 break;
2537 case REOP_CLASS:
2538 pc = ReadCompactIndex(pc, &index);
2539 assert(index < gData->regexp->classCount);
2540 if (x->cp != gData->cpend) {
2541 charSet = &gData->regexp->classList[index];
2542 assert(charSet->converted);
2543 ch = *x->cp;
2544 index = ch >> 3;
2545 if (charSet->length != 0 &&
2546 ch <= charSet->length &&
2547 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2548 result = x;
2549 result->cp++;
2552 break;
2553 case REOP_NCLASS:
2554 pc = ReadCompactIndex(pc, &index);
2555 assert(index < gData->regexp->classCount);
2556 if (x->cp != gData->cpend) {
2557 charSet = &gData->regexp->classList[index];
2558 assert(charSet->converted);
2559 ch = *x->cp;
2560 index = ch >> 3;
2561 if (charSet->length == 0 ||
2562 ch > charSet->length ||
2563 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2564 result = x;
2565 result->cp++;
2568 break;
2570 default:
2571 assert(FALSE);
2573 if (result) {
2574 if (!updatecp)
2575 x->cp = startcp;
2576 *startpc = pc;
2577 TRACE(" *\n");
2578 return result;
2580 x->cp = startcp;
2581 return NULL;
2584 static inline REMatchState *
2585 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2587 REMatchState *result = NULL;
2588 REBackTrackData *backTrackData;
2589 jsbytecode *nextpc, *testpc;
2590 REOp nextop;
2591 RECapture *cap;
2592 REProgState *curState;
2593 const WCHAR *startcp;
2594 size_t parenIndex, k;
2595 size_t parenSoFar = 0;
2597 WCHAR matchCh1, matchCh2;
2598 RECharSet *charSet;
2600 BOOL anchor;
2601 jsbytecode *pc = gData->regexp->program;
2602 REOp op = (REOp) *pc++;
2605 * If the first node is a simple match, step the index into the string
2606 * until that match is made, or fail if it can't be found at all.
2608 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2609 anchor = FALSE;
2610 while (x->cp <= gData->cpend) {
2611 nextpc = pc; /* reset back to start each time */
2612 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2613 if (result) {
2614 anchor = TRUE;
2615 x = result;
2616 pc = nextpc; /* accept skip to next opcode */
2617 op = (REOp) *pc++;
2618 assert(op < REOP_LIMIT);
2619 break;
2621 gData->skipped++;
2622 x->cp++;
2624 if (!anchor)
2625 goto bad;
2628 for (;;) {
2629 const char *opname = reop_names[op];
2630 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2631 (int)gData->stateStackTop * 2, "", opname);
2633 if (REOP_IS_SIMPLE(op)) {
2634 result = SimpleMatch(gData, x, op, &pc, TRUE);
2635 } else {
2636 curState = &gData->stateStack[gData->stateStackTop];
2637 switch (op) {
2638 case REOP_END:
2639 goto good;
2640 case REOP_ALTPREREQ2:
2641 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2642 pc += ARG_LEN;
2643 matchCh2 = GET_ARG(pc);
2644 pc += ARG_LEN;
2645 k = GET_ARG(pc);
2646 pc += ARG_LEN;
2648 if (x->cp != gData->cpend) {
2649 if (*x->cp == matchCh2)
2650 goto doAlt;
2652 charSet = &gData->regexp->classList[k];
2653 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2654 goto bad;
2655 matchCh1 = *x->cp;
2656 k = matchCh1 >> 3;
2657 if ((charSet->length == 0 ||
2658 matchCh1 > charSet->length ||
2659 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2660 charSet->sense) {
2661 goto doAlt;
2664 result = NULL;
2665 break;
2667 case REOP_ALTPREREQ:
2668 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2669 pc += ARG_LEN;
2670 matchCh1 = GET_ARG(pc);
2671 pc += ARG_LEN;
2672 matchCh2 = GET_ARG(pc);
2673 pc += ARG_LEN;
2674 if (x->cp == gData->cpend ||
2675 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2676 result = NULL;
2677 break;
2679 /* else fall through... */
2681 case REOP_ALT:
2682 doAlt:
2683 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2684 pc += ARG_LEN; /* start of this alternate */
2685 curState->parenSoFar = parenSoFar;
2686 PUSH_STATE_STACK(gData);
2687 op = (REOp) *pc++;
2688 startcp = x->cp;
2689 if (REOP_IS_SIMPLE(op)) {
2690 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2691 op = (REOp) *nextpc++;
2692 pc = nextpc;
2693 continue;
2695 result = x;
2696 op = (REOp) *pc++;
2698 nextop = (REOp) *nextpc++;
2699 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2700 goto bad;
2701 continue;
2704 * Occurs at (successful) end of REOP_ALT,
2706 case REOP_JUMP:
2708 * If we have not gotten a result here, it is because of an
2709 * empty match. Do the same thing REOP_EMPTY would do.
2711 if (!result)
2712 result = x;
2714 --gData->stateStackTop;
2715 pc += GET_OFFSET(pc);
2716 op = (REOp) *pc++;
2717 continue;
2720 * Occurs at last (successful) end of REOP_ALT,
2722 case REOP_ENDALT:
2724 * If we have not gotten a result here, it is because of an
2725 * empty match. Do the same thing REOP_EMPTY would do.
2727 if (!result)
2728 result = x;
2730 --gData->stateStackTop;
2731 op = (REOp) *pc++;
2732 continue;
2734 case REOP_LPAREN:
2735 pc = ReadCompactIndex(pc, &parenIndex);
2736 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2737 assert(parenIndex < gData->regexp->parenCount);
2738 if (parenIndex + 1 > parenSoFar)
2739 parenSoFar = parenIndex + 1;
2740 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2741 x->parens[parenIndex].length = 0;
2742 op = (REOp) *pc++;
2743 continue;
2745 case REOP_RPAREN:
2747 ptrdiff_t delta;
2749 pc = ReadCompactIndex(pc, &parenIndex);
2750 assert(parenIndex < gData->regexp->parenCount);
2751 cap = &x->parens[parenIndex];
2752 delta = x->cp - (gData->cpbegin + cap->index);
2753 cap->length = (delta < 0) ? 0 : (size_t) delta;
2754 op = (REOp) *pc++;
2756 if (!result)
2757 result = x;
2758 continue;
2760 case REOP_ASSERT:
2761 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2762 pc += ARG_LEN; /* start of ASSERT child */
2763 op = (REOp) *pc++;
2764 testpc = pc;
2765 if (REOP_IS_SIMPLE(op) &&
2766 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2767 result = NULL;
2768 break;
2770 curState->u.assertion.top =
2771 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2772 curState->u.assertion.sz = gData->cursz;
2773 curState->index = x->cp - gData->cpbegin;
2774 curState->parenSoFar = parenSoFar;
2775 PUSH_STATE_STACK(gData);
2776 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2777 nextpc, x, x->cp, 0, 0)) {
2778 goto bad;
2780 continue;
2782 case REOP_ASSERT_NOT:
2783 nextpc = pc + GET_OFFSET(pc);
2784 pc += ARG_LEN;
2785 op = (REOp) *pc++;
2786 testpc = pc;
2787 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2788 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2789 *testpc == REOP_ASSERTNOTTEST) {
2790 result = NULL;
2791 break;
2793 curState->u.assertion.top
2794 = (char *)gData->backTrackSP -
2795 (char *)gData->backTrackStack;
2796 curState->u.assertion.sz = gData->cursz;
2797 curState->index = x->cp - gData->cpbegin;
2798 curState->parenSoFar = parenSoFar;
2799 PUSH_STATE_STACK(gData);
2800 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2801 nextpc, x, x->cp, 0, 0)) {
2802 goto bad;
2804 continue;
2806 case REOP_ASSERTTEST:
2807 --gData->stateStackTop;
2808 --curState;
2809 x->cp = gData->cpbegin + curState->index;
2810 gData->backTrackSP =
2811 (REBackTrackData *) ((char *)gData->backTrackStack +
2812 curState->u.assertion.top);
2813 gData->cursz = curState->u.assertion.sz;
2814 if (result)
2815 result = x;
2816 break;
2818 case REOP_ASSERTNOTTEST:
2819 --gData->stateStackTop;
2820 --curState;
2821 x->cp = gData->cpbegin + curState->index;
2822 gData->backTrackSP =
2823 (REBackTrackData *) ((char *)gData->backTrackStack +
2824 curState->u.assertion.top);
2825 gData->cursz = curState->u.assertion.sz;
2826 result = (!result) ? x : NULL;
2827 break;
2828 case REOP_STAR:
2829 curState->u.quantifier.min = 0;
2830 curState->u.quantifier.max = (UINT)-1;
2831 goto quantcommon;
2832 case REOP_PLUS:
2833 curState->u.quantifier.min = 1;
2834 curState->u.quantifier.max = (UINT)-1;
2835 goto quantcommon;
2836 case REOP_OPT:
2837 curState->u.quantifier.min = 0;
2838 curState->u.quantifier.max = 1;
2839 goto quantcommon;
2840 case REOP_QUANT:
2841 pc = ReadCompactIndex(pc, &k);
2842 curState->u.quantifier.min = k;
2843 pc = ReadCompactIndex(pc, &k);
2844 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2845 curState->u.quantifier.max = k - 1;
2846 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2847 quantcommon:
2848 if (curState->u.quantifier.max == 0) {
2849 pc = pc + GET_OFFSET(pc);
2850 op = (REOp) *pc++;
2851 result = x;
2852 continue;
2854 /* Step over <next> */
2855 nextpc = pc + ARG_LEN;
2856 op = (REOp) *nextpc++;
2857 startcp = x->cp;
2858 if (REOP_IS_SIMPLE(op)) {
2859 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2860 if (curState->u.quantifier.min == 0)
2861 result = x;
2862 else
2863 result = NULL;
2864 pc = pc + GET_OFFSET(pc);
2865 break;
2867 op = (REOp) *nextpc++;
2868 result = x;
2870 curState->index = startcp - gData->cpbegin;
2871 curState->continue_op = REOP_REPEAT;
2872 curState->continue_pc = pc;
2873 curState->parenSoFar = parenSoFar;
2874 PUSH_STATE_STACK(gData);
2875 if (curState->u.quantifier.min == 0 &&
2876 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2877 0, 0)) {
2878 goto bad;
2880 pc = nextpc;
2881 continue;
2883 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2884 pc = curState[-1].continue_pc;
2885 op = (REOp) curState[-1].continue_op;
2887 if (!result)
2888 result = x;
2889 continue;
2891 case REOP_REPEAT:
2892 --curState;
2893 do {
2894 --gData->stateStackTop;
2895 if (!result) {
2896 /* Failed, see if we have enough children. */
2897 if (curState->u.quantifier.min == 0)
2898 goto repeatDone;
2899 goto break_switch;
2901 if (curState->u.quantifier.min == 0 &&
2902 x->cp == gData->cpbegin + curState->index) {
2903 /* matched an empty string, that'll get us nowhere */
2904 result = NULL;
2905 goto break_switch;
2907 if (curState->u.quantifier.min != 0)
2908 curState->u.quantifier.min--;
2909 if (curState->u.quantifier.max != (UINT) -1)
2910 curState->u.quantifier.max--;
2911 if (curState->u.quantifier.max == 0)
2912 goto repeatDone;
2913 nextpc = pc + ARG_LEN;
2914 nextop = (REOp) *nextpc;
2915 startcp = x->cp;
2916 if (REOP_IS_SIMPLE(nextop)) {
2917 nextpc++;
2918 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2919 if (curState->u.quantifier.min == 0)
2920 goto repeatDone;
2921 result = NULL;
2922 goto break_switch;
2924 result = x;
2926 curState->index = startcp - gData->cpbegin;
2927 PUSH_STATE_STACK(gData);
2928 if (curState->u.quantifier.min == 0 &&
2929 !PushBackTrackState(gData, REOP_REPEAT,
2930 pc, x, startcp,
2931 curState->parenSoFar,
2932 parenSoFar -
2933 curState->parenSoFar)) {
2934 goto bad;
2936 } while (*nextpc == REOP_ENDCHILD);
2937 pc = nextpc;
2938 op = (REOp) *pc++;
2939 parenSoFar = curState->parenSoFar;
2940 continue;
2942 repeatDone:
2943 result = x;
2944 pc += GET_OFFSET(pc);
2945 goto break_switch;
2947 case REOP_MINIMALSTAR:
2948 curState->u.quantifier.min = 0;
2949 curState->u.quantifier.max = (UINT)-1;
2950 goto minimalquantcommon;
2951 case REOP_MINIMALPLUS:
2952 curState->u.quantifier.min = 1;
2953 curState->u.quantifier.max = (UINT)-1;
2954 goto minimalquantcommon;
2955 case REOP_MINIMALOPT:
2956 curState->u.quantifier.min = 0;
2957 curState->u.quantifier.max = 1;
2958 goto minimalquantcommon;
2959 case REOP_MINIMALQUANT:
2960 pc = ReadCompactIndex(pc, &k);
2961 curState->u.quantifier.min = k;
2962 pc = ReadCompactIndex(pc, &k);
2963 /* See REOP_QUANT comments about k - 1. */
2964 curState->u.quantifier.max = k - 1;
2965 assert(curState->u.quantifier.min
2966 <= curState->u.quantifier.max);
2967 minimalquantcommon:
2968 curState->index = x->cp - gData->cpbegin;
2969 curState->parenSoFar = parenSoFar;
2970 PUSH_STATE_STACK(gData);
2971 if (curState->u.quantifier.min != 0) {
2972 curState->continue_op = REOP_MINIMALREPEAT;
2973 curState->continue_pc = pc;
2974 /* step over <next> */
2975 pc += OFFSET_LEN;
2976 op = (REOp) *pc++;
2977 } else {
2978 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2979 pc, x, x->cp, 0, 0)) {
2980 goto bad;
2982 --gData->stateStackTop;
2983 pc = pc + GET_OFFSET(pc);
2984 op = (REOp) *pc++;
2986 continue;
2988 case REOP_MINIMALREPEAT:
2989 --gData->stateStackTop;
2990 --curState;
2992 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2993 #define PREPARE_REPEAT() \
2994 do { \
2995 curState->index = x->cp - gData->cpbegin; \
2996 curState->continue_op = REOP_MINIMALREPEAT; \
2997 curState->continue_pc = pc; \
2998 pc += ARG_LEN; \
2999 for (k = curState->parenSoFar; k < parenSoFar; k++) \
3000 x->parens[k].index = -1; \
3001 PUSH_STATE_STACK(gData); \
3002 op = (REOp) *pc++; \
3003 assert(op < REOP_LIMIT); \
3004 }while(0)
3006 if (!result) {
3007 TRACE(" -\n");
3009 * Non-greedy failure - try to consume another child.
3011 if (curState->u.quantifier.max == (UINT) -1 ||
3012 curState->u.quantifier.max > 0) {
3013 PREPARE_REPEAT();
3014 continue;
3016 /* Don't need to adjust pc since we're going to pop. */
3017 break;
3019 if (curState->u.quantifier.min == 0 &&
3020 x->cp == gData->cpbegin + curState->index) {
3021 /* Matched an empty string, that'll get us nowhere. */
3022 result = NULL;
3023 break;
3025 if (curState->u.quantifier.min != 0)
3026 curState->u.quantifier.min--;
3027 if (curState->u.quantifier.max != (UINT) -1)
3028 curState->u.quantifier.max--;
3029 if (curState->u.quantifier.min != 0) {
3030 PREPARE_REPEAT();
3031 continue;
3033 curState->index = x->cp - gData->cpbegin;
3034 curState->parenSoFar = parenSoFar;
3035 PUSH_STATE_STACK(gData);
3036 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3037 pc, x, x->cp,
3038 curState->parenSoFar,
3039 parenSoFar - curState->parenSoFar)) {
3040 goto bad;
3042 --gData->stateStackTop;
3043 pc = pc + GET_OFFSET(pc);
3044 op = (REOp) *pc++;
3045 assert(op < REOP_LIMIT);
3046 continue;
3047 default:
3048 assert(FALSE);
3049 result = NULL;
3051 break_switch:;
3055 * If the match failed and there's a backtrack option, take it.
3056 * Otherwise this is a complete and utter failure.
3058 if (!result) {
3059 if (gData->cursz == 0)
3060 return NULL;
3062 /* Potentially detect explosive regex here. */
3063 gData->backTrackCount++;
3064 if (gData->backTrackLimit &&
3065 gData->backTrackCount >= gData->backTrackLimit) {
3066 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3067 JSMSG_REGEXP_TOO_COMPLEX);
3068 gData->ok = FALSE;
3069 return NULL;
3072 backTrackData = gData->backTrackSP;
3073 gData->cursz = backTrackData->sz;
3074 gData->backTrackSP =
3075 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3076 x->cp = backTrackData->cp;
3077 pc = backTrackData->backtrack_pc;
3078 op = (REOp) backTrackData->backtrack_op;
3079 assert(op < REOP_LIMIT);
3080 gData->stateStackTop = backTrackData->saveStateStackTop;
3081 assert(gData->stateStackTop);
3083 memcpy(gData->stateStack, backTrackData + 1,
3084 sizeof(REProgState) * backTrackData->saveStateStackTop);
3085 curState = &gData->stateStack[gData->stateStackTop - 1];
3087 if (backTrackData->parenCount) {
3088 memcpy(&x->parens[backTrackData->parenIndex],
3089 (char *)(backTrackData + 1) +
3090 sizeof(REProgState) * backTrackData->saveStateStackTop,
3091 sizeof(RECapture) * backTrackData->parenCount);
3092 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3093 } else {
3094 for (k = curState->parenSoFar; k < parenSoFar; k++)
3095 x->parens[k].index = -1;
3096 parenSoFar = curState->parenSoFar;
3099 TRACE("\tBT_Pop: %ld,%ld\n",
3100 (ULONG_PTR)backTrackData->parenIndex,
3101 (ULONG_PTR)backTrackData->parenCount);
3102 continue;
3104 x = result;
3107 * Continue with the expression.
3109 op = (REOp)*pc++;
3110 assert(op < REOP_LIMIT);
3113 bad:
3114 TRACE("\n");
3115 return NULL;
3117 good:
3118 TRACE("\n");
3119 return x;
3122 static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
3124 REMatchState *result;
3125 const WCHAR *cp = x->cp;
3126 const WCHAR *cp2;
3127 UINT j;
3130 * Have to include the position beyond the last character
3131 * in order to detect end-of-input/line condition.
3133 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3134 gData->skipped = cp2 - cp;
3135 x->cp = cp2;
3136 for (j = 0; j < gData->regexp->parenCount; j++)
3137 x->parens[j].index = -1;
3138 result = ExecuteREBytecode(gData, x);
3139 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
3140 return result;
3141 gData->backTrackSP = gData->backTrackStack;
3142 gData->cursz = 0;
3143 gData->stateStackTop = 0;
3144 cp2 = cp + gData->skipped;
3146 return NULL;
3149 #define MIN_BACKTRACK_LIMIT 400000
3151 static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
3153 REMatchState *result;
3154 UINT i;
3156 gData->backTrackStackSize = INITIAL_BACKTRACK;
3157 gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
3158 if (!gData->backTrackStack)
3159 goto bad;
3161 gData->backTrackSP = gData->backTrackStack;
3162 gData->cursz = 0;
3163 gData->backTrackCount = 0;
3164 gData->backTrackLimit = 0;
3166 gData->stateStackLimit = INITIAL_STATESTACK;
3167 gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3168 if (!gData->stateStack)
3169 goto bad;
3171 gData->stateStackTop = 0;
3172 gData->cx = cx;
3173 gData->regexp = re;
3174 gData->ok = TRUE;
3176 result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
3177 if (!result)
3178 goto bad;
3180 for (i = 0; i < re->classCount; i++) {
3181 if (!re->classList[i].converted &&
3182 !ProcessCharSet(gData, &re->classList[i])) {
3183 return NULL;
3187 return result;
3189 bad:
3190 js_ReportOutOfScriptQuota(cx);
3191 gData->ok = FALSE;
3192 return NULL;
3195 static void
3196 js_DestroyRegExp(JSRegExp *re)
3198 if (re->classList) {
3199 UINT i;
3200 for (i = 0; i < re->classCount; i++) {
3201 if (re->classList[i].converted)
3202 heap_free(re->classList[i].u.bits);
3203 re->classList[i].u.bits = NULL;
3205 heap_free(re->classList);
3207 heap_free(re);
3210 static JSRegExp *
3211 js_NewRegExp(script_ctx_t *cx, BSTR str, UINT flags, BOOL flat)
3213 JSRegExp *re;
3214 jsheap_t *mark;
3215 CompilerState state;
3216 size_t resize;
3217 jsbytecode *endPC;
3218 UINT i;
3219 size_t len;
3221 re = NULL;
3222 mark = jsheap_mark(&cx->tmp_heap);
3223 len = SysStringLen(str);
3225 state.context = cx;
3226 state.cp = str;
3227 if (!state.cp)
3228 goto out;
3229 state.cpbegin = state.cp;
3230 state.cpend = state.cp + len;
3231 state.flags = flags;
3232 state.parenCount = 0;
3233 state.classCount = 0;
3234 state.progLength = 0;
3235 state.treeDepth = 0;
3236 state.classBitmapsMem = 0;
3237 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3238 state.classCache[i].start = NULL;
3240 if (len != 0 && flat) {
3241 state.result = NewRENode(&state, REOP_FLAT);
3242 if (!state.result)
3243 goto out;
3244 state.result->u.flat.chr = *state.cpbegin;
3245 state.result->u.flat.length = len;
3246 state.result->kid = (void *) state.cpbegin;
3247 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3248 state.progLength += 1 + GetCompactIndexWidth(0)
3249 + GetCompactIndexWidth(len);
3250 } else {
3251 if (!ParseRegExp(&state))
3252 goto out;
3254 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3255 re = heap_alloc(resize);
3256 if (!re)
3257 goto out;
3259 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3260 re->classCount = state.classCount;
3261 if (re->classCount) {
3262 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3263 if (!re->classList) {
3264 js_DestroyRegExp(re);
3265 re = NULL;
3266 goto out;
3268 for (i = 0; i < re->classCount; i++)
3269 re->classList[i].converted = FALSE;
3270 } else {
3271 re->classList = NULL;
3273 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3274 if (!endPC) {
3275 js_DestroyRegExp(re);
3276 re = NULL;
3277 goto out;
3279 *endPC++ = REOP_END;
3281 * Check whether size was overestimated and shrink using realloc.
3282 * This is safe since no pointers to newly parsed regexp or its parts
3283 * besides re exist here.
3285 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3286 JSRegExp *tmp;
3287 assert((size_t)(endPC - re->program) < state.progLength + 1);
3288 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3289 tmp = heap_realloc(re, resize);
3290 if (tmp)
3291 re = tmp;
3294 re->flags = flags;
3295 re->parenCount = state.parenCount;
3296 re->source = str;
3298 out:
3299 jsheap_clear(mark);
3300 return re;
3303 static inline RegExpInstance *regexp_from_vdisp(vdisp_t *vdisp)
3305 return (RegExpInstance*)vdisp->u.jsdisp;
3308 static void set_last_index(RegExpInstance *This, DWORD last_index)
3310 This->last_index = last_index;
3311 VariantClear(&This->last_index_var);
3312 num_set_val(&This->last_index_var, last_index);
3315 static HRESULT do_regexp_match_next(script_ctx_t *ctx, RegExpInstance *regexp, DWORD rem_flags,
3316 const WCHAR *str, DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size,
3317 DWORD *parens_cnt, match_result_t *ret)
3319 REMatchState *x, *result;
3320 REGlobalData gData;
3321 DWORD matchlen;
3323 gData.cpbegin = str;
3324 gData.cpend = str + len;
3325 gData.start = *cp-str;
3326 gData.skipped = 0;
3327 gData.pool = &ctx->tmp_heap;
3329 x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
3330 if(!x) {
3331 WARN("InitMatch failed\n");
3332 return E_FAIL;
3335 x->cp = *cp;
3336 result = MatchRegExp(&gData, x);
3337 if(!gData.ok) {
3338 WARN("MatchRegExp failed\n");
3339 return E_FAIL;
3342 if(!result) {
3343 if(rem_flags & REM_RESET_INDEX)
3344 set_last_index(regexp, 0);
3345 return S_FALSE;
3348 if(parens) {
3349 if(regexp->jsregexp->parenCount > *parens_size) {
3350 match_result_t *new_parens;
3352 if(*parens)
3353 new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
3354 else
3355 new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
3356 if(!new_parens)
3357 return E_OUTOFMEMORY;
3359 *parens = new_parens;
3363 /* FIXME: We often already have a copy of input string that we could use to store last match */
3364 if(!(rem_flags & REM_NO_CTX_UPDATE) &&
3365 (!ctx->last_match || len != SysStringLen(ctx->last_match) || strncmpW(ctx->last_match, str, len))) {
3366 BSTR last_match;
3368 last_match = SysAllocStringLen(str, len);
3369 if(!last_match)
3370 return E_OUTOFMEMORY;
3371 SysFreeString(ctx->last_match);
3372 ctx->last_match = last_match;
3375 if(parens) {
3376 DWORD i;
3378 *parens_cnt = regexp->jsregexp->parenCount;
3380 for(i=0; i < regexp->jsregexp->parenCount; i++) {
3381 if(result->parens[i].index == -1) {
3382 (*parens)[i].str = NULL;
3383 (*parens)[i].len = 0;
3384 }else {
3385 (*parens)[i].str = str + result->parens[i].index;
3386 (*parens)[i].len = result->parens[i].length;
3391 matchlen = (result->cp-*cp) - gData.skipped;
3392 *cp = result->cp;
3393 ret->str = result->cp-matchlen;
3394 ret->len = matchlen;
3395 set_last_index(regexp, result->cp-str);
3397 if(!(rem_flags & REM_NO_CTX_UPDATE)) {
3398 ctx->last_match_index = ret->str-str;
3399 ctx->last_match_length = matchlen;
3402 return S_OK;
3405 HRESULT regexp_match_next(script_ctx_t *ctx, jsdisp_t *dispex, DWORD rem_flags, const WCHAR *str,
3406 DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt,
3407 match_result_t *ret)
3409 RegExpInstance *regexp = (RegExpInstance*)dispex;
3410 jsheap_t *mark;
3411 HRESULT hres;
3413 if((rem_flags & REM_CHECK_GLOBAL) && !(regexp->jsregexp->flags & JSREG_GLOB))
3414 return S_FALSE;
3416 mark = jsheap_mark(&ctx->tmp_heap);
3418 hres = do_regexp_match_next(ctx, regexp, rem_flags, str, len, cp, parens, parens_size, parens_cnt, ret);
3420 jsheap_clear(mark);
3421 return hres;
3424 HRESULT regexp_match(script_ctx_t *ctx, jsdisp_t *dispex, const WCHAR *str, DWORD len, BOOL gflag,
3425 match_result_t **match_result, DWORD *result_cnt)
3427 RegExpInstance *This = (RegExpInstance*)dispex;
3428 match_result_t *ret = NULL, cres;
3429 const WCHAR *cp = str;
3430 DWORD i=0, ret_size = 0;
3431 jsheap_t *mark;
3432 HRESULT hres;
3434 mark = jsheap_mark(&ctx->tmp_heap);
3436 while(1) {
3437 hres = do_regexp_match_next(ctx, This, 0, str, len, &cp, NULL, NULL, NULL, &cres);
3438 if(hres == S_FALSE) {
3439 hres = S_OK;
3440 break;
3443 if(FAILED(hres))
3444 break;
3446 if(ret_size == i) {
3447 if(ret)
3448 ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
3449 else
3450 ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
3451 if(!ret) {
3452 hres = E_OUTOFMEMORY;
3453 break;
3457 ret[i++] = cres;
3459 if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
3460 hres = S_OK;
3461 break;
3465 jsheap_clear(mark);
3466 if(FAILED(hres)) {
3467 heap_free(ret);
3468 return hres;
3471 *match_result = ret;
3472 *result_cnt = i;
3473 return S_OK;
3476 static HRESULT RegExp_source(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3477 VARIANT *retv, jsexcept_t *ei)
3479 TRACE("\n");
3481 switch(flags) {
3482 case DISPATCH_PROPERTYGET: {
3483 RegExpInstance *This = regexp_from_vdisp(jsthis);
3485 V_VT(retv) = VT_BSTR;
3486 V_BSTR(retv) = SysAllocString(This->str);
3487 if(!V_BSTR(retv))
3488 return E_OUTOFMEMORY;
3489 break;
3491 default:
3492 FIXME("Unimplemented flags %x\n", flags);
3493 return E_NOTIMPL;
3496 return S_OK;
3499 static HRESULT RegExp_global(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3500 VARIANT *retv, jsexcept_t *ei)
3502 FIXME("\n");
3503 return E_NOTIMPL;
3506 static HRESULT RegExp_ignoreCase(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3507 VARIANT *retv, jsexcept_t *ei)
3509 FIXME("\n");
3510 return E_NOTIMPL;
3513 static HRESULT RegExp_multiline(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3514 VARIANT *retv, jsexcept_t *ei)
3516 FIXME("\n");
3517 return E_NOTIMPL;
3520 static INT index_from_var(script_ctx_t *ctx, VARIANT *v)
3522 jsexcept_t ei;
3523 double n;
3524 HRESULT hres;
3526 memset(&ei, 0, sizeof(ei));
3527 hres = to_number(ctx, v, &ei, &n);
3528 if(FAILED(hres)) { /* FIXME: Move ignoring exceptions to to_primitive */
3529 VariantClear(&ei.var);
3530 return 0;
3533 n = floor(n);
3534 return is_int32(n) ? n : 0;
3537 static HRESULT RegExp_lastIndex(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3538 VARIANT *retv, jsexcept_t *ei)
3540 TRACE("\n");
3542 switch(flags) {
3543 case DISPATCH_PROPERTYGET: {
3544 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3546 V_VT(retv) = VT_EMPTY;
3547 return VariantCopy(retv, &regexp->last_index_var);
3549 case DISPATCH_PROPERTYPUT: {
3550 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3551 HRESULT hres;
3553 hres = VariantCopy(&regexp->last_index_var, argv);
3554 if(FAILED(hres))
3555 return hres;
3557 regexp->last_index = index_from_var(ctx, argv);
3558 break;
3560 default:
3561 FIXME("unimplemented flags: %x\n", flags);
3562 return E_NOTIMPL;
3565 return S_OK;
3568 static HRESULT RegExp_toString(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3569 VARIANT *retv, jsexcept_t *ei)
3571 FIXME("\n");
3572 return E_NOTIMPL;
3575 static HRESULT create_match_array(script_ctx_t *ctx, BSTR input, const match_result_t *result,
3576 const match_result_t *parens, DWORD parens_cnt, jsexcept_t *ei, IDispatch **ret)
3578 jsdisp_t *array;
3579 VARIANT var;
3580 int i;
3581 HRESULT hres = S_OK;
3583 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3584 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3585 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3586 static const WCHAR zeroW[] = {'0',0};
3588 hres = create_array(ctx, parens_cnt+1, &array);
3589 if(FAILED(hres))
3590 return hres;
3592 for(i=0; i < parens_cnt; i++) {
3593 V_VT(&var) = VT_BSTR;
3594 V_BSTR(&var) = SysAllocStringLen(parens[i].str, parens[i].len);
3595 if(!V_BSTR(&var)) {
3596 hres = E_OUTOFMEMORY;
3597 break;
3600 hres = jsdisp_propput_idx(array, i+1, &var, ei);
3601 SysFreeString(V_BSTR(&var));
3602 if(FAILED(hres))
3603 break;
3606 while(SUCCEEDED(hres)) {
3607 num_set_int(&var, result->str-input);
3608 hres = jsdisp_propput_name(array, indexW, &var, ei);
3609 if(FAILED(hres))
3610 break;
3612 num_set_int(&var, result->str-input+result->len);
3613 hres = jsdisp_propput_name(array, lastIndexW, &var, ei);
3614 if(FAILED(hres))
3615 break;
3617 V_VT(&var) = VT_BSTR;
3618 V_BSTR(&var) = input;
3619 hres = jsdisp_propput_name(array, inputW, &var, ei);
3620 if(FAILED(hres))
3621 break;
3623 V_BSTR(&var) = SysAllocStringLen(result->str, result->len);
3624 if(!V_BSTR(&var)) {
3625 hres = E_OUTOFMEMORY;
3626 break;
3628 hres = jsdisp_propput_name(array, zeroW, &var, ei);
3629 SysFreeString(V_BSTR(&var));
3630 break;
3633 if(FAILED(hres)) {
3634 jsdisp_release(array);
3635 return hres;
3638 *ret = to_disp(array);
3639 return S_OK;
3642 static HRESULT run_exec(script_ctx_t *ctx, vdisp_t *jsthis, VARIANT *arg, jsexcept_t *ei, BSTR *input,
3643 match_result_t *match, match_result_t **parens, DWORD *parens_cnt, VARIANT_BOOL *ret)
3645 RegExpInstance *regexp;
3646 DWORD parens_size = 0, last_index = 0, length;
3647 const WCHAR *cp;
3648 BSTR string;
3649 HRESULT hres;
3651 if(!is_vclass(jsthis, JSCLASS_REGEXP)) {
3652 FIXME("Not a RegExp\n");
3653 return E_NOTIMPL;
3656 regexp = regexp_from_vdisp(jsthis);
3658 if(arg) {
3659 hres = to_string(ctx, arg, ei, &string);
3660 if(FAILED(hres))
3661 return hres;
3662 length = SysStringLen(string);
3663 }else {
3664 string = NULL;
3665 length = 0;
3668 if(regexp->jsregexp->flags & JSREG_GLOB) {
3669 if(regexp->last_index < 0) {
3670 SysFreeString(string);
3671 set_last_index(regexp, 0);
3672 *ret = VARIANT_FALSE;
3673 if(input) {
3674 *input = NULL;
3676 return S_OK;
3679 last_index = regexp->last_index;
3682 cp = string + last_index;
3683 hres = regexp_match_next(ctx, &regexp->dispex, REM_RESET_INDEX, string, length, &cp, parens,
3684 parens ? &parens_size : NULL, parens_cnt, match);
3685 if(FAILED(hres)) {
3686 SysFreeString(string);
3687 return hres;
3690 *ret = hres == S_OK ? VARIANT_TRUE : VARIANT_FALSE;
3691 if(input) {
3692 *input = string;
3693 }else {
3694 SysFreeString(string);
3696 return S_OK;
3699 static HRESULT RegExp_exec(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3700 VARIANT *retv, jsexcept_t *ei)
3702 match_result_t *parens = NULL, match;
3703 DWORD parens_cnt = 0;
3704 VARIANT_BOOL b;
3705 BSTR string;
3706 HRESULT hres;
3708 TRACE("\n");
3710 hres = run_exec(ctx, jsthis, argc ? argv : NULL, ei, &string, &match, &parens, &parens_cnt, &b);
3711 if(FAILED(hres))
3712 return hres;
3714 if(retv) {
3715 if(b) {
3716 IDispatch *ret;
3718 hres = create_match_array(ctx, string, &match, parens, parens_cnt, ei, &ret);
3719 if(SUCCEEDED(hres)) {
3720 V_VT(retv) = VT_DISPATCH;
3721 V_DISPATCH(retv) = ret;
3723 }else {
3724 V_VT(retv) = VT_NULL;
3728 heap_free(parens);
3729 SysFreeString(string);
3730 return hres;
3733 static HRESULT RegExp_test(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3734 VARIANT *retv, jsexcept_t *ei)
3736 match_result_t match;
3737 VARIANT undef_var;
3738 VARIANT_BOOL b;
3739 HRESULT hres;
3741 TRACE("\n");
3743 if(!argc) {
3744 V_VT(&undef_var) = VT_BSTR;
3745 V_BSTR(&undef_var) = SysAllocString(undefinedW);
3746 if(!V_BSTR(&undef_var))
3747 return E_OUTOFMEMORY;
3750 hres = run_exec(ctx, jsthis, argc ? argv : &undef_var, ei, NULL, &match, NULL, NULL, &b);
3751 if(!argc)
3752 SysFreeString(V_BSTR(&undef_var));
3753 if(FAILED(hres))
3754 return hres;
3756 if(retv) {
3757 V_VT(retv) = VT_BOOL;
3758 V_BOOL(retv) = b;
3760 return S_OK;
3763 static HRESULT RegExp_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
3764 VARIANT *retv, jsexcept_t *ei)
3766 TRACE("\n");
3768 switch(flags) {
3769 case INVOKE_FUNC:
3770 return throw_type_error(ctx, ei, JS_E_FUNCTION_EXPECTED, NULL);
3771 default:
3772 FIXME("unimplemented flags %x\n", flags);
3773 return E_NOTIMPL;
3776 return S_OK;
3779 static void RegExp_destructor(jsdisp_t *dispex)
3781 RegExpInstance *This = (RegExpInstance*)dispex;
3783 if(This->jsregexp)
3784 js_DestroyRegExp(This->jsregexp);
3785 VariantClear(&This->last_index_var);
3786 SysFreeString(This->str);
3787 heap_free(This);
3790 static const builtin_prop_t RegExp_props[] = {
3791 {execW, RegExp_exec, PROPF_METHOD|1},
3792 {globalW, RegExp_global, 0},
3793 {ignoreCaseW, RegExp_ignoreCase, 0},
3794 {lastIndexW, RegExp_lastIndex, 0},
3795 {multilineW, RegExp_multiline, 0},
3796 {sourceW, RegExp_source, 0},
3797 {testW, RegExp_test, PROPF_METHOD|1},
3798 {toStringW, RegExp_toString, PROPF_METHOD}
3801 static const builtin_info_t RegExp_info = {
3802 JSCLASS_REGEXP,
3803 {NULL, RegExp_value, 0},
3804 sizeof(RegExp_props)/sizeof(*RegExp_props),
3805 RegExp_props,
3806 RegExp_destructor,
3807 NULL
3810 static HRESULT alloc_regexp(script_ctx_t *ctx, jsdisp_t *object_prototype, RegExpInstance **ret)
3812 RegExpInstance *regexp;
3813 HRESULT hres;
3815 regexp = heap_alloc_zero(sizeof(RegExpInstance));
3816 if(!regexp)
3817 return E_OUTOFMEMORY;
3819 if(object_prototype)
3820 hres = init_dispex(&regexp->dispex, ctx, &RegExp_info, object_prototype);
3821 else
3822 hres = init_dispex_from_constr(&regexp->dispex, ctx, &RegExp_info, ctx->regexp_constr);
3824 if(FAILED(hres)) {
3825 heap_free(regexp);
3826 return hres;
3829 *ret = regexp;
3830 return S_OK;
3833 HRESULT create_regexp(script_ctx_t *ctx, const WCHAR *exp, int len, DWORD flags, jsdisp_t **ret)
3835 RegExpInstance *regexp;
3836 HRESULT hres;
3838 TRACE("%s %x\n", debugstr_wn(exp, len), flags);
3840 hres = alloc_regexp(ctx, NULL, &regexp);
3841 if(FAILED(hres))
3842 return hres;
3844 if(len == -1)
3845 regexp->str = SysAllocString(exp);
3846 else
3847 regexp->str = SysAllocStringLen(exp, len);
3848 if(!regexp->str) {
3849 jsdisp_release(&regexp->dispex);
3850 return E_OUTOFMEMORY;
3853 regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
3854 if(!regexp->jsregexp) {
3855 WARN("js_NewRegExp failed\n");
3856 jsdisp_release(&regexp->dispex);
3857 return E_FAIL;
3860 num_set_int(&regexp->last_index_var, 0);
3862 *ret = &regexp->dispex;
3863 return S_OK;
3866 HRESULT create_regexp_var(script_ctx_t *ctx, VARIANT *src_arg, VARIANT *flags_arg, jsdisp_t **ret)
3868 const WCHAR *opt = emptyW, *src;
3869 DWORD flags;
3870 HRESULT hres;
3872 if(V_VT(src_arg) == VT_DISPATCH) {
3873 jsdisp_t *obj;
3875 obj = iface_to_jsdisp((IUnknown*)V_DISPATCH(src_arg));
3876 if(obj) {
3877 if(is_class(obj, JSCLASS_REGEXP)) {
3878 RegExpInstance *regexp = (RegExpInstance*)obj;
3880 hres = create_regexp(ctx, regexp->str, -1, regexp->jsregexp->flags, ret);
3881 jsdisp_release(obj);
3882 return hres;
3885 jsdisp_release(obj);
3889 if(V_VT(src_arg) != VT_BSTR) {
3890 FIXME("flags_arg = %s\n", debugstr_variant(flags_arg));
3891 return E_NOTIMPL;
3894 src = V_BSTR(src_arg);
3896 if(flags_arg) {
3897 if(V_VT(flags_arg) != VT_BSTR) {
3898 FIXME("unimplemented for vt %d\n", V_VT(flags_arg));
3899 return E_NOTIMPL;
3902 opt = V_BSTR(flags_arg);
3905 hres = parse_regexp_flags(opt, strlenW(opt), &flags);
3906 if(FAILED(hres))
3907 return hres;
3909 return create_regexp(ctx, src, -1, flags, ret);
3912 HRESULT regexp_string_match(script_ctx_t *ctx, jsdisp_t *re, BSTR str,
3913 VARIANT *retv, jsexcept_t *ei)
3915 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3916 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3917 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3919 RegExpInstance *regexp = (RegExpInstance*)re;
3920 match_result_t *match_result;
3921 DWORD match_cnt, i, length;
3922 jsdisp_t *array;
3923 VARIANT var;
3924 HRESULT hres;
3926 length = SysStringLen(str);
3928 if(!(regexp->jsregexp->flags & JSREG_GLOB)) {
3929 match_result_t match, *parens = NULL;
3930 DWORD parens_cnt, parens_size = 0;
3931 const WCHAR *cp = str;
3933 hres = regexp_match_next(ctx, &regexp->dispex, 0, str, length, &cp, &parens, &parens_size, &parens_cnt, &match);
3934 if(FAILED(hres))
3935 return hres;
3937 if(retv) {
3938 if(hres == S_OK) {
3939 IDispatch *ret;
3941 hres = create_match_array(ctx, str, &match, parens, parens_cnt, ei, &ret);
3942 if(SUCCEEDED(hres)) {
3943 V_VT(retv) = VT_DISPATCH;
3944 V_DISPATCH(retv) = ret;
3946 }else {
3947 V_VT(retv) = VT_NULL;
3951 heap_free(parens);
3952 return S_OK;
3955 hres = regexp_match(ctx, &regexp->dispex, str, length, FALSE, &match_result, &match_cnt);
3956 if(FAILED(hres))
3957 return hres;
3959 if(!match_cnt) {
3960 TRACE("no match\n");
3962 if(retv)
3963 V_VT(retv) = VT_NULL;
3964 return S_OK;
3967 hres = create_array(ctx, match_cnt, &array);
3968 if(FAILED(hres))
3969 return hres;
3971 V_VT(&var) = VT_BSTR;
3973 for(i=0; i < match_cnt; i++) {
3974 V_BSTR(&var) = SysAllocStringLen(match_result[i].str, match_result[i].len);
3975 if(!V_BSTR(&var)) {
3976 hres = E_OUTOFMEMORY;
3977 break;
3980 hres = jsdisp_propput_idx(array, i, &var, ei);
3981 SysFreeString(V_BSTR(&var));
3982 if(FAILED(hres))
3983 break;
3986 while(SUCCEEDED(hres)) {
3987 num_set_int(&var, match_result[match_cnt-1].str-str);
3988 hres = jsdisp_propput_name(array, indexW, &var, ei);
3989 if(FAILED(hres))
3990 break;
3992 num_set_int(&var, match_result[match_cnt-1].str-str+match_result[match_cnt-1].len);
3993 hres = jsdisp_propput_name(array, lastIndexW, &var, ei);
3994 if(FAILED(hres))
3995 break;
3997 V_VT(&var) = VT_BSTR;
3998 V_BSTR(&var) = str;
3999 hres = jsdisp_propput_name(array, inputW, &var, ei);
4000 break;
4003 heap_free(match_result);
4005 if(SUCCEEDED(hres) && retv)
4006 var_set_jsdisp(retv, array);
4007 else
4008 jsdisp_release(array);
4009 return hres;
4012 static HRESULT RegExpConstr_leftContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4013 unsigned argc, VARIANT *argv, VARIANT *retv, jsexcept_t *ei)
4015 TRACE("\n");
4017 switch(flags) {
4018 case DISPATCH_PROPERTYGET: {
4019 BSTR ret;
4021 ret = SysAllocStringLen(ctx->last_match, ctx->last_match_index);
4022 if(!ret)
4023 return E_OUTOFMEMORY;
4025 V_VT(retv) = VT_BSTR;
4026 V_BSTR(retv) = ret;
4027 break;
4029 case DISPATCH_PROPERTYPUT:
4030 break;
4031 default:
4032 FIXME("unsupported flags\n");
4033 return E_NOTIMPL;
4036 return S_OK;
4039 static HRESULT RegExpConstr_rightContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4040 unsigned argc, VARIANT *argv, VARIANT *retv, jsexcept_t *ei)
4042 TRACE("\n");
4044 switch(flags) {
4045 case DISPATCH_PROPERTYGET: {
4046 BSTR ret;
4048 ret = SysAllocString(ctx->last_match+ctx->last_match_index+ctx->last_match_length);
4049 if(!ret)
4050 return E_OUTOFMEMORY;
4052 V_VT(retv) = VT_BSTR;
4053 V_BSTR(retv) = ret;
4054 break;
4056 case DISPATCH_PROPERTYPUT:
4057 break;
4058 default:
4059 FIXME("unsupported flags\n");
4060 return E_NOTIMPL;
4063 return S_OK;
4066 static HRESULT RegExpConstr_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, VARIANT *argv,
4067 VARIANT *retv, jsexcept_t *ei)
4069 TRACE("\n");
4071 switch(flags) {
4072 case DISPATCH_METHOD:
4073 if(argc) {
4074 if(V_VT(argv) == VT_DISPATCH) {
4075 jsdisp_t *jsdisp = iface_to_jsdisp((IUnknown*)V_DISPATCH(argv));
4076 if(jsdisp) {
4077 if(is_class(jsdisp, JSCLASS_REGEXP)) {
4078 if(argc > 1 && V_VT(argv+1) != VT_EMPTY) {
4079 jsdisp_release(jsdisp);
4080 return throw_regexp_error(ctx, ei, JS_E_REGEXP_SYNTAX, NULL);
4083 if(retv)
4084 var_set_jsdisp(retv, jsdisp);
4085 else
4086 jsdisp_release(jsdisp);
4087 return S_OK;
4089 jsdisp_release(jsdisp);
4093 /* fall through */
4094 case DISPATCH_CONSTRUCT: {
4095 jsdisp_t *ret;
4096 HRESULT hres;
4098 if(!argc) {
4099 FIXME("no args\n");
4100 return E_NOTIMPL;
4103 hres = create_regexp_var(ctx, argv, argc > 1 ? argv+1 : NULL, &ret);
4104 if(FAILED(hres))
4105 return hres;
4107 if(retv)
4108 var_set_jsdisp(retv, ret);
4109 else
4110 jsdisp_release(ret);
4111 return S_OK;
4113 default:
4114 FIXME("unimplemented flags: %x\n", flags);
4115 return E_NOTIMPL;
4118 return S_OK;
4121 static const builtin_prop_t RegExpConstr_props[] = {
4122 {leftContextW, RegExpConstr_leftContext, 0},
4123 {rightContextW, RegExpConstr_rightContext, 0}
4126 static const builtin_info_t RegExpConstr_info = {
4127 JSCLASS_FUNCTION,
4128 {NULL, Function_value, 0},
4129 sizeof(RegExpConstr_props)/sizeof(*RegExpConstr_props),
4130 RegExpConstr_props,
4131 NULL,
4132 NULL
4135 HRESULT create_regexp_constr(script_ctx_t *ctx, jsdisp_t *object_prototype, jsdisp_t **ret)
4137 RegExpInstance *regexp;
4138 HRESULT hres;
4140 static const WCHAR RegExpW[] = {'R','e','g','E','x','p',0};
4142 hres = alloc_regexp(ctx, object_prototype, &regexp);
4143 if(FAILED(hres))
4144 return hres;
4146 hres = create_builtin_constructor(ctx, RegExpConstr_value, RegExpW, &RegExpConstr_info,
4147 PROPF_CONSTR|2, &regexp->dispex, ret);
4149 jsdisp_release(&regexp->dispex);
4150 return hres;
4153 HRESULT parse_regexp_flags(const WCHAR *str, DWORD str_len, DWORD *ret)
4155 const WCHAR *p;
4156 DWORD flags = 0;
4158 for (p = str; p < str+str_len; p++) {
4159 switch (*p) {
4160 case 'g':
4161 flags |= JSREG_GLOB;
4162 break;
4163 case 'i':
4164 flags |= JSREG_FOLD;
4165 break;
4166 case 'm':
4167 flags |= JSREG_MULTILINE;
4168 break;
4169 case 'y':
4170 flags |= JSREG_STICKY;
4171 break;
4172 default:
4173 WARN("wrong flag %c\n", *p);
4174 return E_FAIL;
4178 *ret = flags;
4179 return S_OK;