push 5b1efc32b5a8acb1d5b5e60584746392dd0c436e
[wine/hacks.git] / dlls / jscript / regexp.c
blobff9807b557647df359416049b413e01c9c80a7f1
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
35 #include <math.h>
37 #include "jscript.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
44 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
45 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
46 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
48 typedef BYTE JSPackedBool;
49 typedef BYTE jsbytecode;
52 * This struct holds a bitmap representation of a class from a regexp.
53 * There's a list of these referenced by the classList field in the JSRegExp
54 * struct below. The initial state has startIndex set to the offset in the
55 * original regexp source of the beginning of the class contents. The first
56 * use of the class converts the source representation into a bitmap.
59 typedef struct RECharSet {
60 JSPackedBool converted;
61 JSPackedBool sense;
62 WORD length;
63 union {
64 BYTE *bits;
65 struct {
66 size_t startIndex;
67 size_t length;
68 } src;
69 } u;
70 } RECharSet;
72 typedef struct {
73 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
74 size_t parenCount; /* number of parenthesized submatches */
75 size_t classCount; /* count [...] bitmaps */
76 RECharSet *classList; /* list of [...] bitmaps */
77 BSTR source; /* locked source string, sans // */
78 jsbytecode program[1]; /* regular expression bytecode */
79 } JSRegExp;
81 typedef struct {
82 DispatchEx dispex;
84 JSRegExp *jsregexp;
85 BSTR str;
86 INT last_index;
87 VARIANT last_index_var;
88 } RegExpInstance;
90 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
91 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
92 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
93 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
94 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
95 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
96 static const WCHAR execW[] = {'e','x','e','c',0};
97 static const WCHAR testW[] = {'t','e','s','t',0};
99 static const WCHAR emptyW[] = {0};
101 /* FIXME: Better error handling */
102 #define ReportRegExpError(a,b,c)
103 #define ReportRegExpErrorHelper(a,b,c,d)
104 #define JS_ReportErrorNumber(a,b,c,d)
105 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
106 #define js_ReportOutOfScriptQuota(a)
107 #define JS_ReportOutOfMemory(a)
108 #define JS_COUNT_OPERATION(a,b)
110 #define JSMSG_MIN_TOO_BIG 47
111 #define JSMSG_MAX_TOO_BIG 48
112 #define JSMSG_OUT_OF_ORDER 49
113 #define JSMSG_OUT_OF_MEMORY 137
115 #define LINE_SEPARATOR 0x2028
116 #define PARA_SEPARATOR 0x2029
118 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
119 ((c >= 'a') && (c <= 'z')) )
120 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
121 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
123 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
125 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
126 #define JS7_UNDEC(c) ((c) - '0')
128 typedef enum REOp {
129 REOP_EMPTY,
130 REOP_BOL,
131 REOP_EOL,
132 REOP_WBDRY,
133 REOP_WNONBDRY,
134 REOP_DOT,
135 REOP_DIGIT,
136 REOP_NONDIGIT,
137 REOP_ALNUM,
138 REOP_NONALNUM,
139 REOP_SPACE,
140 REOP_NONSPACE,
141 REOP_BACKREF,
142 REOP_FLAT,
143 REOP_FLAT1,
144 REOP_FLATi,
145 REOP_FLAT1i,
146 REOP_UCFLAT1,
147 REOP_UCFLAT1i,
148 REOP_UCFLAT,
149 REOP_UCFLATi,
150 REOP_CLASS,
151 REOP_NCLASS,
152 REOP_ALT,
153 REOP_QUANT,
154 REOP_STAR,
155 REOP_PLUS,
156 REOP_OPT,
157 REOP_LPAREN,
158 REOP_RPAREN,
159 REOP_JUMP,
160 REOP_DOTSTAR,
161 REOP_LPARENNON,
162 REOP_ASSERT,
163 REOP_ASSERT_NOT,
164 REOP_ASSERTTEST,
165 REOP_ASSERTNOTTEST,
166 REOP_MINIMALSTAR,
167 REOP_MINIMALPLUS,
168 REOP_MINIMALOPT,
169 REOP_MINIMALQUANT,
170 REOP_ENDCHILD,
171 REOP_REPEAT,
172 REOP_MINIMALREPEAT,
173 REOP_ALTPREREQ,
174 REOP_ALTPREREQ2,
175 REOP_ENDALT,
176 REOP_CONCAT,
177 REOP_END,
178 REOP_LIMIT /* META: no operator >= to this */
179 } REOp;
181 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
183 static const char *reop_names[] = {
184 "empty",
185 "bol",
186 "eol",
187 "wbdry",
188 "wnonbdry",
189 "dot",
190 "digit",
191 "nondigit",
192 "alnum",
193 "nonalnum",
194 "space",
195 "nonspace",
196 "backref",
197 "flat",
198 "flat1",
199 "flati",
200 "flat1i",
201 "ucflat1",
202 "ucflat1i",
203 "ucflat",
204 "ucflati",
205 "class",
206 "nclass",
207 "alt",
208 "quant",
209 "star",
210 "plus",
211 "opt",
212 "lparen",
213 "rparen",
214 "jump",
215 "dotstar",
216 "lparennon",
217 "assert",
218 "assert_not",
219 "asserttest",
220 "assertnottest",
221 "minimalstar",
222 "minimalplus",
223 "minimalopt",
224 "minimalquant",
225 "endchild",
226 "repeat",
227 "minimalrepeat",
228 "altprereq",
229 "alrprereq2",
230 "endalt",
231 "concat",
232 "end",
233 NULL
236 typedef struct RECapture {
237 ptrdiff_t index; /* start of contents, -1 for empty */
238 size_t length; /* length of capture */
239 } RECapture;
241 typedef struct REMatchState {
242 const WCHAR *cp;
243 RECapture parens[1]; /* first of 're->parenCount' captures,
244 allocated at end of this struct */
245 } REMatchState;
247 typedef struct REProgState {
248 jsbytecode *continue_pc; /* current continuation data */
249 jsbytecode continue_op;
250 ptrdiff_t index; /* progress in text */
251 size_t parenSoFar; /* highest indexed paren started */
252 union {
253 struct {
254 UINT min; /* current quantifier limits */
255 UINT max;
256 } quantifier;
257 struct {
258 size_t top; /* backtrack stack state */
259 size_t sz;
260 } assertion;
261 } u;
262 } REProgState;
264 typedef struct REBackTrackData {
265 size_t sz; /* size of previous stack entry */
266 jsbytecode *backtrack_pc; /* where to backtrack to */
267 jsbytecode backtrack_op;
268 const WCHAR *cp; /* index in text of match at backtrack */
269 size_t parenIndex; /* start index of saved paren contents */
270 size_t parenCount; /* # of saved paren contents */
271 size_t saveStateStackTop; /* number of parent states */
272 /* saved parent states follow */
273 /* saved paren contents follow */
274 } REBackTrackData;
276 #define INITIAL_STATESTACK 100
277 #define INITIAL_BACKTRACK 8000
279 typedef struct REGlobalData {
280 script_ctx_t *cx;
281 JSRegExp *regexp; /* the RE in execution */
282 BOOL ok; /* runtime error (out_of_memory only?) */
283 size_t start; /* offset to start at */
284 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
285 const WCHAR *cpbegin; /* text base address */
286 const WCHAR *cpend; /* text limit address */
288 REProgState *stateStack; /* stack of state of current parents */
289 size_t stateStackTop;
290 size_t stateStackLimit;
292 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
293 REBackTrackData *backTrackSP;
294 size_t backTrackStackSize;
295 size_t cursz; /* size of current stack entry */
296 size_t backTrackCount; /* how many times we've backtracked */
297 size_t backTrackLimit; /* upper limit on backtrack states */
299 jsheap_t *pool; /* It's faster to use one malloc'd pool
300 than to malloc/free the three items
301 that are allocated from this pool */
302 } REGlobalData;
304 typedef struct RENode RENode;
305 struct RENode {
306 REOp op; /* r.e. op bytecode */
307 RENode *next; /* next in concatenation order */
308 void *kid; /* first operand */
309 union {
310 void *kid2; /* second operand */
311 INT num; /* could be a number */
312 size_t parenIndex; /* or a parenthesis index */
313 struct { /* or a quantifier range */
314 UINT min;
315 UINT max;
316 JSPackedBool greedy;
317 } range;
318 struct { /* or a character class */
319 size_t startIndex;
320 size_t kidlen; /* length of string at kid, in jschars */
321 size_t index; /* index into class list */
322 WORD bmsize; /* bitmap size, based on max char code */
323 JSPackedBool sense;
324 } ucclass;
325 struct { /* or a literal sequence */
326 WCHAR chr; /* of one character */
327 size_t length; /* or many (via the kid) */
328 } flat;
329 struct {
330 RENode *kid2; /* second operand from ALT */
331 WCHAR ch1; /* match char for ALTPREREQ */
332 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
333 } altprereq;
334 } u;
337 #define CLASS_CACHE_SIZE 4
339 typedef struct CompilerState {
340 script_ctx_t *context;
341 const WCHAR *cpbegin;
342 const WCHAR *cpend;
343 const WCHAR *cp;
344 size_t parenCount;
345 size_t classCount; /* number of [] encountered */
346 size_t treeDepth; /* maximum depth of parse tree */
347 size_t progLength; /* estimated bytecode length */
348 RENode *result;
349 size_t classBitmapsMem; /* memory to hold all class bitmaps */
350 struct {
351 const WCHAR *start; /* small cache of class strings */
352 size_t length; /* since they're often the same */
353 size_t index;
354 } classCache[CLASS_CACHE_SIZE];
355 WORD flags;
356 } CompilerState;
358 typedef struct EmitStateStackEntry {
359 jsbytecode *altHead; /* start of REOP_ALT* opcode */
360 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
361 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
362 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
363 RENode *continueNode; /* original REOP_ALT* node being stacked */
364 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
365 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
366 avoid 16-bit unsigned offset overflow */
367 } EmitStateStackEntry;
370 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
371 * the getters and setters take the pc of the offset, not of the opcode before
372 * the offset.
374 #define ARG_LEN 2
375 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
376 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
377 (pc)[1] = (jsbytecode) (arg))
379 #define OFFSET_LEN ARG_LEN
380 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
381 #define GET_OFFSET(pc) GET_ARG(pc)
383 static BOOL ParseRegExp(CompilerState*);
386 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
387 * For sanity, we limit it to 2^24 bytes.
389 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
392 * The maximum memory that can be allocated for class bitmaps.
393 * For sanity, we limit it to 2^24 bytes.
395 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
398 * Functions to get size and write/read bytecode that represent small indexes
399 * compactly.
400 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
401 * indicates that the following byte brings more bits to the index. Otherwise
402 * this is the last byte in the index bytecode representing highest index bits.
404 static size_t
405 GetCompactIndexWidth(size_t index)
407 size_t width;
409 for (width = 1; (index >>= 7) != 0; ++width) { }
410 return width;
413 static inline jsbytecode *
414 WriteCompactIndex(jsbytecode *pc, size_t index)
416 size_t next;
418 while ((next = index >> 7) != 0) {
419 *pc++ = (jsbytecode)(index | 0x80);
420 index = next;
422 *pc++ = (jsbytecode)index;
423 return pc;
426 static inline jsbytecode *
427 ReadCompactIndex(jsbytecode *pc, size_t *result)
429 size_t nextByte;
431 nextByte = *pc++;
432 if ((nextByte & 0x80) == 0) {
434 * Short-circuit the most common case when compact index <= 127.
436 *result = nextByte;
437 } else {
438 size_t shift = 7;
439 *result = 0x7F & nextByte;
440 do {
441 nextByte = *pc++;
442 *result |= (nextByte & 0x7F) << shift;
443 shift += 7;
444 } while ((nextByte & 0x80) != 0);
446 return pc;
449 /* Construct and initialize an RENode, returning NULL for out-of-memory */
450 static RENode *
451 NewRENode(CompilerState *state, REOp op)
453 RENode *ren;
455 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
456 if (!ren) {
457 /* js_ReportOutOfScriptQuota(cx); */
458 return NULL;
460 ren->op = op;
461 ren->next = NULL;
462 ren->kid = NULL;
463 return ren;
467 * Validates and converts hex ascii value.
469 static BOOL
470 isASCIIHexDigit(WCHAR c, UINT *digit)
472 UINT cv = c;
474 if (cv < '0')
475 return FALSE;
476 if (cv <= '9') {
477 *digit = cv - '0';
478 return TRUE;
480 cv |= 0x20;
481 if (cv >= 'a' && cv <= 'f') {
482 *digit = cv - 'a' + 10;
483 return TRUE;
485 return FALSE;
488 typedef struct {
489 REOp op;
490 const WCHAR *errPos;
491 size_t parenIndex;
492 } REOpData;
494 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
495 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
497 static BOOL
498 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
500 ptrdiff_t offset = target - jump;
502 /* Check that target really points forward. */
503 assert(offset >= 2);
504 if ((size_t)offset > OFFSET_MAX)
505 return FALSE;
507 jump[0] = JUMP_OFFSET_HI(offset);
508 jump[1] = JUMP_OFFSET_LO(offset);
509 return TRUE;
513 * Generate bytecode for the tree rooted at t using an explicit stack instead
514 * of recursion.
516 static jsbytecode *
517 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
518 jsbytecode *pc, RENode *t)
520 EmitStateStackEntry *emitStateSP, *emitStateStack;
521 RECharSet *charSet;
522 REOp op;
524 if (treeDepth == 0) {
525 emitStateStack = NULL;
526 } else {
527 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
528 if (!emitStateStack)
529 return NULL;
531 emitStateSP = emitStateStack;
532 op = t->op;
533 assert(op < REOP_LIMIT);
535 for (;;) {
536 *pc++ = op;
537 switch (op) {
538 case REOP_EMPTY:
539 --pc;
540 break;
542 case REOP_ALTPREREQ2:
543 case REOP_ALTPREREQ:
544 assert(emitStateSP);
545 emitStateSP->altHead = pc - 1;
546 emitStateSP->endTermFixup = pc;
547 pc += OFFSET_LEN;
548 SET_ARG(pc, t->u.altprereq.ch1);
549 pc += ARG_LEN;
550 SET_ARG(pc, t->u.altprereq.ch2);
551 pc += ARG_LEN;
553 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
554 pc += OFFSET_LEN;
556 emitStateSP->continueNode = t;
557 emitStateSP->continueOp = REOP_JUMP;
558 emitStateSP->jumpToJumpFlag = FALSE;
559 ++emitStateSP;
560 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
561 t = t->kid;
562 op = t->op;
563 assert(op < REOP_LIMIT);
564 continue;
566 case REOP_JUMP:
567 emitStateSP->nextTermFixup = pc; /* offset to following term */
568 pc += OFFSET_LEN;
569 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
570 goto jump_too_big;
571 emitStateSP->continueOp = REOP_ENDALT;
572 ++emitStateSP;
573 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
574 t = t->u.kid2;
575 op = t->op;
576 assert(op < REOP_LIMIT);
577 continue;
579 case REOP_ENDALT:
581 * If we already patched emitStateSP->nextTermFixup to jump to
582 * a nearer jump, to avoid 16-bit immediate offset overflow, we
583 * are done here.
585 if (emitStateSP->jumpToJumpFlag)
586 break;
589 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
590 * REOP_ENDALT is executed only on successful match of the last
591 * alternate in a group.
593 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
594 goto jump_too_big;
595 if (t->op != REOP_ALT) {
596 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
597 goto jump_too_big;
601 * If the program is bigger than the REOP_JUMP offset range, then
602 * we must check for alternates before this one that are part of
603 * the same group, and fix up their jump offsets to target jumps
604 * close enough to fit in a 16-bit unsigned offset immediate.
606 if ((size_t)(pc - re->program) > OFFSET_MAX &&
607 emitStateSP > emitStateStack) {
608 EmitStateStackEntry *esp, *esp2;
609 jsbytecode *alt, *jump;
610 ptrdiff_t span, header;
612 esp2 = emitStateSP;
613 alt = esp2->altHead;
614 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
615 if (esp->continueOp == REOP_ENDALT &&
616 !esp->jumpToJumpFlag &&
617 esp->nextTermFixup + OFFSET_LEN == alt &&
618 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
619 ? esp->endTermFixup
620 : esp->nextTermFixup)) > OFFSET_MAX) {
621 alt = esp->altHead;
622 jump = esp->nextTermFixup;
625 * The span must be 1 less than the distance from
626 * jump offset to jump offset, so we actually jump
627 * to a REOP_JUMP bytecode, not to its offset!
629 for (;;) {
630 assert(jump < esp2->nextTermFixup);
631 span = esp2->nextTermFixup - jump - 1;
632 if ((size_t)span <= OFFSET_MAX)
633 break;
634 do {
635 if (--esp2 == esp)
636 goto jump_too_big;
637 } while (esp2->continueOp != REOP_ENDALT);
640 jump[0] = JUMP_OFFSET_HI(span);
641 jump[1] = JUMP_OFFSET_LO(span);
643 if (esp->continueNode->op != REOP_ALT) {
645 * We must patch the offset at esp->endTermFixup
646 * as well, for the REOP_ALTPREREQ{,2} opcodes.
647 * If we're unlucky and endTermFixup is more than
648 * OFFSET_MAX bytes from its target, we cheat by
649 * jumping 6 bytes to the jump whose offset is at
650 * esp->nextTermFixup, which has the same target.
652 jump = esp->endTermFixup;
653 header = esp->nextTermFixup - jump;
654 span += header;
655 if ((size_t)span > OFFSET_MAX)
656 span = header;
658 jump[0] = JUMP_OFFSET_HI(span);
659 jump[1] = JUMP_OFFSET_LO(span);
662 esp->jumpToJumpFlag = TRUE;
666 break;
668 case REOP_ALT:
669 assert(emitStateSP);
670 emitStateSP->altHead = pc - 1;
671 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
672 pc += OFFSET_LEN;
673 emitStateSP->continueNode = t;
674 emitStateSP->continueOp = REOP_JUMP;
675 emitStateSP->jumpToJumpFlag = FALSE;
676 ++emitStateSP;
677 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
678 t = t->kid;
679 op = t->op;
680 assert(op < REOP_LIMIT);
681 continue;
683 case REOP_FLAT:
685 * Coalesce FLATs if possible and if it would not increase bytecode
686 * beyond preallocated limit. The latter happens only when bytecode
687 * size for coalesced string with offset p and length 2 exceeds 6
688 * bytes preallocated for 2 single char nodes, i.e. when
689 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
690 * GetCompactIndexWidth(p) > 4.
691 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
692 * nodes strictly decreases bytecode size, the check has to be
693 * done only for the first coalescing.
695 if (t->kid &&
696 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
698 while (t->next &&
699 t->next->op == REOP_FLAT &&
700 (WCHAR*)t->kid + t->u.flat.length ==
701 t->next->kid) {
702 t->u.flat.length += t->next->u.flat.length;
703 t->next = t->next->next;
706 if (t->kid && t->u.flat.length > 1) {
707 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
708 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
709 pc = WriteCompactIndex(pc, t->u.flat.length);
710 } else if (t->u.flat.chr < 256) {
711 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
712 *pc++ = (jsbytecode) t->u.flat.chr;
713 } else {
714 pc[-1] = (state->flags & JSREG_FOLD)
715 ? REOP_UCFLAT1i
716 : REOP_UCFLAT1;
717 SET_ARG(pc, t->u.flat.chr);
718 pc += ARG_LEN;
720 break;
722 case REOP_LPAREN:
723 assert(emitStateSP);
724 pc = WriteCompactIndex(pc, t->u.parenIndex);
725 emitStateSP->continueNode = t;
726 emitStateSP->continueOp = REOP_RPAREN;
727 ++emitStateSP;
728 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
729 t = t->kid;
730 op = t->op;
731 continue;
733 case REOP_RPAREN:
734 pc = WriteCompactIndex(pc, t->u.parenIndex);
735 break;
737 case REOP_BACKREF:
738 pc = WriteCompactIndex(pc, t->u.parenIndex);
739 break;
741 case REOP_ASSERT:
742 assert(emitStateSP);
743 emitStateSP->nextTermFixup = pc;
744 pc += OFFSET_LEN;
745 emitStateSP->continueNode = t;
746 emitStateSP->continueOp = REOP_ASSERTTEST;
747 ++emitStateSP;
748 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
749 t = t->kid;
750 op = t->op;
751 continue;
753 case REOP_ASSERTTEST:
754 case REOP_ASSERTNOTTEST:
755 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
756 goto jump_too_big;
757 break;
759 case REOP_ASSERT_NOT:
760 assert(emitStateSP);
761 emitStateSP->nextTermFixup = pc;
762 pc += OFFSET_LEN;
763 emitStateSP->continueNode = t;
764 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
765 ++emitStateSP;
766 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
767 t = t->kid;
768 op = t->op;
769 continue;
771 case REOP_QUANT:
772 assert(emitStateSP);
773 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
774 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
775 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
776 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
777 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
778 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
779 } else {
780 if (!t->u.range.greedy)
781 pc[-1] = REOP_MINIMALQUANT;
782 pc = WriteCompactIndex(pc, t->u.range.min);
784 * Write max + 1 to avoid using size_t(max) + 1 bytes
785 * for (UINT)-1 sentinel.
787 pc = WriteCompactIndex(pc, t->u.range.max + 1);
789 emitStateSP->nextTermFixup = pc;
790 pc += OFFSET_LEN;
791 emitStateSP->continueNode = t;
792 emitStateSP->continueOp = REOP_ENDCHILD;
793 ++emitStateSP;
794 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
795 t = t->kid;
796 op = t->op;
797 continue;
799 case REOP_ENDCHILD:
800 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
801 goto jump_too_big;
802 break;
804 case REOP_CLASS:
805 if (!t->u.ucclass.sense)
806 pc[-1] = REOP_NCLASS;
807 pc = WriteCompactIndex(pc, t->u.ucclass.index);
808 charSet = &re->classList[t->u.ucclass.index];
809 charSet->converted = FALSE;
810 charSet->length = t->u.ucclass.bmsize;
811 charSet->u.src.startIndex = t->u.ucclass.startIndex;
812 charSet->u.src.length = t->u.ucclass.kidlen;
813 charSet->sense = t->u.ucclass.sense;
814 break;
816 default:
817 break;
820 t = t->next;
821 if (t) {
822 op = t->op;
823 } else {
824 if (emitStateSP == emitStateStack)
825 break;
826 --emitStateSP;
827 t = emitStateSP->continueNode;
828 op = (REOp) emitStateSP->continueOp;
832 cleanup:
833 heap_free(emitStateStack);
834 return pc;
836 jump_too_big:
837 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
838 pc = NULL;
839 goto cleanup;
843 * Process the op against the two top operands, reducing them to a single
844 * operand in the penultimate slot. Update progLength and treeDepth.
846 static BOOL
847 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
848 INT operandSP)
850 RENode *result;
852 switch (opData->op) {
853 case REOP_ALT:
854 result = NewRENode(state, REOP_ALT);
855 if (!result)
856 return FALSE;
857 result->kid = operandStack[operandSP - 2];
858 result->u.kid2 = operandStack[operandSP - 1];
859 operandStack[operandSP - 2] = result;
861 if (state->treeDepth == TREE_DEPTH_MAX) {
862 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
863 return FALSE;
865 ++state->treeDepth;
868 * Look at both alternates to see if there's a FLAT or a CLASS at
869 * the start of each. If so, use a prerequisite match.
871 if (((RENode *) result->kid)->op == REOP_FLAT &&
872 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
873 (state->flags & JSREG_FOLD) == 0) {
874 result->op = REOP_ALTPREREQ;
875 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
876 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
877 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
878 JUMP, <end> ... ENDALT */
879 state->progLength += 13;
881 else
882 if (((RENode *) result->kid)->op == REOP_CLASS &&
883 ((RENode *) result->kid)->u.ucclass.index < 256 &&
884 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
885 (state->flags & JSREG_FOLD) == 0) {
886 result->op = REOP_ALTPREREQ2;
887 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
888 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
889 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
890 JUMP, <end> ... ENDALT */
891 state->progLength += 13;
893 else
894 if (((RENode *) result->kid)->op == REOP_FLAT &&
895 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
896 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
897 (state->flags & JSREG_FOLD) == 0) {
898 result->op = REOP_ALTPREREQ2;
899 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
900 result->u.altprereq.ch2 =
901 ((RENode *) result->u.kid2)->u.ucclass.index;
902 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
903 JUMP, <end> ... ENDALT */
904 state->progLength += 13;
906 else {
907 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
908 state->progLength += 7;
910 break;
912 case REOP_CONCAT:
913 result = operandStack[operandSP - 2];
914 while (result->next)
915 result = result->next;
916 result->next = operandStack[operandSP - 1];
917 break;
919 case REOP_ASSERT:
920 case REOP_ASSERT_NOT:
921 case REOP_LPARENNON:
922 case REOP_LPAREN:
923 /* These should have been processed by a close paren. */
924 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
925 opData->errPos);
926 return FALSE;
928 default:;
930 return TRUE;
934 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
935 * its being on the stack, and to propagate errors to its callers.
937 #define JSREG_FIND_PAREN_COUNT 0x8000
938 #define JSREG_FIND_PAREN_ERROR 0x4000
941 * Magic return value from FindParenCount and GetDecimalValue, to indicate
942 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
943 * its findMax parameter is non-null.
945 #define OVERFLOW_VALUE ((UINT)-1)
947 static UINT
948 FindParenCount(CompilerState *state)
950 CompilerState temp;
951 int i;
953 if (state->flags & JSREG_FIND_PAREN_COUNT)
954 return OVERFLOW_VALUE;
957 * Copy state into temp, flag it so we never report an invalid backref,
958 * and reset its members to parse the entire regexp. This is obviously
959 * suboptimal, but GetDecimalValue calls us only if a backref appears to
960 * refer to a forward parenthetical, which is rare.
962 temp = *state;
963 temp.flags |= JSREG_FIND_PAREN_COUNT;
964 temp.cp = temp.cpbegin;
965 temp.parenCount = 0;
966 temp.classCount = 0;
967 temp.progLength = 0;
968 temp.treeDepth = 0;
969 temp.classBitmapsMem = 0;
970 for (i = 0; i < CLASS_CACHE_SIZE; i++)
971 temp.classCache[i].start = NULL;
973 if (!ParseRegExp(&temp)) {
974 state->flags |= JSREG_FIND_PAREN_ERROR;
975 return OVERFLOW_VALUE;
977 return temp.parenCount;
981 * Extract and return a decimal value at state->cp. The initial character c
982 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
983 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
984 * state->flags to discover whether an error occurred under findMax.
986 static UINT
987 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
988 CompilerState *state)
990 UINT value = JS7_UNDEC(c);
991 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
993 /* The following restriction allows simpler overflow checks. */
994 assert(max <= ((UINT)-1 - 9) / 10);
995 while (state->cp < state->cpend) {
996 c = *state->cp;
997 if (!JS7_ISDEC(c))
998 break;
999 value = 10 * value + JS7_UNDEC(c);
1000 if (!overflow && value > max && (!findMax || value > findMax(state)))
1001 overflow = TRUE;
1002 ++state->cp;
1004 return overflow ? OVERFLOW_VALUE : value;
1008 * Calculate the total size of the bitmap required for a class expression.
1010 static BOOL
1011 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1012 const WCHAR *end)
1014 UINT max = 0;
1015 BOOL inRange = FALSE;
1016 WCHAR c, rangeStart = 0;
1017 UINT n, digit, nDigits, i;
1019 target->u.ucclass.bmsize = 0;
1020 target->u.ucclass.sense = TRUE;
1022 if (src == end)
1023 return TRUE;
1025 if (*src == '^') {
1026 ++src;
1027 target->u.ucclass.sense = FALSE;
1030 while (src != end) {
1031 BOOL canStartRange = TRUE;
1032 UINT localMax = 0;
1034 switch (*src) {
1035 case '\\':
1036 ++src;
1037 c = *src++;
1038 switch (c) {
1039 case 'b':
1040 localMax = 0x8;
1041 break;
1042 case 'f':
1043 localMax = 0xC;
1044 break;
1045 case 'n':
1046 localMax = 0xA;
1047 break;
1048 case 'r':
1049 localMax = 0xD;
1050 break;
1051 case 't':
1052 localMax = 0x9;
1053 break;
1054 case 'v':
1055 localMax = 0xB;
1056 break;
1057 case 'c':
1058 if (src < end && RE_IS_LETTER(*src)) {
1059 localMax = (UINT) (*src++) & 0x1F;
1060 } else {
1061 --src;
1062 localMax = '\\';
1064 break;
1065 case 'x':
1066 nDigits = 2;
1067 goto lexHex;
1068 case 'u':
1069 nDigits = 4;
1070 lexHex:
1071 n = 0;
1072 for (i = 0; (i < nDigits) && (src < end); i++) {
1073 c = *src++;
1074 if (!isASCIIHexDigit(c, &digit)) {
1076 * Back off to accepting the original
1077 *'\' as a literal.
1079 src -= i + 1;
1080 n = '\\';
1081 break;
1083 n = (n << 4) | digit;
1085 localMax = n;
1086 break;
1087 case 'd':
1088 canStartRange = FALSE;
1089 if (inRange) {
1090 JS_ReportErrorNumber(state->context,
1091 js_GetErrorMessage, NULL,
1092 JSMSG_BAD_CLASS_RANGE);
1093 return FALSE;
1095 localMax = '9';
1096 break;
1097 case 'D':
1098 case 's':
1099 case 'S':
1100 case 'w':
1101 case 'W':
1102 canStartRange = FALSE;
1103 if (inRange) {
1104 JS_ReportErrorNumber(state->context,
1105 js_GetErrorMessage, NULL,
1106 JSMSG_BAD_CLASS_RANGE);
1107 return FALSE;
1109 max = 65535;
1112 * If this is the start of a range, ensure that it's less than
1113 * the end.
1115 localMax = 0;
1116 break;
1117 case '0':
1118 case '1':
1119 case '2':
1120 case '3':
1121 case '4':
1122 case '5':
1123 case '6':
1124 case '7':
1126 * This is a non-ECMA extension - decimal escapes (in this
1127 * case, octal!) are supposed to be an error inside class
1128 * ranges, but supported here for backwards compatibility.
1131 n = JS7_UNDEC(c);
1132 c = *src;
1133 if ('0' <= c && c <= '7') {
1134 src++;
1135 n = 8 * n + JS7_UNDEC(c);
1136 c = *src;
1137 if ('0' <= c && c <= '7') {
1138 src++;
1139 i = 8 * n + JS7_UNDEC(c);
1140 if (i <= 0377)
1141 n = i;
1142 else
1143 src--;
1146 localMax = n;
1147 break;
1149 default:
1150 localMax = c;
1151 break;
1153 break;
1154 default:
1155 localMax = *src++;
1156 break;
1159 if (inRange) {
1160 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1161 if (rangeStart > localMax) {
1162 JS_ReportErrorNumber(state->context,
1163 js_GetErrorMessage, NULL,
1164 JSMSG_BAD_CLASS_RANGE);
1165 return FALSE;
1167 inRange = FALSE;
1168 } else {
1169 if (canStartRange && src < end - 1) {
1170 if (*src == '-') {
1171 ++src;
1172 inRange = TRUE;
1173 rangeStart = (WCHAR)localMax;
1174 continue;
1177 if (state->flags & JSREG_FOLD)
1178 rangeStart = localMax; /* one run of the uc/dc loop below */
1181 if (state->flags & JSREG_FOLD) {
1182 WCHAR maxch = localMax;
1184 for (i = rangeStart; i <= localMax; i++) {
1185 WCHAR uch, dch;
1187 uch = toupperW(i);
1188 dch = tolowerW(i);
1189 if(maxch < uch)
1190 maxch = uch;
1191 if(maxch < dch)
1192 maxch = dch;
1194 localMax = maxch;
1197 if (localMax > max)
1198 max = localMax;
1200 target->u.ucclass.bmsize = max;
1201 return TRUE;
1204 static INT
1205 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1207 UINT min, max;
1208 WCHAR c;
1209 const WCHAR *errp = state->cp++;
1211 c = *state->cp;
1212 if (JS7_ISDEC(c)) {
1213 ++state->cp;
1214 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1215 c = *state->cp;
1217 if (!ignoreValues && min == OVERFLOW_VALUE)
1218 return JSMSG_MIN_TOO_BIG;
1220 if (c == ',') {
1221 c = *++state->cp;
1222 if (JS7_ISDEC(c)) {
1223 ++state->cp;
1224 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1225 c = *state->cp;
1226 if (!ignoreValues && max == OVERFLOW_VALUE)
1227 return JSMSG_MAX_TOO_BIG;
1228 if (!ignoreValues && min > max)
1229 return JSMSG_OUT_OF_ORDER;
1230 } else {
1231 max = (UINT)-1;
1233 } else {
1234 max = min;
1236 if (c == '}') {
1237 state->result = NewRENode(state, REOP_QUANT);
1238 if (!state->result)
1239 return JSMSG_OUT_OF_MEMORY;
1240 state->result->u.range.min = min;
1241 state->result->u.range.max = max;
1243 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1244 * where <max> is written as compact(max+1) to make
1245 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1247 state->progLength += (1 + GetCompactIndexWidth(min)
1248 + GetCompactIndexWidth(max + 1)
1249 +3);
1250 return 0;
1254 state->cp = errp;
1255 return -1;
1258 static BOOL
1259 ParseQuantifier(CompilerState *state)
1261 RENode *term;
1262 term = state->result;
1263 if (state->cp < state->cpend) {
1264 switch (*state->cp) {
1265 case '+':
1266 state->result = NewRENode(state, REOP_QUANT);
1267 if (!state->result)
1268 return FALSE;
1269 state->result->u.range.min = 1;
1270 state->result->u.range.max = (UINT)-1;
1271 /* <PLUS>, <next> ... <ENDCHILD> */
1272 state->progLength += 4;
1273 goto quantifier;
1274 case '*':
1275 state->result = NewRENode(state, REOP_QUANT);
1276 if (!state->result)
1277 return FALSE;
1278 state->result->u.range.min = 0;
1279 state->result->u.range.max = (UINT)-1;
1280 /* <STAR>, <next> ... <ENDCHILD> */
1281 state->progLength += 4;
1282 goto quantifier;
1283 case '?':
1284 state->result = NewRENode(state, REOP_QUANT);
1285 if (!state->result)
1286 return FALSE;
1287 state->result->u.range.min = 0;
1288 state->result->u.range.max = 1;
1289 /* <OPT>, <next> ... <ENDCHILD> */
1290 state->progLength += 4;
1291 goto quantifier;
1292 case '{': /* balance '}' */
1294 INT err;
1296 err = ParseMinMaxQuantifier(state, FALSE);
1297 if (err == 0)
1298 goto quantifier;
1299 if (err == -1)
1300 return TRUE;
1302 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1303 return FALSE;
1305 default:;
1308 return TRUE;
1310 quantifier:
1311 if (state->treeDepth == TREE_DEPTH_MAX) {
1312 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1313 return FALSE;
1316 ++state->treeDepth;
1317 ++state->cp;
1318 state->result->kid = term;
1319 if (state->cp < state->cpend && *state->cp == '?') {
1320 ++state->cp;
1321 state->result->u.range.greedy = FALSE;
1322 } else {
1323 state->result->u.range.greedy = TRUE;
1325 return TRUE;
1329 * item: assertion An item is either an assertion or
1330 * quantatom a quantified atom.
1332 * assertion: '^' Assertions match beginning of string
1333 * (or line if the class static property
1334 * RegExp.multiline is true).
1335 * '$' End of string (or line if the class
1336 * static property RegExp.multiline is
1337 * true).
1338 * '\b' Word boundary (between \w and \W).
1339 * '\B' Word non-boundary.
1341 * quantatom: atom An unquantified atom.
1342 * quantatom '{' n ',' m '}'
1343 * Atom must occur between n and m times.
1344 * quantatom '{' n ',' '}' Atom must occur at least n times.
1345 * quantatom '{' n '}' Atom must occur exactly n times.
1346 * quantatom '*' Zero or more times (same as {0,}).
1347 * quantatom '+' One or more times (same as {1,}).
1348 * quantatom '?' Zero or one time (same as {0,1}).
1350 * any of which can be optionally followed by '?' for ungreedy
1352 * atom: '(' regexp ')' A parenthesized regexp (what matched
1353 * can be addressed using a backreference,
1354 * see '\' n below).
1355 * '.' Matches any char except '\n'.
1356 * '[' classlist ']' A character class.
1357 * '[' '^' classlist ']' A negated character class.
1358 * '\f' Form Feed.
1359 * '\n' Newline (Line Feed).
1360 * '\r' Carriage Return.
1361 * '\t' Horizontal Tab.
1362 * '\v' Vertical Tab.
1363 * '\d' A digit (same as [0-9]).
1364 * '\D' A non-digit.
1365 * '\w' A word character, [0-9a-z_A-Z].
1366 * '\W' A non-word character.
1367 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1368 * '\S' A non-whitespace character.
1369 * '\' n A backreference to the nth (n decimal
1370 * and positive) parenthesized expression.
1371 * '\' octal An octal escape sequence (octal must be
1372 * two or three digits long, unless it is
1373 * 0 for the null character).
1374 * '\x' hex A hex escape (hex must be two digits).
1375 * '\u' unicode A unicode escape (must be four digits).
1376 * '\c' ctrl A control character, ctrl is a letter.
1377 * '\' literalatomchar Any character except one of the above
1378 * that follow '\' in an atom.
1379 * otheratomchar Any character not first among the other
1380 * atom right-hand sides.
1382 static BOOL
1383 ParseTerm(CompilerState *state)
1385 WCHAR c = *state->cp++;
1386 UINT nDigits;
1387 UINT num, tmp, n, i;
1388 const WCHAR *termStart;
1390 switch (c) {
1391 /* assertions and atoms */
1392 case '^':
1393 state->result = NewRENode(state, REOP_BOL);
1394 if (!state->result)
1395 return FALSE;
1396 state->progLength++;
1397 return TRUE;
1398 case '$':
1399 state->result = NewRENode(state, REOP_EOL);
1400 if (!state->result)
1401 return FALSE;
1402 state->progLength++;
1403 return TRUE;
1404 case '\\':
1405 if (state->cp >= state->cpend) {
1406 /* a trailing '\' is an error */
1407 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1408 return FALSE;
1410 c = *state->cp++;
1411 switch (c) {
1412 /* assertion escapes */
1413 case 'b' :
1414 state->result = NewRENode(state, REOP_WBDRY);
1415 if (!state->result)
1416 return FALSE;
1417 state->progLength++;
1418 return TRUE;
1419 case 'B':
1420 state->result = NewRENode(state, REOP_WNONBDRY);
1421 if (!state->result)
1422 return FALSE;
1423 state->progLength++;
1424 return TRUE;
1425 /* Decimal escape */
1426 case '0':
1427 /* Give a strict warning. See also the note below. */
1428 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1429 doOctal:
1430 num = 0;
1431 while (state->cp < state->cpend) {
1432 c = *state->cp;
1433 if (c < '0' || '7' < c)
1434 break;
1435 state->cp++;
1436 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1437 if (tmp > 0377)
1438 break;
1439 num = tmp;
1441 c = (WCHAR)num;
1442 doFlat:
1443 state->result = NewRENode(state, REOP_FLAT);
1444 if (!state->result)
1445 return FALSE;
1446 state->result->u.flat.chr = c;
1447 state->result->u.flat.length = 1;
1448 state->progLength += 3;
1449 break;
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9':
1459 termStart = state->cp - 1;
1460 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1461 if (state->flags & JSREG_FIND_PAREN_ERROR)
1462 return FALSE;
1463 if (num == OVERFLOW_VALUE) {
1464 /* Give a strict mode warning. */
1465 WARN("back-reference exceeds number of capturing parentheses\n");
1468 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1469 * error here. However, for compatibility with IE, we treat the
1470 * whole backref as flat if the first character in it is not a
1471 * valid octal character, and as an octal escape otherwise.
1473 state->cp = termStart;
1474 if (c >= '8') {
1475 /* Treat this as flat. termStart - 1 is the \. */
1476 c = '\\';
1477 goto asFlat;
1480 /* Treat this as an octal escape. */
1481 goto doOctal;
1483 assert(1 <= num && num <= 0x10000);
1484 state->result = NewRENode(state, REOP_BACKREF);
1485 if (!state->result)
1486 return FALSE;
1487 state->result->u.parenIndex = num - 1;
1488 state->progLength
1489 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1490 break;
1491 /* Control escape */
1492 case 'f':
1493 c = 0xC;
1494 goto doFlat;
1495 case 'n':
1496 c = 0xA;
1497 goto doFlat;
1498 case 'r':
1499 c = 0xD;
1500 goto doFlat;
1501 case 't':
1502 c = 0x9;
1503 goto doFlat;
1504 case 'v':
1505 c = 0xB;
1506 goto doFlat;
1507 /* Control letter */
1508 case 'c':
1509 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1510 c = (WCHAR) (*state->cp++ & 0x1F);
1511 } else {
1512 /* back off to accepting the original '\' as a literal */
1513 --state->cp;
1514 c = '\\';
1516 goto doFlat;
1517 /* HexEscapeSequence */
1518 case 'x':
1519 nDigits = 2;
1520 goto lexHex;
1521 /* UnicodeEscapeSequence */
1522 case 'u':
1523 nDigits = 4;
1524 lexHex:
1525 n = 0;
1526 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1527 UINT digit;
1528 c = *state->cp++;
1529 if (!isASCIIHexDigit(c, &digit)) {
1531 * Back off to accepting the original 'u' or 'x' as a
1532 * literal.
1534 state->cp -= i + 2;
1535 n = *state->cp++;
1536 break;
1538 n = (n << 4) | digit;
1540 c = (WCHAR) n;
1541 goto doFlat;
1542 /* Character class escapes */
1543 case 'd':
1544 state->result = NewRENode(state, REOP_DIGIT);
1545 doSimple:
1546 if (!state->result)
1547 return FALSE;
1548 state->progLength++;
1549 break;
1550 case 'D':
1551 state->result = NewRENode(state, REOP_NONDIGIT);
1552 goto doSimple;
1553 case 's':
1554 state->result = NewRENode(state, REOP_SPACE);
1555 goto doSimple;
1556 case 'S':
1557 state->result = NewRENode(state, REOP_NONSPACE);
1558 goto doSimple;
1559 case 'w':
1560 state->result = NewRENode(state, REOP_ALNUM);
1561 goto doSimple;
1562 case 'W':
1563 state->result = NewRENode(state, REOP_NONALNUM);
1564 goto doSimple;
1565 /* IdentityEscape */
1566 default:
1567 state->result = NewRENode(state, REOP_FLAT);
1568 if (!state->result)
1569 return FALSE;
1570 state->result->u.flat.chr = c;
1571 state->result->u.flat.length = 1;
1572 state->result->kid = (void *) (state->cp - 1);
1573 state->progLength += 3;
1574 break;
1576 break;
1577 case '[':
1578 state->result = NewRENode(state, REOP_CLASS);
1579 if (!state->result)
1580 return FALSE;
1581 termStart = state->cp;
1582 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1583 for (;;) {
1584 if (state->cp == state->cpend) {
1585 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1586 JSMSG_UNTERM_CLASS, termStart);
1588 return FALSE;
1590 if (*state->cp == '\\') {
1591 state->cp++;
1592 if (state->cp != state->cpend)
1593 state->cp++;
1594 continue;
1596 if (*state->cp == ']') {
1597 state->result->u.ucclass.kidlen = state->cp - termStart;
1598 break;
1600 state->cp++;
1602 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1603 if (!state->classCache[i].start) {
1604 state->classCache[i].start = termStart;
1605 state->classCache[i].length = state->result->u.ucclass.kidlen;
1606 state->classCache[i].index = state->classCount;
1607 break;
1609 if (state->classCache[i].length ==
1610 state->result->u.ucclass.kidlen) {
1611 for (n = 0; ; n++) {
1612 if (n == state->classCache[i].length) {
1613 state->result->u.ucclass.index
1614 = state->classCache[i].index;
1615 goto claim;
1617 if (state->classCache[i].start[n] != termStart[n])
1618 break;
1622 state->result->u.ucclass.index = state->classCount++;
1624 claim:
1626 * Call CalculateBitmapSize now as we want any errors it finds
1627 * to be reported during the parse phase, not at execution.
1629 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1630 return FALSE;
1632 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1633 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1634 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1636 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1637 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1638 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1639 return FALSE;
1641 state->classBitmapsMem += n;
1642 /* CLASS, <index> */
1643 state->progLength
1644 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1645 break;
1647 case '.':
1648 state->result = NewRENode(state, REOP_DOT);
1649 goto doSimple;
1651 case '{':
1653 const WCHAR *errp = state->cp--;
1654 INT err;
1656 err = ParseMinMaxQuantifier(state, TRUE);
1657 state->cp = errp;
1659 if (err < 0)
1660 goto asFlat;
1662 /* FALL THROUGH */
1664 case '*':
1665 case '+':
1666 case '?':
1667 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1668 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1669 return FALSE;
1670 default:
1671 asFlat:
1672 state->result = NewRENode(state, REOP_FLAT);
1673 if (!state->result)
1674 return FALSE;
1675 state->result->u.flat.chr = c;
1676 state->result->u.flat.length = 1;
1677 state->result->kid = (void *) (state->cp - 1);
1678 state->progLength += 3;
1679 break;
1681 return ParseQuantifier(state);
1685 * Top-down regular expression grammar, based closely on Perl4.
1687 * regexp: altern A regular expression is one or more
1688 * altern '|' regexp alternatives separated by vertical bar.
1690 #define INITIAL_STACK_SIZE 128
1692 static BOOL
1693 ParseRegExp(CompilerState *state)
1695 size_t parenIndex;
1696 RENode *operand;
1697 REOpData *operatorStack;
1698 RENode **operandStack;
1699 REOp op;
1700 INT i;
1701 BOOL result = FALSE;
1703 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1704 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1706 /* Watch out for empty regexp */
1707 if (state->cp == state->cpend) {
1708 state->result = NewRENode(state, REOP_EMPTY);
1709 return (state->result != NULL);
1712 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1713 if (!operatorStack)
1714 return FALSE;
1716 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1717 if (!operandStack)
1718 goto out;
1720 for (;;) {
1721 parenIndex = state->parenCount;
1722 if (state->cp == state->cpend) {
1724 * If we are at the end of the regexp and we're short one or more
1725 * operands, the regexp must have the form /x|/ or some such, with
1726 * left parentheses making us short more than one operand.
1728 if (operatorSP >= operandSP) {
1729 operand = NewRENode(state, REOP_EMPTY);
1730 if (!operand)
1731 goto out;
1732 goto pushOperand;
1734 } else {
1735 switch (*state->cp) {
1736 case '(':
1737 ++state->cp;
1738 if (state->cp + 1 < state->cpend &&
1739 *state->cp == '?' &&
1740 (state->cp[1] == '=' ||
1741 state->cp[1] == '!' ||
1742 state->cp[1] == ':')) {
1743 switch (state->cp[1]) {
1744 case '=':
1745 op = REOP_ASSERT;
1746 /* ASSERT, <next>, ... ASSERTTEST */
1747 state->progLength += 4;
1748 break;
1749 case '!':
1750 op = REOP_ASSERT_NOT;
1751 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1752 state->progLength += 4;
1753 break;
1754 default:
1755 op = REOP_LPARENNON;
1756 break;
1758 state->cp += 2;
1759 } else {
1760 op = REOP_LPAREN;
1761 /* LPAREN, <index>, ... RPAREN, <index> */
1762 state->progLength
1763 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1764 state->parenCount++;
1765 if (state->parenCount == 65535) {
1766 ReportRegExpError(state, JSREPORT_ERROR,
1767 JSMSG_TOO_MANY_PARENS);
1768 goto out;
1771 goto pushOperator;
1773 case ')':
1775 * If there's no stacked open parenthesis, throw syntax error.
1777 for (i = operatorSP - 1; ; i--) {
1778 if (i < 0) {
1779 ReportRegExpError(state, JSREPORT_ERROR,
1780 JSMSG_UNMATCHED_RIGHT_PAREN);
1781 goto out;
1783 if (operatorStack[i].op == REOP_ASSERT ||
1784 operatorStack[i].op == REOP_ASSERT_NOT ||
1785 operatorStack[i].op == REOP_LPARENNON ||
1786 operatorStack[i].op == REOP_LPAREN) {
1787 break;
1790 /* FALL THROUGH */
1792 case '|':
1793 /* Expected an operand before these, so make an empty one */
1794 operand = NewRENode(state, REOP_EMPTY);
1795 if (!operand)
1796 goto out;
1797 goto pushOperand;
1799 default:
1800 if (!ParseTerm(state))
1801 goto out;
1802 operand = state->result;
1803 pushOperand:
1804 if (operandSP == operandStackSize) {
1805 RENode **tmp;
1806 operandStackSize += operandStackSize;
1807 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1808 if (!tmp)
1809 goto out;
1810 operandStack = tmp;
1812 operandStack[operandSP++] = operand;
1813 break;
1817 /* At the end; process remaining operators. */
1818 restartOperator:
1819 if (state->cp == state->cpend) {
1820 while (operatorSP) {
1821 --operatorSP;
1822 if (!ProcessOp(state, &operatorStack[operatorSP],
1823 operandStack, operandSP))
1824 goto out;
1825 --operandSP;
1827 assert(operandSP == 1);
1828 state->result = operandStack[0];
1829 result = TRUE;
1830 goto out;
1833 switch (*state->cp) {
1834 case '|':
1835 /* Process any stacked 'concat' operators */
1836 ++state->cp;
1837 while (operatorSP &&
1838 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1839 --operatorSP;
1840 if (!ProcessOp(state, &operatorStack[operatorSP],
1841 operandStack, operandSP)) {
1842 goto out;
1844 --operandSP;
1846 op = REOP_ALT;
1847 goto pushOperator;
1849 case ')':
1851 * If there's no stacked open parenthesis, throw syntax error.
1853 for (i = operatorSP - 1; ; i--) {
1854 if (i < 0) {
1855 ReportRegExpError(state, JSREPORT_ERROR,
1856 JSMSG_UNMATCHED_RIGHT_PAREN);
1857 goto out;
1859 if (operatorStack[i].op == REOP_ASSERT ||
1860 operatorStack[i].op == REOP_ASSERT_NOT ||
1861 operatorStack[i].op == REOP_LPARENNON ||
1862 operatorStack[i].op == REOP_LPAREN) {
1863 break;
1866 ++state->cp;
1868 /* Process everything on the stack until the open parenthesis. */
1869 for (;;) {
1870 assert(operatorSP);
1871 --operatorSP;
1872 switch (operatorStack[operatorSP].op) {
1873 case REOP_ASSERT:
1874 case REOP_ASSERT_NOT:
1875 case REOP_LPAREN:
1876 operand = NewRENode(state, operatorStack[operatorSP].op);
1877 if (!operand)
1878 goto out;
1879 operand->u.parenIndex =
1880 operatorStack[operatorSP].parenIndex;
1881 assert(operandSP);
1882 operand->kid = operandStack[operandSP - 1];
1883 operandStack[operandSP - 1] = operand;
1884 if (state->treeDepth == TREE_DEPTH_MAX) {
1885 ReportRegExpError(state, JSREPORT_ERROR,
1886 JSMSG_REGEXP_TOO_COMPLEX);
1887 goto out;
1889 ++state->treeDepth;
1890 /* FALL THROUGH */
1892 case REOP_LPARENNON:
1893 state->result = operandStack[operandSP - 1];
1894 if (!ParseQuantifier(state))
1895 goto out;
1896 operandStack[operandSP - 1] = state->result;
1897 goto restartOperator;
1898 default:
1899 if (!ProcessOp(state, &operatorStack[operatorSP],
1900 operandStack, operandSP))
1901 goto out;
1902 --operandSP;
1903 break;
1906 break;
1908 case '{':
1910 const WCHAR *errp = state->cp;
1912 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1914 * This didn't even scan correctly as a quantifier, so we should
1915 * treat it as flat.
1917 op = REOP_CONCAT;
1918 goto pushOperator;
1921 state->cp = errp;
1922 /* FALL THROUGH */
1925 case '+':
1926 case '*':
1927 case '?':
1928 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1929 state->cp);
1930 result = FALSE;
1931 goto out;
1933 default:
1934 /* Anything else is the start of the next term. */
1935 op = REOP_CONCAT;
1936 pushOperator:
1937 if (operatorSP == operatorStackSize) {
1938 REOpData *tmp;
1939 operatorStackSize += operatorStackSize;
1940 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1941 if (!tmp)
1942 goto out;
1943 operatorStack = tmp;
1945 operatorStack[operatorSP].op = op;
1946 operatorStack[operatorSP].errPos = state->cp;
1947 operatorStack[operatorSP++].parenIndex = parenIndex;
1948 break;
1951 out:
1952 heap_free(operatorStack);
1953 heap_free(operandStack);
1954 return result;
1958 * Save the current state of the match - the position in the input
1959 * text as well as the position in the bytecode. The state of any
1960 * parent expressions is also saved (preceding state).
1961 * Contents of parenCount parentheses from parenIndex are also saved.
1963 static REBackTrackData *
1964 PushBackTrackState(REGlobalData *gData, REOp op,
1965 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1966 size_t parenIndex, size_t parenCount)
1968 size_t i;
1969 REBackTrackData *result =
1970 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1972 size_t sz = sizeof(REBackTrackData) +
1973 gData->stateStackTop * sizeof(REProgState) +
1974 parenCount * sizeof(RECapture);
1976 ptrdiff_t btsize = gData->backTrackStackSize;
1977 ptrdiff_t btincr = ((char *)result + sz) -
1978 ((char *)gData->backTrackStack + btsize);
1980 TRACE("\tBT_Push: %lu,%lu\n", (unsigned long) parenIndex, (unsigned long) parenCount);
1982 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1983 if (btincr > 0) {
1984 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1986 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1987 btincr = ((btincr+btsize-1)/btsize)*btsize;
1988 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1989 if (!gData->backTrackStack) {
1990 js_ReportOutOfScriptQuota(gData->cx);
1991 gData->ok = FALSE;
1992 return NULL;
1994 gData->backTrackStackSize = btsize + btincr;
1995 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1997 gData->backTrackSP = result;
1998 result->sz = gData->cursz;
1999 gData->cursz = sz;
2001 result->backtrack_op = op;
2002 result->backtrack_pc = target;
2003 result->cp = cp;
2004 result->parenCount = parenCount;
2005 result->parenIndex = parenIndex;
2007 result->saveStateStackTop = gData->stateStackTop;
2008 assert(gData->stateStackTop);
2009 memcpy(result + 1, gData->stateStack,
2010 sizeof(REProgState) * result->saveStateStackTop);
2012 if (parenCount != 0) {
2013 memcpy((char *)(result + 1) +
2014 sizeof(REProgState) * result->saveStateStackTop,
2015 &x->parens[parenIndex],
2016 sizeof(RECapture) * parenCount);
2017 for (i = 0; i != parenCount; i++)
2018 x->parens[parenIndex + i].index = -1;
2021 return result;
2024 static inline REMatchState *
2025 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2026 size_t length)
2028 size_t i;
2029 assert(gData->cpend >= x->cp);
2030 if (length > (size_t)(gData->cpend - x->cp))
2031 return NULL;
2032 for (i = 0; i != length; i++) {
2033 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2034 return NULL;
2036 x->cp += length;
2037 return x;
2041 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2042 * 2. If E is not a character then go to step 6.
2043 * 3. Let ch be E's character.
2044 * 4. Let A be a one-element RECharSet containing the character ch.
2045 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2046 * 6. E must be an integer. Let n be that integer.
2047 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2048 * 8. Return an internal Matcher closure that takes two arguments, a State x
2049 * and a Continuation c, and performs the following:
2050 * 1. Let cap be x's captures internal array.
2051 * 2. Let s be cap[n].
2052 * 3. If s is undefined, then call c(x) and return its result.
2053 * 4. Let e be x's endIndex.
2054 * 5. Let len be s's length.
2055 * 6. Let f be e+len.
2056 * 7. If f>InputLength, return failure.
2057 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2058 * such that Canonicalize(s[i]) is not the same character as
2059 * Canonicalize(Input [e+i]), then return failure.
2060 * 9. Let y be the State (f, cap).
2061 * 10. Call c(y) and return its result.
2063 static REMatchState *
2064 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2066 size_t len, i;
2067 const WCHAR *parenContent;
2068 RECapture *cap = &x->parens[parenIndex];
2070 if (cap->index == -1)
2071 return x;
2073 len = cap->length;
2074 if (x->cp + len > gData->cpend)
2075 return NULL;
2077 parenContent = &gData->cpbegin[cap->index];
2078 if (gData->regexp->flags & JSREG_FOLD) {
2079 for (i = 0; i < len; i++) {
2080 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2081 return NULL;
2083 } else {
2084 for (i = 0; i < len; i++) {
2085 if (parenContent[i] != x->cp[i])
2086 return NULL;
2089 x->cp += len;
2090 return x;
2093 /* Add a single character to the RECharSet */
2094 static void
2095 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2097 UINT byteIndex = (UINT)(c >> 3);
2098 assert(c <= cs->length);
2099 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2103 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2104 static void
2105 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2107 UINT i;
2109 UINT byteIndex1 = c1 >> 3;
2110 UINT byteIndex2 = c2 >> 3;
2112 assert(c2 <= cs->length && c1 <= c2);
2114 c1 &= 0x7;
2115 c2 &= 0x7;
2117 if (byteIndex1 == byteIndex2) {
2118 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2119 } else {
2120 cs->u.bits[byteIndex1] |= 0xFF << c1;
2121 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2122 cs->u.bits[i] = 0xFF;
2123 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2127 /* Compile the source of the class into a RECharSet */
2128 static BOOL
2129 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2131 const WCHAR *src, *end;
2132 BOOL inRange = FALSE;
2133 WCHAR rangeStart = 0;
2134 UINT byteLength, n;
2135 WCHAR c, thisCh;
2136 INT nDigits, i;
2138 assert(!charSet->converted);
2140 * Assert that startIndex and length points to chars inside [] inside
2141 * source string.
2143 assert(1 <= charSet->u.src.startIndex);
2144 assert(charSet->u.src.startIndex
2145 < SysStringLen(gData->regexp->source));
2146 assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
2147 - 1 - charSet->u.src.startIndex);
2149 charSet->converted = TRUE;
2150 src = gData->regexp->source + charSet->u.src.startIndex;
2152 end = src + charSet->u.src.length;
2154 assert(src[-1] == '[' && end[0] == ']');
2156 byteLength = (charSet->length >> 3) + 1;
2157 charSet->u.bits = heap_alloc(byteLength);
2158 if (!charSet->u.bits) {
2159 JS_ReportOutOfMemory(gData->cx);
2160 gData->ok = FALSE;
2161 return FALSE;
2163 memset(charSet->u.bits, 0, byteLength);
2165 if (src == end)
2166 return TRUE;
2168 if (*src == '^') {
2169 assert(charSet->sense == FALSE);
2170 ++src;
2171 } else {
2172 assert(charSet->sense == TRUE);
2175 while (src != end) {
2176 switch (*src) {
2177 case '\\':
2178 ++src;
2179 c = *src++;
2180 switch (c) {
2181 case 'b':
2182 thisCh = 0x8;
2183 break;
2184 case 'f':
2185 thisCh = 0xC;
2186 break;
2187 case 'n':
2188 thisCh = 0xA;
2189 break;
2190 case 'r':
2191 thisCh = 0xD;
2192 break;
2193 case 't':
2194 thisCh = 0x9;
2195 break;
2196 case 'v':
2197 thisCh = 0xB;
2198 break;
2199 case 'c':
2200 if (src < end && JS_ISWORD(*src)) {
2201 thisCh = (WCHAR)(*src++ & 0x1F);
2202 } else {
2203 --src;
2204 thisCh = '\\';
2206 break;
2207 case 'x':
2208 nDigits = 2;
2209 goto lexHex;
2210 case 'u':
2211 nDigits = 4;
2212 lexHex:
2213 n = 0;
2214 for (i = 0; (i < nDigits) && (src < end); i++) {
2215 UINT digit;
2216 c = *src++;
2217 if (!isASCIIHexDigit(c, &digit)) {
2219 * Back off to accepting the original '\'
2220 * as a literal
2222 src -= i + 1;
2223 n = '\\';
2224 break;
2226 n = (n << 4) | digit;
2228 thisCh = (WCHAR)n;
2229 break;
2230 case '0':
2231 case '1':
2232 case '2':
2233 case '3':
2234 case '4':
2235 case '5':
2236 case '6':
2237 case '7':
2239 * This is a non-ECMA extension - decimal escapes (in this
2240 * case, octal!) are supposed to be an error inside class
2241 * ranges, but supported here for backwards compatibility.
2243 n = JS7_UNDEC(c);
2244 c = *src;
2245 if ('0' <= c && c <= '7') {
2246 src++;
2247 n = 8 * n + JS7_UNDEC(c);
2248 c = *src;
2249 if ('0' <= c && c <= '7') {
2250 src++;
2251 i = 8 * n + JS7_UNDEC(c);
2252 if (i <= 0377)
2253 n = i;
2254 else
2255 src--;
2258 thisCh = (WCHAR)n;
2259 break;
2261 case 'd':
2262 AddCharacterRangeToCharSet(charSet, '0', '9');
2263 continue; /* don't need range processing */
2264 case 'D':
2265 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2266 AddCharacterRangeToCharSet(charSet,
2267 (WCHAR)('9' + 1),
2268 (WCHAR)charSet->length);
2269 continue;
2270 case 's':
2271 for (i = (INT)charSet->length; i >= 0; i--)
2272 if (isspaceW(i))
2273 AddCharacterToCharSet(charSet, (WCHAR)i);
2274 continue;
2275 case 'S':
2276 for (i = (INT)charSet->length; i >= 0; i--)
2277 if (!isspaceW(i))
2278 AddCharacterToCharSet(charSet, (WCHAR)i);
2279 continue;
2280 case 'w':
2281 for (i = (INT)charSet->length; i >= 0; i--)
2282 if (JS_ISWORD(i))
2283 AddCharacterToCharSet(charSet, (WCHAR)i);
2284 continue;
2285 case 'W':
2286 for (i = (INT)charSet->length; i >= 0; i--)
2287 if (!JS_ISWORD(i))
2288 AddCharacterToCharSet(charSet, (WCHAR)i);
2289 continue;
2290 default:
2291 thisCh = c;
2292 break;
2295 break;
2297 default:
2298 thisCh = *src++;
2299 break;
2302 if (inRange) {
2303 if (gData->regexp->flags & JSREG_FOLD) {
2304 int i;
2306 assert(rangeStart <= thisCh);
2307 for (i = rangeStart; i <= thisCh; i++) {
2308 WCHAR uch, dch;
2310 AddCharacterToCharSet(charSet, i);
2311 uch = toupperW(i);
2312 dch = tolowerW(i);
2313 if (i != uch)
2314 AddCharacterToCharSet(charSet, uch);
2315 if (i != dch)
2316 AddCharacterToCharSet(charSet, dch);
2318 } else {
2319 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2321 inRange = FALSE;
2322 } else {
2323 if (gData->regexp->flags & JSREG_FOLD) {
2324 AddCharacterToCharSet(charSet, toupperW(thisCh));
2325 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2326 } else {
2327 AddCharacterToCharSet(charSet, thisCh);
2329 if (src < end - 1) {
2330 if (*src == '-') {
2331 ++src;
2332 inRange = TRUE;
2333 rangeStart = thisCh;
2338 return TRUE;
2341 static BOOL
2342 ReallocStateStack(REGlobalData *gData)
2344 size_t limit = gData->stateStackLimit;
2345 size_t sz = sizeof(REProgState) * limit;
2347 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2348 if (!gData->stateStack) {
2349 js_ReportOutOfScriptQuota(gData->cx);
2350 gData->ok = FALSE;
2351 return FALSE;
2353 gData->stateStackLimit = limit + limit;
2354 return TRUE;
2357 #define PUSH_STATE_STACK(data) \
2358 do { \
2359 ++(data)->stateStackTop; \
2360 if ((data)->stateStackTop == (data)->stateStackLimit && \
2361 !ReallocStateStack((data))) { \
2362 return NULL; \
2364 }while(0)
2367 * Apply the current op against the given input to see if it's going to match
2368 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2369 * true, then update the current state's cp. Always update startpc to the next
2370 * op.
2372 static inline REMatchState *
2373 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2374 jsbytecode **startpc, BOOL updatecp)
2376 REMatchState *result = NULL;
2377 WCHAR matchCh;
2378 size_t parenIndex;
2379 size_t offset, length, index;
2380 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2381 WCHAR *source;
2382 const WCHAR *startcp = x->cp;
2383 WCHAR ch;
2384 RECharSet *charSet;
2386 const char *opname = reop_names[op];
2387 TRACE("\n%06d: %*s%s\n", pc - gData->regexp->program,
2388 (int)gData->stateStackTop * 2, "", opname);
2390 switch (op) {
2391 case REOP_EMPTY:
2392 result = x;
2393 break;
2394 case REOP_BOL:
2395 if (x->cp != gData->cpbegin) {
2396 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2397 !(gData->regexp->flags & JSREG_MULTILINE)) {
2398 break;
2400 if (!RE_IS_LINE_TERM(x->cp[-1]))
2401 break;
2403 result = x;
2404 break;
2405 case REOP_EOL:
2406 if (x->cp != gData->cpend) {
2407 if (/*!gData->cx->regExpStatics.multiline &&*/
2408 !(gData->regexp->flags & JSREG_MULTILINE)) {
2409 break;
2411 if (!RE_IS_LINE_TERM(*x->cp))
2412 break;
2414 result = x;
2415 break;
2416 case REOP_WBDRY:
2417 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2418 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2419 result = x;
2421 break;
2422 case REOP_WNONBDRY:
2423 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2424 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2425 result = x;
2427 break;
2428 case REOP_DOT:
2429 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2430 result = x;
2431 result->cp++;
2433 break;
2434 case REOP_DIGIT:
2435 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2436 result = x;
2437 result->cp++;
2439 break;
2440 case REOP_NONDIGIT:
2441 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2442 result = x;
2443 result->cp++;
2445 break;
2446 case REOP_ALNUM:
2447 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2448 result = x;
2449 result->cp++;
2451 break;
2452 case REOP_NONALNUM:
2453 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2454 result = x;
2455 result->cp++;
2457 break;
2458 case REOP_SPACE:
2459 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2460 result = x;
2461 result->cp++;
2463 break;
2464 case REOP_NONSPACE:
2465 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2466 result = x;
2467 result->cp++;
2469 break;
2470 case REOP_BACKREF:
2471 pc = ReadCompactIndex(pc, &parenIndex);
2472 assert(parenIndex < gData->regexp->parenCount);
2473 result = BackrefMatcher(gData, x, parenIndex);
2474 break;
2475 case REOP_FLAT:
2476 pc = ReadCompactIndex(pc, &offset);
2477 assert(offset < SysStringLen(gData->regexp->source));
2478 pc = ReadCompactIndex(pc, &length);
2479 assert(1 <= length);
2480 assert(length <= SysStringLen(gData->regexp->source) - offset);
2481 if (length <= (size_t)(gData->cpend - x->cp)) {
2482 source = gData->regexp->source + offset;
2483 TRACE("%s\n", debugstr_wn(source, length));
2484 for (index = 0; index != length; index++) {
2485 if (source[index] != x->cp[index])
2486 return NULL;
2488 x->cp += length;
2489 result = x;
2491 break;
2492 case REOP_FLAT1:
2493 matchCh = *pc++;
2494 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2495 if (x->cp != gData->cpend && *x->cp == matchCh) {
2496 result = x;
2497 result->cp++;
2499 break;
2500 case REOP_FLATi:
2501 pc = ReadCompactIndex(pc, &offset);
2502 assert(offset < SysStringLen(gData->regexp->source));
2503 pc = ReadCompactIndex(pc, &length);
2504 assert(1 <= length);
2505 assert(length <= SysStringLen(gData->regexp->source) - offset);
2506 source = gData->regexp->source;
2507 result = FlatNIMatcher(gData, x, source + offset, length);
2508 break;
2509 case REOP_FLAT1i:
2510 matchCh = *pc++;
2511 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2512 result = x;
2513 result->cp++;
2515 break;
2516 case REOP_UCFLAT1:
2517 matchCh = GET_ARG(pc);
2518 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2519 pc += ARG_LEN;
2520 if (x->cp != gData->cpend && *x->cp == matchCh) {
2521 result = x;
2522 result->cp++;
2524 break;
2525 case REOP_UCFLAT1i:
2526 matchCh = GET_ARG(pc);
2527 pc += ARG_LEN;
2528 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2529 result = x;
2530 result->cp++;
2532 break;
2533 case REOP_CLASS:
2534 pc = ReadCompactIndex(pc, &index);
2535 assert(index < gData->regexp->classCount);
2536 if (x->cp != gData->cpend) {
2537 charSet = &gData->regexp->classList[index];
2538 assert(charSet->converted);
2539 ch = *x->cp;
2540 index = ch >> 3;
2541 if (charSet->length != 0 &&
2542 ch <= charSet->length &&
2543 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2544 result = x;
2545 result->cp++;
2548 break;
2549 case REOP_NCLASS:
2550 pc = ReadCompactIndex(pc, &index);
2551 assert(index < gData->regexp->classCount);
2552 if (x->cp != gData->cpend) {
2553 charSet = &gData->regexp->classList[index];
2554 assert(charSet->converted);
2555 ch = *x->cp;
2556 index = ch >> 3;
2557 if (charSet->length == 0 ||
2558 ch > charSet->length ||
2559 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2560 result = x;
2561 result->cp++;
2564 break;
2566 default:
2567 assert(FALSE);
2569 if (result) {
2570 if (!updatecp)
2571 x->cp = startcp;
2572 *startpc = pc;
2573 TRACE(" *\n");
2574 return result;
2576 x->cp = startcp;
2577 return NULL;
2580 static inline REMatchState *
2581 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2583 REMatchState *result = NULL;
2584 REBackTrackData *backTrackData;
2585 jsbytecode *nextpc, *testpc;
2586 REOp nextop;
2587 RECapture *cap;
2588 REProgState *curState;
2589 const WCHAR *startcp;
2590 size_t parenIndex, k;
2591 size_t parenSoFar = 0;
2593 WCHAR matchCh1, matchCh2;
2594 RECharSet *charSet;
2596 BOOL anchor;
2597 jsbytecode *pc = gData->regexp->program;
2598 REOp op = (REOp) *pc++;
2601 * If the first node is a simple match, step the index into the string
2602 * until that match is made, or fail if it can't be found at all.
2604 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2605 anchor = FALSE;
2606 while (x->cp <= gData->cpend) {
2607 nextpc = pc; /* reset back to start each time */
2608 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2609 if (result) {
2610 anchor = TRUE;
2611 x = result;
2612 pc = nextpc; /* accept skip to next opcode */
2613 op = (REOp) *pc++;
2614 assert(op < REOP_LIMIT);
2615 break;
2617 gData->skipped++;
2618 x->cp++;
2620 if (!anchor)
2621 goto bad;
2624 for (;;) {
2625 const char *opname = reop_names[op];
2626 TRACE("\n%06d: %*s%s\n", pc - gData->regexp->program,
2627 (int)gData->stateStackTop * 2, "", opname);
2629 if (REOP_IS_SIMPLE(op)) {
2630 result = SimpleMatch(gData, x, op, &pc, TRUE);
2631 } else {
2632 curState = &gData->stateStack[gData->stateStackTop];
2633 switch (op) {
2634 case REOP_END:
2635 goto good;
2636 case REOP_ALTPREREQ2:
2637 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2638 pc += ARG_LEN;
2639 matchCh2 = GET_ARG(pc);
2640 pc += ARG_LEN;
2641 k = GET_ARG(pc);
2642 pc += ARG_LEN;
2644 if (x->cp != gData->cpend) {
2645 if (*x->cp == matchCh2)
2646 goto doAlt;
2648 charSet = &gData->regexp->classList[k];
2649 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2650 goto bad;
2651 matchCh1 = *x->cp;
2652 k = matchCh1 >> 3;
2653 if ((charSet->length == 0 ||
2654 matchCh1 > charSet->length ||
2655 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2656 charSet->sense) {
2657 goto doAlt;
2660 result = NULL;
2661 break;
2663 case REOP_ALTPREREQ:
2664 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2665 pc += ARG_LEN;
2666 matchCh1 = GET_ARG(pc);
2667 pc += ARG_LEN;
2668 matchCh2 = GET_ARG(pc);
2669 pc += ARG_LEN;
2670 if (x->cp == gData->cpend ||
2671 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2672 result = NULL;
2673 break;
2675 /* else false thru... */
2677 case REOP_ALT:
2678 doAlt:
2679 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2680 pc += ARG_LEN; /* start of this alternate */
2681 curState->parenSoFar = parenSoFar;
2682 PUSH_STATE_STACK(gData);
2683 op = (REOp) *pc++;
2684 startcp = x->cp;
2685 if (REOP_IS_SIMPLE(op)) {
2686 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2687 op = (REOp) *nextpc++;
2688 pc = nextpc;
2689 continue;
2691 result = x;
2692 op = (REOp) *pc++;
2694 nextop = (REOp) *nextpc++;
2695 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2696 goto bad;
2697 continue;
2700 * Occurs at (successful) end of REOP_ALT,
2702 case REOP_JUMP:
2704 * If we have not gotten a result here, it is because of an
2705 * empty match. Do the same thing REOP_EMPTY would do.
2707 if (!result)
2708 result = x;
2710 --gData->stateStackTop;
2711 pc += GET_OFFSET(pc);
2712 op = (REOp) *pc++;
2713 continue;
2716 * Occurs at last (successful) end of REOP_ALT,
2718 case REOP_ENDALT:
2720 * If we have not gotten a result here, it is because of an
2721 * empty match. Do the same thing REOP_EMPTY would do.
2723 if (!result)
2724 result = x;
2726 --gData->stateStackTop;
2727 op = (REOp) *pc++;
2728 continue;
2730 case REOP_LPAREN:
2731 pc = ReadCompactIndex(pc, &parenIndex);
2732 TRACE("[ %lu ]\n", (unsigned long) parenIndex);
2733 assert(parenIndex < gData->regexp->parenCount);
2734 if (parenIndex + 1 > parenSoFar)
2735 parenSoFar = parenIndex + 1;
2736 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2737 x->parens[parenIndex].length = 0;
2738 op = (REOp) *pc++;
2739 continue;
2741 case REOP_RPAREN:
2743 ptrdiff_t delta;
2745 pc = ReadCompactIndex(pc, &parenIndex);
2746 assert(parenIndex < gData->regexp->parenCount);
2747 cap = &x->parens[parenIndex];
2748 delta = x->cp - (gData->cpbegin + cap->index);
2749 cap->length = (delta < 0) ? 0 : (size_t) delta;
2750 op = (REOp) *pc++;
2752 if (!result)
2753 result = x;
2754 continue;
2756 case REOP_ASSERT:
2757 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2758 pc += ARG_LEN; /* start of ASSERT child */
2759 op = (REOp) *pc++;
2760 testpc = pc;
2761 if (REOP_IS_SIMPLE(op) &&
2762 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2763 result = NULL;
2764 break;
2766 curState->u.assertion.top =
2767 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2768 curState->u.assertion.sz = gData->cursz;
2769 curState->index = x->cp - gData->cpbegin;
2770 curState->parenSoFar = parenSoFar;
2771 PUSH_STATE_STACK(gData);
2772 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2773 nextpc, x, x->cp, 0, 0)) {
2774 goto bad;
2776 continue;
2778 case REOP_ASSERT_NOT:
2779 nextpc = pc + GET_OFFSET(pc);
2780 pc += ARG_LEN;
2781 op = (REOp) *pc++;
2782 testpc = pc;
2783 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2784 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2785 *testpc == REOP_ASSERTNOTTEST) {
2786 result = NULL;
2787 break;
2789 curState->u.assertion.top
2790 = (char *)gData->backTrackSP -
2791 (char *)gData->backTrackStack;
2792 curState->u.assertion.sz = gData->cursz;
2793 curState->index = x->cp - gData->cpbegin;
2794 curState->parenSoFar = parenSoFar;
2795 PUSH_STATE_STACK(gData);
2796 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2797 nextpc, x, x->cp, 0, 0)) {
2798 goto bad;
2800 continue;
2802 case REOP_ASSERTTEST:
2803 --gData->stateStackTop;
2804 --curState;
2805 x->cp = gData->cpbegin + curState->index;
2806 gData->backTrackSP =
2807 (REBackTrackData *) ((char *)gData->backTrackStack +
2808 curState->u.assertion.top);
2809 gData->cursz = curState->u.assertion.sz;
2810 if (result)
2811 result = x;
2812 break;
2814 case REOP_ASSERTNOTTEST:
2815 --gData->stateStackTop;
2816 --curState;
2817 x->cp = gData->cpbegin + curState->index;
2818 gData->backTrackSP =
2819 (REBackTrackData *) ((char *)gData->backTrackStack +
2820 curState->u.assertion.top);
2821 gData->cursz = curState->u.assertion.sz;
2822 result = (!result) ? x : NULL;
2823 break;
2824 case REOP_STAR:
2825 curState->u.quantifier.min = 0;
2826 curState->u.quantifier.max = (UINT)-1;
2827 goto quantcommon;
2828 case REOP_PLUS:
2829 curState->u.quantifier.min = 1;
2830 curState->u.quantifier.max = (UINT)-1;
2831 goto quantcommon;
2832 case REOP_OPT:
2833 curState->u.quantifier.min = 0;
2834 curState->u.quantifier.max = 1;
2835 goto quantcommon;
2836 case REOP_QUANT:
2837 pc = ReadCompactIndex(pc, &k);
2838 curState->u.quantifier.min = k;
2839 pc = ReadCompactIndex(pc, &k);
2840 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2841 curState->u.quantifier.max = k - 1;
2842 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2843 quantcommon:
2844 if (curState->u.quantifier.max == 0) {
2845 pc = pc + GET_OFFSET(pc);
2846 op = (REOp) *pc++;
2847 result = x;
2848 continue;
2850 /* Step over <next> */
2851 nextpc = pc + ARG_LEN;
2852 op = (REOp) *nextpc++;
2853 startcp = x->cp;
2854 if (REOP_IS_SIMPLE(op)) {
2855 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2856 if (curState->u.quantifier.min == 0)
2857 result = x;
2858 else
2859 result = NULL;
2860 pc = pc + GET_OFFSET(pc);
2861 break;
2863 op = (REOp) *nextpc++;
2864 result = x;
2866 curState->index = startcp - gData->cpbegin;
2867 curState->continue_op = REOP_REPEAT;
2868 curState->continue_pc = pc;
2869 curState->parenSoFar = parenSoFar;
2870 PUSH_STATE_STACK(gData);
2871 if (curState->u.quantifier.min == 0 &&
2872 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2873 0, 0)) {
2874 goto bad;
2876 pc = nextpc;
2877 continue;
2879 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2880 pc = curState[-1].continue_pc;
2881 op = (REOp) curState[-1].continue_op;
2883 if (!result)
2884 result = x;
2885 continue;
2887 case REOP_REPEAT:
2888 --curState;
2889 do {
2890 --gData->stateStackTop;
2891 if (!result) {
2892 /* Failed, see if we have enough children. */
2893 if (curState->u.quantifier.min == 0)
2894 goto repeatDone;
2895 goto break_switch;
2897 if (curState->u.quantifier.min == 0 &&
2898 x->cp == gData->cpbegin + curState->index) {
2899 /* matched an empty string, that'll get us nowhere */
2900 result = NULL;
2901 goto break_switch;
2903 if (curState->u.quantifier.min != 0)
2904 curState->u.quantifier.min--;
2905 if (curState->u.quantifier.max != (UINT) -1)
2906 curState->u.quantifier.max--;
2907 if (curState->u.quantifier.max == 0)
2908 goto repeatDone;
2909 nextpc = pc + ARG_LEN;
2910 nextop = (REOp) *nextpc;
2911 startcp = x->cp;
2912 if (REOP_IS_SIMPLE(nextop)) {
2913 nextpc++;
2914 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2915 if (curState->u.quantifier.min == 0)
2916 goto repeatDone;
2917 result = NULL;
2918 goto break_switch;
2920 result = x;
2922 curState->index = startcp - gData->cpbegin;
2923 PUSH_STATE_STACK(gData);
2924 if (curState->u.quantifier.min == 0 &&
2925 !PushBackTrackState(gData, REOP_REPEAT,
2926 pc, x, startcp,
2927 curState->parenSoFar,
2928 parenSoFar -
2929 curState->parenSoFar)) {
2930 goto bad;
2932 } while (*nextpc == REOP_ENDCHILD);
2933 pc = nextpc;
2934 op = (REOp) *pc++;
2935 parenSoFar = curState->parenSoFar;
2936 continue;
2938 repeatDone:
2939 result = x;
2940 pc += GET_OFFSET(pc);
2941 goto break_switch;
2943 case REOP_MINIMALSTAR:
2944 curState->u.quantifier.min = 0;
2945 curState->u.quantifier.max = (UINT)-1;
2946 goto minimalquantcommon;
2947 case REOP_MINIMALPLUS:
2948 curState->u.quantifier.min = 1;
2949 curState->u.quantifier.max = (UINT)-1;
2950 goto minimalquantcommon;
2951 case REOP_MINIMALOPT:
2952 curState->u.quantifier.min = 0;
2953 curState->u.quantifier.max = 1;
2954 goto minimalquantcommon;
2955 case REOP_MINIMALQUANT:
2956 pc = ReadCompactIndex(pc, &k);
2957 curState->u.quantifier.min = k;
2958 pc = ReadCompactIndex(pc, &k);
2959 /* See REOP_QUANT comments about k - 1. */
2960 curState->u.quantifier.max = k - 1;
2961 assert(curState->u.quantifier.min
2962 <= curState->u.quantifier.max);
2963 minimalquantcommon:
2964 curState->index = x->cp - gData->cpbegin;
2965 curState->parenSoFar = parenSoFar;
2966 PUSH_STATE_STACK(gData);
2967 if (curState->u.quantifier.min != 0) {
2968 curState->continue_op = REOP_MINIMALREPEAT;
2969 curState->continue_pc = pc;
2970 /* step over <next> */
2971 pc += OFFSET_LEN;
2972 op = (REOp) *pc++;
2973 } else {
2974 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2975 pc, x, x->cp, 0, 0)) {
2976 goto bad;
2978 --gData->stateStackTop;
2979 pc = pc + GET_OFFSET(pc);
2980 op = (REOp) *pc++;
2982 continue;
2984 case REOP_MINIMALREPEAT:
2985 --gData->stateStackTop;
2986 --curState;
2988 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2989 #define PREPARE_REPEAT() \
2990 do { \
2991 curState->index = x->cp - gData->cpbegin; \
2992 curState->continue_op = REOP_MINIMALREPEAT; \
2993 curState->continue_pc = pc; \
2994 pc += ARG_LEN; \
2995 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2996 x->parens[k].index = -1; \
2997 PUSH_STATE_STACK(gData); \
2998 op = (REOp) *pc++; \
2999 assert(op < REOP_LIMIT); \
3000 }while(0)
3002 if (!result) {
3003 TRACE(" -\n");
3005 * Non-greedy failure - try to consume another child.
3007 if (curState->u.quantifier.max == (UINT) -1 ||
3008 curState->u.quantifier.max > 0) {
3009 PREPARE_REPEAT();
3010 continue;
3012 /* Don't need to adjust pc since we're going to pop. */
3013 break;
3015 if (curState->u.quantifier.min == 0 &&
3016 x->cp == gData->cpbegin + curState->index) {
3017 /* Matched an empty string, that'll get us nowhere. */
3018 result = NULL;
3019 break;
3021 if (curState->u.quantifier.min != 0)
3022 curState->u.quantifier.min--;
3023 if (curState->u.quantifier.max != (UINT) -1)
3024 curState->u.quantifier.max--;
3025 if (curState->u.quantifier.min != 0) {
3026 PREPARE_REPEAT();
3027 continue;
3029 curState->index = x->cp - gData->cpbegin;
3030 curState->parenSoFar = parenSoFar;
3031 PUSH_STATE_STACK(gData);
3032 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3033 pc, x, x->cp,
3034 curState->parenSoFar,
3035 parenSoFar - curState->parenSoFar)) {
3036 goto bad;
3038 --gData->stateStackTop;
3039 pc = pc + GET_OFFSET(pc);
3040 op = (REOp) *pc++;
3041 assert(op < REOP_LIMIT);
3042 continue;
3043 default:
3044 assert(FALSE);
3045 result = NULL;
3047 break_switch:;
3051 * If the match failed and there's a backtrack option, take it.
3052 * Otherwise this is a complete and utter failure.
3054 if (!result) {
3055 if (gData->cursz == 0)
3056 return NULL;
3058 /* Potentially detect explosive regex here. */
3059 gData->backTrackCount++;
3060 if (gData->backTrackLimit &&
3061 gData->backTrackCount >= gData->backTrackLimit) {
3062 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3063 JSMSG_REGEXP_TOO_COMPLEX);
3064 gData->ok = FALSE;
3065 return NULL;
3068 backTrackData = gData->backTrackSP;
3069 gData->cursz = backTrackData->sz;
3070 gData->backTrackSP =
3071 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3072 x->cp = backTrackData->cp;
3073 pc = backTrackData->backtrack_pc;
3074 op = (REOp) backTrackData->backtrack_op;
3075 assert(op < REOP_LIMIT);
3076 gData->stateStackTop = backTrackData->saveStateStackTop;
3077 assert(gData->stateStackTop);
3079 memcpy(gData->stateStack, backTrackData + 1,
3080 sizeof(REProgState) * backTrackData->saveStateStackTop);
3081 curState = &gData->stateStack[gData->stateStackTop - 1];
3083 if (backTrackData->parenCount) {
3084 memcpy(&x->parens[backTrackData->parenIndex],
3085 (char *)(backTrackData + 1) +
3086 sizeof(REProgState) * backTrackData->saveStateStackTop,
3087 sizeof(RECapture) * backTrackData->parenCount);
3088 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3089 } else {
3090 for (k = curState->parenSoFar; k < parenSoFar; k++)
3091 x->parens[k].index = -1;
3092 parenSoFar = curState->parenSoFar;
3095 TRACE("\tBT_Pop: %ld,%ld\n",
3096 (unsigned long) backTrackData->parenIndex,
3097 (unsigned long) backTrackData->parenCount);
3098 continue;
3100 x = result;
3103 * Continue with the expression.
3105 op = (REOp)*pc++;
3106 assert(op < REOP_LIMIT);
3109 bad:
3110 TRACE("\n");
3111 return NULL;
3113 good:
3114 TRACE("\n");
3115 return x;
3118 static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
3120 REMatchState *result;
3121 const WCHAR *cp = x->cp;
3122 const WCHAR *cp2;
3123 UINT j;
3126 * Have to include the position beyond the last character
3127 * in order to detect end-of-input/line condition.
3129 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3130 gData->skipped = cp2 - cp;
3131 x->cp = cp2;
3132 for (j = 0; j < gData->regexp->parenCount; j++)
3133 x->parens[j].index = -1;
3134 result = ExecuteREBytecode(gData, x);
3135 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
3136 return result;
3137 gData->backTrackSP = gData->backTrackStack;
3138 gData->cursz = 0;
3139 gData->stateStackTop = 0;
3140 cp2 = cp + gData->skipped;
3142 return NULL;
3145 #define MIN_BACKTRACK_LIMIT 400000
3147 static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
3149 REMatchState *result;
3150 UINT i;
3152 gData->backTrackStackSize = INITIAL_BACKTRACK;
3153 gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
3154 if (!gData->backTrackStack)
3155 goto bad;
3157 gData->backTrackSP = gData->backTrackStack;
3158 gData->cursz = 0;
3159 gData->backTrackCount = 0;
3160 gData->backTrackLimit = 0;
3162 gData->stateStackLimit = INITIAL_STATESTACK;
3163 gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3164 if (!gData->stateStack)
3165 goto bad;
3167 gData->stateStackTop = 0;
3168 gData->cx = cx;
3169 gData->regexp = re;
3170 gData->ok = TRUE;
3172 result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
3173 if (!result)
3174 goto bad;
3176 for (i = 0; i < re->classCount; i++) {
3177 if (!re->classList[i].converted &&
3178 !ProcessCharSet(gData, &re->classList[i])) {
3179 return NULL;
3183 return result;
3185 bad:
3186 js_ReportOutOfScriptQuota(cx);
3187 gData->ok = FALSE;
3188 return NULL;
3191 static void
3192 js_DestroyRegExp(JSRegExp *re)
3194 if (re->classList) {
3195 UINT i;
3196 for (i = 0; i < re->classCount; i++) {
3197 if (re->classList[i].converted)
3198 heap_free(re->classList[i].u.bits);
3199 re->classList[i].u.bits = NULL;
3201 heap_free(re->classList);
3203 heap_free(re);
3206 static JSRegExp *
3207 js_NewRegExp(script_ctx_t *cx, BSTR str, UINT flags, BOOL flat)
3209 JSRegExp *re;
3210 jsheap_t *mark;
3211 CompilerState state;
3212 size_t resize;
3213 jsbytecode *endPC;
3214 UINT i;
3215 size_t len;
3217 re = NULL;
3218 mark = jsheap_mark(&cx->tmp_heap);
3219 len = SysStringLen(str);
3221 state.context = cx;
3222 state.cp = str;
3223 if (!state.cp)
3224 goto out;
3225 state.cpbegin = state.cp;
3226 state.cpend = state.cp + len;
3227 state.flags = flags;
3228 state.parenCount = 0;
3229 state.classCount = 0;
3230 state.progLength = 0;
3231 state.treeDepth = 0;
3232 state.classBitmapsMem = 0;
3233 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3234 state.classCache[i].start = NULL;
3236 if (len != 0 && flat) {
3237 state.result = NewRENode(&state, REOP_FLAT);
3238 if (!state.result)
3239 goto out;
3240 state.result->u.flat.chr = *state.cpbegin;
3241 state.result->u.flat.length = len;
3242 state.result->kid = (void *) state.cpbegin;
3243 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3244 state.progLength += 1 + GetCompactIndexWidth(0)
3245 + GetCompactIndexWidth(len);
3246 } else {
3247 if (!ParseRegExp(&state))
3248 goto out;
3250 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3251 re = heap_alloc(resize);
3252 if (!re)
3253 goto out;
3255 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3256 re->classCount = state.classCount;
3257 if (re->classCount) {
3258 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3259 if (!re->classList) {
3260 js_DestroyRegExp(re);
3261 re = NULL;
3262 goto out;
3264 for (i = 0; i < re->classCount; i++)
3265 re->classList[i].converted = FALSE;
3266 } else {
3267 re->classList = NULL;
3269 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3270 if (!endPC) {
3271 js_DestroyRegExp(re);
3272 re = NULL;
3273 goto out;
3275 *endPC++ = REOP_END;
3277 * Check whether size was overestimated and shrink using realloc.
3278 * This is safe since no pointers to newly parsed regexp or its parts
3279 * besides re exist here.
3281 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3282 JSRegExp *tmp;
3283 assert((size_t)(endPC - re->program) < state.progLength + 1);
3284 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3285 tmp = heap_realloc(re, resize);
3286 if (tmp)
3287 re = tmp;
3290 re->flags = flags;
3291 re->parenCount = state.parenCount;
3292 re->source = str;
3294 out:
3295 jsheap_clear(mark);
3296 return re;
3299 static inline RegExpInstance *regexp_from_vdisp(vdisp_t *vdisp)
3301 return (RegExpInstance*)vdisp->u.jsdisp;
3304 static void set_last_index(RegExpInstance *This, DWORD last_index)
3306 This->last_index = last_index;
3307 VariantClear(&This->last_index_var);
3308 num_set_val(&This->last_index_var, last_index);
3311 static HRESULT do_regexp_match_next(script_ctx_t *ctx, RegExpInstance *regexp, const WCHAR *str, DWORD len,
3312 const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt, match_result_t *ret)
3314 REMatchState *x, *result;
3315 REGlobalData gData;
3316 DWORD matchlen;
3318 gData.cpbegin = *cp;
3319 gData.cpend = str + len;
3320 gData.start = *cp-str;
3321 gData.skipped = 0;
3322 gData.pool = &ctx->tmp_heap;
3324 x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
3325 if(!x) {
3326 WARN("InitMatch failed\n");
3327 return E_FAIL;
3330 x->cp = *cp;
3331 result = MatchRegExp(&gData, x);
3332 if(!gData.ok) {
3333 WARN("MatchRegExp failed\n");
3334 return E_FAIL;
3337 if(!result)
3338 return S_FALSE;
3340 if(parens) {
3341 DWORD i;
3343 if(regexp->jsregexp->parenCount > *parens_size) {
3344 match_result_t *new_parens;
3346 if(*parens)
3347 new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
3348 else
3349 new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
3350 if(!new_parens)
3351 return E_OUTOFMEMORY;
3353 *parens = new_parens;
3356 *parens_cnt = regexp->jsregexp->parenCount;
3358 for(i=0; i < regexp->jsregexp->parenCount; i++) {
3359 if(result->parens[i].index == -1) {
3360 (*parens)[i].str = NULL;
3361 (*parens)[i].len = 0;
3362 }else {
3363 (*parens)[i].str = *cp + result->parens[i].index;
3364 (*parens)[i].len = result->parens[i].length;
3369 matchlen = (result->cp-*cp) - gData.skipped;
3370 *cp = result->cp;
3371 ret->str = result->cp-matchlen;
3372 ret->len = matchlen;
3373 set_last_index(regexp, result->cp-str);
3375 return S_OK;
3378 HRESULT regexp_match_next(script_ctx_t *ctx, DispatchEx *dispex, BOOL gcheck, const WCHAR *str, DWORD len,
3379 const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt, match_result_t *ret)
3381 RegExpInstance *regexp = (RegExpInstance*)dispex;
3382 jsheap_t *mark;
3383 HRESULT hres;
3385 if(gcheck && !(regexp->jsregexp->flags & JSREG_GLOB))
3386 return S_FALSE;
3388 mark = jsheap_mark(&ctx->tmp_heap);
3390 hres = do_regexp_match_next(ctx, regexp, str, len, cp, parens, parens_size, parens_cnt, ret);
3392 jsheap_clear(mark);
3393 return hres;
3396 HRESULT regexp_match(script_ctx_t *ctx, DispatchEx *dispex, const WCHAR *str, DWORD len, BOOL gflag,
3397 match_result_t **match_result, DWORD *result_cnt)
3399 RegExpInstance *This = (RegExpInstance*)dispex;
3400 match_result_t *ret = NULL, cres;
3401 const WCHAR *cp = str;
3402 DWORD i=0, ret_size = 0;
3403 jsheap_t *mark;
3404 HRESULT hres;
3406 mark = jsheap_mark(&ctx->tmp_heap);
3408 while(1) {
3409 hres = do_regexp_match_next(ctx, This, str, len, &cp, NULL, NULL, NULL, &cres);
3410 if(hres == S_FALSE) {
3411 hres = S_OK;
3412 break;
3415 if(FAILED(hres))
3416 break;
3418 if(ret_size == i) {
3419 if(ret)
3420 ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
3421 else
3422 ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
3423 if(!ret) {
3424 hres = E_OUTOFMEMORY;
3425 break;
3429 ret[i++] = cres;
3431 if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
3432 hres = S_OK;
3433 break;
3437 jsheap_clear(mark);
3438 if(FAILED(hres)) {
3439 heap_free(ret);
3440 return hres;
3443 *match_result = ret;
3444 *result_cnt = i;
3445 return S_OK;
3448 static HRESULT RegExp_source(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3449 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3451 TRACE("\n");
3453 switch(flags) {
3454 case DISPATCH_PROPERTYGET: {
3455 RegExpInstance *This = regexp_from_vdisp(jsthis);
3457 V_VT(retv) = VT_BSTR;
3458 V_BSTR(retv) = SysAllocString(This->str);
3459 if(!V_BSTR(retv))
3460 return E_OUTOFMEMORY;
3461 break;
3463 default:
3464 FIXME("Unimplemnted flags %x\n", flags);
3465 return E_NOTIMPL;
3468 return S_OK;
3471 static HRESULT RegExp_global(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3472 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3474 FIXME("\n");
3475 return E_NOTIMPL;
3478 static HRESULT RegExp_ignoreCase(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3479 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3481 FIXME("\n");
3482 return E_NOTIMPL;
3485 static HRESULT RegExp_multiline(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3486 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3488 FIXME("\n");
3489 return E_NOTIMPL;
3492 static INT index_from_var(script_ctx_t *ctx, VARIANT *v)
3494 jsexcept_t ei;
3495 VARIANT num;
3496 HRESULT hres;
3498 memset(&ei, 0, sizeof(ei));
3499 hres = to_number(ctx, v, &ei, &num);
3500 if(FAILED(hres)) { /* FIXME: Move ignoring exceptions to to_promitive */
3501 VariantClear(&ei.var);
3502 return 0;
3505 if(V_VT(&num) == VT_R8) {
3506 DOUBLE d = floor(V_R8(&num));
3507 return (DOUBLE)(INT)d == d ? d : 0;
3510 return V_I4(&num);
3513 static HRESULT RegExp_lastIndex(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3514 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3516 TRACE("\n");
3518 switch(flags) {
3519 case DISPATCH_PROPERTYGET: {
3520 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3522 V_VT(retv) = VT_EMPTY;
3523 return VariantCopy(retv, &regexp->last_index_var);
3525 case DISPATCH_PROPERTYPUT: {
3526 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3527 VARIANT *arg;
3528 HRESULT hres;
3530 arg = get_arg(dp,0);
3531 hres = VariantCopy(&regexp->last_index_var, arg);
3532 if(FAILED(hres))
3533 return hres;
3535 regexp->last_index = index_from_var(ctx, arg);
3536 break;
3538 default:
3539 FIXME("unimplemented flags: %x\n", flags);
3540 return E_NOTIMPL;
3543 return S_OK;
3546 static HRESULT RegExp_toString(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3547 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3549 FIXME("\n");
3550 return E_NOTIMPL;
3553 static HRESULT create_match_array(script_ctx_t *ctx, BSTR input, const match_result_t *result,
3554 const match_result_t *parens, DWORD parens_cnt, jsexcept_t *ei, IDispatch **ret)
3556 DispatchEx *array;
3557 VARIANT var;
3558 int i;
3559 HRESULT hres = S_OK;
3561 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3562 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3563 static const WCHAR zeroW[] = {'0',0};
3565 hres = create_array(ctx, parens_cnt+1, &array);
3566 if(FAILED(hres))
3567 return hres;
3569 for(i=0; i < parens_cnt; i++) {
3570 V_VT(&var) = VT_BSTR;
3571 V_BSTR(&var) = SysAllocStringLen(parens[i].str, parens[i].len);
3572 if(!V_BSTR(&var)) {
3573 hres = E_OUTOFMEMORY;
3574 break;
3577 hres = jsdisp_propput_idx(array, i+1, &var, ei, NULL/*FIXME*/);
3578 SysFreeString(V_BSTR(&var));
3579 if(FAILED(hres))
3580 break;
3583 while(SUCCEEDED(hres)) {
3584 V_VT(&var) = VT_I4;
3585 V_I4(&var) = result->str-input;
3586 hres = jsdisp_propput_name(array, indexW, &var, ei, NULL/*FIXME*/);
3587 if(FAILED(hres))
3588 break;
3590 V_VT(&var) = VT_BSTR;
3591 V_BSTR(&var) = input;
3592 hres = jsdisp_propput_name(array, inputW, &var, ei, NULL/*FIXME*/);
3593 if(FAILED(hres))
3594 break;
3596 V_BSTR(&var) = SysAllocStringLen(result->str, result->len);
3597 if(!V_BSTR(&var)) {
3598 hres = E_OUTOFMEMORY;
3599 break;
3601 hres = jsdisp_propput_name(array, zeroW, &var, ei, NULL/*FIXME*/);
3602 SysFreeString(V_BSTR(&var));
3603 break;
3606 if(FAILED(hres)) {
3607 jsdisp_release(array);
3608 return hres;
3611 *ret = (IDispatch*)_IDispatchEx_(array);
3612 return S_OK;
3615 static HRESULT run_exec(script_ctx_t *ctx, vdisp_t *jsthis, VARIANT *arg, jsexcept_t *ei, BSTR *input,
3616 match_result_t *match, match_result_t **parens, DWORD *parens_cnt, VARIANT_BOOL *ret)
3618 RegExpInstance *regexp;
3619 DWORD parens_size = 0, last_index = 0, length;
3620 const WCHAR *cp;
3621 BSTR string;
3622 HRESULT hres;
3624 if(!is_vclass(jsthis, JSCLASS_REGEXP)) {
3625 FIXME("Not a RegExp\n");
3626 return E_NOTIMPL;
3629 regexp = regexp_from_vdisp(jsthis);
3631 if(arg) {
3632 hres = to_string(ctx, arg, ei, &string);
3633 if(FAILED(hres))
3634 return hres;
3635 }else {
3636 string = SysAllocStringLen(NULL, 0);
3637 if(!string)
3638 return E_OUTOFMEMORY;
3641 if(regexp->last_index < 0) {
3642 SysFreeString(string);
3643 set_last_index(regexp, 0);
3644 *ret = VARIANT_FALSE;
3645 return S_OK;
3648 length = SysStringLen(string);
3649 if(regexp->jsregexp->flags & JSREG_GLOB)
3650 last_index = regexp->last_index;
3652 cp = string + last_index;
3653 hres = regexp_match_next(ctx, &regexp->dispex, FALSE, string, length, &cp, parens, parens ? &parens_size : NULL,
3654 parens_cnt, match);
3655 if(FAILED(hres)) {
3656 SysFreeString(string);
3657 return hres;
3660 if(hres == S_OK) {
3661 *ret = VARIANT_TRUE;
3662 }else {
3663 set_last_index(regexp, 0);
3664 *ret = VARIANT_FALSE;
3667 if(input)
3668 *input = string;
3669 return S_OK;
3672 static HRESULT RegExp_exec(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3673 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3675 match_result_t *parens = NULL, match;
3676 DWORD parens_cnt = 0;
3677 VARIANT_BOOL b;
3678 BSTR string;
3679 HRESULT hres;
3681 TRACE("\n");
3683 hres = run_exec(ctx, jsthis, arg_cnt(dp) ? get_arg(dp,0) : NULL, ei, &string, &match, &parens, &parens_cnt, &b);
3684 if(FAILED(hres))
3685 return hres;
3687 if(retv) {
3688 if(b) {
3689 IDispatch *ret;
3691 hres = create_match_array(ctx, string, &match, parens, parens_cnt, ei, &ret);
3692 if(SUCCEEDED(hres)) {
3693 V_VT(retv) = VT_DISPATCH;
3694 V_DISPATCH(retv) = ret;
3696 }else {
3697 V_VT(retv) = VT_NULL;
3701 heap_free(parens);
3702 SysFreeString(string);
3703 return hres;
3706 static HRESULT RegExp_test(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3707 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3709 match_result_t match;
3710 VARIANT_BOOL b;
3711 HRESULT hres;
3713 TRACE("\n");
3715 hres = run_exec(ctx, jsthis, arg_cnt(dp) ? get_arg(dp,0) : NULL, ei, NULL, &match, NULL, NULL, &b);
3716 if(FAILED(hres))
3717 return hres;
3719 if(retv) {
3720 V_VT(retv) = VT_BOOL;
3721 V_BOOL(retv) = b;
3723 return S_OK;
3726 static HRESULT RegExp_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3727 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3729 TRACE("\n");
3731 switch(flags) {
3732 case INVOKE_FUNC:
3733 return throw_type_error(ctx, ei, IDS_NOT_FUNC, NULL);
3734 default:
3735 FIXME("unimplemented flags %x\n", flags);
3736 return E_NOTIMPL;
3739 return S_OK;
3742 static void RegExp_destructor(DispatchEx *dispex)
3744 RegExpInstance *This = (RegExpInstance*)dispex;
3746 if(This->jsregexp)
3747 js_DestroyRegExp(This->jsregexp);
3748 VariantClear(&This->last_index_var);
3749 SysFreeString(This->str);
3750 heap_free(This);
3753 static const builtin_prop_t RegExp_props[] = {
3754 {execW, RegExp_exec, PROPF_METHOD|1},
3755 {globalW, RegExp_global, 0},
3756 {ignoreCaseW, RegExp_ignoreCase, 0},
3757 {lastIndexW, RegExp_lastIndex, 0},
3758 {multilineW, RegExp_multiline, 0},
3759 {sourceW, RegExp_source, 0},
3760 {testW, RegExp_test, PROPF_METHOD|1},
3761 {toStringW, RegExp_toString, PROPF_METHOD}
3764 static const builtin_info_t RegExp_info = {
3765 JSCLASS_REGEXP,
3766 {NULL, RegExp_value, 0},
3767 sizeof(RegExp_props)/sizeof(*RegExp_props),
3768 RegExp_props,
3769 RegExp_destructor,
3770 NULL
3773 static HRESULT alloc_regexp(script_ctx_t *ctx, DispatchEx *object_prototype, RegExpInstance **ret)
3775 RegExpInstance *regexp;
3776 HRESULT hres;
3778 regexp = heap_alloc_zero(sizeof(RegExpInstance));
3779 if(!regexp)
3780 return E_OUTOFMEMORY;
3782 if(object_prototype)
3783 hres = init_dispex(&regexp->dispex, ctx, &RegExp_info, object_prototype);
3784 else
3785 hres = init_dispex_from_constr(&regexp->dispex, ctx, &RegExp_info, ctx->regexp_constr);
3787 if(FAILED(hres)) {
3788 heap_free(regexp);
3789 return hres;
3792 *ret = regexp;
3793 return S_OK;
3796 HRESULT create_regexp(script_ctx_t *ctx, const WCHAR *exp, int len, DWORD flags, DispatchEx **ret)
3798 RegExpInstance *regexp;
3799 HRESULT hres;
3801 TRACE("%s %x\n", debugstr_w(exp), flags);
3803 hres = alloc_regexp(ctx, NULL, &regexp);
3804 if(FAILED(hres))
3805 return hres;
3807 if(len == -1)
3808 regexp->str = SysAllocString(exp);
3809 else
3810 regexp->str = SysAllocStringLen(exp, len);
3811 if(!regexp->str) {
3812 jsdisp_release(&regexp->dispex);
3813 return E_OUTOFMEMORY;
3816 regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
3817 if(!regexp->jsregexp) {
3818 WARN("js_NewRegExp failed\n");
3819 jsdisp_release(&regexp->dispex);
3820 return E_FAIL;
3823 V_VT(&regexp->last_index_var) = VT_I4;
3824 V_I4(&regexp->last_index_var) = 0;
3826 *ret = &regexp->dispex;
3827 return S_OK;
3830 static HRESULT regexp_constructor(script_ctx_t *ctx, DISPPARAMS *dp, VARIANT *retv)
3832 const WCHAR *opt = emptyW, *src;
3833 DispatchEx *ret;
3834 VARIANT *arg;
3835 DWORD flags;
3836 HRESULT hres;
3838 if(!arg_cnt(dp)) {
3839 FIXME("no args\n");
3840 return E_NOTIMPL;
3843 arg = get_arg(dp,0);
3844 if(V_VT(arg) == VT_DISPATCH) {
3845 DispatchEx *obj;
3847 obj = iface_to_jsdisp((IUnknown*)V_DISPATCH(arg));
3848 if(obj) {
3849 if(is_class(obj, JSCLASS_REGEXP)) {
3850 RegExpInstance *regexp = (RegExpInstance*)obj;
3852 hres = create_regexp(ctx, regexp->str, -1, regexp->jsregexp->flags, &ret);
3853 jsdisp_release(obj);
3854 if(FAILED(hres))
3855 return hres;
3857 V_VT(retv) = VT_DISPATCH;
3858 V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(ret);
3859 return S_OK;
3862 jsdisp_release(obj);
3866 if(V_VT(arg) != VT_BSTR) {
3867 FIXME("vt arg0 = %d\n", V_VT(arg));
3868 return E_NOTIMPL;
3871 src = V_BSTR(arg);
3873 if(arg_cnt(dp) >= 2) {
3874 arg = get_arg(dp,1);
3875 if(V_VT(arg) != VT_BSTR) {
3876 FIXME("unimplemented for vt %d\n", V_VT(arg));
3877 return E_NOTIMPL;
3880 opt = V_BSTR(arg);
3883 hres = parse_regexp_flags(opt, strlenW(opt), &flags);
3884 if(FAILED(hres))
3885 return hres;
3887 hres = create_regexp(ctx, src, -1, flags, &ret);
3888 if(FAILED(hres))
3889 return hres;
3891 if(retv) {
3892 V_VT(retv) = VT_DISPATCH;
3893 V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(ret);
3894 }else {
3895 jsdisp_release(ret);
3897 return S_OK;
3900 static HRESULT RegExpConstr_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3901 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3903 TRACE("\n");
3905 switch(flags) {
3906 case DISPATCH_METHOD:
3907 if(arg_cnt(dp)) {
3908 VARIANT *arg = get_arg(dp,0);
3909 if(V_VT(arg) == VT_DISPATCH) {
3910 DispatchEx *jsdisp = iface_to_jsdisp((IUnknown*)V_DISPATCH(arg));
3911 if(jsdisp) {
3912 if(is_class(jsdisp, JSCLASS_REGEXP)) {
3913 if(arg_cnt(dp) > 1 && V_VT(get_arg(dp,1)) != VT_EMPTY) {
3914 jsdisp_release(jsdisp);
3915 return throw_regexp_error(ctx, ei, IDS_REGEXP_SYNTAX_ERROR, NULL);
3918 if(retv) {
3919 V_VT(retv) = VT_DISPATCH;
3920 V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(jsdisp);
3921 }else {
3922 jsdisp_release(jsdisp);
3924 return S_OK;
3926 jsdisp_release(jsdisp);
3930 /* fall through */
3931 case DISPATCH_CONSTRUCT:
3932 return regexp_constructor(ctx, dp, retv);
3933 default:
3934 FIXME("unimplemented flags: %x\n", flags);
3935 return E_NOTIMPL;
3938 return S_OK;
3941 HRESULT create_regexp_constr(script_ctx_t *ctx, DispatchEx *object_prototype, DispatchEx **ret)
3943 RegExpInstance *regexp;
3944 HRESULT hres;
3946 static const WCHAR RegExpW[] = {'R','e','g','E','x','p',0};
3948 hres = alloc_regexp(ctx, object_prototype, &regexp);
3949 if(FAILED(hres))
3950 return hres;
3952 hres = create_builtin_function(ctx, RegExpConstr_value, RegExpW, NULL,
3953 PROPF_CONSTR|2, &regexp->dispex, ret);
3955 jsdisp_release(&regexp->dispex);
3956 return hres;
3959 HRESULT parse_regexp_flags(const WCHAR *str, DWORD str_len, DWORD *ret)
3961 const WCHAR *p;
3962 DWORD flags = 0;
3964 for (p = str; p < str+str_len; p++) {
3965 switch (*p) {
3966 case 'g':
3967 flags |= JSREG_GLOB;
3968 break;
3969 case 'i':
3970 flags |= JSREG_FOLD;
3971 break;
3972 case 'm':
3973 flags |= JSREG_MULTILINE;
3974 break;
3975 case 'y':
3976 flags |= JSREG_STICKY;
3977 break;
3978 default:
3979 WARN("wrong flag %c\n", *p);
3980 return E_FAIL;
3984 *ret = flags;
3985 return S_OK;