usp10: Properly handle invalid arguments to ScriptBreak.
[wine/multimedia.git] / dlls / jscript / regexp.c
blob5106604f9a10787b4cb12d7f44db557f87f7d4a2
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
35 #include <math.h>
37 #include "jscript.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
44 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
45 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
46 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
48 typedef BYTE JSPackedBool;
49 typedef BYTE jsbytecode;
52 * This struct holds a bitmap representation of a class from a regexp.
53 * There's a list of these referenced by the classList field in the JSRegExp
54 * struct below. The initial state has startIndex set to the offset in the
55 * original regexp source of the beginning of the class contents. The first
56 * use of the class converts the source representation into a bitmap.
59 typedef struct RECharSet {
60 JSPackedBool converted;
61 JSPackedBool sense;
62 WORD length;
63 union {
64 BYTE *bits;
65 struct {
66 size_t startIndex;
67 size_t length;
68 } src;
69 } u;
70 } RECharSet;
72 typedef struct {
73 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
74 size_t parenCount; /* number of parenthesized submatches */
75 size_t classCount; /* count [...] bitmaps */
76 RECharSet *classList; /* list of [...] bitmaps */
77 BSTR source; /* locked source string, sans // */
78 jsbytecode program[1]; /* regular expression bytecode */
79 } JSRegExp;
81 typedef struct {
82 jsdisp_t dispex;
84 JSRegExp *jsregexp;
85 BSTR str;
86 INT last_index;
87 VARIANT last_index_var;
88 } RegExpInstance;
90 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
91 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
92 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
93 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
94 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
95 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
96 static const WCHAR execW[] = {'e','x','e','c',0};
97 static const WCHAR testW[] = {'t','e','s','t',0};
99 static const WCHAR leftContextW[] =
100 {'l','e','f','t','C','o','n','t','e','x','t',0};
101 static const WCHAR rightContextW[] =
102 {'r','i','g','h','t','C','o','n','t','e','x','t',0};
104 static const WCHAR undefinedW[] = {'u','n','d','e','f','i','n','e','d',0};
105 static const WCHAR emptyW[] = {0};
107 /* FIXME: Better error handling */
108 #define ReportRegExpError(a,b,c)
109 #define ReportRegExpErrorHelper(a,b,c,d)
110 #define JS_ReportErrorNumber(a,b,c,d)
111 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
112 #define js_ReportOutOfScriptQuota(a)
113 #define JS_ReportOutOfMemory(a)
114 #define JS_COUNT_OPERATION(a,b)
116 #define JSMSG_MIN_TOO_BIG 47
117 #define JSMSG_MAX_TOO_BIG 48
118 #define JSMSG_OUT_OF_ORDER 49
119 #define JSMSG_OUT_OF_MEMORY 137
121 #define LINE_SEPARATOR 0x2028
122 #define PARA_SEPARATOR 0x2029
124 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
125 ((c >= 'a') && (c <= 'z')) )
126 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
127 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
129 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
131 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
132 #define JS7_UNDEC(c) ((c) - '0')
134 typedef enum REOp {
135 REOP_EMPTY,
136 REOP_BOL,
137 REOP_EOL,
138 REOP_WBDRY,
139 REOP_WNONBDRY,
140 REOP_DOT,
141 REOP_DIGIT,
142 REOP_NONDIGIT,
143 REOP_ALNUM,
144 REOP_NONALNUM,
145 REOP_SPACE,
146 REOP_NONSPACE,
147 REOP_BACKREF,
148 REOP_FLAT,
149 REOP_FLAT1,
150 REOP_FLATi,
151 REOP_FLAT1i,
152 REOP_UCFLAT1,
153 REOP_UCFLAT1i,
154 REOP_UCFLAT,
155 REOP_UCFLATi,
156 REOP_CLASS,
157 REOP_NCLASS,
158 REOP_ALT,
159 REOP_QUANT,
160 REOP_STAR,
161 REOP_PLUS,
162 REOP_OPT,
163 REOP_LPAREN,
164 REOP_RPAREN,
165 REOP_JUMP,
166 REOP_DOTSTAR,
167 REOP_LPARENNON,
168 REOP_ASSERT,
169 REOP_ASSERT_NOT,
170 REOP_ASSERTTEST,
171 REOP_ASSERTNOTTEST,
172 REOP_MINIMALSTAR,
173 REOP_MINIMALPLUS,
174 REOP_MINIMALOPT,
175 REOP_MINIMALQUANT,
176 REOP_ENDCHILD,
177 REOP_REPEAT,
178 REOP_MINIMALREPEAT,
179 REOP_ALTPREREQ,
180 REOP_ALTPREREQ2,
181 REOP_ENDALT,
182 REOP_CONCAT,
183 REOP_END,
184 REOP_LIMIT /* META: no operator >= to this */
185 } REOp;
187 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
189 static const char *reop_names[] = {
190 "empty",
191 "bol",
192 "eol",
193 "wbdry",
194 "wnonbdry",
195 "dot",
196 "digit",
197 "nondigit",
198 "alnum",
199 "nonalnum",
200 "space",
201 "nonspace",
202 "backref",
203 "flat",
204 "flat1",
205 "flati",
206 "flat1i",
207 "ucflat1",
208 "ucflat1i",
209 "ucflat",
210 "ucflati",
211 "class",
212 "nclass",
213 "alt",
214 "quant",
215 "star",
216 "plus",
217 "opt",
218 "lparen",
219 "rparen",
220 "jump",
221 "dotstar",
222 "lparennon",
223 "assert",
224 "assert_not",
225 "asserttest",
226 "assertnottest",
227 "minimalstar",
228 "minimalplus",
229 "minimalopt",
230 "minimalquant",
231 "endchild",
232 "repeat",
233 "minimalrepeat",
234 "altprereq",
235 "alrprereq2",
236 "endalt",
237 "concat",
238 "end",
239 NULL
242 typedef struct RECapture {
243 ptrdiff_t index; /* start of contents, -1 for empty */
244 size_t length; /* length of capture */
245 } RECapture;
247 typedef struct REMatchState {
248 const WCHAR *cp;
249 RECapture parens[1]; /* first of 're->parenCount' captures,
250 allocated at end of this struct */
251 } REMatchState;
253 typedef struct REProgState {
254 jsbytecode *continue_pc; /* current continuation data */
255 jsbytecode continue_op;
256 ptrdiff_t index; /* progress in text */
257 size_t parenSoFar; /* highest indexed paren started */
258 union {
259 struct {
260 UINT min; /* current quantifier limits */
261 UINT max;
262 } quantifier;
263 struct {
264 size_t top; /* backtrack stack state */
265 size_t sz;
266 } assertion;
267 } u;
268 } REProgState;
270 typedef struct REBackTrackData {
271 size_t sz; /* size of previous stack entry */
272 jsbytecode *backtrack_pc; /* where to backtrack to */
273 jsbytecode backtrack_op;
274 const WCHAR *cp; /* index in text of match at backtrack */
275 size_t parenIndex; /* start index of saved paren contents */
276 size_t parenCount; /* # of saved paren contents */
277 size_t saveStateStackTop; /* number of parent states */
278 /* saved parent states follow */
279 /* saved paren contents follow */
280 } REBackTrackData;
282 #define INITIAL_STATESTACK 100
283 #define INITIAL_BACKTRACK 8000
285 typedef struct REGlobalData {
286 script_ctx_t *cx;
287 JSRegExp *regexp; /* the RE in execution */
288 BOOL ok; /* runtime error (out_of_memory only?) */
289 size_t start; /* offset to start at */
290 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
291 const WCHAR *cpbegin; /* text base address */
292 const WCHAR *cpend; /* text limit address */
294 REProgState *stateStack; /* stack of state of current parents */
295 size_t stateStackTop;
296 size_t stateStackLimit;
298 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
299 REBackTrackData *backTrackSP;
300 size_t backTrackStackSize;
301 size_t cursz; /* size of current stack entry */
302 size_t backTrackCount; /* how many times we've backtracked */
303 size_t backTrackLimit; /* upper limit on backtrack states */
305 jsheap_t *pool; /* It's faster to use one malloc'd pool
306 than to malloc/free the three items
307 that are allocated from this pool */
308 } REGlobalData;
310 typedef struct RENode RENode;
311 struct RENode {
312 REOp op; /* r.e. op bytecode */
313 RENode *next; /* next in concatenation order */
314 void *kid; /* first operand */
315 union {
316 void *kid2; /* second operand */
317 INT num; /* could be a number */
318 size_t parenIndex; /* or a parenthesis index */
319 struct { /* or a quantifier range */
320 UINT min;
321 UINT max;
322 JSPackedBool greedy;
323 } range;
324 struct { /* or a character class */
325 size_t startIndex;
326 size_t kidlen; /* length of string at kid, in jschars */
327 size_t index; /* index into class list */
328 WORD bmsize; /* bitmap size, based on max char code */
329 JSPackedBool sense;
330 } ucclass;
331 struct { /* or a literal sequence */
332 WCHAR chr; /* of one character */
333 size_t length; /* or many (via the kid) */
334 } flat;
335 struct {
336 RENode *kid2; /* second operand from ALT */
337 WCHAR ch1; /* match char for ALTPREREQ */
338 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
339 } altprereq;
340 } u;
343 #define CLASS_CACHE_SIZE 4
345 typedef struct CompilerState {
346 script_ctx_t *context;
347 const WCHAR *cpbegin;
348 const WCHAR *cpend;
349 const WCHAR *cp;
350 size_t parenCount;
351 size_t classCount; /* number of [] encountered */
352 size_t treeDepth; /* maximum depth of parse tree */
353 size_t progLength; /* estimated bytecode length */
354 RENode *result;
355 size_t classBitmapsMem; /* memory to hold all class bitmaps */
356 struct {
357 const WCHAR *start; /* small cache of class strings */
358 size_t length; /* since they're often the same */
359 size_t index;
360 } classCache[CLASS_CACHE_SIZE];
361 WORD flags;
362 } CompilerState;
364 typedef struct EmitStateStackEntry {
365 jsbytecode *altHead; /* start of REOP_ALT* opcode */
366 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
367 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
368 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
369 RENode *continueNode; /* original REOP_ALT* node being stacked */
370 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
371 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
372 avoid 16-bit unsigned offset overflow */
373 } EmitStateStackEntry;
376 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
377 * the getters and setters take the pc of the offset, not of the opcode before
378 * the offset.
380 #define ARG_LEN 2
381 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
382 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
383 (pc)[1] = (jsbytecode) (arg))
385 #define OFFSET_LEN ARG_LEN
386 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
387 #define GET_OFFSET(pc) GET_ARG(pc)
389 static BOOL ParseRegExp(CompilerState*);
392 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
393 * For sanity, we limit it to 2^24 bytes.
395 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
398 * The maximum memory that can be allocated for class bitmaps.
399 * For sanity, we limit it to 2^24 bytes.
401 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
404 * Functions to get size and write/read bytecode that represent small indexes
405 * compactly.
406 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
407 * indicates that the following byte brings more bits to the index. Otherwise
408 * this is the last byte in the index bytecode representing highest index bits.
410 static size_t
411 GetCompactIndexWidth(size_t index)
413 size_t width;
415 for (width = 1; (index >>= 7) != 0; ++width) { }
416 return width;
419 static inline jsbytecode *
420 WriteCompactIndex(jsbytecode *pc, size_t index)
422 size_t next;
424 while ((next = index >> 7) != 0) {
425 *pc++ = (jsbytecode)(index | 0x80);
426 index = next;
428 *pc++ = (jsbytecode)index;
429 return pc;
432 static inline jsbytecode *
433 ReadCompactIndex(jsbytecode *pc, size_t *result)
435 size_t nextByte;
437 nextByte = *pc++;
438 if ((nextByte & 0x80) == 0) {
440 * Short-circuit the most common case when compact index <= 127.
442 *result = nextByte;
443 } else {
444 size_t shift = 7;
445 *result = 0x7F & nextByte;
446 do {
447 nextByte = *pc++;
448 *result |= (nextByte & 0x7F) << shift;
449 shift += 7;
450 } while ((nextByte & 0x80) != 0);
452 return pc;
455 /* Construct and initialize an RENode, returning NULL for out-of-memory */
456 static RENode *
457 NewRENode(CompilerState *state, REOp op)
459 RENode *ren;
461 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
462 if (!ren) {
463 /* js_ReportOutOfScriptQuota(cx); */
464 return NULL;
466 ren->op = op;
467 ren->next = NULL;
468 ren->kid = NULL;
469 return ren;
473 * Validates and converts hex ascii value.
475 static BOOL
476 isASCIIHexDigit(WCHAR c, UINT *digit)
478 UINT cv = c;
480 if (cv < '0')
481 return FALSE;
482 if (cv <= '9') {
483 *digit = cv - '0';
484 return TRUE;
486 cv |= 0x20;
487 if (cv >= 'a' && cv <= 'f') {
488 *digit = cv - 'a' + 10;
489 return TRUE;
491 return FALSE;
494 typedef struct {
495 REOp op;
496 const WCHAR *errPos;
497 size_t parenIndex;
498 } REOpData;
500 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
501 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
503 static BOOL
504 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
506 ptrdiff_t offset = target - jump;
508 /* Check that target really points forward. */
509 assert(offset >= 2);
510 if ((size_t)offset > OFFSET_MAX)
511 return FALSE;
513 jump[0] = JUMP_OFFSET_HI(offset);
514 jump[1] = JUMP_OFFSET_LO(offset);
515 return TRUE;
519 * Generate bytecode for the tree rooted at t using an explicit stack instead
520 * of recursion.
522 static jsbytecode *
523 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
524 jsbytecode *pc, RENode *t)
526 EmitStateStackEntry *emitStateSP, *emitStateStack;
527 RECharSet *charSet;
528 REOp op;
530 if (treeDepth == 0) {
531 emitStateStack = NULL;
532 } else {
533 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
534 if (!emitStateStack)
535 return NULL;
537 emitStateSP = emitStateStack;
538 op = t->op;
539 assert(op < REOP_LIMIT);
541 for (;;) {
542 *pc++ = op;
543 switch (op) {
544 case REOP_EMPTY:
545 --pc;
546 break;
548 case REOP_ALTPREREQ2:
549 case REOP_ALTPREREQ:
550 assert(emitStateSP);
551 emitStateSP->altHead = pc - 1;
552 emitStateSP->endTermFixup = pc;
553 pc += OFFSET_LEN;
554 SET_ARG(pc, t->u.altprereq.ch1);
555 pc += ARG_LEN;
556 SET_ARG(pc, t->u.altprereq.ch2);
557 pc += ARG_LEN;
559 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
560 pc += OFFSET_LEN;
562 emitStateSP->continueNode = t;
563 emitStateSP->continueOp = REOP_JUMP;
564 emitStateSP->jumpToJumpFlag = FALSE;
565 ++emitStateSP;
566 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
567 t = t->kid;
568 op = t->op;
569 assert(op < REOP_LIMIT);
570 continue;
572 case REOP_JUMP:
573 emitStateSP->nextTermFixup = pc; /* offset to following term */
574 pc += OFFSET_LEN;
575 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
576 goto jump_too_big;
577 emitStateSP->continueOp = REOP_ENDALT;
578 ++emitStateSP;
579 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
580 t = t->u.kid2;
581 op = t->op;
582 assert(op < REOP_LIMIT);
583 continue;
585 case REOP_ENDALT:
587 * If we already patched emitStateSP->nextTermFixup to jump to
588 * a nearer jump, to avoid 16-bit immediate offset overflow, we
589 * are done here.
591 if (emitStateSP->jumpToJumpFlag)
592 break;
595 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
596 * REOP_ENDALT is executed only on successful match of the last
597 * alternate in a group.
599 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
600 goto jump_too_big;
601 if (t->op != REOP_ALT) {
602 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
603 goto jump_too_big;
607 * If the program is bigger than the REOP_JUMP offset range, then
608 * we must check for alternates before this one that are part of
609 * the same group, and fix up their jump offsets to target jumps
610 * close enough to fit in a 16-bit unsigned offset immediate.
612 if ((size_t)(pc - re->program) > OFFSET_MAX &&
613 emitStateSP > emitStateStack) {
614 EmitStateStackEntry *esp, *esp2;
615 jsbytecode *alt, *jump;
616 ptrdiff_t span, header;
618 esp2 = emitStateSP;
619 alt = esp2->altHead;
620 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
621 if (esp->continueOp == REOP_ENDALT &&
622 !esp->jumpToJumpFlag &&
623 esp->nextTermFixup + OFFSET_LEN == alt &&
624 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
625 ? esp->endTermFixup
626 : esp->nextTermFixup)) > OFFSET_MAX) {
627 alt = esp->altHead;
628 jump = esp->nextTermFixup;
631 * The span must be 1 less than the distance from
632 * jump offset to jump offset, so we actually jump
633 * to a REOP_JUMP bytecode, not to its offset!
635 for (;;) {
636 assert(jump < esp2->nextTermFixup);
637 span = esp2->nextTermFixup - jump - 1;
638 if ((size_t)span <= OFFSET_MAX)
639 break;
640 do {
641 if (--esp2 == esp)
642 goto jump_too_big;
643 } while (esp2->continueOp != REOP_ENDALT);
646 jump[0] = JUMP_OFFSET_HI(span);
647 jump[1] = JUMP_OFFSET_LO(span);
649 if (esp->continueNode->op != REOP_ALT) {
651 * We must patch the offset at esp->endTermFixup
652 * as well, for the REOP_ALTPREREQ{,2} opcodes.
653 * If we're unlucky and endTermFixup is more than
654 * OFFSET_MAX bytes from its target, we cheat by
655 * jumping 6 bytes to the jump whose offset is at
656 * esp->nextTermFixup, which has the same target.
658 jump = esp->endTermFixup;
659 header = esp->nextTermFixup - jump;
660 span += header;
661 if ((size_t)span > OFFSET_MAX)
662 span = header;
664 jump[0] = JUMP_OFFSET_HI(span);
665 jump[1] = JUMP_OFFSET_LO(span);
668 esp->jumpToJumpFlag = TRUE;
672 break;
674 case REOP_ALT:
675 assert(emitStateSP);
676 emitStateSP->altHead = pc - 1;
677 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
678 pc += OFFSET_LEN;
679 emitStateSP->continueNode = t;
680 emitStateSP->continueOp = REOP_JUMP;
681 emitStateSP->jumpToJumpFlag = FALSE;
682 ++emitStateSP;
683 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
684 t = t->kid;
685 op = t->op;
686 assert(op < REOP_LIMIT);
687 continue;
689 case REOP_FLAT:
691 * Coalesce FLATs if possible and if it would not increase bytecode
692 * beyond preallocated limit. The latter happens only when bytecode
693 * size for coalesced string with offset p and length 2 exceeds 6
694 * bytes preallocated for 2 single char nodes, i.e. when
695 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
696 * GetCompactIndexWidth(p) > 4.
697 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
698 * nodes strictly decreases bytecode size, the check has to be
699 * done only for the first coalescing.
701 if (t->kid &&
702 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
704 while (t->next &&
705 t->next->op == REOP_FLAT &&
706 (WCHAR*)t->kid + t->u.flat.length ==
707 t->next->kid) {
708 t->u.flat.length += t->next->u.flat.length;
709 t->next = t->next->next;
712 if (t->kid && t->u.flat.length > 1) {
713 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
714 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
715 pc = WriteCompactIndex(pc, t->u.flat.length);
716 } else if (t->u.flat.chr < 256) {
717 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
718 *pc++ = (jsbytecode) t->u.flat.chr;
719 } else {
720 pc[-1] = (state->flags & JSREG_FOLD)
721 ? REOP_UCFLAT1i
722 : REOP_UCFLAT1;
723 SET_ARG(pc, t->u.flat.chr);
724 pc += ARG_LEN;
726 break;
728 case REOP_LPAREN:
729 assert(emitStateSP);
730 pc = WriteCompactIndex(pc, t->u.parenIndex);
731 emitStateSP->continueNode = t;
732 emitStateSP->continueOp = REOP_RPAREN;
733 ++emitStateSP;
734 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
735 t = t->kid;
736 op = t->op;
737 continue;
739 case REOP_RPAREN:
740 pc = WriteCompactIndex(pc, t->u.parenIndex);
741 break;
743 case REOP_BACKREF:
744 pc = WriteCompactIndex(pc, t->u.parenIndex);
745 break;
747 case REOP_ASSERT:
748 assert(emitStateSP);
749 emitStateSP->nextTermFixup = pc;
750 pc += OFFSET_LEN;
751 emitStateSP->continueNode = t;
752 emitStateSP->continueOp = REOP_ASSERTTEST;
753 ++emitStateSP;
754 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
755 t = t->kid;
756 op = t->op;
757 continue;
759 case REOP_ASSERTTEST:
760 case REOP_ASSERTNOTTEST:
761 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
762 goto jump_too_big;
763 break;
765 case REOP_ASSERT_NOT:
766 assert(emitStateSP);
767 emitStateSP->nextTermFixup = pc;
768 pc += OFFSET_LEN;
769 emitStateSP->continueNode = t;
770 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
771 ++emitStateSP;
772 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
773 t = t->kid;
774 op = t->op;
775 continue;
777 case REOP_QUANT:
778 assert(emitStateSP);
779 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
780 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
781 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
782 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
783 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
784 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
785 } else {
786 if (!t->u.range.greedy)
787 pc[-1] = REOP_MINIMALQUANT;
788 pc = WriteCompactIndex(pc, t->u.range.min);
790 * Write max + 1 to avoid using size_t(max) + 1 bytes
791 * for (UINT)-1 sentinel.
793 pc = WriteCompactIndex(pc, t->u.range.max + 1);
795 emitStateSP->nextTermFixup = pc;
796 pc += OFFSET_LEN;
797 emitStateSP->continueNode = t;
798 emitStateSP->continueOp = REOP_ENDCHILD;
799 ++emitStateSP;
800 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
801 t = t->kid;
802 op = t->op;
803 continue;
805 case REOP_ENDCHILD:
806 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
807 goto jump_too_big;
808 break;
810 case REOP_CLASS:
811 if (!t->u.ucclass.sense)
812 pc[-1] = REOP_NCLASS;
813 pc = WriteCompactIndex(pc, t->u.ucclass.index);
814 charSet = &re->classList[t->u.ucclass.index];
815 charSet->converted = FALSE;
816 charSet->length = t->u.ucclass.bmsize;
817 charSet->u.src.startIndex = t->u.ucclass.startIndex;
818 charSet->u.src.length = t->u.ucclass.kidlen;
819 charSet->sense = t->u.ucclass.sense;
820 break;
822 default:
823 break;
826 t = t->next;
827 if (t) {
828 op = t->op;
829 } else {
830 if (emitStateSP == emitStateStack)
831 break;
832 --emitStateSP;
833 t = emitStateSP->continueNode;
834 op = (REOp) emitStateSP->continueOp;
838 cleanup:
839 heap_free(emitStateStack);
840 return pc;
842 jump_too_big:
843 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
844 pc = NULL;
845 goto cleanup;
849 * Process the op against the two top operands, reducing them to a single
850 * operand in the penultimate slot. Update progLength and treeDepth.
852 static BOOL
853 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
854 INT operandSP)
856 RENode *result;
858 switch (opData->op) {
859 case REOP_ALT:
860 result = NewRENode(state, REOP_ALT);
861 if (!result)
862 return FALSE;
863 result->kid = operandStack[operandSP - 2];
864 result->u.kid2 = operandStack[operandSP - 1];
865 operandStack[operandSP - 2] = result;
867 if (state->treeDepth == TREE_DEPTH_MAX) {
868 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
869 return FALSE;
871 ++state->treeDepth;
874 * Look at both alternates to see if there's a FLAT or a CLASS at
875 * the start of each. If so, use a prerequisite match.
877 if (((RENode *) result->kid)->op == REOP_FLAT &&
878 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
879 (state->flags & JSREG_FOLD) == 0) {
880 result->op = REOP_ALTPREREQ;
881 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
882 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
883 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
884 JUMP, <end> ... ENDALT */
885 state->progLength += 13;
887 else
888 if (((RENode *) result->kid)->op == REOP_CLASS &&
889 ((RENode *) result->kid)->u.ucclass.index < 256 &&
890 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
891 (state->flags & JSREG_FOLD) == 0) {
892 result->op = REOP_ALTPREREQ2;
893 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
894 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
895 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
896 JUMP, <end> ... ENDALT */
897 state->progLength += 13;
899 else
900 if (((RENode *) result->kid)->op == REOP_FLAT &&
901 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
902 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
903 (state->flags & JSREG_FOLD) == 0) {
904 result->op = REOP_ALTPREREQ2;
905 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
906 result->u.altprereq.ch2 =
907 ((RENode *) result->u.kid2)->u.ucclass.index;
908 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
909 JUMP, <end> ... ENDALT */
910 state->progLength += 13;
912 else {
913 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
914 state->progLength += 7;
916 break;
918 case REOP_CONCAT:
919 result = operandStack[operandSP - 2];
920 while (result->next)
921 result = result->next;
922 result->next = operandStack[operandSP - 1];
923 break;
925 case REOP_ASSERT:
926 case REOP_ASSERT_NOT:
927 case REOP_LPARENNON:
928 case REOP_LPAREN:
929 /* These should have been processed by a close paren. */
930 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
931 opData->errPos);
932 return FALSE;
934 default:;
936 return TRUE;
940 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
941 * its being on the stack, and to propagate errors to its callers.
943 #define JSREG_FIND_PAREN_COUNT 0x8000
944 #define JSREG_FIND_PAREN_ERROR 0x4000
947 * Magic return value from FindParenCount and GetDecimalValue, to indicate
948 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
949 * its findMax parameter is non-null.
951 #define OVERFLOW_VALUE ((UINT)-1)
953 static UINT
954 FindParenCount(CompilerState *state)
956 CompilerState temp;
957 int i;
959 if (state->flags & JSREG_FIND_PAREN_COUNT)
960 return OVERFLOW_VALUE;
963 * Copy state into temp, flag it so we never report an invalid backref,
964 * and reset its members to parse the entire regexp. This is obviously
965 * suboptimal, but GetDecimalValue calls us only if a backref appears to
966 * refer to a forward parenthetical, which is rare.
968 temp = *state;
969 temp.flags |= JSREG_FIND_PAREN_COUNT;
970 temp.cp = temp.cpbegin;
971 temp.parenCount = 0;
972 temp.classCount = 0;
973 temp.progLength = 0;
974 temp.treeDepth = 0;
975 temp.classBitmapsMem = 0;
976 for (i = 0; i < CLASS_CACHE_SIZE; i++)
977 temp.classCache[i].start = NULL;
979 if (!ParseRegExp(&temp)) {
980 state->flags |= JSREG_FIND_PAREN_ERROR;
981 return OVERFLOW_VALUE;
983 return temp.parenCount;
987 * Extract and return a decimal value at state->cp. The initial character c
988 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
989 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
990 * state->flags to discover whether an error occurred under findMax.
992 static UINT
993 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
994 CompilerState *state)
996 UINT value = JS7_UNDEC(c);
997 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
999 /* The following restriction allows simpler overflow checks. */
1000 assert(max <= ((UINT)-1 - 9) / 10);
1001 while (state->cp < state->cpend) {
1002 c = *state->cp;
1003 if (!JS7_ISDEC(c))
1004 break;
1005 value = 10 * value + JS7_UNDEC(c);
1006 if (!overflow && value > max && (!findMax || value > findMax(state)))
1007 overflow = TRUE;
1008 ++state->cp;
1010 return overflow ? OVERFLOW_VALUE : value;
1014 * Calculate the total size of the bitmap required for a class expression.
1016 static BOOL
1017 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1018 const WCHAR *end)
1020 UINT max = 0;
1021 BOOL inRange = FALSE;
1022 WCHAR c, rangeStart = 0;
1023 UINT n, digit, nDigits, i;
1025 target->u.ucclass.bmsize = 0;
1026 target->u.ucclass.sense = TRUE;
1028 if (src == end)
1029 return TRUE;
1031 if (*src == '^') {
1032 ++src;
1033 target->u.ucclass.sense = FALSE;
1036 while (src != end) {
1037 BOOL canStartRange = TRUE;
1038 UINT localMax = 0;
1040 switch (*src) {
1041 case '\\':
1042 ++src;
1043 c = *src++;
1044 switch (c) {
1045 case 'b':
1046 localMax = 0x8;
1047 break;
1048 case 'f':
1049 localMax = 0xC;
1050 break;
1051 case 'n':
1052 localMax = 0xA;
1053 break;
1054 case 'r':
1055 localMax = 0xD;
1056 break;
1057 case 't':
1058 localMax = 0x9;
1059 break;
1060 case 'v':
1061 localMax = 0xB;
1062 break;
1063 case 'c':
1064 if (src < end && RE_IS_LETTER(*src)) {
1065 localMax = (UINT) (*src++) & 0x1F;
1066 } else {
1067 --src;
1068 localMax = '\\';
1070 break;
1071 case 'x':
1072 nDigits = 2;
1073 goto lexHex;
1074 case 'u':
1075 nDigits = 4;
1076 lexHex:
1077 n = 0;
1078 for (i = 0; (i < nDigits) && (src < end); i++) {
1079 c = *src++;
1080 if (!isASCIIHexDigit(c, &digit)) {
1082 * Back off to accepting the original
1083 *'\' as a literal.
1085 src -= i + 1;
1086 n = '\\';
1087 break;
1089 n = (n << 4) | digit;
1091 localMax = n;
1092 break;
1093 case 'd':
1094 canStartRange = FALSE;
1095 if (inRange) {
1096 JS_ReportErrorNumber(state->context,
1097 js_GetErrorMessage, NULL,
1098 JSMSG_BAD_CLASS_RANGE);
1099 return FALSE;
1101 localMax = '9';
1102 break;
1103 case 'D':
1104 case 's':
1105 case 'S':
1106 case 'w':
1107 case 'W':
1108 canStartRange = FALSE;
1109 if (inRange) {
1110 JS_ReportErrorNumber(state->context,
1111 js_GetErrorMessage, NULL,
1112 JSMSG_BAD_CLASS_RANGE);
1113 return FALSE;
1115 max = 65535;
1118 * If this is the start of a range, ensure that it's less than
1119 * the end.
1121 localMax = 0;
1122 break;
1123 case '0':
1124 case '1':
1125 case '2':
1126 case '3':
1127 case '4':
1128 case '5':
1129 case '6':
1130 case '7':
1132 * This is a non-ECMA extension - decimal escapes (in this
1133 * case, octal!) are supposed to be an error inside class
1134 * ranges, but supported here for backwards compatibility.
1137 n = JS7_UNDEC(c);
1138 c = *src;
1139 if ('0' <= c && c <= '7') {
1140 src++;
1141 n = 8 * n + JS7_UNDEC(c);
1142 c = *src;
1143 if ('0' <= c && c <= '7') {
1144 src++;
1145 i = 8 * n + JS7_UNDEC(c);
1146 if (i <= 0377)
1147 n = i;
1148 else
1149 src--;
1152 localMax = n;
1153 break;
1155 default:
1156 localMax = c;
1157 break;
1159 break;
1160 default:
1161 localMax = *src++;
1162 break;
1165 if (inRange) {
1166 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1167 if (rangeStart > localMax) {
1168 JS_ReportErrorNumber(state->context,
1169 js_GetErrorMessage, NULL,
1170 JSMSG_BAD_CLASS_RANGE);
1171 return FALSE;
1173 inRange = FALSE;
1174 } else {
1175 if (canStartRange && src < end - 1) {
1176 if (*src == '-') {
1177 ++src;
1178 inRange = TRUE;
1179 rangeStart = (WCHAR)localMax;
1180 continue;
1183 if (state->flags & JSREG_FOLD)
1184 rangeStart = localMax; /* one run of the uc/dc loop below */
1187 if (state->flags & JSREG_FOLD) {
1188 WCHAR maxch = localMax;
1190 for (i = rangeStart; i <= localMax; i++) {
1191 WCHAR uch, dch;
1193 uch = toupperW(i);
1194 dch = tolowerW(i);
1195 if(maxch < uch)
1196 maxch = uch;
1197 if(maxch < dch)
1198 maxch = dch;
1200 localMax = maxch;
1203 if (localMax > max)
1204 max = localMax;
1206 target->u.ucclass.bmsize = max;
1207 return TRUE;
1210 static INT
1211 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1213 UINT min, max;
1214 WCHAR c;
1215 const WCHAR *errp = state->cp++;
1217 c = *state->cp;
1218 if (JS7_ISDEC(c)) {
1219 ++state->cp;
1220 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1221 c = *state->cp;
1223 if (!ignoreValues && min == OVERFLOW_VALUE)
1224 return JSMSG_MIN_TOO_BIG;
1226 if (c == ',') {
1227 c = *++state->cp;
1228 if (JS7_ISDEC(c)) {
1229 ++state->cp;
1230 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1231 c = *state->cp;
1232 if (!ignoreValues && max == OVERFLOW_VALUE)
1233 return JSMSG_MAX_TOO_BIG;
1234 if (!ignoreValues && min > max)
1235 return JSMSG_OUT_OF_ORDER;
1236 } else {
1237 max = (UINT)-1;
1239 } else {
1240 max = min;
1242 if (c == '}') {
1243 state->result = NewRENode(state, REOP_QUANT);
1244 if (!state->result)
1245 return JSMSG_OUT_OF_MEMORY;
1246 state->result->u.range.min = min;
1247 state->result->u.range.max = max;
1249 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1250 * where <max> is written as compact(max+1) to make
1251 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1253 state->progLength += (1 + GetCompactIndexWidth(min)
1254 + GetCompactIndexWidth(max + 1)
1255 +3);
1256 return 0;
1260 state->cp = errp;
1261 return -1;
1264 static BOOL
1265 ParseQuantifier(CompilerState *state)
1267 RENode *term;
1268 term = state->result;
1269 if (state->cp < state->cpend) {
1270 switch (*state->cp) {
1271 case '+':
1272 state->result = NewRENode(state, REOP_QUANT);
1273 if (!state->result)
1274 return FALSE;
1275 state->result->u.range.min = 1;
1276 state->result->u.range.max = (UINT)-1;
1277 /* <PLUS>, <next> ... <ENDCHILD> */
1278 state->progLength += 4;
1279 goto quantifier;
1280 case '*':
1281 state->result = NewRENode(state, REOP_QUANT);
1282 if (!state->result)
1283 return FALSE;
1284 state->result->u.range.min = 0;
1285 state->result->u.range.max = (UINT)-1;
1286 /* <STAR>, <next> ... <ENDCHILD> */
1287 state->progLength += 4;
1288 goto quantifier;
1289 case '?':
1290 state->result = NewRENode(state, REOP_QUANT);
1291 if (!state->result)
1292 return FALSE;
1293 state->result->u.range.min = 0;
1294 state->result->u.range.max = 1;
1295 /* <OPT>, <next> ... <ENDCHILD> */
1296 state->progLength += 4;
1297 goto quantifier;
1298 case '{': /* balance '}' */
1300 INT err;
1302 err = ParseMinMaxQuantifier(state, FALSE);
1303 if (err == 0)
1304 goto quantifier;
1305 if (err == -1)
1306 return TRUE;
1308 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1309 return FALSE;
1311 default:;
1314 return TRUE;
1316 quantifier:
1317 if (state->treeDepth == TREE_DEPTH_MAX) {
1318 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1319 return FALSE;
1322 ++state->treeDepth;
1323 ++state->cp;
1324 state->result->kid = term;
1325 if (state->cp < state->cpend && *state->cp == '?') {
1326 ++state->cp;
1327 state->result->u.range.greedy = FALSE;
1328 } else {
1329 state->result->u.range.greedy = TRUE;
1331 return TRUE;
1335 * item: assertion An item is either an assertion or
1336 * quantatom a quantified atom.
1338 * assertion: '^' Assertions match beginning of string
1339 * (or line if the class static property
1340 * RegExp.multiline is true).
1341 * '$' End of string (or line if the class
1342 * static property RegExp.multiline is
1343 * true).
1344 * '\b' Word boundary (between \w and \W).
1345 * '\B' Word non-boundary.
1347 * quantatom: atom An unquantified atom.
1348 * quantatom '{' n ',' m '}'
1349 * Atom must occur between n and m times.
1350 * quantatom '{' n ',' '}' Atom must occur at least n times.
1351 * quantatom '{' n '}' Atom must occur exactly n times.
1352 * quantatom '*' Zero or more times (same as {0,}).
1353 * quantatom '+' One or more times (same as {1,}).
1354 * quantatom '?' Zero or one time (same as {0,1}).
1356 * any of which can be optionally followed by '?' for ungreedy
1358 * atom: '(' regexp ')' A parenthesized regexp (what matched
1359 * can be addressed using a backreference,
1360 * see '\' n below).
1361 * '.' Matches any char except '\n'.
1362 * '[' classlist ']' A character class.
1363 * '[' '^' classlist ']' A negated character class.
1364 * '\f' Form Feed.
1365 * '\n' Newline (Line Feed).
1366 * '\r' Carriage Return.
1367 * '\t' Horizontal Tab.
1368 * '\v' Vertical Tab.
1369 * '\d' A digit (same as [0-9]).
1370 * '\D' A non-digit.
1371 * '\w' A word character, [0-9a-z_A-Z].
1372 * '\W' A non-word character.
1373 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1374 * '\S' A non-whitespace character.
1375 * '\' n A backreference to the nth (n decimal
1376 * and positive) parenthesized expression.
1377 * '\' octal An octal escape sequence (octal must be
1378 * two or three digits long, unless it is
1379 * 0 for the null character).
1380 * '\x' hex A hex escape (hex must be two digits).
1381 * '\u' unicode A unicode escape (must be four digits).
1382 * '\c' ctrl A control character, ctrl is a letter.
1383 * '\' literalatomchar Any character except one of the above
1384 * that follow '\' in an atom.
1385 * otheratomchar Any character not first among the other
1386 * atom right-hand sides.
1388 static BOOL
1389 ParseTerm(CompilerState *state)
1391 WCHAR c = *state->cp++;
1392 UINT nDigits;
1393 UINT num, tmp, n, i;
1394 const WCHAR *termStart;
1396 switch (c) {
1397 /* assertions and atoms */
1398 case '^':
1399 state->result = NewRENode(state, REOP_BOL);
1400 if (!state->result)
1401 return FALSE;
1402 state->progLength++;
1403 return TRUE;
1404 case '$':
1405 state->result = NewRENode(state, REOP_EOL);
1406 if (!state->result)
1407 return FALSE;
1408 state->progLength++;
1409 return TRUE;
1410 case '\\':
1411 if (state->cp >= state->cpend) {
1412 /* a trailing '\' is an error */
1413 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1414 return FALSE;
1416 c = *state->cp++;
1417 switch (c) {
1418 /* assertion escapes */
1419 case 'b' :
1420 state->result = NewRENode(state, REOP_WBDRY);
1421 if (!state->result)
1422 return FALSE;
1423 state->progLength++;
1424 return TRUE;
1425 case 'B':
1426 state->result = NewRENode(state, REOP_WNONBDRY);
1427 if (!state->result)
1428 return FALSE;
1429 state->progLength++;
1430 return TRUE;
1431 /* Decimal escape */
1432 case '0':
1433 /* Give a strict warning. See also the note below. */
1434 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1435 doOctal:
1436 num = 0;
1437 while (state->cp < state->cpend) {
1438 c = *state->cp;
1439 if (c < '0' || '7' < c)
1440 break;
1441 state->cp++;
1442 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1443 if (tmp > 0377)
1444 break;
1445 num = tmp;
1447 c = (WCHAR)num;
1448 doFlat:
1449 state->result = NewRENode(state, REOP_FLAT);
1450 if (!state->result)
1451 return FALSE;
1452 state->result->u.flat.chr = c;
1453 state->result->u.flat.length = 1;
1454 state->progLength += 3;
1455 break;
1456 case '1':
1457 case '2':
1458 case '3':
1459 case '4':
1460 case '5':
1461 case '6':
1462 case '7':
1463 case '8':
1464 case '9':
1465 termStart = state->cp - 1;
1466 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1467 if (state->flags & JSREG_FIND_PAREN_ERROR)
1468 return FALSE;
1469 if (num == OVERFLOW_VALUE) {
1470 /* Give a strict mode warning. */
1471 WARN("back-reference exceeds number of capturing parentheses\n");
1474 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1475 * error here. However, for compatibility with IE, we treat the
1476 * whole backref as flat if the first character in it is not a
1477 * valid octal character, and as an octal escape otherwise.
1479 state->cp = termStart;
1480 if (c >= '8') {
1481 /* Treat this as flat. termStart - 1 is the \. */
1482 c = '\\';
1483 goto asFlat;
1486 /* Treat this as an octal escape. */
1487 goto doOctal;
1489 assert(1 <= num && num <= 0x10000);
1490 state->result = NewRENode(state, REOP_BACKREF);
1491 if (!state->result)
1492 return FALSE;
1493 state->result->u.parenIndex = num - 1;
1494 state->progLength
1495 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1496 break;
1497 /* Control escape */
1498 case 'f':
1499 c = 0xC;
1500 goto doFlat;
1501 case 'n':
1502 c = 0xA;
1503 goto doFlat;
1504 case 'r':
1505 c = 0xD;
1506 goto doFlat;
1507 case 't':
1508 c = 0x9;
1509 goto doFlat;
1510 case 'v':
1511 c = 0xB;
1512 goto doFlat;
1513 /* Control letter */
1514 case 'c':
1515 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1516 c = (WCHAR) (*state->cp++ & 0x1F);
1517 } else {
1518 /* back off to accepting the original '\' as a literal */
1519 --state->cp;
1520 c = '\\';
1522 goto doFlat;
1523 /* HexEscapeSequence */
1524 case 'x':
1525 nDigits = 2;
1526 goto lexHex;
1527 /* UnicodeEscapeSequence */
1528 case 'u':
1529 nDigits = 4;
1530 lexHex:
1531 n = 0;
1532 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1533 UINT digit;
1534 c = *state->cp++;
1535 if (!isASCIIHexDigit(c, &digit)) {
1537 * Back off to accepting the original 'u' or 'x' as a
1538 * literal.
1540 state->cp -= i + 2;
1541 n = *state->cp++;
1542 break;
1544 n = (n << 4) | digit;
1546 c = (WCHAR) n;
1547 goto doFlat;
1548 /* Character class escapes */
1549 case 'd':
1550 state->result = NewRENode(state, REOP_DIGIT);
1551 doSimple:
1552 if (!state->result)
1553 return FALSE;
1554 state->progLength++;
1555 break;
1556 case 'D':
1557 state->result = NewRENode(state, REOP_NONDIGIT);
1558 goto doSimple;
1559 case 's':
1560 state->result = NewRENode(state, REOP_SPACE);
1561 goto doSimple;
1562 case 'S':
1563 state->result = NewRENode(state, REOP_NONSPACE);
1564 goto doSimple;
1565 case 'w':
1566 state->result = NewRENode(state, REOP_ALNUM);
1567 goto doSimple;
1568 case 'W':
1569 state->result = NewRENode(state, REOP_NONALNUM);
1570 goto doSimple;
1571 /* IdentityEscape */
1572 default:
1573 state->result = NewRENode(state, REOP_FLAT);
1574 if (!state->result)
1575 return FALSE;
1576 state->result->u.flat.chr = c;
1577 state->result->u.flat.length = 1;
1578 state->result->kid = (void *) (state->cp - 1);
1579 state->progLength += 3;
1580 break;
1582 break;
1583 case '[':
1584 state->result = NewRENode(state, REOP_CLASS);
1585 if (!state->result)
1586 return FALSE;
1587 termStart = state->cp;
1588 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1589 for (;;) {
1590 if (state->cp == state->cpend) {
1591 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1592 JSMSG_UNTERM_CLASS, termStart);
1594 return FALSE;
1596 if (*state->cp == '\\') {
1597 state->cp++;
1598 if (state->cp != state->cpend)
1599 state->cp++;
1600 continue;
1602 if (*state->cp == ']') {
1603 state->result->u.ucclass.kidlen = state->cp - termStart;
1604 break;
1606 state->cp++;
1608 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1609 if (!state->classCache[i].start) {
1610 state->classCache[i].start = termStart;
1611 state->classCache[i].length = state->result->u.ucclass.kidlen;
1612 state->classCache[i].index = state->classCount;
1613 break;
1615 if (state->classCache[i].length ==
1616 state->result->u.ucclass.kidlen) {
1617 for (n = 0; ; n++) {
1618 if (n == state->classCache[i].length) {
1619 state->result->u.ucclass.index
1620 = state->classCache[i].index;
1621 goto claim;
1623 if (state->classCache[i].start[n] != termStart[n])
1624 break;
1628 state->result->u.ucclass.index = state->classCount++;
1630 claim:
1632 * Call CalculateBitmapSize now as we want any errors it finds
1633 * to be reported during the parse phase, not at execution.
1635 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1636 return FALSE;
1638 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1639 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1640 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1642 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1643 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1644 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1645 return FALSE;
1647 state->classBitmapsMem += n;
1648 /* CLASS, <index> */
1649 state->progLength
1650 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1651 break;
1653 case '.':
1654 state->result = NewRENode(state, REOP_DOT);
1655 goto doSimple;
1657 case '{':
1659 const WCHAR *errp = state->cp--;
1660 INT err;
1662 err = ParseMinMaxQuantifier(state, TRUE);
1663 state->cp = errp;
1665 if (err < 0)
1666 goto asFlat;
1668 /* FALL THROUGH */
1670 case '*':
1671 case '+':
1672 case '?':
1673 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1674 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1675 return FALSE;
1676 default:
1677 asFlat:
1678 state->result = NewRENode(state, REOP_FLAT);
1679 if (!state->result)
1680 return FALSE;
1681 state->result->u.flat.chr = c;
1682 state->result->u.flat.length = 1;
1683 state->result->kid = (void *) (state->cp - 1);
1684 state->progLength += 3;
1685 break;
1687 return ParseQuantifier(state);
1691 * Top-down regular expression grammar, based closely on Perl4.
1693 * regexp: altern A regular expression is one or more
1694 * altern '|' regexp alternatives separated by vertical bar.
1696 #define INITIAL_STACK_SIZE 128
1698 static BOOL
1699 ParseRegExp(CompilerState *state)
1701 size_t parenIndex;
1702 RENode *operand;
1703 REOpData *operatorStack;
1704 RENode **operandStack;
1705 REOp op;
1706 INT i;
1707 BOOL result = FALSE;
1709 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1710 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1712 /* Watch out for empty regexp */
1713 if (state->cp == state->cpend) {
1714 state->result = NewRENode(state, REOP_EMPTY);
1715 return (state->result != NULL);
1718 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1719 if (!operatorStack)
1720 return FALSE;
1722 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1723 if (!operandStack)
1724 goto out;
1726 for (;;) {
1727 parenIndex = state->parenCount;
1728 if (state->cp == state->cpend) {
1730 * If we are at the end of the regexp and we're short one or more
1731 * operands, the regexp must have the form /x|/ or some such, with
1732 * left parentheses making us short more than one operand.
1734 if (operatorSP >= operandSP) {
1735 operand = NewRENode(state, REOP_EMPTY);
1736 if (!operand)
1737 goto out;
1738 goto pushOperand;
1740 } else {
1741 switch (*state->cp) {
1742 case '(':
1743 ++state->cp;
1744 if (state->cp + 1 < state->cpend &&
1745 *state->cp == '?' &&
1746 (state->cp[1] == '=' ||
1747 state->cp[1] == '!' ||
1748 state->cp[1] == ':')) {
1749 switch (state->cp[1]) {
1750 case '=':
1751 op = REOP_ASSERT;
1752 /* ASSERT, <next>, ... ASSERTTEST */
1753 state->progLength += 4;
1754 break;
1755 case '!':
1756 op = REOP_ASSERT_NOT;
1757 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1758 state->progLength += 4;
1759 break;
1760 default:
1761 op = REOP_LPARENNON;
1762 break;
1764 state->cp += 2;
1765 } else {
1766 op = REOP_LPAREN;
1767 /* LPAREN, <index>, ... RPAREN, <index> */
1768 state->progLength
1769 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1770 state->parenCount++;
1771 if (state->parenCount == 65535) {
1772 ReportRegExpError(state, JSREPORT_ERROR,
1773 JSMSG_TOO_MANY_PARENS);
1774 goto out;
1777 goto pushOperator;
1779 case ')':
1781 * If there's no stacked open parenthesis, throw syntax error.
1783 for (i = operatorSP - 1; ; i--) {
1784 if (i < 0) {
1785 ReportRegExpError(state, JSREPORT_ERROR,
1786 JSMSG_UNMATCHED_RIGHT_PAREN);
1787 goto out;
1789 if (operatorStack[i].op == REOP_ASSERT ||
1790 operatorStack[i].op == REOP_ASSERT_NOT ||
1791 operatorStack[i].op == REOP_LPARENNON ||
1792 operatorStack[i].op == REOP_LPAREN) {
1793 break;
1796 /* FALL THROUGH */
1798 case '|':
1799 /* Expected an operand before these, so make an empty one */
1800 operand = NewRENode(state, REOP_EMPTY);
1801 if (!operand)
1802 goto out;
1803 goto pushOperand;
1805 default:
1806 if (!ParseTerm(state))
1807 goto out;
1808 operand = state->result;
1809 pushOperand:
1810 if (operandSP == operandStackSize) {
1811 RENode **tmp;
1812 operandStackSize += operandStackSize;
1813 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1814 if (!tmp)
1815 goto out;
1816 operandStack = tmp;
1818 operandStack[operandSP++] = operand;
1819 break;
1823 /* At the end; process remaining operators. */
1824 restartOperator:
1825 if (state->cp == state->cpend) {
1826 while (operatorSP) {
1827 --operatorSP;
1828 if (!ProcessOp(state, &operatorStack[operatorSP],
1829 operandStack, operandSP))
1830 goto out;
1831 --operandSP;
1833 assert(operandSP == 1);
1834 state->result = operandStack[0];
1835 result = TRUE;
1836 goto out;
1839 switch (*state->cp) {
1840 case '|':
1841 /* Process any stacked 'concat' operators */
1842 ++state->cp;
1843 while (operatorSP &&
1844 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1845 --operatorSP;
1846 if (!ProcessOp(state, &operatorStack[operatorSP],
1847 operandStack, operandSP)) {
1848 goto out;
1850 --operandSP;
1852 op = REOP_ALT;
1853 goto pushOperator;
1855 case ')':
1857 * If there's no stacked open parenthesis, throw syntax error.
1859 for (i = operatorSP - 1; ; i--) {
1860 if (i < 0) {
1861 ReportRegExpError(state, JSREPORT_ERROR,
1862 JSMSG_UNMATCHED_RIGHT_PAREN);
1863 goto out;
1865 if (operatorStack[i].op == REOP_ASSERT ||
1866 operatorStack[i].op == REOP_ASSERT_NOT ||
1867 operatorStack[i].op == REOP_LPARENNON ||
1868 operatorStack[i].op == REOP_LPAREN) {
1869 break;
1872 ++state->cp;
1874 /* Process everything on the stack until the open parenthesis. */
1875 for (;;) {
1876 assert(operatorSP);
1877 --operatorSP;
1878 switch (operatorStack[operatorSP].op) {
1879 case REOP_ASSERT:
1880 case REOP_ASSERT_NOT:
1881 case REOP_LPAREN:
1882 operand = NewRENode(state, operatorStack[operatorSP].op);
1883 if (!operand)
1884 goto out;
1885 operand->u.parenIndex =
1886 operatorStack[operatorSP].parenIndex;
1887 assert(operandSP);
1888 operand->kid = operandStack[operandSP - 1];
1889 operandStack[operandSP - 1] = operand;
1890 if (state->treeDepth == TREE_DEPTH_MAX) {
1891 ReportRegExpError(state, JSREPORT_ERROR,
1892 JSMSG_REGEXP_TOO_COMPLEX);
1893 goto out;
1895 ++state->treeDepth;
1896 /* FALL THROUGH */
1898 case REOP_LPARENNON:
1899 state->result = operandStack[operandSP - 1];
1900 if (!ParseQuantifier(state))
1901 goto out;
1902 operandStack[operandSP - 1] = state->result;
1903 goto restartOperator;
1904 default:
1905 if (!ProcessOp(state, &operatorStack[operatorSP],
1906 operandStack, operandSP))
1907 goto out;
1908 --operandSP;
1909 break;
1912 break;
1914 case '{':
1916 const WCHAR *errp = state->cp;
1918 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1920 * This didn't even scan correctly as a quantifier, so we should
1921 * treat it as flat.
1923 op = REOP_CONCAT;
1924 goto pushOperator;
1927 state->cp = errp;
1928 /* FALL THROUGH */
1931 case '+':
1932 case '*':
1933 case '?':
1934 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1935 state->cp);
1936 result = FALSE;
1937 goto out;
1939 default:
1940 /* Anything else is the start of the next term. */
1941 op = REOP_CONCAT;
1942 pushOperator:
1943 if (operatorSP == operatorStackSize) {
1944 REOpData *tmp;
1945 operatorStackSize += operatorStackSize;
1946 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1947 if (!tmp)
1948 goto out;
1949 operatorStack = tmp;
1951 operatorStack[operatorSP].op = op;
1952 operatorStack[operatorSP].errPos = state->cp;
1953 operatorStack[operatorSP++].parenIndex = parenIndex;
1954 break;
1957 out:
1958 heap_free(operatorStack);
1959 heap_free(operandStack);
1960 return result;
1964 * Save the current state of the match - the position in the input
1965 * text as well as the position in the bytecode. The state of any
1966 * parent expressions is also saved (preceding state).
1967 * Contents of parenCount parentheses from parenIndex are also saved.
1969 static REBackTrackData *
1970 PushBackTrackState(REGlobalData *gData, REOp op,
1971 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1972 size_t parenIndex, size_t parenCount)
1974 size_t i;
1975 REBackTrackData *result =
1976 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1978 size_t sz = sizeof(REBackTrackData) +
1979 gData->stateStackTop * sizeof(REProgState) +
1980 parenCount * sizeof(RECapture);
1982 ptrdiff_t btsize = gData->backTrackStackSize;
1983 ptrdiff_t btincr = ((char *)result + sz) -
1984 ((char *)gData->backTrackStack + btsize);
1986 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1988 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1989 if (btincr > 0) {
1990 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1992 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1993 btincr = ((btincr+btsize-1)/btsize)*btsize;
1994 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1995 if (!gData->backTrackStack) {
1996 js_ReportOutOfScriptQuota(gData->cx);
1997 gData->ok = FALSE;
1998 return NULL;
2000 gData->backTrackStackSize = btsize + btincr;
2001 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
2003 gData->backTrackSP = result;
2004 result->sz = gData->cursz;
2005 gData->cursz = sz;
2007 result->backtrack_op = op;
2008 result->backtrack_pc = target;
2009 result->cp = cp;
2010 result->parenCount = parenCount;
2011 result->parenIndex = parenIndex;
2013 result->saveStateStackTop = gData->stateStackTop;
2014 assert(gData->stateStackTop);
2015 memcpy(result + 1, gData->stateStack,
2016 sizeof(REProgState) * result->saveStateStackTop);
2018 if (parenCount != 0) {
2019 memcpy((char *)(result + 1) +
2020 sizeof(REProgState) * result->saveStateStackTop,
2021 &x->parens[parenIndex],
2022 sizeof(RECapture) * parenCount);
2023 for (i = 0; i != parenCount; i++)
2024 x->parens[parenIndex + i].index = -1;
2027 return result;
2030 static inline REMatchState *
2031 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2032 size_t length)
2034 size_t i;
2035 assert(gData->cpend >= x->cp);
2036 if (length > (size_t)(gData->cpend - x->cp))
2037 return NULL;
2038 for (i = 0; i != length; i++) {
2039 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2040 return NULL;
2042 x->cp += length;
2043 return x;
2047 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2048 * 2. If E is not a character then go to step 6.
2049 * 3. Let ch be E's character.
2050 * 4. Let A be a one-element RECharSet containing the character ch.
2051 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2052 * 6. E must be an integer. Let n be that integer.
2053 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2054 * 8. Return an internal Matcher closure that takes two arguments, a State x
2055 * and a Continuation c, and performs the following:
2056 * 1. Let cap be x's captures internal array.
2057 * 2. Let s be cap[n].
2058 * 3. If s is undefined, then call c(x) and return its result.
2059 * 4. Let e be x's endIndex.
2060 * 5. Let len be s's length.
2061 * 6. Let f be e+len.
2062 * 7. If f>InputLength, return failure.
2063 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2064 * such that Canonicalize(s[i]) is not the same character as
2065 * Canonicalize(Input [e+i]), then return failure.
2066 * 9. Let y be the State (f, cap).
2067 * 10. Call c(y) and return its result.
2069 static REMatchState *
2070 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2072 size_t len, i;
2073 const WCHAR *parenContent;
2074 RECapture *cap = &x->parens[parenIndex];
2076 if (cap->index == -1)
2077 return x;
2079 len = cap->length;
2080 if (x->cp + len > gData->cpend)
2081 return NULL;
2083 parenContent = &gData->cpbegin[cap->index];
2084 if (gData->regexp->flags & JSREG_FOLD) {
2085 for (i = 0; i < len; i++) {
2086 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2087 return NULL;
2089 } else {
2090 for (i = 0; i < len; i++) {
2091 if (parenContent[i] != x->cp[i])
2092 return NULL;
2095 x->cp += len;
2096 return x;
2099 /* Add a single character to the RECharSet */
2100 static void
2101 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2103 UINT byteIndex = (UINT)(c >> 3);
2104 assert(c <= cs->length);
2105 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2109 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2110 static void
2111 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2113 UINT i;
2115 UINT byteIndex1 = c1 >> 3;
2116 UINT byteIndex2 = c2 >> 3;
2118 assert(c2 <= cs->length && c1 <= c2);
2120 c1 &= 0x7;
2121 c2 &= 0x7;
2123 if (byteIndex1 == byteIndex2) {
2124 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2125 } else {
2126 cs->u.bits[byteIndex1] |= 0xFF << c1;
2127 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2128 cs->u.bits[i] = 0xFF;
2129 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2133 /* Compile the source of the class into a RECharSet */
2134 static BOOL
2135 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2137 const WCHAR *src, *end;
2138 BOOL inRange = FALSE;
2139 WCHAR rangeStart = 0;
2140 UINT byteLength, n;
2141 WCHAR c, thisCh;
2142 INT nDigits, i;
2144 assert(!charSet->converted);
2146 * Assert that startIndex and length points to chars inside [] inside
2147 * source string.
2149 assert(1 <= charSet->u.src.startIndex);
2150 assert(charSet->u.src.startIndex
2151 < SysStringLen(gData->regexp->source));
2152 assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
2153 - 1 - charSet->u.src.startIndex);
2155 charSet->converted = TRUE;
2156 src = gData->regexp->source + charSet->u.src.startIndex;
2158 end = src + charSet->u.src.length;
2160 assert(src[-1] == '[' && end[0] == ']');
2162 byteLength = (charSet->length >> 3) + 1;
2163 charSet->u.bits = heap_alloc(byteLength);
2164 if (!charSet->u.bits) {
2165 JS_ReportOutOfMemory(gData->cx);
2166 gData->ok = FALSE;
2167 return FALSE;
2169 memset(charSet->u.bits, 0, byteLength);
2171 if (src == end)
2172 return TRUE;
2174 if (*src == '^') {
2175 assert(charSet->sense == FALSE);
2176 ++src;
2177 } else {
2178 assert(charSet->sense == TRUE);
2181 while (src != end) {
2182 switch (*src) {
2183 case '\\':
2184 ++src;
2185 c = *src++;
2186 switch (c) {
2187 case 'b':
2188 thisCh = 0x8;
2189 break;
2190 case 'f':
2191 thisCh = 0xC;
2192 break;
2193 case 'n':
2194 thisCh = 0xA;
2195 break;
2196 case 'r':
2197 thisCh = 0xD;
2198 break;
2199 case 't':
2200 thisCh = 0x9;
2201 break;
2202 case 'v':
2203 thisCh = 0xB;
2204 break;
2205 case 'c':
2206 if (src < end && JS_ISWORD(*src)) {
2207 thisCh = (WCHAR)(*src++ & 0x1F);
2208 } else {
2209 --src;
2210 thisCh = '\\';
2212 break;
2213 case 'x':
2214 nDigits = 2;
2215 goto lexHex;
2216 case 'u':
2217 nDigits = 4;
2218 lexHex:
2219 n = 0;
2220 for (i = 0; (i < nDigits) && (src < end); i++) {
2221 UINT digit;
2222 c = *src++;
2223 if (!isASCIIHexDigit(c, &digit)) {
2225 * Back off to accepting the original '\'
2226 * as a literal
2228 src -= i + 1;
2229 n = '\\';
2230 break;
2232 n = (n << 4) | digit;
2234 thisCh = (WCHAR)n;
2235 break;
2236 case '0':
2237 case '1':
2238 case '2':
2239 case '3':
2240 case '4':
2241 case '5':
2242 case '6':
2243 case '7':
2245 * This is a non-ECMA extension - decimal escapes (in this
2246 * case, octal!) are supposed to be an error inside class
2247 * ranges, but supported here for backwards compatibility.
2249 n = JS7_UNDEC(c);
2250 c = *src;
2251 if ('0' <= c && c <= '7') {
2252 src++;
2253 n = 8 * n + JS7_UNDEC(c);
2254 c = *src;
2255 if ('0' <= c && c <= '7') {
2256 src++;
2257 i = 8 * n + JS7_UNDEC(c);
2258 if (i <= 0377)
2259 n = i;
2260 else
2261 src--;
2264 thisCh = (WCHAR)n;
2265 break;
2267 case 'd':
2268 AddCharacterRangeToCharSet(charSet, '0', '9');
2269 continue; /* don't need range processing */
2270 case 'D':
2271 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2272 AddCharacterRangeToCharSet(charSet,
2273 (WCHAR)('9' + 1),
2274 (WCHAR)charSet->length);
2275 continue;
2276 case 's':
2277 for (i = (INT)charSet->length; i >= 0; i--)
2278 if (isspaceW(i))
2279 AddCharacterToCharSet(charSet, (WCHAR)i);
2280 continue;
2281 case 'S':
2282 for (i = (INT)charSet->length; i >= 0; i--)
2283 if (!isspaceW(i))
2284 AddCharacterToCharSet(charSet, (WCHAR)i);
2285 continue;
2286 case 'w':
2287 for (i = (INT)charSet->length; i >= 0; i--)
2288 if (JS_ISWORD(i))
2289 AddCharacterToCharSet(charSet, (WCHAR)i);
2290 continue;
2291 case 'W':
2292 for (i = (INT)charSet->length; i >= 0; i--)
2293 if (!JS_ISWORD(i))
2294 AddCharacterToCharSet(charSet, (WCHAR)i);
2295 continue;
2296 default:
2297 thisCh = c;
2298 break;
2301 break;
2303 default:
2304 thisCh = *src++;
2305 break;
2308 if (inRange) {
2309 if (gData->regexp->flags & JSREG_FOLD) {
2310 int i;
2312 assert(rangeStart <= thisCh);
2313 for (i = rangeStart; i <= thisCh; i++) {
2314 WCHAR uch, dch;
2316 AddCharacterToCharSet(charSet, i);
2317 uch = toupperW(i);
2318 dch = tolowerW(i);
2319 if (i != uch)
2320 AddCharacterToCharSet(charSet, uch);
2321 if (i != dch)
2322 AddCharacterToCharSet(charSet, dch);
2324 } else {
2325 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2327 inRange = FALSE;
2328 } else {
2329 if (gData->regexp->flags & JSREG_FOLD) {
2330 AddCharacterToCharSet(charSet, toupperW(thisCh));
2331 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2332 } else {
2333 AddCharacterToCharSet(charSet, thisCh);
2335 if (src < end - 1) {
2336 if (*src == '-') {
2337 ++src;
2338 inRange = TRUE;
2339 rangeStart = thisCh;
2344 return TRUE;
2347 static BOOL
2348 ReallocStateStack(REGlobalData *gData)
2350 size_t limit = gData->stateStackLimit;
2351 size_t sz = sizeof(REProgState) * limit;
2353 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2354 if (!gData->stateStack) {
2355 js_ReportOutOfScriptQuota(gData->cx);
2356 gData->ok = FALSE;
2357 return FALSE;
2359 gData->stateStackLimit = limit + limit;
2360 return TRUE;
2363 #define PUSH_STATE_STACK(data) \
2364 do { \
2365 ++(data)->stateStackTop; \
2366 if ((data)->stateStackTop == (data)->stateStackLimit && \
2367 !ReallocStateStack((data))) { \
2368 return NULL; \
2370 }while(0)
2373 * Apply the current op against the given input to see if it's going to match
2374 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2375 * true, then update the current state's cp. Always update startpc to the next
2376 * op.
2378 static inline REMatchState *
2379 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2380 jsbytecode **startpc, BOOL updatecp)
2382 REMatchState *result = NULL;
2383 WCHAR matchCh;
2384 size_t parenIndex;
2385 size_t offset, length, index;
2386 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2387 WCHAR *source;
2388 const WCHAR *startcp = x->cp;
2389 WCHAR ch;
2390 RECharSet *charSet;
2392 const char *opname = reop_names[op];
2393 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2394 (int)gData->stateStackTop * 2, "", opname);
2396 switch (op) {
2397 case REOP_EMPTY:
2398 result = x;
2399 break;
2400 case REOP_BOL:
2401 if (x->cp != gData->cpbegin) {
2402 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2403 !(gData->regexp->flags & JSREG_MULTILINE)) {
2404 break;
2406 if (!RE_IS_LINE_TERM(x->cp[-1]))
2407 break;
2409 result = x;
2410 break;
2411 case REOP_EOL:
2412 if (x->cp != gData->cpend) {
2413 if (/*!gData->cx->regExpStatics.multiline &&*/
2414 !(gData->regexp->flags & JSREG_MULTILINE)) {
2415 break;
2417 if (!RE_IS_LINE_TERM(*x->cp))
2418 break;
2420 result = x;
2421 break;
2422 case REOP_WBDRY:
2423 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2424 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2425 result = x;
2427 break;
2428 case REOP_WNONBDRY:
2429 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2430 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2431 result = x;
2433 break;
2434 case REOP_DOT:
2435 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2436 result = x;
2437 result->cp++;
2439 break;
2440 case REOP_DIGIT:
2441 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2442 result = x;
2443 result->cp++;
2445 break;
2446 case REOP_NONDIGIT:
2447 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2448 result = x;
2449 result->cp++;
2451 break;
2452 case REOP_ALNUM:
2453 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2454 result = x;
2455 result->cp++;
2457 break;
2458 case REOP_NONALNUM:
2459 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2460 result = x;
2461 result->cp++;
2463 break;
2464 case REOP_SPACE:
2465 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2466 result = x;
2467 result->cp++;
2469 break;
2470 case REOP_NONSPACE:
2471 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2472 result = x;
2473 result->cp++;
2475 break;
2476 case REOP_BACKREF:
2477 pc = ReadCompactIndex(pc, &parenIndex);
2478 assert(parenIndex < gData->regexp->parenCount);
2479 result = BackrefMatcher(gData, x, parenIndex);
2480 break;
2481 case REOP_FLAT:
2482 pc = ReadCompactIndex(pc, &offset);
2483 assert(offset < SysStringLen(gData->regexp->source));
2484 pc = ReadCompactIndex(pc, &length);
2485 assert(1 <= length);
2486 assert(length <= SysStringLen(gData->regexp->source) - offset);
2487 if (length <= (size_t)(gData->cpend - x->cp)) {
2488 source = gData->regexp->source + offset;
2489 TRACE("%s\n", debugstr_wn(source, length));
2490 for (index = 0; index != length; index++) {
2491 if (source[index] != x->cp[index])
2492 return NULL;
2494 x->cp += length;
2495 result = x;
2497 break;
2498 case REOP_FLAT1:
2499 matchCh = *pc++;
2500 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2501 if (x->cp != gData->cpend && *x->cp == matchCh) {
2502 result = x;
2503 result->cp++;
2505 break;
2506 case REOP_FLATi:
2507 pc = ReadCompactIndex(pc, &offset);
2508 assert(offset < SysStringLen(gData->regexp->source));
2509 pc = ReadCompactIndex(pc, &length);
2510 assert(1 <= length);
2511 assert(length <= SysStringLen(gData->regexp->source) - offset);
2512 source = gData->regexp->source;
2513 result = FlatNIMatcher(gData, x, source + offset, length);
2514 break;
2515 case REOP_FLAT1i:
2516 matchCh = *pc++;
2517 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2518 result = x;
2519 result->cp++;
2521 break;
2522 case REOP_UCFLAT1:
2523 matchCh = GET_ARG(pc);
2524 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2525 pc += ARG_LEN;
2526 if (x->cp != gData->cpend && *x->cp == matchCh) {
2527 result = x;
2528 result->cp++;
2530 break;
2531 case REOP_UCFLAT1i:
2532 matchCh = GET_ARG(pc);
2533 pc += ARG_LEN;
2534 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2535 result = x;
2536 result->cp++;
2538 break;
2539 case REOP_CLASS:
2540 pc = ReadCompactIndex(pc, &index);
2541 assert(index < gData->regexp->classCount);
2542 if (x->cp != gData->cpend) {
2543 charSet = &gData->regexp->classList[index];
2544 assert(charSet->converted);
2545 ch = *x->cp;
2546 index = ch >> 3;
2547 if (charSet->length != 0 &&
2548 ch <= charSet->length &&
2549 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2550 result = x;
2551 result->cp++;
2554 break;
2555 case REOP_NCLASS:
2556 pc = ReadCompactIndex(pc, &index);
2557 assert(index < gData->regexp->classCount);
2558 if (x->cp != gData->cpend) {
2559 charSet = &gData->regexp->classList[index];
2560 assert(charSet->converted);
2561 ch = *x->cp;
2562 index = ch >> 3;
2563 if (charSet->length == 0 ||
2564 ch > charSet->length ||
2565 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2566 result = x;
2567 result->cp++;
2570 break;
2572 default:
2573 assert(FALSE);
2575 if (result) {
2576 if (!updatecp)
2577 x->cp = startcp;
2578 *startpc = pc;
2579 TRACE(" *\n");
2580 return result;
2582 x->cp = startcp;
2583 return NULL;
2586 static inline REMatchState *
2587 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2589 REMatchState *result = NULL;
2590 REBackTrackData *backTrackData;
2591 jsbytecode *nextpc, *testpc;
2592 REOp nextop;
2593 RECapture *cap;
2594 REProgState *curState;
2595 const WCHAR *startcp;
2596 size_t parenIndex, k;
2597 size_t parenSoFar = 0;
2599 WCHAR matchCh1, matchCh2;
2600 RECharSet *charSet;
2602 BOOL anchor;
2603 jsbytecode *pc = gData->regexp->program;
2604 REOp op = (REOp) *pc++;
2607 * If the first node is a simple match, step the index into the string
2608 * until that match is made, or fail if it can't be found at all.
2610 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2611 anchor = FALSE;
2612 while (x->cp <= gData->cpend) {
2613 nextpc = pc; /* reset back to start each time */
2614 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2615 if (result) {
2616 anchor = TRUE;
2617 x = result;
2618 pc = nextpc; /* accept skip to next opcode */
2619 op = (REOp) *pc++;
2620 assert(op < REOP_LIMIT);
2621 break;
2623 gData->skipped++;
2624 x->cp++;
2626 if (!anchor)
2627 goto bad;
2630 for (;;) {
2631 const char *opname = reop_names[op];
2632 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2633 (int)gData->stateStackTop * 2, "", opname);
2635 if (REOP_IS_SIMPLE(op)) {
2636 result = SimpleMatch(gData, x, op, &pc, TRUE);
2637 } else {
2638 curState = &gData->stateStack[gData->stateStackTop];
2639 switch (op) {
2640 case REOP_END:
2641 goto good;
2642 case REOP_ALTPREREQ2:
2643 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2644 pc += ARG_LEN;
2645 matchCh2 = GET_ARG(pc);
2646 pc += ARG_LEN;
2647 k = GET_ARG(pc);
2648 pc += ARG_LEN;
2650 if (x->cp != gData->cpend) {
2651 if (*x->cp == matchCh2)
2652 goto doAlt;
2654 charSet = &gData->regexp->classList[k];
2655 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2656 goto bad;
2657 matchCh1 = *x->cp;
2658 k = matchCh1 >> 3;
2659 if ((charSet->length == 0 ||
2660 matchCh1 > charSet->length ||
2661 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2662 charSet->sense) {
2663 goto doAlt;
2666 result = NULL;
2667 break;
2669 case REOP_ALTPREREQ:
2670 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2671 pc += ARG_LEN;
2672 matchCh1 = GET_ARG(pc);
2673 pc += ARG_LEN;
2674 matchCh2 = GET_ARG(pc);
2675 pc += ARG_LEN;
2676 if (x->cp == gData->cpend ||
2677 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2678 result = NULL;
2679 break;
2681 /* else false thru... */
2683 case REOP_ALT:
2684 doAlt:
2685 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2686 pc += ARG_LEN; /* start of this alternate */
2687 curState->parenSoFar = parenSoFar;
2688 PUSH_STATE_STACK(gData);
2689 op = (REOp) *pc++;
2690 startcp = x->cp;
2691 if (REOP_IS_SIMPLE(op)) {
2692 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2693 op = (REOp) *nextpc++;
2694 pc = nextpc;
2695 continue;
2697 result = x;
2698 op = (REOp) *pc++;
2700 nextop = (REOp) *nextpc++;
2701 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2702 goto bad;
2703 continue;
2706 * Occurs at (successful) end of REOP_ALT,
2708 case REOP_JUMP:
2710 * If we have not gotten a result here, it is because of an
2711 * empty match. Do the same thing REOP_EMPTY would do.
2713 if (!result)
2714 result = x;
2716 --gData->stateStackTop;
2717 pc += GET_OFFSET(pc);
2718 op = (REOp) *pc++;
2719 continue;
2722 * Occurs at last (successful) end of REOP_ALT,
2724 case REOP_ENDALT:
2726 * If we have not gotten a result here, it is because of an
2727 * empty match. Do the same thing REOP_EMPTY would do.
2729 if (!result)
2730 result = x;
2732 --gData->stateStackTop;
2733 op = (REOp) *pc++;
2734 continue;
2736 case REOP_LPAREN:
2737 pc = ReadCompactIndex(pc, &parenIndex);
2738 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2739 assert(parenIndex < gData->regexp->parenCount);
2740 if (parenIndex + 1 > parenSoFar)
2741 parenSoFar = parenIndex + 1;
2742 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2743 x->parens[parenIndex].length = 0;
2744 op = (REOp) *pc++;
2745 continue;
2747 case REOP_RPAREN:
2749 ptrdiff_t delta;
2751 pc = ReadCompactIndex(pc, &parenIndex);
2752 assert(parenIndex < gData->regexp->parenCount);
2753 cap = &x->parens[parenIndex];
2754 delta = x->cp - (gData->cpbegin + cap->index);
2755 cap->length = (delta < 0) ? 0 : (size_t) delta;
2756 op = (REOp) *pc++;
2758 if (!result)
2759 result = x;
2760 continue;
2762 case REOP_ASSERT:
2763 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2764 pc += ARG_LEN; /* start of ASSERT child */
2765 op = (REOp) *pc++;
2766 testpc = pc;
2767 if (REOP_IS_SIMPLE(op) &&
2768 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2769 result = NULL;
2770 break;
2772 curState->u.assertion.top =
2773 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2774 curState->u.assertion.sz = gData->cursz;
2775 curState->index = x->cp - gData->cpbegin;
2776 curState->parenSoFar = parenSoFar;
2777 PUSH_STATE_STACK(gData);
2778 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2779 nextpc, x, x->cp, 0, 0)) {
2780 goto bad;
2782 continue;
2784 case REOP_ASSERT_NOT:
2785 nextpc = pc + GET_OFFSET(pc);
2786 pc += ARG_LEN;
2787 op = (REOp) *pc++;
2788 testpc = pc;
2789 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2790 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2791 *testpc == REOP_ASSERTNOTTEST) {
2792 result = NULL;
2793 break;
2795 curState->u.assertion.top
2796 = (char *)gData->backTrackSP -
2797 (char *)gData->backTrackStack;
2798 curState->u.assertion.sz = gData->cursz;
2799 curState->index = x->cp - gData->cpbegin;
2800 curState->parenSoFar = parenSoFar;
2801 PUSH_STATE_STACK(gData);
2802 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2803 nextpc, x, x->cp, 0, 0)) {
2804 goto bad;
2806 continue;
2808 case REOP_ASSERTTEST:
2809 --gData->stateStackTop;
2810 --curState;
2811 x->cp = gData->cpbegin + curState->index;
2812 gData->backTrackSP =
2813 (REBackTrackData *) ((char *)gData->backTrackStack +
2814 curState->u.assertion.top);
2815 gData->cursz = curState->u.assertion.sz;
2816 if (result)
2817 result = x;
2818 break;
2820 case REOP_ASSERTNOTTEST:
2821 --gData->stateStackTop;
2822 --curState;
2823 x->cp = gData->cpbegin + curState->index;
2824 gData->backTrackSP =
2825 (REBackTrackData *) ((char *)gData->backTrackStack +
2826 curState->u.assertion.top);
2827 gData->cursz = curState->u.assertion.sz;
2828 result = (!result) ? x : NULL;
2829 break;
2830 case REOP_STAR:
2831 curState->u.quantifier.min = 0;
2832 curState->u.quantifier.max = (UINT)-1;
2833 goto quantcommon;
2834 case REOP_PLUS:
2835 curState->u.quantifier.min = 1;
2836 curState->u.quantifier.max = (UINT)-1;
2837 goto quantcommon;
2838 case REOP_OPT:
2839 curState->u.quantifier.min = 0;
2840 curState->u.quantifier.max = 1;
2841 goto quantcommon;
2842 case REOP_QUANT:
2843 pc = ReadCompactIndex(pc, &k);
2844 curState->u.quantifier.min = k;
2845 pc = ReadCompactIndex(pc, &k);
2846 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2847 curState->u.quantifier.max = k - 1;
2848 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2849 quantcommon:
2850 if (curState->u.quantifier.max == 0) {
2851 pc = pc + GET_OFFSET(pc);
2852 op = (REOp) *pc++;
2853 result = x;
2854 continue;
2856 /* Step over <next> */
2857 nextpc = pc + ARG_LEN;
2858 op = (REOp) *nextpc++;
2859 startcp = x->cp;
2860 if (REOP_IS_SIMPLE(op)) {
2861 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2862 if (curState->u.quantifier.min == 0)
2863 result = x;
2864 else
2865 result = NULL;
2866 pc = pc + GET_OFFSET(pc);
2867 break;
2869 op = (REOp) *nextpc++;
2870 result = x;
2872 curState->index = startcp - gData->cpbegin;
2873 curState->continue_op = REOP_REPEAT;
2874 curState->continue_pc = pc;
2875 curState->parenSoFar = parenSoFar;
2876 PUSH_STATE_STACK(gData);
2877 if (curState->u.quantifier.min == 0 &&
2878 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2879 0, 0)) {
2880 goto bad;
2882 pc = nextpc;
2883 continue;
2885 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2886 pc = curState[-1].continue_pc;
2887 op = (REOp) curState[-1].continue_op;
2889 if (!result)
2890 result = x;
2891 continue;
2893 case REOP_REPEAT:
2894 --curState;
2895 do {
2896 --gData->stateStackTop;
2897 if (!result) {
2898 /* Failed, see if we have enough children. */
2899 if (curState->u.quantifier.min == 0)
2900 goto repeatDone;
2901 goto break_switch;
2903 if (curState->u.quantifier.min == 0 &&
2904 x->cp == gData->cpbegin + curState->index) {
2905 /* matched an empty string, that'll get us nowhere */
2906 result = NULL;
2907 goto break_switch;
2909 if (curState->u.quantifier.min != 0)
2910 curState->u.quantifier.min--;
2911 if (curState->u.quantifier.max != (UINT) -1)
2912 curState->u.quantifier.max--;
2913 if (curState->u.quantifier.max == 0)
2914 goto repeatDone;
2915 nextpc = pc + ARG_LEN;
2916 nextop = (REOp) *nextpc;
2917 startcp = x->cp;
2918 if (REOP_IS_SIMPLE(nextop)) {
2919 nextpc++;
2920 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2921 if (curState->u.quantifier.min == 0)
2922 goto repeatDone;
2923 result = NULL;
2924 goto break_switch;
2926 result = x;
2928 curState->index = startcp - gData->cpbegin;
2929 PUSH_STATE_STACK(gData);
2930 if (curState->u.quantifier.min == 0 &&
2931 !PushBackTrackState(gData, REOP_REPEAT,
2932 pc, x, startcp,
2933 curState->parenSoFar,
2934 parenSoFar -
2935 curState->parenSoFar)) {
2936 goto bad;
2938 } while (*nextpc == REOP_ENDCHILD);
2939 pc = nextpc;
2940 op = (REOp) *pc++;
2941 parenSoFar = curState->parenSoFar;
2942 continue;
2944 repeatDone:
2945 result = x;
2946 pc += GET_OFFSET(pc);
2947 goto break_switch;
2949 case REOP_MINIMALSTAR:
2950 curState->u.quantifier.min = 0;
2951 curState->u.quantifier.max = (UINT)-1;
2952 goto minimalquantcommon;
2953 case REOP_MINIMALPLUS:
2954 curState->u.quantifier.min = 1;
2955 curState->u.quantifier.max = (UINT)-1;
2956 goto minimalquantcommon;
2957 case REOP_MINIMALOPT:
2958 curState->u.quantifier.min = 0;
2959 curState->u.quantifier.max = 1;
2960 goto minimalquantcommon;
2961 case REOP_MINIMALQUANT:
2962 pc = ReadCompactIndex(pc, &k);
2963 curState->u.quantifier.min = k;
2964 pc = ReadCompactIndex(pc, &k);
2965 /* See REOP_QUANT comments about k - 1. */
2966 curState->u.quantifier.max = k - 1;
2967 assert(curState->u.quantifier.min
2968 <= curState->u.quantifier.max);
2969 minimalquantcommon:
2970 curState->index = x->cp - gData->cpbegin;
2971 curState->parenSoFar = parenSoFar;
2972 PUSH_STATE_STACK(gData);
2973 if (curState->u.quantifier.min != 0) {
2974 curState->continue_op = REOP_MINIMALREPEAT;
2975 curState->continue_pc = pc;
2976 /* step over <next> */
2977 pc += OFFSET_LEN;
2978 op = (REOp) *pc++;
2979 } else {
2980 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2981 pc, x, x->cp, 0, 0)) {
2982 goto bad;
2984 --gData->stateStackTop;
2985 pc = pc + GET_OFFSET(pc);
2986 op = (REOp) *pc++;
2988 continue;
2990 case REOP_MINIMALREPEAT:
2991 --gData->stateStackTop;
2992 --curState;
2994 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2995 #define PREPARE_REPEAT() \
2996 do { \
2997 curState->index = x->cp - gData->cpbegin; \
2998 curState->continue_op = REOP_MINIMALREPEAT; \
2999 curState->continue_pc = pc; \
3000 pc += ARG_LEN; \
3001 for (k = curState->parenSoFar; k < parenSoFar; k++) \
3002 x->parens[k].index = -1; \
3003 PUSH_STATE_STACK(gData); \
3004 op = (REOp) *pc++; \
3005 assert(op < REOP_LIMIT); \
3006 }while(0)
3008 if (!result) {
3009 TRACE(" -\n");
3011 * Non-greedy failure - try to consume another child.
3013 if (curState->u.quantifier.max == (UINT) -1 ||
3014 curState->u.quantifier.max > 0) {
3015 PREPARE_REPEAT();
3016 continue;
3018 /* Don't need to adjust pc since we're going to pop. */
3019 break;
3021 if (curState->u.quantifier.min == 0 &&
3022 x->cp == gData->cpbegin + curState->index) {
3023 /* Matched an empty string, that'll get us nowhere. */
3024 result = NULL;
3025 break;
3027 if (curState->u.quantifier.min != 0)
3028 curState->u.quantifier.min--;
3029 if (curState->u.quantifier.max != (UINT) -1)
3030 curState->u.quantifier.max--;
3031 if (curState->u.quantifier.min != 0) {
3032 PREPARE_REPEAT();
3033 continue;
3035 curState->index = x->cp - gData->cpbegin;
3036 curState->parenSoFar = parenSoFar;
3037 PUSH_STATE_STACK(gData);
3038 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3039 pc, x, x->cp,
3040 curState->parenSoFar,
3041 parenSoFar - curState->parenSoFar)) {
3042 goto bad;
3044 --gData->stateStackTop;
3045 pc = pc + GET_OFFSET(pc);
3046 op = (REOp) *pc++;
3047 assert(op < REOP_LIMIT);
3048 continue;
3049 default:
3050 assert(FALSE);
3051 result = NULL;
3053 break_switch:;
3057 * If the match failed and there's a backtrack option, take it.
3058 * Otherwise this is a complete and utter failure.
3060 if (!result) {
3061 if (gData->cursz == 0)
3062 return NULL;
3064 /* Potentially detect explosive regex here. */
3065 gData->backTrackCount++;
3066 if (gData->backTrackLimit &&
3067 gData->backTrackCount >= gData->backTrackLimit) {
3068 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3069 JSMSG_REGEXP_TOO_COMPLEX);
3070 gData->ok = FALSE;
3071 return NULL;
3074 backTrackData = gData->backTrackSP;
3075 gData->cursz = backTrackData->sz;
3076 gData->backTrackSP =
3077 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3078 x->cp = backTrackData->cp;
3079 pc = backTrackData->backtrack_pc;
3080 op = (REOp) backTrackData->backtrack_op;
3081 assert(op < REOP_LIMIT);
3082 gData->stateStackTop = backTrackData->saveStateStackTop;
3083 assert(gData->stateStackTop);
3085 memcpy(gData->stateStack, backTrackData + 1,
3086 sizeof(REProgState) * backTrackData->saveStateStackTop);
3087 curState = &gData->stateStack[gData->stateStackTop - 1];
3089 if (backTrackData->parenCount) {
3090 memcpy(&x->parens[backTrackData->parenIndex],
3091 (char *)(backTrackData + 1) +
3092 sizeof(REProgState) * backTrackData->saveStateStackTop,
3093 sizeof(RECapture) * backTrackData->parenCount);
3094 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3095 } else {
3096 for (k = curState->parenSoFar; k < parenSoFar; k++)
3097 x->parens[k].index = -1;
3098 parenSoFar = curState->parenSoFar;
3101 TRACE("\tBT_Pop: %ld,%ld\n",
3102 (ULONG_PTR)backTrackData->parenIndex,
3103 (ULONG_PTR)backTrackData->parenCount);
3104 continue;
3106 x = result;
3109 * Continue with the expression.
3111 op = (REOp)*pc++;
3112 assert(op < REOP_LIMIT);
3115 bad:
3116 TRACE("\n");
3117 return NULL;
3119 good:
3120 TRACE("\n");
3121 return x;
3124 static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
3126 REMatchState *result;
3127 const WCHAR *cp = x->cp;
3128 const WCHAR *cp2;
3129 UINT j;
3132 * Have to include the position beyond the last character
3133 * in order to detect end-of-input/line condition.
3135 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3136 gData->skipped = cp2 - cp;
3137 x->cp = cp2;
3138 for (j = 0; j < gData->regexp->parenCount; j++)
3139 x->parens[j].index = -1;
3140 result = ExecuteREBytecode(gData, x);
3141 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
3142 return result;
3143 gData->backTrackSP = gData->backTrackStack;
3144 gData->cursz = 0;
3145 gData->stateStackTop = 0;
3146 cp2 = cp + gData->skipped;
3148 return NULL;
3151 #define MIN_BACKTRACK_LIMIT 400000
3153 static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
3155 REMatchState *result;
3156 UINT i;
3158 gData->backTrackStackSize = INITIAL_BACKTRACK;
3159 gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
3160 if (!gData->backTrackStack)
3161 goto bad;
3163 gData->backTrackSP = gData->backTrackStack;
3164 gData->cursz = 0;
3165 gData->backTrackCount = 0;
3166 gData->backTrackLimit = 0;
3168 gData->stateStackLimit = INITIAL_STATESTACK;
3169 gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3170 if (!gData->stateStack)
3171 goto bad;
3173 gData->stateStackTop = 0;
3174 gData->cx = cx;
3175 gData->regexp = re;
3176 gData->ok = TRUE;
3178 result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
3179 if (!result)
3180 goto bad;
3182 for (i = 0; i < re->classCount; i++) {
3183 if (!re->classList[i].converted &&
3184 !ProcessCharSet(gData, &re->classList[i])) {
3185 return NULL;
3189 return result;
3191 bad:
3192 js_ReportOutOfScriptQuota(cx);
3193 gData->ok = FALSE;
3194 return NULL;
3197 static void
3198 js_DestroyRegExp(JSRegExp *re)
3200 if (re->classList) {
3201 UINT i;
3202 for (i = 0; i < re->classCount; i++) {
3203 if (re->classList[i].converted)
3204 heap_free(re->classList[i].u.bits);
3205 re->classList[i].u.bits = NULL;
3207 heap_free(re->classList);
3209 heap_free(re);
3212 static JSRegExp *
3213 js_NewRegExp(script_ctx_t *cx, BSTR str, UINT flags, BOOL flat)
3215 JSRegExp *re;
3216 jsheap_t *mark;
3217 CompilerState state;
3218 size_t resize;
3219 jsbytecode *endPC;
3220 UINT i;
3221 size_t len;
3223 re = NULL;
3224 mark = jsheap_mark(&cx->tmp_heap);
3225 len = SysStringLen(str);
3227 state.context = cx;
3228 state.cp = str;
3229 if (!state.cp)
3230 goto out;
3231 state.cpbegin = state.cp;
3232 state.cpend = state.cp + len;
3233 state.flags = flags;
3234 state.parenCount = 0;
3235 state.classCount = 0;
3236 state.progLength = 0;
3237 state.treeDepth = 0;
3238 state.classBitmapsMem = 0;
3239 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3240 state.classCache[i].start = NULL;
3242 if (len != 0 && flat) {
3243 state.result = NewRENode(&state, REOP_FLAT);
3244 if (!state.result)
3245 goto out;
3246 state.result->u.flat.chr = *state.cpbegin;
3247 state.result->u.flat.length = len;
3248 state.result->kid = (void *) state.cpbegin;
3249 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3250 state.progLength += 1 + GetCompactIndexWidth(0)
3251 + GetCompactIndexWidth(len);
3252 } else {
3253 if (!ParseRegExp(&state))
3254 goto out;
3256 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3257 re = heap_alloc(resize);
3258 if (!re)
3259 goto out;
3261 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3262 re->classCount = state.classCount;
3263 if (re->classCount) {
3264 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3265 if (!re->classList) {
3266 js_DestroyRegExp(re);
3267 re = NULL;
3268 goto out;
3270 for (i = 0; i < re->classCount; i++)
3271 re->classList[i].converted = FALSE;
3272 } else {
3273 re->classList = NULL;
3275 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3276 if (!endPC) {
3277 js_DestroyRegExp(re);
3278 re = NULL;
3279 goto out;
3281 *endPC++ = REOP_END;
3283 * Check whether size was overestimated and shrink using realloc.
3284 * This is safe since no pointers to newly parsed regexp or its parts
3285 * besides re exist here.
3287 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3288 JSRegExp *tmp;
3289 assert((size_t)(endPC - re->program) < state.progLength + 1);
3290 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3291 tmp = heap_realloc(re, resize);
3292 if (tmp)
3293 re = tmp;
3296 re->flags = flags;
3297 re->parenCount = state.parenCount;
3298 re->source = str;
3300 out:
3301 jsheap_clear(mark);
3302 return re;
3305 static inline RegExpInstance *regexp_from_vdisp(vdisp_t *vdisp)
3307 return (RegExpInstance*)vdisp->u.jsdisp;
3310 static void set_last_index(RegExpInstance *This, DWORD last_index)
3312 This->last_index = last_index;
3313 VariantClear(&This->last_index_var);
3314 num_set_val(&This->last_index_var, last_index);
3317 static HRESULT do_regexp_match_next(script_ctx_t *ctx, RegExpInstance *regexp, DWORD rem_flags,
3318 const WCHAR *str, DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size,
3319 DWORD *parens_cnt, match_result_t *ret)
3321 REMatchState *x, *result;
3322 REGlobalData gData;
3323 DWORD matchlen;
3325 gData.cpbegin = *cp;
3326 gData.cpend = str + len;
3327 gData.start = *cp-str;
3328 gData.skipped = 0;
3329 gData.pool = &ctx->tmp_heap;
3331 x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
3332 if(!x) {
3333 WARN("InitMatch failed\n");
3334 return E_FAIL;
3337 x->cp = *cp;
3338 result = MatchRegExp(&gData, x);
3339 if(!gData.ok) {
3340 WARN("MatchRegExp failed\n");
3341 return E_FAIL;
3344 if(!result) {
3345 if(rem_flags & REM_RESET_INDEX)
3346 set_last_index(regexp, 0);
3347 return S_FALSE;
3350 if(parens) {
3351 if(regexp->jsregexp->parenCount > *parens_size) {
3352 match_result_t *new_parens;
3354 if(*parens)
3355 new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
3356 else
3357 new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
3358 if(!new_parens)
3359 return E_OUTOFMEMORY;
3361 *parens = new_parens;
3365 /* FIXME: We often already have a copy of input string that we could use to store last match */
3366 if(!(rem_flags & REM_NO_CTX_UPDATE) &&
3367 (!ctx->last_match || len != SysStringLen(ctx->last_match) || strncmpW(ctx->last_match, str, len))) {
3368 BSTR last_match;
3370 last_match = SysAllocStringLen(str, len);
3371 if(!last_match)
3372 return E_OUTOFMEMORY;
3373 SysFreeString(ctx->last_match);
3374 ctx->last_match = last_match;
3377 if(parens) {
3378 DWORD i;
3380 *parens_cnt = regexp->jsregexp->parenCount;
3382 for(i=0; i < regexp->jsregexp->parenCount; i++) {
3383 if(result->parens[i].index == -1) {
3384 (*parens)[i].str = NULL;
3385 (*parens)[i].len = 0;
3386 }else {
3387 (*parens)[i].str = *cp + result->parens[i].index;
3388 (*parens)[i].len = result->parens[i].length;
3393 matchlen = (result->cp-*cp) - gData.skipped;
3394 *cp = result->cp;
3395 ret->str = result->cp-matchlen;
3396 ret->len = matchlen;
3397 set_last_index(regexp, result->cp-str);
3399 if(!(rem_flags & REM_NO_CTX_UPDATE)) {
3400 ctx->last_match_index = ret->str-str;
3401 ctx->last_match_length = matchlen;
3404 return S_OK;
3407 HRESULT regexp_match_next(script_ctx_t *ctx, jsdisp_t *dispex, DWORD rem_flags, const WCHAR *str,
3408 DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt,
3409 match_result_t *ret)
3411 RegExpInstance *regexp = (RegExpInstance*)dispex;
3412 jsheap_t *mark;
3413 HRESULT hres;
3415 if((rem_flags & REM_CHECK_GLOBAL) && !(regexp->jsregexp->flags & JSREG_GLOB))
3416 return S_FALSE;
3418 mark = jsheap_mark(&ctx->tmp_heap);
3420 hres = do_regexp_match_next(ctx, regexp, rem_flags, str, len, cp, parens, parens_size, parens_cnt, ret);
3422 jsheap_clear(mark);
3423 return hres;
3426 HRESULT regexp_match(script_ctx_t *ctx, jsdisp_t *dispex, const WCHAR *str, DWORD len, BOOL gflag,
3427 match_result_t **match_result, DWORD *result_cnt)
3429 RegExpInstance *This = (RegExpInstance*)dispex;
3430 match_result_t *ret = NULL, cres;
3431 const WCHAR *cp = str;
3432 DWORD i=0, ret_size = 0;
3433 jsheap_t *mark;
3434 HRESULT hres;
3436 mark = jsheap_mark(&ctx->tmp_heap);
3438 while(1) {
3439 hres = do_regexp_match_next(ctx, This, 0, str, len, &cp, NULL, NULL, NULL, &cres);
3440 if(hres == S_FALSE) {
3441 hres = S_OK;
3442 break;
3445 if(FAILED(hres))
3446 break;
3448 if(ret_size == i) {
3449 if(ret)
3450 ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
3451 else
3452 ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
3453 if(!ret) {
3454 hres = E_OUTOFMEMORY;
3455 break;
3459 ret[i++] = cres;
3461 if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
3462 hres = S_OK;
3463 break;
3467 jsheap_clear(mark);
3468 if(FAILED(hres)) {
3469 heap_free(ret);
3470 return hres;
3473 *match_result = ret;
3474 *result_cnt = i;
3475 return S_OK;
3478 static HRESULT RegExp_source(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3479 VARIANT *retv, jsexcept_t *ei)
3481 TRACE("\n");
3483 switch(flags) {
3484 case DISPATCH_PROPERTYGET: {
3485 RegExpInstance *This = regexp_from_vdisp(jsthis);
3487 V_VT(retv) = VT_BSTR;
3488 V_BSTR(retv) = SysAllocString(This->str);
3489 if(!V_BSTR(retv))
3490 return E_OUTOFMEMORY;
3491 break;
3493 default:
3494 FIXME("Unimplemnted flags %x\n", flags);
3495 return E_NOTIMPL;
3498 return S_OK;
3501 static HRESULT RegExp_global(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3502 VARIANT *retv, jsexcept_t *ei)
3504 FIXME("\n");
3505 return E_NOTIMPL;
3508 static HRESULT RegExp_ignoreCase(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3509 VARIANT *retv, jsexcept_t *ei)
3511 FIXME("\n");
3512 return E_NOTIMPL;
3515 static HRESULT RegExp_multiline(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3516 VARIANT *retv, jsexcept_t *ei)
3518 FIXME("\n");
3519 return E_NOTIMPL;
3522 static INT index_from_var(script_ctx_t *ctx, VARIANT *v)
3524 jsexcept_t ei;
3525 VARIANT num;
3526 HRESULT hres;
3528 memset(&ei, 0, sizeof(ei));
3529 hres = to_number(ctx, v, &ei, &num);
3530 if(FAILED(hres)) { /* FIXME: Move ignoring exceptions to to_primitive */
3531 VariantClear(&ei.var);
3532 return 0;
3535 if(V_VT(&num) == VT_R8) {
3536 DOUBLE d = floor(V_R8(&num));
3537 return (DOUBLE)(INT)d == d ? d : 0;
3540 return V_I4(&num);
3543 static HRESULT RegExp_lastIndex(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3544 VARIANT *retv, jsexcept_t *ei)
3546 TRACE("\n");
3548 switch(flags) {
3549 case DISPATCH_PROPERTYGET: {
3550 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3552 V_VT(retv) = VT_EMPTY;
3553 return VariantCopy(retv, &regexp->last_index_var);
3555 case DISPATCH_PROPERTYPUT: {
3556 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3557 VARIANT *arg;
3558 HRESULT hres;
3560 arg = get_arg(dp,0);
3561 hres = VariantCopy(&regexp->last_index_var, arg);
3562 if(FAILED(hres))
3563 return hres;
3565 regexp->last_index = index_from_var(ctx, arg);
3566 break;
3568 default:
3569 FIXME("unimplemented flags: %x\n", flags);
3570 return E_NOTIMPL;
3573 return S_OK;
3576 static HRESULT RegExp_toString(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3577 VARIANT *retv, jsexcept_t *ei)
3579 FIXME("\n");
3580 return E_NOTIMPL;
3583 static HRESULT create_match_array(script_ctx_t *ctx, BSTR input, const match_result_t *result,
3584 const match_result_t *parens, DWORD parens_cnt, jsexcept_t *ei, IDispatch **ret)
3586 jsdisp_t *array;
3587 VARIANT var;
3588 int i;
3589 HRESULT hres = S_OK;
3591 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3592 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3593 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3594 static const WCHAR zeroW[] = {'0',0};
3596 hres = create_array(ctx, parens_cnt+1, &array);
3597 if(FAILED(hres))
3598 return hres;
3600 for(i=0; i < parens_cnt; i++) {
3601 V_VT(&var) = VT_BSTR;
3602 V_BSTR(&var) = SysAllocStringLen(parens[i].str, parens[i].len);
3603 if(!V_BSTR(&var)) {
3604 hres = E_OUTOFMEMORY;
3605 break;
3608 hres = jsdisp_propput_idx(array, i+1, &var, ei);
3609 SysFreeString(V_BSTR(&var));
3610 if(FAILED(hres))
3611 break;
3614 while(SUCCEEDED(hres)) {
3615 V_VT(&var) = VT_I4;
3616 V_I4(&var) = result->str-input;
3617 hres = jsdisp_propput_name(array, indexW, &var, ei);
3618 if(FAILED(hres))
3619 break;
3621 V_I4(&var) = result->str-input+result->len;
3622 hres = jsdisp_propput_name(array, lastIndexW, &var, ei);
3623 if(FAILED(hres))
3624 break;
3626 V_VT(&var) = VT_BSTR;
3627 V_BSTR(&var) = input;
3628 hres = jsdisp_propput_name(array, inputW, &var, ei);
3629 if(FAILED(hres))
3630 break;
3632 V_BSTR(&var) = SysAllocStringLen(result->str, result->len);
3633 if(!V_BSTR(&var)) {
3634 hres = E_OUTOFMEMORY;
3635 break;
3637 hres = jsdisp_propput_name(array, zeroW, &var, ei);
3638 SysFreeString(V_BSTR(&var));
3639 break;
3642 if(FAILED(hres)) {
3643 jsdisp_release(array);
3644 return hres;
3647 *ret = to_disp(array);
3648 return S_OK;
3651 static HRESULT run_exec(script_ctx_t *ctx, vdisp_t *jsthis, VARIANT *arg, jsexcept_t *ei, BSTR *input,
3652 match_result_t *match, match_result_t **parens, DWORD *parens_cnt, VARIANT_BOOL *ret)
3654 RegExpInstance *regexp;
3655 DWORD parens_size = 0, last_index = 0, length;
3656 const WCHAR *cp;
3657 BSTR string;
3658 HRESULT hres;
3660 if(!is_vclass(jsthis, JSCLASS_REGEXP)) {
3661 FIXME("Not a RegExp\n");
3662 return E_NOTIMPL;
3665 regexp = regexp_from_vdisp(jsthis);
3667 if(arg) {
3668 hres = to_string(ctx, arg, ei, &string);
3669 if(FAILED(hres))
3670 return hres;
3671 length = SysStringLen(string);
3672 }else {
3673 string = NULL;
3674 length = 0;
3677 if(regexp->jsregexp->flags & JSREG_GLOB) {
3678 if(regexp->last_index < 0) {
3679 SysFreeString(string);
3680 set_last_index(regexp, 0);
3681 *ret = VARIANT_FALSE;
3682 if(input) {
3683 *input = NULL;
3685 return S_OK;
3688 last_index = regexp->last_index;
3691 cp = string + last_index;
3692 hres = regexp_match_next(ctx, &regexp->dispex, REM_RESET_INDEX, string, length, &cp, parens,
3693 parens ? &parens_size : NULL, parens_cnt, match);
3694 if(FAILED(hres)) {
3695 SysFreeString(string);
3696 return hres;
3699 *ret = hres == S_OK ? VARIANT_TRUE : VARIANT_FALSE;
3700 if(input) {
3701 *input = string;
3702 }else {
3703 SysFreeString(string);
3705 return S_OK;
3708 static HRESULT RegExp_exec(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3709 VARIANT *retv, jsexcept_t *ei)
3711 match_result_t *parens = NULL, match;
3712 DWORD parens_cnt = 0;
3713 VARIANT_BOOL b;
3714 BSTR string;
3715 HRESULT hres;
3717 TRACE("\n");
3719 hres = run_exec(ctx, jsthis, arg_cnt(dp) ? get_arg(dp,0) : NULL, ei, &string, &match, &parens, &parens_cnt, &b);
3720 if(FAILED(hres))
3721 return hres;
3723 if(retv) {
3724 if(b) {
3725 IDispatch *ret;
3727 hres = create_match_array(ctx, string, &match, parens, parens_cnt, ei, &ret);
3728 if(SUCCEEDED(hres)) {
3729 V_VT(retv) = VT_DISPATCH;
3730 V_DISPATCH(retv) = ret;
3732 }else {
3733 V_VT(retv) = VT_NULL;
3737 heap_free(parens);
3738 SysFreeString(string);
3739 return hres;
3742 static HRESULT RegExp_test(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3743 VARIANT *retv, jsexcept_t *ei)
3745 match_result_t match;
3746 VARIANT undef_var;
3747 VARIANT_BOOL b;
3748 DWORD argc;
3749 HRESULT hres;
3751 TRACE("\n");
3753 argc = arg_cnt(dp);
3754 if(!argc) {
3755 V_VT(&undef_var) = VT_BSTR;
3756 V_BSTR(&undef_var) = SysAllocString(undefinedW);
3757 if(!V_BSTR(&undef_var))
3758 return E_OUTOFMEMORY;
3761 hres = run_exec(ctx, jsthis, argc ? get_arg(dp,0) : &undef_var, ei, NULL, &match, NULL, NULL, &b);
3762 if(!argc)
3763 SysFreeString(V_BSTR(&undef_var));
3764 if(FAILED(hres))
3765 return hres;
3767 if(retv) {
3768 V_VT(retv) = VT_BOOL;
3769 V_BOOL(retv) = b;
3771 return S_OK;
3774 static HRESULT RegExp_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
3775 VARIANT *retv, jsexcept_t *ei)
3777 TRACE("\n");
3779 switch(flags) {
3780 case INVOKE_FUNC:
3781 return throw_type_error(ctx, ei, JS_E_FUNCTION_EXPECTED, NULL);
3782 default:
3783 FIXME("unimplemented flags %x\n", flags);
3784 return E_NOTIMPL;
3787 return S_OK;
3790 static void RegExp_destructor(jsdisp_t *dispex)
3792 RegExpInstance *This = (RegExpInstance*)dispex;
3794 if(This->jsregexp)
3795 js_DestroyRegExp(This->jsregexp);
3796 VariantClear(&This->last_index_var);
3797 SysFreeString(This->str);
3798 heap_free(This);
3801 static const builtin_prop_t RegExp_props[] = {
3802 {execW, RegExp_exec, PROPF_METHOD|1},
3803 {globalW, RegExp_global, 0},
3804 {ignoreCaseW, RegExp_ignoreCase, 0},
3805 {lastIndexW, RegExp_lastIndex, 0},
3806 {multilineW, RegExp_multiline, 0},
3807 {sourceW, RegExp_source, 0},
3808 {testW, RegExp_test, PROPF_METHOD|1},
3809 {toStringW, RegExp_toString, PROPF_METHOD}
3812 static const builtin_info_t RegExp_info = {
3813 JSCLASS_REGEXP,
3814 {NULL, RegExp_value, 0},
3815 sizeof(RegExp_props)/sizeof(*RegExp_props),
3816 RegExp_props,
3817 RegExp_destructor,
3818 NULL
3821 static HRESULT alloc_regexp(script_ctx_t *ctx, jsdisp_t *object_prototype, RegExpInstance **ret)
3823 RegExpInstance *regexp;
3824 HRESULT hres;
3826 regexp = heap_alloc_zero(sizeof(RegExpInstance));
3827 if(!regexp)
3828 return E_OUTOFMEMORY;
3830 if(object_prototype)
3831 hres = init_dispex(&regexp->dispex, ctx, &RegExp_info, object_prototype);
3832 else
3833 hres = init_dispex_from_constr(&regexp->dispex, ctx, &RegExp_info, ctx->regexp_constr);
3835 if(FAILED(hres)) {
3836 heap_free(regexp);
3837 return hres;
3840 *ret = regexp;
3841 return S_OK;
3844 HRESULT create_regexp(script_ctx_t *ctx, const WCHAR *exp, int len, DWORD flags, jsdisp_t **ret)
3846 RegExpInstance *regexp;
3847 HRESULT hres;
3849 TRACE("%s %x\n", debugstr_wn(exp, len), flags);
3851 hres = alloc_regexp(ctx, NULL, &regexp);
3852 if(FAILED(hres))
3853 return hres;
3855 if(len == -1)
3856 regexp->str = SysAllocString(exp);
3857 else
3858 regexp->str = SysAllocStringLen(exp, len);
3859 if(!regexp->str) {
3860 jsdisp_release(&regexp->dispex);
3861 return E_OUTOFMEMORY;
3864 regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
3865 if(!regexp->jsregexp) {
3866 WARN("js_NewRegExp failed\n");
3867 jsdisp_release(&regexp->dispex);
3868 return E_FAIL;
3871 V_VT(&regexp->last_index_var) = VT_I4;
3872 V_I4(&regexp->last_index_var) = 0;
3874 *ret = &regexp->dispex;
3875 return S_OK;
3878 HRESULT create_regexp_var(script_ctx_t *ctx, VARIANT *src_arg, VARIANT *flags_arg, jsdisp_t **ret)
3880 const WCHAR *opt = emptyW, *src;
3881 DWORD flags;
3882 HRESULT hres;
3884 if(V_VT(src_arg) == VT_DISPATCH) {
3885 jsdisp_t *obj;
3887 obj = iface_to_jsdisp((IUnknown*)V_DISPATCH(src_arg));
3888 if(obj) {
3889 if(is_class(obj, JSCLASS_REGEXP)) {
3890 RegExpInstance *regexp = (RegExpInstance*)obj;
3892 hres = create_regexp(ctx, regexp->str, -1, regexp->jsregexp->flags, ret);
3893 jsdisp_release(obj);
3894 return hres;
3897 jsdisp_release(obj);
3901 if(V_VT(src_arg) != VT_BSTR) {
3902 FIXME("flags_arg = %s\n", debugstr_variant(flags_arg));
3903 return E_NOTIMPL;
3906 src = V_BSTR(src_arg);
3908 if(flags_arg) {
3909 if(V_VT(flags_arg) != VT_BSTR) {
3910 FIXME("unimplemented for vt %d\n", V_VT(flags_arg));
3911 return E_NOTIMPL;
3914 opt = V_BSTR(flags_arg);
3917 hres = parse_regexp_flags(opt, strlenW(opt), &flags);
3918 if(FAILED(hres))
3919 return hres;
3921 return create_regexp(ctx, src, -1, flags, ret);
3924 HRESULT regexp_string_match(script_ctx_t *ctx, jsdisp_t *re, BSTR str,
3925 VARIANT *retv, jsexcept_t *ei)
3927 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3928 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3929 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3931 RegExpInstance *regexp = (RegExpInstance*)re;
3932 match_result_t *match_result;
3933 DWORD match_cnt, i, length;
3934 jsdisp_t *array;
3935 VARIANT var;
3936 HRESULT hres;
3938 length = SysStringLen(str);
3940 if(!(regexp->jsregexp->flags & JSREG_GLOB)) {
3941 match_result_t match, *parens = NULL;
3942 DWORD parens_cnt, parens_size = 0;
3943 const WCHAR *cp = str;
3945 hres = regexp_match_next(ctx, &regexp->dispex, 0, str, length, &cp, &parens, &parens_size, &parens_cnt, &match);
3946 if(FAILED(hres))
3947 return hres;
3949 if(retv) {
3950 if(hres == S_OK) {
3951 IDispatch *ret;
3953 hres = create_match_array(ctx, str, &match, parens, parens_cnt, ei, &ret);
3954 if(SUCCEEDED(hres)) {
3955 V_VT(retv) = VT_DISPATCH;
3956 V_DISPATCH(retv) = ret;
3958 }else {
3959 V_VT(retv) = VT_NULL;
3963 heap_free(parens);
3964 return S_OK;
3967 hres = regexp_match(ctx, &regexp->dispex, str, length, FALSE, &match_result, &match_cnt);
3968 if(FAILED(hres))
3969 return hres;
3971 if(!match_cnt) {
3972 TRACE("no match\n");
3974 if(retv)
3975 V_VT(retv) = VT_NULL;
3976 return S_OK;
3979 hres = create_array(ctx, match_cnt, &array);
3980 if(FAILED(hres))
3981 return hres;
3983 V_VT(&var) = VT_BSTR;
3985 for(i=0; i < match_cnt; i++) {
3986 V_BSTR(&var) = SysAllocStringLen(match_result[i].str, match_result[i].len);
3987 if(!V_BSTR(&var)) {
3988 hres = E_OUTOFMEMORY;
3989 break;
3992 hres = jsdisp_propput_idx(array, i, &var, ei);
3993 SysFreeString(V_BSTR(&var));
3994 if(FAILED(hres))
3995 break;
3998 while(SUCCEEDED(hres)) {
3999 V_VT(&var) = VT_I4;
4000 V_I4(&var) = match_result[match_cnt-1].str-str;
4001 hres = jsdisp_propput_name(array, indexW, &var, ei);
4002 if(FAILED(hres))
4003 break;
4005 V_I4(&var) = match_result[match_cnt-1].str-str+match_result[match_cnt-1].len;
4006 hres = jsdisp_propput_name(array, lastIndexW, &var, ei);
4007 if(FAILED(hres))
4008 break;
4010 V_VT(&var) = VT_BSTR;
4011 V_BSTR(&var) = str;
4012 hres = jsdisp_propput_name(array, inputW, &var, ei);
4013 break;
4016 heap_free(match_result);
4018 if(SUCCEEDED(hres) && retv)
4019 var_set_jsdisp(retv, array);
4020 else
4021 jsdisp_release(array);
4022 return hres;
4025 static HRESULT RegExpConstr_leftContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4026 DISPPARAMS *dp, VARIANT *retv, jsexcept_t *ei)
4028 TRACE("\n");
4030 switch(flags) {
4031 case DISPATCH_PROPERTYGET: {
4032 BSTR ret;
4034 ret = SysAllocStringLen(ctx->last_match, ctx->last_match_index);
4035 if(!ret)
4036 return E_OUTOFMEMORY;
4038 V_VT(retv) = VT_BSTR;
4039 V_BSTR(retv) = ret;
4040 break;
4042 case DISPATCH_PROPERTYPUT:
4043 break;
4044 default:
4045 FIXME("unsupported flags\n");
4046 return E_NOTIMPL;
4049 return S_OK;
4052 static HRESULT RegExpConstr_rightContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4053 DISPPARAMS *dp, VARIANT *retv, jsexcept_t *ei)
4055 TRACE("\n");
4057 switch(flags) {
4058 case DISPATCH_PROPERTYGET: {
4059 BSTR ret;
4061 ret = SysAllocString(ctx->last_match+ctx->last_match_index+ctx->last_match_length);
4062 if(!ret)
4063 return E_OUTOFMEMORY;
4065 V_VT(retv) = VT_BSTR;
4066 V_BSTR(retv) = ret;
4067 break;
4069 case DISPATCH_PROPERTYPUT:
4070 break;
4071 default:
4072 FIXME("unsupported flags\n");
4073 return E_NOTIMPL;
4076 return S_OK;
4079 static HRESULT RegExpConstr_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
4080 VARIANT *retv, jsexcept_t *ei)
4082 TRACE("\n");
4084 switch(flags) {
4085 case DISPATCH_METHOD:
4086 if(arg_cnt(dp)) {
4087 VARIANT *arg = get_arg(dp,0);
4088 if(V_VT(arg) == VT_DISPATCH) {
4089 jsdisp_t *jsdisp = iface_to_jsdisp((IUnknown*)V_DISPATCH(arg));
4090 if(jsdisp) {
4091 if(is_class(jsdisp, JSCLASS_REGEXP)) {
4092 if(arg_cnt(dp) > 1 && V_VT(get_arg(dp,1)) != VT_EMPTY) {
4093 jsdisp_release(jsdisp);
4094 return throw_regexp_error(ctx, ei, JS_E_REGEXP_SYNTAX, NULL);
4097 if(retv)
4098 var_set_jsdisp(retv, jsdisp);
4099 else
4100 jsdisp_release(jsdisp);
4101 return S_OK;
4103 jsdisp_release(jsdisp);
4107 /* fall through */
4108 case DISPATCH_CONSTRUCT: {
4109 jsdisp_t *ret;
4110 HRESULT hres;
4112 if(!arg_cnt(dp)) {
4113 FIXME("no args\n");
4114 return E_NOTIMPL;
4117 hres = create_regexp_var(ctx, get_arg(dp,0), arg_cnt(dp) > 1 ? get_arg(dp,1) : NULL, &ret);
4118 if(FAILED(hres))
4119 return hres;
4121 if(retv)
4122 var_set_jsdisp(retv, ret);
4123 else
4124 jsdisp_release(ret);
4125 return S_OK;
4127 default:
4128 FIXME("unimplemented flags: %x\n", flags);
4129 return E_NOTIMPL;
4132 return S_OK;
4135 static const builtin_prop_t RegExpConstr_props[] = {
4136 {leftContextW, RegExpConstr_leftContext, 0},
4137 {rightContextW, RegExpConstr_rightContext, 0}
4140 static const builtin_info_t RegExpConstr_info = {
4141 JSCLASS_FUNCTION,
4142 {NULL, Function_value, 0},
4143 sizeof(RegExpConstr_props)/sizeof(*RegExpConstr_props),
4144 RegExpConstr_props,
4145 NULL,
4146 NULL
4149 HRESULT create_regexp_constr(script_ctx_t *ctx, jsdisp_t *object_prototype, jsdisp_t **ret)
4151 RegExpInstance *regexp;
4152 HRESULT hres;
4154 static const WCHAR RegExpW[] = {'R','e','g','E','x','p',0};
4156 hres = alloc_regexp(ctx, object_prototype, &regexp);
4157 if(FAILED(hres))
4158 return hres;
4160 hres = create_builtin_function(ctx, RegExpConstr_value, RegExpW, &RegExpConstr_info,
4161 PROPF_CONSTR|2, &regexp->dispex, ret);
4163 jsdisp_release(&regexp->dispex);
4164 return hres;
4167 HRESULT parse_regexp_flags(const WCHAR *str, DWORD str_len, DWORD *ret)
4169 const WCHAR *p;
4170 DWORD flags = 0;
4172 for (p = str; p < str+str_len; p++) {
4173 switch (*p) {
4174 case 'g':
4175 flags |= JSREG_GLOB;
4176 break;
4177 case 'i':
4178 flags |= JSREG_FOLD;
4179 break;
4180 case 'm':
4181 flags |= JSREG_MULTILINE;
4182 break;
4183 case 'y':
4184 flags |= JSREG_STICKY;
4185 break;
4186 default:
4187 WARN("wrong flag %c\n", *p);
4188 return E_FAIL;
4192 *ret = flags;
4193 return S_OK;