jscript: Use jsstr_t for passing strings to regexp matching functions.
[wine.git] / dlls / jscript / regexp.c
blob4d29390fc8f7f4d561e212b24361f1b32257eafb
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
35 #include <math.h>
37 #include "jscript.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
44 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
45 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
46 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
48 typedef BYTE JSPackedBool;
49 typedef BYTE jsbytecode;
52 * This struct holds a bitmap representation of a class from a regexp.
53 * There's a list of these referenced by the classList field in the JSRegExp
54 * struct below. The initial state has startIndex set to the offset in the
55 * original regexp source of the beginning of the class contents. The first
56 * use of the class converts the source representation into a bitmap.
59 typedef struct RECharSet {
60 JSPackedBool converted;
61 JSPackedBool sense;
62 WORD length;
63 union {
64 BYTE *bits;
65 struct {
66 size_t startIndex;
67 size_t length;
68 } src;
69 } u;
70 } RECharSet;
72 typedef struct {
73 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
74 size_t parenCount; /* number of parenthesized submatches */
75 size_t classCount; /* count [...] bitmaps */
76 RECharSet *classList; /* list of [...] bitmaps */
77 jsstr_t *source; /* locked source string, sans // */
78 jsbytecode program[1]; /* regular expression bytecode */
79 } JSRegExp;
81 typedef struct {
82 jsdisp_t dispex;
84 JSRegExp *jsregexp;
85 jsstr_t *str;
86 INT last_index;
87 jsval_t last_index_val;
88 } RegExpInstance;
90 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
91 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
92 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
93 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
94 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
95 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
96 static const WCHAR execW[] = {'e','x','e','c',0};
97 static const WCHAR testW[] = {'t','e','s','t',0};
99 static const WCHAR leftContextW[] =
100 {'l','e','f','t','C','o','n','t','e','x','t',0};
101 static const WCHAR rightContextW[] =
102 {'r','i','g','h','t','C','o','n','t','e','x','t',0};
104 static const WCHAR idx1W[] = {'$','1',0};
105 static const WCHAR idx2W[] = {'$','2',0};
106 static const WCHAR idx3W[] = {'$','3',0};
107 static const WCHAR idx4W[] = {'$','4',0};
108 static const WCHAR idx5W[] = {'$','5',0};
109 static const WCHAR idx6W[] = {'$','6',0};
110 static const WCHAR idx7W[] = {'$','7',0};
111 static const WCHAR idx8W[] = {'$','8',0};
112 static const WCHAR idx9W[] = {'$','9',0};
114 static const WCHAR undefinedW[] = {'u','n','d','e','f','i','n','e','d',0};
115 static const WCHAR emptyW[] = {0};
117 /* FIXME: Better error handling */
118 #define ReportRegExpError(a,b,c)
119 #define ReportRegExpErrorHelper(a,b,c,d)
120 #define JS_ReportErrorNumber(a,b,c,d)
121 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
122 #define js_ReportOutOfScriptQuota(a)
123 #define JS_ReportOutOfMemory(a)
124 #define JS_COUNT_OPERATION(a,b)
126 #define JSMSG_MIN_TOO_BIG 47
127 #define JSMSG_MAX_TOO_BIG 48
128 #define JSMSG_OUT_OF_ORDER 49
129 #define JSMSG_OUT_OF_MEMORY 137
131 #define LINE_SEPARATOR 0x2028
132 #define PARA_SEPARATOR 0x2029
134 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
135 ((c >= 'a') && (c <= 'z')) )
136 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
137 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
139 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
141 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
142 #define JS7_UNDEC(c) ((c) - '0')
144 typedef enum REOp {
145 REOP_EMPTY,
146 REOP_BOL,
147 REOP_EOL,
148 REOP_WBDRY,
149 REOP_WNONBDRY,
150 REOP_DOT,
151 REOP_DIGIT,
152 REOP_NONDIGIT,
153 REOP_ALNUM,
154 REOP_NONALNUM,
155 REOP_SPACE,
156 REOP_NONSPACE,
157 REOP_BACKREF,
158 REOP_FLAT,
159 REOP_FLAT1,
160 REOP_FLATi,
161 REOP_FLAT1i,
162 REOP_UCFLAT1,
163 REOP_UCFLAT1i,
164 REOP_UCFLAT,
165 REOP_UCFLATi,
166 REOP_CLASS,
167 REOP_NCLASS,
168 REOP_ALT,
169 REOP_QUANT,
170 REOP_STAR,
171 REOP_PLUS,
172 REOP_OPT,
173 REOP_LPAREN,
174 REOP_RPAREN,
175 REOP_JUMP,
176 REOP_DOTSTAR,
177 REOP_LPARENNON,
178 REOP_ASSERT,
179 REOP_ASSERT_NOT,
180 REOP_ASSERTTEST,
181 REOP_ASSERTNOTTEST,
182 REOP_MINIMALSTAR,
183 REOP_MINIMALPLUS,
184 REOP_MINIMALOPT,
185 REOP_MINIMALQUANT,
186 REOP_ENDCHILD,
187 REOP_REPEAT,
188 REOP_MINIMALREPEAT,
189 REOP_ALTPREREQ,
190 REOP_ALTPREREQ2,
191 REOP_ENDALT,
192 REOP_CONCAT,
193 REOP_END,
194 REOP_LIMIT /* META: no operator >= to this */
195 } REOp;
197 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
199 static const char *reop_names[] = {
200 "empty",
201 "bol",
202 "eol",
203 "wbdry",
204 "wnonbdry",
205 "dot",
206 "digit",
207 "nondigit",
208 "alnum",
209 "nonalnum",
210 "space",
211 "nonspace",
212 "backref",
213 "flat",
214 "flat1",
215 "flati",
216 "flat1i",
217 "ucflat1",
218 "ucflat1i",
219 "ucflat",
220 "ucflati",
221 "class",
222 "nclass",
223 "alt",
224 "quant",
225 "star",
226 "plus",
227 "opt",
228 "lparen",
229 "rparen",
230 "jump",
231 "dotstar",
232 "lparennon",
233 "assert",
234 "assert_not",
235 "asserttest",
236 "assertnottest",
237 "minimalstar",
238 "minimalplus",
239 "minimalopt",
240 "minimalquant",
241 "endchild",
242 "repeat",
243 "minimalrepeat",
244 "altprereq",
245 "alrprereq2",
246 "endalt",
247 "concat",
248 "end",
249 NULL
252 typedef struct RECapture {
253 ptrdiff_t index; /* start of contents, -1 for empty */
254 size_t length; /* length of capture */
255 } RECapture;
257 typedef struct REMatchState {
258 const WCHAR *cp;
259 RECapture parens[1]; /* first of 're->parenCount' captures,
260 allocated at end of this struct */
261 } REMatchState;
263 typedef struct REProgState {
264 jsbytecode *continue_pc; /* current continuation data */
265 jsbytecode continue_op;
266 ptrdiff_t index; /* progress in text */
267 size_t parenSoFar; /* highest indexed paren started */
268 union {
269 struct {
270 UINT min; /* current quantifier limits */
271 UINT max;
272 } quantifier;
273 struct {
274 size_t top; /* backtrack stack state */
275 size_t sz;
276 } assertion;
277 } u;
278 } REProgState;
280 typedef struct REBackTrackData {
281 size_t sz; /* size of previous stack entry */
282 jsbytecode *backtrack_pc; /* where to backtrack to */
283 jsbytecode backtrack_op;
284 const WCHAR *cp; /* index in text of match at backtrack */
285 size_t parenIndex; /* start index of saved paren contents */
286 size_t parenCount; /* # of saved paren contents */
287 size_t saveStateStackTop; /* number of parent states */
288 /* saved parent states follow */
289 /* saved paren contents follow */
290 } REBackTrackData;
292 #define INITIAL_STATESTACK 100
293 #define INITIAL_BACKTRACK 8000
295 typedef struct REGlobalData {
296 script_ctx_t *cx;
297 JSRegExp *regexp; /* the RE in execution */
298 BOOL ok; /* runtime error (out_of_memory only?) */
299 size_t start; /* offset to start at */
300 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
301 const WCHAR *cpbegin; /* text base address */
302 const WCHAR *cpend; /* text limit address */
304 REProgState *stateStack; /* stack of state of current parents */
305 size_t stateStackTop;
306 size_t stateStackLimit;
308 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
309 REBackTrackData *backTrackSP;
310 size_t backTrackStackSize;
311 size_t cursz; /* size of current stack entry */
312 size_t backTrackCount; /* how many times we've backtracked */
313 size_t backTrackLimit; /* upper limit on backtrack states */
315 jsheap_t *pool; /* It's faster to use one malloc'd pool
316 than to malloc/free the three items
317 that are allocated from this pool */
318 } REGlobalData;
320 typedef struct RENode RENode;
321 struct RENode {
322 REOp op; /* r.e. op bytecode */
323 RENode *next; /* next in concatenation order */
324 void *kid; /* first operand */
325 union {
326 void *kid2; /* second operand */
327 INT num; /* could be a number */
328 size_t parenIndex; /* or a parenthesis index */
329 struct { /* or a quantifier range */
330 UINT min;
331 UINT max;
332 JSPackedBool greedy;
333 } range;
334 struct { /* or a character class */
335 size_t startIndex;
336 size_t kidlen; /* length of string at kid, in jschars */
337 size_t index; /* index into class list */
338 WORD bmsize; /* bitmap size, based on max char code */
339 JSPackedBool sense;
340 } ucclass;
341 struct { /* or a literal sequence */
342 WCHAR chr; /* of one character */
343 size_t length; /* or many (via the kid) */
344 } flat;
345 struct {
346 RENode *kid2; /* second operand from ALT */
347 WCHAR ch1; /* match char for ALTPREREQ */
348 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
349 } altprereq;
350 } u;
353 #define CLASS_CACHE_SIZE 4
355 typedef struct CompilerState {
356 script_ctx_t *context;
357 const WCHAR *cpbegin;
358 const WCHAR *cpend;
359 const WCHAR *cp;
360 size_t parenCount;
361 size_t classCount; /* number of [] encountered */
362 size_t treeDepth; /* maximum depth of parse tree */
363 size_t progLength; /* estimated bytecode length */
364 RENode *result;
365 size_t classBitmapsMem; /* memory to hold all class bitmaps */
366 struct {
367 const WCHAR *start; /* small cache of class strings */
368 size_t length; /* since they're often the same */
369 size_t index;
370 } classCache[CLASS_CACHE_SIZE];
371 WORD flags;
372 } CompilerState;
374 typedef struct EmitStateStackEntry {
375 jsbytecode *altHead; /* start of REOP_ALT* opcode */
376 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
377 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
378 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
379 RENode *continueNode; /* original REOP_ALT* node being stacked */
380 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
381 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
382 avoid 16-bit unsigned offset overflow */
383 } EmitStateStackEntry;
386 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
387 * the getters and setters take the pc of the offset, not of the opcode before
388 * the offset.
390 #define ARG_LEN 2
391 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
392 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
393 (pc)[1] = (jsbytecode) (arg))
395 #define OFFSET_LEN ARG_LEN
396 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
397 #define GET_OFFSET(pc) GET_ARG(pc)
399 static BOOL ParseRegExp(CompilerState*);
402 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
403 * For sanity, we limit it to 2^24 bytes.
405 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
408 * The maximum memory that can be allocated for class bitmaps.
409 * For sanity, we limit it to 2^24 bytes.
411 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
414 * Functions to get size and write/read bytecode that represent small indexes
415 * compactly.
416 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
417 * indicates that the following byte brings more bits to the index. Otherwise
418 * this is the last byte in the index bytecode representing highest index bits.
420 static size_t
421 GetCompactIndexWidth(size_t index)
423 size_t width;
425 for (width = 1; (index >>= 7) != 0; ++width) { }
426 return width;
429 static inline jsbytecode *
430 WriteCompactIndex(jsbytecode *pc, size_t index)
432 size_t next;
434 while ((next = index >> 7) != 0) {
435 *pc++ = (jsbytecode)(index | 0x80);
436 index = next;
438 *pc++ = (jsbytecode)index;
439 return pc;
442 static inline jsbytecode *
443 ReadCompactIndex(jsbytecode *pc, size_t *result)
445 size_t nextByte;
447 nextByte = *pc++;
448 if ((nextByte & 0x80) == 0) {
450 * Short-circuit the most common case when compact index <= 127.
452 *result = nextByte;
453 } else {
454 size_t shift = 7;
455 *result = 0x7F & nextByte;
456 do {
457 nextByte = *pc++;
458 *result |= (nextByte & 0x7F) << shift;
459 shift += 7;
460 } while ((nextByte & 0x80) != 0);
462 return pc;
465 /* Construct and initialize an RENode, returning NULL for out-of-memory */
466 static RENode *
467 NewRENode(CompilerState *state, REOp op)
469 RENode *ren;
471 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
472 if (!ren) {
473 /* js_ReportOutOfScriptQuota(cx); */
474 return NULL;
476 ren->op = op;
477 ren->next = NULL;
478 ren->kid = NULL;
479 return ren;
483 * Validates and converts hex ascii value.
485 static BOOL
486 isASCIIHexDigit(WCHAR c, UINT *digit)
488 UINT cv = c;
490 if (cv < '0')
491 return FALSE;
492 if (cv <= '9') {
493 *digit = cv - '0';
494 return TRUE;
496 cv |= 0x20;
497 if (cv >= 'a' && cv <= 'f') {
498 *digit = cv - 'a' + 10;
499 return TRUE;
501 return FALSE;
504 typedef struct {
505 REOp op;
506 const WCHAR *errPos;
507 size_t parenIndex;
508 } REOpData;
510 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
511 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
513 static BOOL
514 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
516 ptrdiff_t offset = target - jump;
518 /* Check that target really points forward. */
519 assert(offset >= 2);
520 if ((size_t)offset > OFFSET_MAX)
521 return FALSE;
523 jump[0] = JUMP_OFFSET_HI(offset);
524 jump[1] = JUMP_OFFSET_LO(offset);
525 return TRUE;
529 * Generate bytecode for the tree rooted at t using an explicit stack instead
530 * of recursion.
532 static jsbytecode *
533 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
534 jsbytecode *pc, RENode *t)
536 EmitStateStackEntry *emitStateSP, *emitStateStack;
537 RECharSet *charSet;
538 REOp op;
540 if (treeDepth == 0) {
541 emitStateStack = NULL;
542 } else {
543 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
544 if (!emitStateStack)
545 return NULL;
547 emitStateSP = emitStateStack;
548 op = t->op;
549 assert(op < REOP_LIMIT);
551 for (;;) {
552 *pc++ = op;
553 switch (op) {
554 case REOP_EMPTY:
555 --pc;
556 break;
558 case REOP_ALTPREREQ2:
559 case REOP_ALTPREREQ:
560 assert(emitStateSP);
561 emitStateSP->altHead = pc - 1;
562 emitStateSP->endTermFixup = pc;
563 pc += OFFSET_LEN;
564 SET_ARG(pc, t->u.altprereq.ch1);
565 pc += ARG_LEN;
566 SET_ARG(pc, t->u.altprereq.ch2);
567 pc += ARG_LEN;
569 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
570 pc += OFFSET_LEN;
572 emitStateSP->continueNode = t;
573 emitStateSP->continueOp = REOP_JUMP;
574 emitStateSP->jumpToJumpFlag = FALSE;
575 ++emitStateSP;
576 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
577 t = t->kid;
578 op = t->op;
579 assert(op < REOP_LIMIT);
580 continue;
582 case REOP_JUMP:
583 emitStateSP->nextTermFixup = pc; /* offset to following term */
584 pc += OFFSET_LEN;
585 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
586 goto jump_too_big;
587 emitStateSP->continueOp = REOP_ENDALT;
588 ++emitStateSP;
589 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
590 t = t->u.kid2;
591 op = t->op;
592 assert(op < REOP_LIMIT);
593 continue;
595 case REOP_ENDALT:
597 * If we already patched emitStateSP->nextTermFixup to jump to
598 * a nearer jump, to avoid 16-bit immediate offset overflow, we
599 * are done here.
601 if (emitStateSP->jumpToJumpFlag)
602 break;
605 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
606 * REOP_ENDALT is executed only on successful match of the last
607 * alternate in a group.
609 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
610 goto jump_too_big;
611 if (t->op != REOP_ALT) {
612 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
613 goto jump_too_big;
617 * If the program is bigger than the REOP_JUMP offset range, then
618 * we must check for alternates before this one that are part of
619 * the same group, and fix up their jump offsets to target jumps
620 * close enough to fit in a 16-bit unsigned offset immediate.
622 if ((size_t)(pc - re->program) > OFFSET_MAX &&
623 emitStateSP > emitStateStack) {
624 EmitStateStackEntry *esp, *esp2;
625 jsbytecode *alt, *jump;
626 ptrdiff_t span, header;
628 esp2 = emitStateSP;
629 alt = esp2->altHead;
630 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
631 if (esp->continueOp == REOP_ENDALT &&
632 !esp->jumpToJumpFlag &&
633 esp->nextTermFixup + OFFSET_LEN == alt &&
634 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
635 ? esp->endTermFixup
636 : esp->nextTermFixup)) > OFFSET_MAX) {
637 alt = esp->altHead;
638 jump = esp->nextTermFixup;
641 * The span must be 1 less than the distance from
642 * jump offset to jump offset, so we actually jump
643 * to a REOP_JUMP bytecode, not to its offset!
645 for (;;) {
646 assert(jump < esp2->nextTermFixup);
647 span = esp2->nextTermFixup - jump - 1;
648 if ((size_t)span <= OFFSET_MAX)
649 break;
650 do {
651 if (--esp2 == esp)
652 goto jump_too_big;
653 } while (esp2->continueOp != REOP_ENDALT);
656 jump[0] = JUMP_OFFSET_HI(span);
657 jump[1] = JUMP_OFFSET_LO(span);
659 if (esp->continueNode->op != REOP_ALT) {
661 * We must patch the offset at esp->endTermFixup
662 * as well, for the REOP_ALTPREREQ{,2} opcodes.
663 * If we're unlucky and endTermFixup is more than
664 * OFFSET_MAX bytes from its target, we cheat by
665 * jumping 6 bytes to the jump whose offset is at
666 * esp->nextTermFixup, which has the same target.
668 jump = esp->endTermFixup;
669 header = esp->nextTermFixup - jump;
670 span += header;
671 if ((size_t)span > OFFSET_MAX)
672 span = header;
674 jump[0] = JUMP_OFFSET_HI(span);
675 jump[1] = JUMP_OFFSET_LO(span);
678 esp->jumpToJumpFlag = TRUE;
682 break;
684 case REOP_ALT:
685 assert(emitStateSP);
686 emitStateSP->altHead = pc - 1;
687 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
688 pc += OFFSET_LEN;
689 emitStateSP->continueNode = t;
690 emitStateSP->continueOp = REOP_JUMP;
691 emitStateSP->jumpToJumpFlag = FALSE;
692 ++emitStateSP;
693 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
694 t = t->kid;
695 op = t->op;
696 assert(op < REOP_LIMIT);
697 continue;
699 case REOP_FLAT:
701 * Coalesce FLATs if possible and if it would not increase bytecode
702 * beyond preallocated limit. The latter happens only when bytecode
703 * size for coalesced string with offset p and length 2 exceeds 6
704 * bytes preallocated for 2 single char nodes, i.e. when
705 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
706 * GetCompactIndexWidth(p) > 4.
707 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
708 * nodes strictly decreases bytecode size, the check has to be
709 * done only for the first coalescing.
711 if (t->kid &&
712 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
714 while (t->next &&
715 t->next->op == REOP_FLAT &&
716 (WCHAR*)t->kid + t->u.flat.length ==
717 t->next->kid) {
718 t->u.flat.length += t->next->u.flat.length;
719 t->next = t->next->next;
722 if (t->kid && t->u.flat.length > 1) {
723 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
724 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
725 pc = WriteCompactIndex(pc, t->u.flat.length);
726 } else if (t->u.flat.chr < 256) {
727 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
728 *pc++ = (jsbytecode) t->u.flat.chr;
729 } else {
730 pc[-1] = (state->flags & JSREG_FOLD)
731 ? REOP_UCFLAT1i
732 : REOP_UCFLAT1;
733 SET_ARG(pc, t->u.flat.chr);
734 pc += ARG_LEN;
736 break;
738 case REOP_LPAREN:
739 assert(emitStateSP);
740 pc = WriteCompactIndex(pc, t->u.parenIndex);
741 emitStateSP->continueNode = t;
742 emitStateSP->continueOp = REOP_RPAREN;
743 ++emitStateSP;
744 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
745 t = t->kid;
746 op = t->op;
747 continue;
749 case REOP_RPAREN:
750 pc = WriteCompactIndex(pc, t->u.parenIndex);
751 break;
753 case REOP_BACKREF:
754 pc = WriteCompactIndex(pc, t->u.parenIndex);
755 break;
757 case REOP_ASSERT:
758 assert(emitStateSP);
759 emitStateSP->nextTermFixup = pc;
760 pc += OFFSET_LEN;
761 emitStateSP->continueNode = t;
762 emitStateSP->continueOp = REOP_ASSERTTEST;
763 ++emitStateSP;
764 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
765 t = t->kid;
766 op = t->op;
767 continue;
769 case REOP_ASSERTTEST:
770 case REOP_ASSERTNOTTEST:
771 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
772 goto jump_too_big;
773 break;
775 case REOP_ASSERT_NOT:
776 assert(emitStateSP);
777 emitStateSP->nextTermFixup = pc;
778 pc += OFFSET_LEN;
779 emitStateSP->continueNode = t;
780 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
781 ++emitStateSP;
782 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
783 t = t->kid;
784 op = t->op;
785 continue;
787 case REOP_QUANT:
788 assert(emitStateSP);
789 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
790 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
791 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
792 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
793 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
794 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
795 } else {
796 if (!t->u.range.greedy)
797 pc[-1] = REOP_MINIMALQUANT;
798 pc = WriteCompactIndex(pc, t->u.range.min);
800 * Write max + 1 to avoid using size_t(max) + 1 bytes
801 * for (UINT)-1 sentinel.
803 pc = WriteCompactIndex(pc, t->u.range.max + 1);
805 emitStateSP->nextTermFixup = pc;
806 pc += OFFSET_LEN;
807 emitStateSP->continueNode = t;
808 emitStateSP->continueOp = REOP_ENDCHILD;
809 ++emitStateSP;
810 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
811 t = t->kid;
812 op = t->op;
813 continue;
815 case REOP_ENDCHILD:
816 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
817 goto jump_too_big;
818 break;
820 case REOP_CLASS:
821 if (!t->u.ucclass.sense)
822 pc[-1] = REOP_NCLASS;
823 pc = WriteCompactIndex(pc, t->u.ucclass.index);
824 charSet = &re->classList[t->u.ucclass.index];
825 charSet->converted = FALSE;
826 charSet->length = t->u.ucclass.bmsize;
827 charSet->u.src.startIndex = t->u.ucclass.startIndex;
828 charSet->u.src.length = t->u.ucclass.kidlen;
829 charSet->sense = t->u.ucclass.sense;
830 break;
832 default:
833 break;
836 t = t->next;
837 if (t) {
838 op = t->op;
839 } else {
840 if (emitStateSP == emitStateStack)
841 break;
842 --emitStateSP;
843 t = emitStateSP->continueNode;
844 op = (REOp) emitStateSP->continueOp;
848 cleanup:
849 heap_free(emitStateStack);
850 return pc;
852 jump_too_big:
853 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
854 pc = NULL;
855 goto cleanup;
859 * Process the op against the two top operands, reducing them to a single
860 * operand in the penultimate slot. Update progLength and treeDepth.
862 static BOOL
863 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
864 INT operandSP)
866 RENode *result;
868 switch (opData->op) {
869 case REOP_ALT:
870 result = NewRENode(state, REOP_ALT);
871 if (!result)
872 return FALSE;
873 result->kid = operandStack[operandSP - 2];
874 result->u.kid2 = operandStack[operandSP - 1];
875 operandStack[operandSP - 2] = result;
877 if (state->treeDepth == TREE_DEPTH_MAX) {
878 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
879 return FALSE;
881 ++state->treeDepth;
884 * Look at both alternates to see if there's a FLAT or a CLASS at
885 * the start of each. If so, use a prerequisite match.
887 if (((RENode *) result->kid)->op == REOP_FLAT &&
888 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
889 (state->flags & JSREG_FOLD) == 0) {
890 result->op = REOP_ALTPREREQ;
891 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
892 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
893 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
894 JUMP, <end> ... ENDALT */
895 state->progLength += 13;
897 else
898 if (((RENode *) result->kid)->op == REOP_CLASS &&
899 ((RENode *) result->kid)->u.ucclass.index < 256 &&
900 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
901 (state->flags & JSREG_FOLD) == 0) {
902 result->op = REOP_ALTPREREQ2;
903 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
904 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
905 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
906 JUMP, <end> ... ENDALT */
907 state->progLength += 13;
909 else
910 if (((RENode *) result->kid)->op == REOP_FLAT &&
911 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
912 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
913 (state->flags & JSREG_FOLD) == 0) {
914 result->op = REOP_ALTPREREQ2;
915 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
916 result->u.altprereq.ch2 =
917 ((RENode *) result->u.kid2)->u.ucclass.index;
918 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
919 JUMP, <end> ... ENDALT */
920 state->progLength += 13;
922 else {
923 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
924 state->progLength += 7;
926 break;
928 case REOP_CONCAT:
929 result = operandStack[operandSP - 2];
930 while (result->next)
931 result = result->next;
932 result->next = operandStack[operandSP - 1];
933 break;
935 case REOP_ASSERT:
936 case REOP_ASSERT_NOT:
937 case REOP_LPARENNON:
938 case REOP_LPAREN:
939 /* These should have been processed by a close paren. */
940 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
941 opData->errPos);
942 return FALSE;
944 default:;
946 return TRUE;
950 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
951 * its being on the stack, and to propagate errors to its callers.
953 #define JSREG_FIND_PAREN_COUNT 0x8000
954 #define JSREG_FIND_PAREN_ERROR 0x4000
957 * Magic return value from FindParenCount and GetDecimalValue, to indicate
958 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
959 * its findMax parameter is non-null.
961 #define OVERFLOW_VALUE ((UINT)-1)
963 static UINT
964 FindParenCount(CompilerState *state)
966 CompilerState temp;
967 int i;
969 if (state->flags & JSREG_FIND_PAREN_COUNT)
970 return OVERFLOW_VALUE;
973 * Copy state into temp, flag it so we never report an invalid backref,
974 * and reset its members to parse the entire regexp. This is obviously
975 * suboptimal, but GetDecimalValue calls us only if a backref appears to
976 * refer to a forward parenthetical, which is rare.
978 temp = *state;
979 temp.flags |= JSREG_FIND_PAREN_COUNT;
980 temp.cp = temp.cpbegin;
981 temp.parenCount = 0;
982 temp.classCount = 0;
983 temp.progLength = 0;
984 temp.treeDepth = 0;
985 temp.classBitmapsMem = 0;
986 for (i = 0; i < CLASS_CACHE_SIZE; i++)
987 temp.classCache[i].start = NULL;
989 if (!ParseRegExp(&temp)) {
990 state->flags |= JSREG_FIND_PAREN_ERROR;
991 return OVERFLOW_VALUE;
993 return temp.parenCount;
997 * Extract and return a decimal value at state->cp. The initial character c
998 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
999 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
1000 * state->flags to discover whether an error occurred under findMax.
1002 static UINT
1003 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
1004 CompilerState *state)
1006 UINT value = JS7_UNDEC(c);
1007 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
1009 /* The following restriction allows simpler overflow checks. */
1010 assert(max <= ((UINT)-1 - 9) / 10);
1011 while (state->cp < state->cpend) {
1012 c = *state->cp;
1013 if (!JS7_ISDEC(c))
1014 break;
1015 value = 10 * value + JS7_UNDEC(c);
1016 if (!overflow && value > max && (!findMax || value > findMax(state)))
1017 overflow = TRUE;
1018 ++state->cp;
1020 return overflow ? OVERFLOW_VALUE : value;
1024 * Calculate the total size of the bitmap required for a class expression.
1026 static BOOL
1027 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1028 const WCHAR *end)
1030 UINT max = 0;
1031 BOOL inRange = FALSE;
1032 WCHAR c, rangeStart = 0;
1033 UINT n, digit, nDigits, i;
1035 target->u.ucclass.bmsize = 0;
1036 target->u.ucclass.sense = TRUE;
1038 if (src == end)
1039 return TRUE;
1041 if (*src == '^') {
1042 ++src;
1043 target->u.ucclass.sense = FALSE;
1046 while (src != end) {
1047 BOOL canStartRange = TRUE;
1048 UINT localMax = 0;
1050 switch (*src) {
1051 case '\\':
1052 ++src;
1053 c = *src++;
1054 switch (c) {
1055 case 'b':
1056 localMax = 0x8;
1057 break;
1058 case 'f':
1059 localMax = 0xC;
1060 break;
1061 case 'n':
1062 localMax = 0xA;
1063 break;
1064 case 'r':
1065 localMax = 0xD;
1066 break;
1067 case 't':
1068 localMax = 0x9;
1069 break;
1070 case 'v':
1071 localMax = 0xB;
1072 break;
1073 case 'c':
1074 if (src < end && RE_IS_LETTER(*src)) {
1075 localMax = (UINT) (*src++) & 0x1F;
1076 } else {
1077 --src;
1078 localMax = '\\';
1080 break;
1081 case 'x':
1082 nDigits = 2;
1083 goto lexHex;
1084 case 'u':
1085 nDigits = 4;
1086 lexHex:
1087 n = 0;
1088 for (i = 0; (i < nDigits) && (src < end); i++) {
1089 c = *src++;
1090 if (!isASCIIHexDigit(c, &digit)) {
1092 * Back off to accepting the original
1093 *'\' as a literal.
1095 src -= i + 1;
1096 n = '\\';
1097 break;
1099 n = (n << 4) | digit;
1101 localMax = n;
1102 break;
1103 case 'd':
1104 canStartRange = FALSE;
1105 if (inRange) {
1106 JS_ReportErrorNumber(state->context,
1107 js_GetErrorMessage, NULL,
1108 JSMSG_BAD_CLASS_RANGE);
1109 return FALSE;
1111 localMax = '9';
1112 break;
1113 case 'D':
1114 case 's':
1115 case 'S':
1116 case 'w':
1117 case 'W':
1118 canStartRange = FALSE;
1119 if (inRange) {
1120 JS_ReportErrorNumber(state->context,
1121 js_GetErrorMessage, NULL,
1122 JSMSG_BAD_CLASS_RANGE);
1123 return FALSE;
1125 max = 65535;
1128 * If this is the start of a range, ensure that it's less than
1129 * the end.
1131 localMax = 0;
1132 break;
1133 case '0':
1134 case '1':
1135 case '2':
1136 case '3':
1137 case '4':
1138 case '5':
1139 case '6':
1140 case '7':
1142 * This is a non-ECMA extension - decimal escapes (in this
1143 * case, octal!) are supposed to be an error inside class
1144 * ranges, but supported here for backwards compatibility.
1147 n = JS7_UNDEC(c);
1148 c = *src;
1149 if ('0' <= c && c <= '7') {
1150 src++;
1151 n = 8 * n + JS7_UNDEC(c);
1152 c = *src;
1153 if ('0' <= c && c <= '7') {
1154 src++;
1155 i = 8 * n + JS7_UNDEC(c);
1156 if (i <= 0377)
1157 n = i;
1158 else
1159 src--;
1162 localMax = n;
1163 break;
1165 default:
1166 localMax = c;
1167 break;
1169 break;
1170 default:
1171 localMax = *src++;
1172 break;
1175 if (inRange) {
1176 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1177 if (rangeStart > localMax) {
1178 JS_ReportErrorNumber(state->context,
1179 js_GetErrorMessage, NULL,
1180 JSMSG_BAD_CLASS_RANGE);
1181 return FALSE;
1183 inRange = FALSE;
1184 } else {
1185 if (canStartRange && src < end - 1) {
1186 if (*src == '-') {
1187 ++src;
1188 inRange = TRUE;
1189 rangeStart = (WCHAR)localMax;
1190 continue;
1193 if (state->flags & JSREG_FOLD)
1194 rangeStart = localMax; /* one run of the uc/dc loop below */
1197 if (state->flags & JSREG_FOLD) {
1198 WCHAR maxch = localMax;
1200 for (i = rangeStart; i <= localMax; i++) {
1201 WCHAR uch, dch;
1203 uch = toupperW(i);
1204 dch = tolowerW(i);
1205 if(maxch < uch)
1206 maxch = uch;
1207 if(maxch < dch)
1208 maxch = dch;
1210 localMax = maxch;
1213 if (localMax > max)
1214 max = localMax;
1216 target->u.ucclass.bmsize = max;
1217 return TRUE;
1220 static INT
1221 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1223 UINT min, max;
1224 WCHAR c;
1225 const WCHAR *errp = state->cp++;
1227 c = *state->cp;
1228 if (JS7_ISDEC(c)) {
1229 ++state->cp;
1230 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1231 c = *state->cp;
1233 if (!ignoreValues && min == OVERFLOW_VALUE)
1234 return JSMSG_MIN_TOO_BIG;
1236 if (c == ',') {
1237 c = *++state->cp;
1238 if (JS7_ISDEC(c)) {
1239 ++state->cp;
1240 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1241 c = *state->cp;
1242 if (!ignoreValues && max == OVERFLOW_VALUE)
1243 return JSMSG_MAX_TOO_BIG;
1244 if (!ignoreValues && min > max)
1245 return JSMSG_OUT_OF_ORDER;
1246 } else {
1247 max = (UINT)-1;
1249 } else {
1250 max = min;
1252 if (c == '}') {
1253 state->result = NewRENode(state, REOP_QUANT);
1254 if (!state->result)
1255 return JSMSG_OUT_OF_MEMORY;
1256 state->result->u.range.min = min;
1257 state->result->u.range.max = max;
1259 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1260 * where <max> is written as compact(max+1) to make
1261 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1263 state->progLength += (1 + GetCompactIndexWidth(min)
1264 + GetCompactIndexWidth(max + 1)
1265 +3);
1266 return 0;
1270 state->cp = errp;
1271 return -1;
1274 static BOOL
1275 ParseQuantifier(CompilerState *state)
1277 RENode *term;
1278 term = state->result;
1279 if (state->cp < state->cpend) {
1280 switch (*state->cp) {
1281 case '+':
1282 state->result = NewRENode(state, REOP_QUANT);
1283 if (!state->result)
1284 return FALSE;
1285 state->result->u.range.min = 1;
1286 state->result->u.range.max = (UINT)-1;
1287 /* <PLUS>, <next> ... <ENDCHILD> */
1288 state->progLength += 4;
1289 goto quantifier;
1290 case '*':
1291 state->result = NewRENode(state, REOP_QUANT);
1292 if (!state->result)
1293 return FALSE;
1294 state->result->u.range.min = 0;
1295 state->result->u.range.max = (UINT)-1;
1296 /* <STAR>, <next> ... <ENDCHILD> */
1297 state->progLength += 4;
1298 goto quantifier;
1299 case '?':
1300 state->result = NewRENode(state, REOP_QUANT);
1301 if (!state->result)
1302 return FALSE;
1303 state->result->u.range.min = 0;
1304 state->result->u.range.max = 1;
1305 /* <OPT>, <next> ... <ENDCHILD> */
1306 state->progLength += 4;
1307 goto quantifier;
1308 case '{': /* balance '}' */
1310 INT err;
1312 err = ParseMinMaxQuantifier(state, FALSE);
1313 if (err == 0)
1314 goto quantifier;
1315 if (err == -1)
1316 return TRUE;
1318 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1319 return FALSE;
1321 default:;
1324 return TRUE;
1326 quantifier:
1327 if (state->treeDepth == TREE_DEPTH_MAX) {
1328 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1329 return FALSE;
1332 ++state->treeDepth;
1333 ++state->cp;
1334 state->result->kid = term;
1335 if (state->cp < state->cpend && *state->cp == '?') {
1336 ++state->cp;
1337 state->result->u.range.greedy = FALSE;
1338 } else {
1339 state->result->u.range.greedy = TRUE;
1341 return TRUE;
1345 * item: assertion An item is either an assertion or
1346 * quantatom a quantified atom.
1348 * assertion: '^' Assertions match beginning of string
1349 * (or line if the class static property
1350 * RegExp.multiline is true).
1351 * '$' End of string (or line if the class
1352 * static property RegExp.multiline is
1353 * true).
1354 * '\b' Word boundary (between \w and \W).
1355 * '\B' Word non-boundary.
1357 * quantatom: atom An unquantified atom.
1358 * quantatom '{' n ',' m '}'
1359 * Atom must occur between n and m times.
1360 * quantatom '{' n ',' '}' Atom must occur at least n times.
1361 * quantatom '{' n '}' Atom must occur exactly n times.
1362 * quantatom '*' Zero or more times (same as {0,}).
1363 * quantatom '+' One or more times (same as {1,}).
1364 * quantatom '?' Zero or one time (same as {0,1}).
1366 * any of which can be optionally followed by '?' for ungreedy
1368 * atom: '(' regexp ')' A parenthesized regexp (what matched
1369 * can be addressed using a backreference,
1370 * see '\' n below).
1371 * '.' Matches any char except '\n'.
1372 * '[' classlist ']' A character class.
1373 * '[' '^' classlist ']' A negated character class.
1374 * '\f' Form Feed.
1375 * '\n' Newline (Line Feed).
1376 * '\r' Carriage Return.
1377 * '\t' Horizontal Tab.
1378 * '\v' Vertical Tab.
1379 * '\d' A digit (same as [0-9]).
1380 * '\D' A non-digit.
1381 * '\w' A word character, [0-9a-z_A-Z].
1382 * '\W' A non-word character.
1383 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1384 * '\S' A non-whitespace character.
1385 * '\' n A backreference to the nth (n decimal
1386 * and positive) parenthesized expression.
1387 * '\' octal An octal escape sequence (octal must be
1388 * two or three digits long, unless it is
1389 * 0 for the null character).
1390 * '\x' hex A hex escape (hex must be two digits).
1391 * '\u' unicode A unicode escape (must be four digits).
1392 * '\c' ctrl A control character, ctrl is a letter.
1393 * '\' literalatomchar Any character except one of the above
1394 * that follow '\' in an atom.
1395 * otheratomchar Any character not first among the other
1396 * atom right-hand sides.
1398 static BOOL
1399 ParseTerm(CompilerState *state)
1401 WCHAR c = *state->cp++;
1402 UINT nDigits;
1403 UINT num, tmp, n, i;
1404 const WCHAR *termStart;
1406 switch (c) {
1407 /* assertions and atoms */
1408 case '^':
1409 state->result = NewRENode(state, REOP_BOL);
1410 if (!state->result)
1411 return FALSE;
1412 state->progLength++;
1413 return TRUE;
1414 case '$':
1415 state->result = NewRENode(state, REOP_EOL);
1416 if (!state->result)
1417 return FALSE;
1418 state->progLength++;
1419 return TRUE;
1420 case '\\':
1421 if (state->cp >= state->cpend) {
1422 /* a trailing '\' is an error */
1423 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1424 return FALSE;
1426 c = *state->cp++;
1427 switch (c) {
1428 /* assertion escapes */
1429 case 'b' :
1430 state->result = NewRENode(state, REOP_WBDRY);
1431 if (!state->result)
1432 return FALSE;
1433 state->progLength++;
1434 return TRUE;
1435 case 'B':
1436 state->result = NewRENode(state, REOP_WNONBDRY);
1437 if (!state->result)
1438 return FALSE;
1439 state->progLength++;
1440 return TRUE;
1441 /* Decimal escape */
1442 case '0':
1443 /* Give a strict warning. See also the note below. */
1444 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1445 doOctal:
1446 num = 0;
1447 while (state->cp < state->cpend) {
1448 c = *state->cp;
1449 if (c < '0' || '7' < c)
1450 break;
1451 state->cp++;
1452 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1453 if (tmp > 0377)
1454 break;
1455 num = tmp;
1457 c = (WCHAR)num;
1458 doFlat:
1459 state->result = NewRENode(state, REOP_FLAT);
1460 if (!state->result)
1461 return FALSE;
1462 state->result->u.flat.chr = c;
1463 state->result->u.flat.length = 1;
1464 state->progLength += 3;
1465 break;
1466 case '1':
1467 case '2':
1468 case '3':
1469 case '4':
1470 case '5':
1471 case '6':
1472 case '7':
1473 case '8':
1474 case '9':
1475 termStart = state->cp - 1;
1476 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1477 if (state->flags & JSREG_FIND_PAREN_ERROR)
1478 return FALSE;
1479 if (num == OVERFLOW_VALUE) {
1480 /* Give a strict mode warning. */
1481 WARN("back-reference exceeds number of capturing parentheses\n");
1484 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1485 * error here. However, for compatibility with IE, we treat the
1486 * whole backref as flat if the first character in it is not a
1487 * valid octal character, and as an octal escape otherwise.
1489 state->cp = termStart;
1490 if (c >= '8') {
1491 /* Treat this as flat. termStart - 1 is the \. */
1492 c = '\\';
1493 goto asFlat;
1496 /* Treat this as an octal escape. */
1497 goto doOctal;
1499 assert(1 <= num && num <= 0x10000);
1500 state->result = NewRENode(state, REOP_BACKREF);
1501 if (!state->result)
1502 return FALSE;
1503 state->result->u.parenIndex = num - 1;
1504 state->progLength
1505 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1506 break;
1507 /* Control escape */
1508 case 'f':
1509 c = 0xC;
1510 goto doFlat;
1511 case 'n':
1512 c = 0xA;
1513 goto doFlat;
1514 case 'r':
1515 c = 0xD;
1516 goto doFlat;
1517 case 't':
1518 c = 0x9;
1519 goto doFlat;
1520 case 'v':
1521 c = 0xB;
1522 goto doFlat;
1523 /* Control letter */
1524 case 'c':
1525 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1526 c = (WCHAR) (*state->cp++ & 0x1F);
1527 } else {
1528 /* back off to accepting the original '\' as a literal */
1529 --state->cp;
1530 c = '\\';
1532 goto doFlat;
1533 /* HexEscapeSequence */
1534 case 'x':
1535 nDigits = 2;
1536 goto lexHex;
1537 /* UnicodeEscapeSequence */
1538 case 'u':
1539 nDigits = 4;
1540 lexHex:
1541 n = 0;
1542 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1543 UINT digit;
1544 c = *state->cp++;
1545 if (!isASCIIHexDigit(c, &digit)) {
1547 * Back off to accepting the original 'u' or 'x' as a
1548 * literal.
1550 state->cp -= i + 2;
1551 n = *state->cp++;
1552 break;
1554 n = (n << 4) | digit;
1556 c = (WCHAR) n;
1557 goto doFlat;
1558 /* Character class escapes */
1559 case 'd':
1560 state->result = NewRENode(state, REOP_DIGIT);
1561 doSimple:
1562 if (!state->result)
1563 return FALSE;
1564 state->progLength++;
1565 break;
1566 case 'D':
1567 state->result = NewRENode(state, REOP_NONDIGIT);
1568 goto doSimple;
1569 case 's':
1570 state->result = NewRENode(state, REOP_SPACE);
1571 goto doSimple;
1572 case 'S':
1573 state->result = NewRENode(state, REOP_NONSPACE);
1574 goto doSimple;
1575 case 'w':
1576 state->result = NewRENode(state, REOP_ALNUM);
1577 goto doSimple;
1578 case 'W':
1579 state->result = NewRENode(state, REOP_NONALNUM);
1580 goto doSimple;
1581 /* IdentityEscape */
1582 default:
1583 state->result = NewRENode(state, REOP_FLAT);
1584 if (!state->result)
1585 return FALSE;
1586 state->result->u.flat.chr = c;
1587 state->result->u.flat.length = 1;
1588 state->result->kid = (void *) (state->cp - 1);
1589 state->progLength += 3;
1590 break;
1592 break;
1593 case '[':
1594 state->result = NewRENode(state, REOP_CLASS);
1595 if (!state->result)
1596 return FALSE;
1597 termStart = state->cp;
1598 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1599 for (;;) {
1600 if (state->cp == state->cpend) {
1601 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1602 JSMSG_UNTERM_CLASS, termStart);
1604 return FALSE;
1606 if (*state->cp == '\\') {
1607 state->cp++;
1608 if (state->cp != state->cpend)
1609 state->cp++;
1610 continue;
1612 if (*state->cp == ']') {
1613 state->result->u.ucclass.kidlen = state->cp - termStart;
1614 break;
1616 state->cp++;
1618 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1619 if (!state->classCache[i].start) {
1620 state->classCache[i].start = termStart;
1621 state->classCache[i].length = state->result->u.ucclass.kidlen;
1622 state->classCache[i].index = state->classCount;
1623 break;
1625 if (state->classCache[i].length ==
1626 state->result->u.ucclass.kidlen) {
1627 for (n = 0; ; n++) {
1628 if (n == state->classCache[i].length) {
1629 state->result->u.ucclass.index
1630 = state->classCache[i].index;
1631 goto claim;
1633 if (state->classCache[i].start[n] != termStart[n])
1634 break;
1638 state->result->u.ucclass.index = state->classCount++;
1640 claim:
1642 * Call CalculateBitmapSize now as we want any errors it finds
1643 * to be reported during the parse phase, not at execution.
1645 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1646 return FALSE;
1648 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1649 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1650 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1652 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1653 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1654 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1655 return FALSE;
1657 state->classBitmapsMem += n;
1658 /* CLASS, <index> */
1659 state->progLength
1660 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1661 break;
1663 case '.':
1664 state->result = NewRENode(state, REOP_DOT);
1665 goto doSimple;
1667 case '{':
1669 const WCHAR *errp = state->cp--;
1670 INT err;
1672 err = ParseMinMaxQuantifier(state, TRUE);
1673 state->cp = errp;
1675 if (err < 0)
1676 goto asFlat;
1678 /* FALL THROUGH */
1680 case '*':
1681 case '+':
1682 case '?':
1683 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1684 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1685 return FALSE;
1686 default:
1687 asFlat:
1688 state->result = NewRENode(state, REOP_FLAT);
1689 if (!state->result)
1690 return FALSE;
1691 state->result->u.flat.chr = c;
1692 state->result->u.flat.length = 1;
1693 state->result->kid = (void *) (state->cp - 1);
1694 state->progLength += 3;
1695 break;
1697 return ParseQuantifier(state);
1701 * Top-down regular expression grammar, based closely on Perl4.
1703 * regexp: altern A regular expression is one or more
1704 * altern '|' regexp alternatives separated by vertical bar.
1706 #define INITIAL_STACK_SIZE 128
1708 static BOOL
1709 ParseRegExp(CompilerState *state)
1711 size_t parenIndex;
1712 RENode *operand;
1713 REOpData *operatorStack;
1714 RENode **operandStack;
1715 REOp op;
1716 INT i;
1717 BOOL result = FALSE;
1719 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1720 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1722 /* Watch out for empty regexp */
1723 if (state->cp == state->cpend) {
1724 state->result = NewRENode(state, REOP_EMPTY);
1725 return (state->result != NULL);
1728 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1729 if (!operatorStack)
1730 return FALSE;
1732 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1733 if (!operandStack)
1734 goto out;
1736 for (;;) {
1737 parenIndex = state->parenCount;
1738 if (state->cp == state->cpend) {
1740 * If we are at the end of the regexp and we're short one or more
1741 * operands, the regexp must have the form /x|/ or some such, with
1742 * left parentheses making us short more than one operand.
1744 if (operatorSP >= operandSP) {
1745 operand = NewRENode(state, REOP_EMPTY);
1746 if (!operand)
1747 goto out;
1748 goto pushOperand;
1750 } else {
1751 switch (*state->cp) {
1752 case '(':
1753 ++state->cp;
1754 if (state->cp + 1 < state->cpend &&
1755 *state->cp == '?' &&
1756 (state->cp[1] == '=' ||
1757 state->cp[1] == '!' ||
1758 state->cp[1] == ':')) {
1759 switch (state->cp[1]) {
1760 case '=':
1761 op = REOP_ASSERT;
1762 /* ASSERT, <next>, ... ASSERTTEST */
1763 state->progLength += 4;
1764 break;
1765 case '!':
1766 op = REOP_ASSERT_NOT;
1767 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1768 state->progLength += 4;
1769 break;
1770 default:
1771 op = REOP_LPARENNON;
1772 break;
1774 state->cp += 2;
1775 } else {
1776 op = REOP_LPAREN;
1777 /* LPAREN, <index>, ... RPAREN, <index> */
1778 state->progLength
1779 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1780 state->parenCount++;
1781 if (state->parenCount == 65535) {
1782 ReportRegExpError(state, JSREPORT_ERROR,
1783 JSMSG_TOO_MANY_PARENS);
1784 goto out;
1787 goto pushOperator;
1789 case ')':
1791 * If there's no stacked open parenthesis, throw syntax error.
1793 for (i = operatorSP - 1; ; i--) {
1794 if (i < 0) {
1795 ReportRegExpError(state, JSREPORT_ERROR,
1796 JSMSG_UNMATCHED_RIGHT_PAREN);
1797 goto out;
1799 if (operatorStack[i].op == REOP_ASSERT ||
1800 operatorStack[i].op == REOP_ASSERT_NOT ||
1801 operatorStack[i].op == REOP_LPARENNON ||
1802 operatorStack[i].op == REOP_LPAREN) {
1803 break;
1806 /* FALL THROUGH */
1808 case '|':
1809 /* Expected an operand before these, so make an empty one */
1810 operand = NewRENode(state, REOP_EMPTY);
1811 if (!operand)
1812 goto out;
1813 goto pushOperand;
1815 default:
1816 if (!ParseTerm(state))
1817 goto out;
1818 operand = state->result;
1819 pushOperand:
1820 if (operandSP == operandStackSize) {
1821 RENode **tmp;
1822 operandStackSize += operandStackSize;
1823 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1824 if (!tmp)
1825 goto out;
1826 operandStack = tmp;
1828 operandStack[operandSP++] = operand;
1829 break;
1833 /* At the end; process remaining operators. */
1834 restartOperator:
1835 if (state->cp == state->cpend) {
1836 while (operatorSP) {
1837 --operatorSP;
1838 if (!ProcessOp(state, &operatorStack[operatorSP],
1839 operandStack, operandSP))
1840 goto out;
1841 --operandSP;
1843 assert(operandSP == 1);
1844 state->result = operandStack[0];
1845 result = TRUE;
1846 goto out;
1849 switch (*state->cp) {
1850 case '|':
1851 /* Process any stacked 'concat' operators */
1852 ++state->cp;
1853 while (operatorSP &&
1854 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1855 --operatorSP;
1856 if (!ProcessOp(state, &operatorStack[operatorSP],
1857 operandStack, operandSP)) {
1858 goto out;
1860 --operandSP;
1862 op = REOP_ALT;
1863 goto pushOperator;
1865 case ')':
1867 * If there's no stacked open parenthesis, throw syntax error.
1869 for (i = operatorSP - 1; ; i--) {
1870 if (i < 0) {
1871 ReportRegExpError(state, JSREPORT_ERROR,
1872 JSMSG_UNMATCHED_RIGHT_PAREN);
1873 goto out;
1875 if (operatorStack[i].op == REOP_ASSERT ||
1876 operatorStack[i].op == REOP_ASSERT_NOT ||
1877 operatorStack[i].op == REOP_LPARENNON ||
1878 operatorStack[i].op == REOP_LPAREN) {
1879 break;
1882 ++state->cp;
1884 /* Process everything on the stack until the open parenthesis. */
1885 for (;;) {
1886 assert(operatorSP);
1887 --operatorSP;
1888 switch (operatorStack[operatorSP].op) {
1889 case REOP_ASSERT:
1890 case REOP_ASSERT_NOT:
1891 case REOP_LPAREN:
1892 operand = NewRENode(state, operatorStack[operatorSP].op);
1893 if (!operand)
1894 goto out;
1895 operand->u.parenIndex =
1896 operatorStack[operatorSP].parenIndex;
1897 assert(operandSP);
1898 operand->kid = operandStack[operandSP - 1];
1899 operandStack[operandSP - 1] = operand;
1900 if (state->treeDepth == TREE_DEPTH_MAX) {
1901 ReportRegExpError(state, JSREPORT_ERROR,
1902 JSMSG_REGEXP_TOO_COMPLEX);
1903 goto out;
1905 ++state->treeDepth;
1906 /* FALL THROUGH */
1908 case REOP_LPARENNON:
1909 state->result = operandStack[operandSP - 1];
1910 if (!ParseQuantifier(state))
1911 goto out;
1912 operandStack[operandSP - 1] = state->result;
1913 goto restartOperator;
1914 default:
1915 if (!ProcessOp(state, &operatorStack[operatorSP],
1916 operandStack, operandSP))
1917 goto out;
1918 --operandSP;
1919 break;
1922 break;
1924 case '{':
1926 const WCHAR *errp = state->cp;
1928 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1930 * This didn't even scan correctly as a quantifier, so we should
1931 * treat it as flat.
1933 op = REOP_CONCAT;
1934 goto pushOperator;
1937 state->cp = errp;
1938 /* FALL THROUGH */
1941 case '+':
1942 case '*':
1943 case '?':
1944 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1945 state->cp);
1946 result = FALSE;
1947 goto out;
1949 default:
1950 /* Anything else is the start of the next term. */
1951 op = REOP_CONCAT;
1952 pushOperator:
1953 if (operatorSP == operatorStackSize) {
1954 REOpData *tmp;
1955 operatorStackSize += operatorStackSize;
1956 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1957 if (!tmp)
1958 goto out;
1959 operatorStack = tmp;
1961 operatorStack[operatorSP].op = op;
1962 operatorStack[operatorSP].errPos = state->cp;
1963 operatorStack[operatorSP++].parenIndex = parenIndex;
1964 break;
1967 out:
1968 heap_free(operatorStack);
1969 heap_free(operandStack);
1970 return result;
1974 * Save the current state of the match - the position in the input
1975 * text as well as the position in the bytecode. The state of any
1976 * parent expressions is also saved (preceding state).
1977 * Contents of parenCount parentheses from parenIndex are also saved.
1979 static REBackTrackData *
1980 PushBackTrackState(REGlobalData *gData, REOp op,
1981 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1982 size_t parenIndex, size_t parenCount)
1984 size_t i;
1985 REBackTrackData *result =
1986 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1988 size_t sz = sizeof(REBackTrackData) +
1989 gData->stateStackTop * sizeof(REProgState) +
1990 parenCount * sizeof(RECapture);
1992 ptrdiff_t btsize = gData->backTrackStackSize;
1993 ptrdiff_t btincr = ((char *)result + sz) -
1994 ((char *)gData->backTrackStack + btsize);
1996 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1998 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1999 if (btincr > 0) {
2000 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
2002 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
2003 btincr = ((btincr+btsize-1)/btsize)*btsize;
2004 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
2005 if (!gData->backTrackStack) {
2006 js_ReportOutOfScriptQuota(gData->cx);
2007 gData->ok = FALSE;
2008 return NULL;
2010 gData->backTrackStackSize = btsize + btincr;
2011 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
2013 gData->backTrackSP = result;
2014 result->sz = gData->cursz;
2015 gData->cursz = sz;
2017 result->backtrack_op = op;
2018 result->backtrack_pc = target;
2019 result->cp = cp;
2020 result->parenCount = parenCount;
2021 result->parenIndex = parenIndex;
2023 result->saveStateStackTop = gData->stateStackTop;
2024 assert(gData->stateStackTop);
2025 memcpy(result + 1, gData->stateStack,
2026 sizeof(REProgState) * result->saveStateStackTop);
2028 if (parenCount != 0) {
2029 memcpy((char *)(result + 1) +
2030 sizeof(REProgState) * result->saveStateStackTop,
2031 &x->parens[parenIndex],
2032 sizeof(RECapture) * parenCount);
2033 for (i = 0; i != parenCount; i++)
2034 x->parens[parenIndex + i].index = -1;
2037 return result;
2040 static inline REMatchState *
2041 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2042 size_t length)
2044 size_t i;
2045 assert(gData->cpend >= x->cp);
2046 if (length > (size_t)(gData->cpend - x->cp))
2047 return NULL;
2048 for (i = 0; i != length; i++) {
2049 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2050 return NULL;
2052 x->cp += length;
2053 return x;
2057 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2058 * 2. If E is not a character then go to step 6.
2059 * 3. Let ch be E's character.
2060 * 4. Let A be a one-element RECharSet containing the character ch.
2061 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2062 * 6. E must be an integer. Let n be that integer.
2063 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2064 * 8. Return an internal Matcher closure that takes two arguments, a State x
2065 * and a Continuation c, and performs the following:
2066 * 1. Let cap be x's captures internal array.
2067 * 2. Let s be cap[n].
2068 * 3. If s is undefined, then call c(x) and return its result.
2069 * 4. Let e be x's endIndex.
2070 * 5. Let len be s's length.
2071 * 6. Let f be e+len.
2072 * 7. If f>InputLength, return failure.
2073 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2074 * such that Canonicalize(s[i]) is not the same character as
2075 * Canonicalize(Input [e+i]), then return failure.
2076 * 9. Let y be the State (f, cap).
2077 * 10. Call c(y) and return its result.
2079 static REMatchState *
2080 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2082 size_t len, i;
2083 const WCHAR *parenContent;
2084 RECapture *cap = &x->parens[parenIndex];
2086 if (cap->index == -1)
2087 return x;
2089 len = cap->length;
2090 if (x->cp + len > gData->cpend)
2091 return NULL;
2093 parenContent = &gData->cpbegin[cap->index];
2094 if (gData->regexp->flags & JSREG_FOLD) {
2095 for (i = 0; i < len; i++) {
2096 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2097 return NULL;
2099 } else {
2100 for (i = 0; i < len; i++) {
2101 if (parenContent[i] != x->cp[i])
2102 return NULL;
2105 x->cp += len;
2106 return x;
2109 /* Add a single character to the RECharSet */
2110 static void
2111 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2113 UINT byteIndex = (UINT)(c >> 3);
2114 assert(c <= cs->length);
2115 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2119 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2120 static void
2121 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2123 UINT i;
2125 UINT byteIndex1 = c1 >> 3;
2126 UINT byteIndex2 = c2 >> 3;
2128 assert(c2 <= cs->length && c1 <= c2);
2130 c1 &= 0x7;
2131 c2 &= 0x7;
2133 if (byteIndex1 == byteIndex2) {
2134 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2135 } else {
2136 cs->u.bits[byteIndex1] |= 0xFF << c1;
2137 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2138 cs->u.bits[i] = 0xFF;
2139 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2143 /* Compile the source of the class into a RECharSet */
2144 static BOOL
2145 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2147 const WCHAR *src, *end;
2148 BOOL inRange = FALSE;
2149 WCHAR rangeStart = 0;
2150 UINT byteLength, n;
2151 WCHAR c, thisCh;
2152 INT nDigits, i;
2154 assert(!charSet->converted);
2156 * Assert that startIndex and length points to chars inside [] inside
2157 * source string.
2159 assert(1 <= charSet->u.src.startIndex);
2160 assert(charSet->u.src.startIndex
2161 < jsstr_length(gData->regexp->source));
2162 assert(charSet->u.src.length <= jsstr_length(gData->regexp->source)
2163 - 1 - charSet->u.src.startIndex);
2165 charSet->converted = TRUE;
2166 src = gData->regexp->source->str + charSet->u.src.startIndex;
2168 end = src + charSet->u.src.length;
2170 assert(src[-1] == '[' && end[0] == ']');
2172 byteLength = (charSet->length >> 3) + 1;
2173 charSet->u.bits = heap_alloc(byteLength);
2174 if (!charSet->u.bits) {
2175 JS_ReportOutOfMemory(gData->cx);
2176 gData->ok = FALSE;
2177 return FALSE;
2179 memset(charSet->u.bits, 0, byteLength);
2181 if (src == end)
2182 return TRUE;
2184 if (*src == '^') {
2185 assert(charSet->sense == FALSE);
2186 ++src;
2187 } else {
2188 assert(charSet->sense == TRUE);
2191 while (src != end) {
2192 switch (*src) {
2193 case '\\':
2194 ++src;
2195 c = *src++;
2196 switch (c) {
2197 case 'b':
2198 thisCh = 0x8;
2199 break;
2200 case 'f':
2201 thisCh = 0xC;
2202 break;
2203 case 'n':
2204 thisCh = 0xA;
2205 break;
2206 case 'r':
2207 thisCh = 0xD;
2208 break;
2209 case 't':
2210 thisCh = 0x9;
2211 break;
2212 case 'v':
2213 thisCh = 0xB;
2214 break;
2215 case 'c':
2216 if (src < end && JS_ISWORD(*src)) {
2217 thisCh = (WCHAR)(*src++ & 0x1F);
2218 } else {
2219 --src;
2220 thisCh = '\\';
2222 break;
2223 case 'x':
2224 nDigits = 2;
2225 goto lexHex;
2226 case 'u':
2227 nDigits = 4;
2228 lexHex:
2229 n = 0;
2230 for (i = 0; (i < nDigits) && (src < end); i++) {
2231 UINT digit;
2232 c = *src++;
2233 if (!isASCIIHexDigit(c, &digit)) {
2235 * Back off to accepting the original '\'
2236 * as a literal
2238 src -= i + 1;
2239 n = '\\';
2240 break;
2242 n = (n << 4) | digit;
2244 thisCh = (WCHAR)n;
2245 break;
2246 case '0':
2247 case '1':
2248 case '2':
2249 case '3':
2250 case '4':
2251 case '5':
2252 case '6':
2253 case '7':
2255 * This is a non-ECMA extension - decimal escapes (in this
2256 * case, octal!) are supposed to be an error inside class
2257 * ranges, but supported here for backwards compatibility.
2259 n = JS7_UNDEC(c);
2260 c = *src;
2261 if ('0' <= c && c <= '7') {
2262 src++;
2263 n = 8 * n + JS7_UNDEC(c);
2264 c = *src;
2265 if ('0' <= c && c <= '7') {
2266 src++;
2267 i = 8 * n + JS7_UNDEC(c);
2268 if (i <= 0377)
2269 n = i;
2270 else
2271 src--;
2274 thisCh = (WCHAR)n;
2275 break;
2277 case 'd':
2278 AddCharacterRangeToCharSet(charSet, '0', '9');
2279 continue; /* don't need range processing */
2280 case 'D':
2281 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2282 AddCharacterRangeToCharSet(charSet,
2283 (WCHAR)('9' + 1),
2284 (WCHAR)charSet->length);
2285 continue;
2286 case 's':
2287 for (i = (INT)charSet->length; i >= 0; i--)
2288 if (isspaceW(i))
2289 AddCharacterToCharSet(charSet, (WCHAR)i);
2290 continue;
2291 case 'S':
2292 for (i = (INT)charSet->length; i >= 0; i--)
2293 if (!isspaceW(i))
2294 AddCharacterToCharSet(charSet, (WCHAR)i);
2295 continue;
2296 case 'w':
2297 for (i = (INT)charSet->length; i >= 0; i--)
2298 if (JS_ISWORD(i))
2299 AddCharacterToCharSet(charSet, (WCHAR)i);
2300 continue;
2301 case 'W':
2302 for (i = (INT)charSet->length; i >= 0; i--)
2303 if (!JS_ISWORD(i))
2304 AddCharacterToCharSet(charSet, (WCHAR)i);
2305 continue;
2306 default:
2307 thisCh = c;
2308 break;
2311 break;
2313 default:
2314 thisCh = *src++;
2315 break;
2318 if (inRange) {
2319 if (gData->regexp->flags & JSREG_FOLD) {
2320 assert(rangeStart <= thisCh);
2321 for (i = rangeStart; i <= thisCh; i++) {
2322 WCHAR uch, dch;
2324 AddCharacterToCharSet(charSet, i);
2325 uch = toupperW(i);
2326 dch = tolowerW(i);
2327 if (i != uch)
2328 AddCharacterToCharSet(charSet, uch);
2329 if (i != dch)
2330 AddCharacterToCharSet(charSet, dch);
2332 } else {
2333 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2335 inRange = FALSE;
2336 } else {
2337 if (gData->regexp->flags & JSREG_FOLD) {
2338 AddCharacterToCharSet(charSet, toupperW(thisCh));
2339 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2340 } else {
2341 AddCharacterToCharSet(charSet, thisCh);
2343 if (src < end - 1) {
2344 if (*src == '-') {
2345 ++src;
2346 inRange = TRUE;
2347 rangeStart = thisCh;
2352 return TRUE;
2355 static BOOL
2356 ReallocStateStack(REGlobalData *gData)
2358 size_t limit = gData->stateStackLimit;
2359 size_t sz = sizeof(REProgState) * limit;
2361 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2362 if (!gData->stateStack) {
2363 js_ReportOutOfScriptQuota(gData->cx);
2364 gData->ok = FALSE;
2365 return FALSE;
2367 gData->stateStackLimit = limit + limit;
2368 return TRUE;
2371 #define PUSH_STATE_STACK(data) \
2372 do { \
2373 ++(data)->stateStackTop; \
2374 if ((data)->stateStackTop == (data)->stateStackLimit && \
2375 !ReallocStateStack((data))) { \
2376 return NULL; \
2378 }while(0)
2381 * Apply the current op against the given input to see if it's going to match
2382 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2383 * true, then update the current state's cp. Always update startpc to the next
2384 * op.
2386 static inline REMatchState *
2387 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2388 jsbytecode **startpc, BOOL updatecp)
2390 REMatchState *result = NULL;
2391 WCHAR matchCh;
2392 size_t parenIndex;
2393 size_t offset, length, index;
2394 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2395 WCHAR *source;
2396 const WCHAR *startcp = x->cp;
2397 WCHAR ch;
2398 RECharSet *charSet;
2400 const char *opname = reop_names[op];
2401 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2402 (int)gData->stateStackTop * 2, "", opname);
2404 switch (op) {
2405 case REOP_EMPTY:
2406 result = x;
2407 break;
2408 case REOP_BOL:
2409 if (x->cp != gData->cpbegin) {
2410 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2411 !(gData->regexp->flags & JSREG_MULTILINE)) {
2412 break;
2414 if (!RE_IS_LINE_TERM(x->cp[-1]))
2415 break;
2417 result = x;
2418 break;
2419 case REOP_EOL:
2420 if (x->cp != gData->cpend) {
2421 if (/*!gData->cx->regExpStatics.multiline &&*/
2422 !(gData->regexp->flags & JSREG_MULTILINE)) {
2423 break;
2425 if (!RE_IS_LINE_TERM(*x->cp))
2426 break;
2428 result = x;
2429 break;
2430 case REOP_WBDRY:
2431 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2432 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2433 result = x;
2435 break;
2436 case REOP_WNONBDRY:
2437 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2438 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2439 result = x;
2441 break;
2442 case REOP_DOT:
2443 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2444 result = x;
2445 result->cp++;
2447 break;
2448 case REOP_DIGIT:
2449 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2450 result = x;
2451 result->cp++;
2453 break;
2454 case REOP_NONDIGIT:
2455 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2456 result = x;
2457 result->cp++;
2459 break;
2460 case REOP_ALNUM:
2461 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2462 result = x;
2463 result->cp++;
2465 break;
2466 case REOP_NONALNUM:
2467 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2468 result = x;
2469 result->cp++;
2471 break;
2472 case REOP_SPACE:
2473 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2474 result = x;
2475 result->cp++;
2477 break;
2478 case REOP_NONSPACE:
2479 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2480 result = x;
2481 result->cp++;
2483 break;
2484 case REOP_BACKREF:
2485 pc = ReadCompactIndex(pc, &parenIndex);
2486 assert(parenIndex < gData->regexp->parenCount);
2487 result = BackrefMatcher(gData, x, parenIndex);
2488 break;
2489 case REOP_FLAT:
2490 pc = ReadCompactIndex(pc, &offset);
2491 assert(offset < jsstr_length(gData->regexp->source));
2492 pc = ReadCompactIndex(pc, &length);
2493 assert(1 <= length);
2494 assert(length <= jsstr_length(gData->regexp->source) - offset);
2495 if (length <= (size_t)(gData->cpend - x->cp)) {
2496 source = gData->regexp->source->str + offset;
2497 TRACE("%s\n", debugstr_wn(source, length));
2498 for (index = 0; index != length; index++) {
2499 if (source[index] != x->cp[index])
2500 return NULL;
2502 x->cp += length;
2503 result = x;
2505 break;
2506 case REOP_FLAT1:
2507 matchCh = *pc++;
2508 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2509 if (x->cp != gData->cpend && *x->cp == matchCh) {
2510 result = x;
2511 result->cp++;
2513 break;
2514 case REOP_FLATi:
2515 pc = ReadCompactIndex(pc, &offset);
2516 assert(offset < jsstr_length(gData->regexp->source));
2517 pc = ReadCompactIndex(pc, &length);
2518 assert(1 <= length);
2519 assert(length <= jsstr_length(gData->regexp->source) - offset);
2520 source = gData->regexp->source->str;
2521 result = FlatNIMatcher(gData, x, source + offset, length);
2522 break;
2523 case REOP_FLAT1i:
2524 matchCh = *pc++;
2525 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2526 result = x;
2527 result->cp++;
2529 break;
2530 case REOP_UCFLAT1:
2531 matchCh = GET_ARG(pc);
2532 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2533 pc += ARG_LEN;
2534 if (x->cp != gData->cpend && *x->cp == matchCh) {
2535 result = x;
2536 result->cp++;
2538 break;
2539 case REOP_UCFLAT1i:
2540 matchCh = GET_ARG(pc);
2541 pc += ARG_LEN;
2542 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2543 result = x;
2544 result->cp++;
2546 break;
2547 case REOP_CLASS:
2548 pc = ReadCompactIndex(pc, &index);
2549 assert(index < gData->regexp->classCount);
2550 if (x->cp != gData->cpend) {
2551 charSet = &gData->regexp->classList[index];
2552 assert(charSet->converted);
2553 ch = *x->cp;
2554 index = ch >> 3;
2555 if (charSet->length != 0 &&
2556 ch <= charSet->length &&
2557 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2558 result = x;
2559 result->cp++;
2562 break;
2563 case REOP_NCLASS:
2564 pc = ReadCompactIndex(pc, &index);
2565 assert(index < gData->regexp->classCount);
2566 if (x->cp != gData->cpend) {
2567 charSet = &gData->regexp->classList[index];
2568 assert(charSet->converted);
2569 ch = *x->cp;
2570 index = ch >> 3;
2571 if (charSet->length == 0 ||
2572 ch > charSet->length ||
2573 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2574 result = x;
2575 result->cp++;
2578 break;
2580 default:
2581 assert(FALSE);
2583 if (result) {
2584 if (!updatecp)
2585 x->cp = startcp;
2586 *startpc = pc;
2587 TRACE(" *\n");
2588 return result;
2590 x->cp = startcp;
2591 return NULL;
2594 static inline REMatchState *
2595 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2597 REMatchState *result = NULL;
2598 REBackTrackData *backTrackData;
2599 jsbytecode *nextpc, *testpc;
2600 REOp nextop;
2601 RECapture *cap;
2602 REProgState *curState;
2603 const WCHAR *startcp;
2604 size_t parenIndex, k;
2605 size_t parenSoFar = 0;
2607 WCHAR matchCh1, matchCh2;
2608 RECharSet *charSet;
2610 BOOL anchor;
2611 jsbytecode *pc = gData->regexp->program;
2612 REOp op = (REOp) *pc++;
2615 * If the first node is a simple match, step the index into the string
2616 * until that match is made, or fail if it can't be found at all.
2618 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2619 anchor = FALSE;
2620 while (x->cp <= gData->cpend) {
2621 nextpc = pc; /* reset back to start each time */
2622 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2623 if (result) {
2624 anchor = TRUE;
2625 x = result;
2626 pc = nextpc; /* accept skip to next opcode */
2627 op = (REOp) *pc++;
2628 assert(op < REOP_LIMIT);
2629 break;
2631 gData->skipped++;
2632 x->cp++;
2634 if (!anchor)
2635 goto bad;
2638 for (;;) {
2639 const char *opname = reop_names[op];
2640 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2641 (int)gData->stateStackTop * 2, "", opname);
2643 if (REOP_IS_SIMPLE(op)) {
2644 result = SimpleMatch(gData, x, op, &pc, TRUE);
2645 } else {
2646 curState = &gData->stateStack[gData->stateStackTop];
2647 switch (op) {
2648 case REOP_END:
2649 goto good;
2650 case REOP_ALTPREREQ2:
2651 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2652 pc += ARG_LEN;
2653 matchCh2 = GET_ARG(pc);
2654 pc += ARG_LEN;
2655 k = GET_ARG(pc);
2656 pc += ARG_LEN;
2658 if (x->cp != gData->cpend) {
2659 if (*x->cp == matchCh2)
2660 goto doAlt;
2662 charSet = &gData->regexp->classList[k];
2663 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2664 goto bad;
2665 matchCh1 = *x->cp;
2666 k = matchCh1 >> 3;
2667 if ((charSet->length == 0 ||
2668 matchCh1 > charSet->length ||
2669 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2670 charSet->sense) {
2671 goto doAlt;
2674 result = NULL;
2675 break;
2677 case REOP_ALTPREREQ:
2678 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2679 pc += ARG_LEN;
2680 matchCh1 = GET_ARG(pc);
2681 pc += ARG_LEN;
2682 matchCh2 = GET_ARG(pc);
2683 pc += ARG_LEN;
2684 if (x->cp == gData->cpend ||
2685 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2686 result = NULL;
2687 break;
2689 /* else fall through... */
2691 case REOP_ALT:
2692 doAlt:
2693 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2694 pc += ARG_LEN; /* start of this alternate */
2695 curState->parenSoFar = parenSoFar;
2696 PUSH_STATE_STACK(gData);
2697 op = (REOp) *pc++;
2698 startcp = x->cp;
2699 if (REOP_IS_SIMPLE(op)) {
2700 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2701 op = (REOp) *nextpc++;
2702 pc = nextpc;
2703 continue;
2705 result = x;
2706 op = (REOp) *pc++;
2708 nextop = (REOp) *nextpc++;
2709 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2710 goto bad;
2711 continue;
2714 * Occurs at (successful) end of REOP_ALT,
2716 case REOP_JUMP:
2718 * If we have not gotten a result here, it is because of an
2719 * empty match. Do the same thing REOP_EMPTY would do.
2721 if (!result)
2722 result = x;
2724 --gData->stateStackTop;
2725 pc += GET_OFFSET(pc);
2726 op = (REOp) *pc++;
2727 continue;
2730 * Occurs at last (successful) end of REOP_ALT,
2732 case REOP_ENDALT:
2734 * If we have not gotten a result here, it is because of an
2735 * empty match. Do the same thing REOP_EMPTY would do.
2737 if (!result)
2738 result = x;
2740 --gData->stateStackTop;
2741 op = (REOp) *pc++;
2742 continue;
2744 case REOP_LPAREN:
2745 pc = ReadCompactIndex(pc, &parenIndex);
2746 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2747 assert(parenIndex < gData->regexp->parenCount);
2748 if (parenIndex + 1 > parenSoFar)
2749 parenSoFar = parenIndex + 1;
2750 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2751 x->parens[parenIndex].length = 0;
2752 op = (REOp) *pc++;
2753 continue;
2755 case REOP_RPAREN:
2757 ptrdiff_t delta;
2759 pc = ReadCompactIndex(pc, &parenIndex);
2760 assert(parenIndex < gData->regexp->parenCount);
2761 cap = &x->parens[parenIndex];
2762 delta = x->cp - (gData->cpbegin + cap->index);
2763 cap->length = (delta < 0) ? 0 : (size_t) delta;
2764 op = (REOp) *pc++;
2766 if (!result)
2767 result = x;
2768 continue;
2770 case REOP_ASSERT:
2771 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2772 pc += ARG_LEN; /* start of ASSERT child */
2773 op = (REOp) *pc++;
2774 testpc = pc;
2775 if (REOP_IS_SIMPLE(op) &&
2776 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2777 result = NULL;
2778 break;
2780 curState->u.assertion.top =
2781 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2782 curState->u.assertion.sz = gData->cursz;
2783 curState->index = x->cp - gData->cpbegin;
2784 curState->parenSoFar = parenSoFar;
2785 PUSH_STATE_STACK(gData);
2786 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2787 nextpc, x, x->cp, 0, 0)) {
2788 goto bad;
2790 continue;
2792 case REOP_ASSERT_NOT:
2793 nextpc = pc + GET_OFFSET(pc);
2794 pc += ARG_LEN;
2795 op = (REOp) *pc++;
2796 testpc = pc;
2797 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2798 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2799 *testpc == REOP_ASSERTNOTTEST) {
2800 result = NULL;
2801 break;
2803 curState->u.assertion.top
2804 = (char *)gData->backTrackSP -
2805 (char *)gData->backTrackStack;
2806 curState->u.assertion.sz = gData->cursz;
2807 curState->index = x->cp - gData->cpbegin;
2808 curState->parenSoFar = parenSoFar;
2809 PUSH_STATE_STACK(gData);
2810 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2811 nextpc, x, x->cp, 0, 0)) {
2812 goto bad;
2814 continue;
2816 case REOP_ASSERTTEST:
2817 --gData->stateStackTop;
2818 --curState;
2819 x->cp = gData->cpbegin + curState->index;
2820 gData->backTrackSP =
2821 (REBackTrackData *) ((char *)gData->backTrackStack +
2822 curState->u.assertion.top);
2823 gData->cursz = curState->u.assertion.sz;
2824 if (result)
2825 result = x;
2826 break;
2828 case REOP_ASSERTNOTTEST:
2829 --gData->stateStackTop;
2830 --curState;
2831 x->cp = gData->cpbegin + curState->index;
2832 gData->backTrackSP =
2833 (REBackTrackData *) ((char *)gData->backTrackStack +
2834 curState->u.assertion.top);
2835 gData->cursz = curState->u.assertion.sz;
2836 result = (!result) ? x : NULL;
2837 break;
2838 case REOP_STAR:
2839 curState->u.quantifier.min = 0;
2840 curState->u.quantifier.max = (UINT)-1;
2841 goto quantcommon;
2842 case REOP_PLUS:
2843 curState->u.quantifier.min = 1;
2844 curState->u.quantifier.max = (UINT)-1;
2845 goto quantcommon;
2846 case REOP_OPT:
2847 curState->u.quantifier.min = 0;
2848 curState->u.quantifier.max = 1;
2849 goto quantcommon;
2850 case REOP_QUANT:
2851 pc = ReadCompactIndex(pc, &k);
2852 curState->u.quantifier.min = k;
2853 pc = ReadCompactIndex(pc, &k);
2854 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2855 curState->u.quantifier.max = k - 1;
2856 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2857 quantcommon:
2858 if (curState->u.quantifier.max == 0) {
2859 pc = pc + GET_OFFSET(pc);
2860 op = (REOp) *pc++;
2861 result = x;
2862 continue;
2864 /* Step over <next> */
2865 nextpc = pc + ARG_LEN;
2866 op = (REOp) *nextpc++;
2867 startcp = x->cp;
2868 if (REOP_IS_SIMPLE(op)) {
2869 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2870 if (curState->u.quantifier.min == 0)
2871 result = x;
2872 else
2873 result = NULL;
2874 pc = pc + GET_OFFSET(pc);
2875 break;
2877 op = (REOp) *nextpc++;
2878 result = x;
2880 curState->index = startcp - gData->cpbegin;
2881 curState->continue_op = REOP_REPEAT;
2882 curState->continue_pc = pc;
2883 curState->parenSoFar = parenSoFar;
2884 PUSH_STATE_STACK(gData);
2885 if (curState->u.quantifier.min == 0 &&
2886 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2887 0, 0)) {
2888 goto bad;
2890 pc = nextpc;
2891 continue;
2893 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2894 pc = curState[-1].continue_pc;
2895 op = (REOp) curState[-1].continue_op;
2897 if (!result)
2898 result = x;
2899 continue;
2901 case REOP_REPEAT:
2902 --curState;
2903 do {
2904 --gData->stateStackTop;
2905 if (!result) {
2906 /* Failed, see if we have enough children. */
2907 if (curState->u.quantifier.min == 0)
2908 goto repeatDone;
2909 goto break_switch;
2911 if (curState->u.quantifier.min == 0 &&
2912 x->cp == gData->cpbegin + curState->index) {
2913 /* matched an empty string, that'll get us nowhere */
2914 result = NULL;
2915 goto break_switch;
2917 if (curState->u.quantifier.min != 0)
2918 curState->u.quantifier.min--;
2919 if (curState->u.quantifier.max != (UINT) -1)
2920 curState->u.quantifier.max--;
2921 if (curState->u.quantifier.max == 0)
2922 goto repeatDone;
2923 nextpc = pc + ARG_LEN;
2924 nextop = (REOp) *nextpc;
2925 startcp = x->cp;
2926 if (REOP_IS_SIMPLE(nextop)) {
2927 nextpc++;
2928 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2929 if (curState->u.quantifier.min == 0)
2930 goto repeatDone;
2931 result = NULL;
2932 goto break_switch;
2934 result = x;
2936 curState->index = startcp - gData->cpbegin;
2937 PUSH_STATE_STACK(gData);
2938 if (curState->u.quantifier.min == 0 &&
2939 !PushBackTrackState(gData, REOP_REPEAT,
2940 pc, x, startcp,
2941 curState->parenSoFar,
2942 parenSoFar -
2943 curState->parenSoFar)) {
2944 goto bad;
2946 } while (*nextpc == REOP_ENDCHILD);
2947 pc = nextpc;
2948 op = (REOp) *pc++;
2949 parenSoFar = curState->parenSoFar;
2950 continue;
2952 repeatDone:
2953 result = x;
2954 pc += GET_OFFSET(pc);
2955 goto break_switch;
2957 case REOP_MINIMALSTAR:
2958 curState->u.quantifier.min = 0;
2959 curState->u.quantifier.max = (UINT)-1;
2960 goto minimalquantcommon;
2961 case REOP_MINIMALPLUS:
2962 curState->u.quantifier.min = 1;
2963 curState->u.quantifier.max = (UINT)-1;
2964 goto minimalquantcommon;
2965 case REOP_MINIMALOPT:
2966 curState->u.quantifier.min = 0;
2967 curState->u.quantifier.max = 1;
2968 goto minimalquantcommon;
2969 case REOP_MINIMALQUANT:
2970 pc = ReadCompactIndex(pc, &k);
2971 curState->u.quantifier.min = k;
2972 pc = ReadCompactIndex(pc, &k);
2973 /* See REOP_QUANT comments about k - 1. */
2974 curState->u.quantifier.max = k - 1;
2975 assert(curState->u.quantifier.min
2976 <= curState->u.quantifier.max);
2977 minimalquantcommon:
2978 curState->index = x->cp - gData->cpbegin;
2979 curState->parenSoFar = parenSoFar;
2980 PUSH_STATE_STACK(gData);
2981 if (curState->u.quantifier.min != 0) {
2982 curState->continue_op = REOP_MINIMALREPEAT;
2983 curState->continue_pc = pc;
2984 /* step over <next> */
2985 pc += OFFSET_LEN;
2986 op = (REOp) *pc++;
2987 } else {
2988 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2989 pc, x, x->cp, 0, 0)) {
2990 goto bad;
2992 --gData->stateStackTop;
2993 pc = pc + GET_OFFSET(pc);
2994 op = (REOp) *pc++;
2996 continue;
2998 case REOP_MINIMALREPEAT:
2999 --gData->stateStackTop;
3000 --curState;
3002 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
3003 #define PREPARE_REPEAT() \
3004 do { \
3005 curState->index = x->cp - gData->cpbegin; \
3006 curState->continue_op = REOP_MINIMALREPEAT; \
3007 curState->continue_pc = pc; \
3008 pc += ARG_LEN; \
3009 for (k = curState->parenSoFar; k < parenSoFar; k++) \
3010 x->parens[k].index = -1; \
3011 PUSH_STATE_STACK(gData); \
3012 op = (REOp) *pc++; \
3013 assert(op < REOP_LIMIT); \
3014 }while(0)
3016 if (!result) {
3017 TRACE(" -\n");
3019 * Non-greedy failure - try to consume another child.
3021 if (curState->u.quantifier.max == (UINT) -1 ||
3022 curState->u.quantifier.max > 0) {
3023 PREPARE_REPEAT();
3024 continue;
3026 /* Don't need to adjust pc since we're going to pop. */
3027 break;
3029 if (curState->u.quantifier.min == 0 &&
3030 x->cp == gData->cpbegin + curState->index) {
3031 /* Matched an empty string, that'll get us nowhere. */
3032 result = NULL;
3033 break;
3035 if (curState->u.quantifier.min != 0)
3036 curState->u.quantifier.min--;
3037 if (curState->u.quantifier.max != (UINT) -1)
3038 curState->u.quantifier.max--;
3039 if (curState->u.quantifier.min != 0) {
3040 PREPARE_REPEAT();
3041 continue;
3043 curState->index = x->cp - gData->cpbegin;
3044 curState->parenSoFar = parenSoFar;
3045 PUSH_STATE_STACK(gData);
3046 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3047 pc, x, x->cp,
3048 curState->parenSoFar,
3049 parenSoFar - curState->parenSoFar)) {
3050 goto bad;
3052 --gData->stateStackTop;
3053 pc = pc + GET_OFFSET(pc);
3054 op = (REOp) *pc++;
3055 assert(op < REOP_LIMIT);
3056 continue;
3057 default:
3058 assert(FALSE);
3059 result = NULL;
3061 break_switch:;
3065 * If the match failed and there's a backtrack option, take it.
3066 * Otherwise this is a complete and utter failure.
3068 if (!result) {
3069 if (gData->cursz == 0)
3070 return NULL;
3072 /* Potentially detect explosive regex here. */
3073 gData->backTrackCount++;
3074 if (gData->backTrackLimit &&
3075 gData->backTrackCount >= gData->backTrackLimit) {
3076 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3077 JSMSG_REGEXP_TOO_COMPLEX);
3078 gData->ok = FALSE;
3079 return NULL;
3082 backTrackData = gData->backTrackSP;
3083 gData->cursz = backTrackData->sz;
3084 gData->backTrackSP =
3085 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3086 x->cp = backTrackData->cp;
3087 pc = backTrackData->backtrack_pc;
3088 op = (REOp) backTrackData->backtrack_op;
3089 assert(op < REOP_LIMIT);
3090 gData->stateStackTop = backTrackData->saveStateStackTop;
3091 assert(gData->stateStackTop);
3093 memcpy(gData->stateStack, backTrackData + 1,
3094 sizeof(REProgState) * backTrackData->saveStateStackTop);
3095 curState = &gData->stateStack[gData->stateStackTop - 1];
3097 if (backTrackData->parenCount) {
3098 memcpy(&x->parens[backTrackData->parenIndex],
3099 (char *)(backTrackData + 1) +
3100 sizeof(REProgState) * backTrackData->saveStateStackTop,
3101 sizeof(RECapture) * backTrackData->parenCount);
3102 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3103 } else {
3104 for (k = curState->parenSoFar; k < parenSoFar; k++)
3105 x->parens[k].index = -1;
3106 parenSoFar = curState->parenSoFar;
3109 TRACE("\tBT_Pop: %ld,%ld\n",
3110 (ULONG_PTR)backTrackData->parenIndex,
3111 (ULONG_PTR)backTrackData->parenCount);
3112 continue;
3114 x = result;
3117 * Continue with the expression.
3119 op = (REOp)*pc++;
3120 assert(op < REOP_LIMIT);
3123 bad:
3124 TRACE("\n");
3125 return NULL;
3127 good:
3128 TRACE("\n");
3129 return x;
3132 static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
3134 REMatchState *result;
3135 const WCHAR *cp = x->cp;
3136 const WCHAR *cp2;
3137 UINT j;
3140 * Have to include the position beyond the last character
3141 * in order to detect end-of-input/line condition.
3143 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3144 gData->skipped = cp2 - cp;
3145 x->cp = cp2;
3146 for (j = 0; j < gData->regexp->parenCount; j++)
3147 x->parens[j].index = -1;
3148 result = ExecuteREBytecode(gData, x);
3149 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
3150 return result;
3151 gData->backTrackSP = gData->backTrackStack;
3152 gData->cursz = 0;
3153 gData->stateStackTop = 0;
3154 cp2 = cp + gData->skipped;
3156 return NULL;
3159 #define MIN_BACKTRACK_LIMIT 400000
3161 static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
3163 REMatchState *result;
3164 UINT i;
3166 gData->backTrackStackSize = INITIAL_BACKTRACK;
3167 gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
3168 if (!gData->backTrackStack)
3169 goto bad;
3171 gData->backTrackSP = gData->backTrackStack;
3172 gData->cursz = 0;
3173 gData->backTrackCount = 0;
3174 gData->backTrackLimit = 0;
3176 gData->stateStackLimit = INITIAL_STATESTACK;
3177 gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3178 if (!gData->stateStack)
3179 goto bad;
3181 gData->stateStackTop = 0;
3182 gData->cx = cx;
3183 gData->regexp = re;
3184 gData->ok = TRUE;
3186 result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
3187 if (!result)
3188 goto bad;
3190 for (i = 0; i < re->classCount; i++) {
3191 if (!re->classList[i].converted &&
3192 !ProcessCharSet(gData, &re->classList[i])) {
3193 return NULL;
3197 return result;
3199 bad:
3200 js_ReportOutOfScriptQuota(cx);
3201 gData->ok = FALSE;
3202 return NULL;
3205 static void
3206 js_DestroyRegExp(JSRegExp *re)
3208 if (re->classList) {
3209 UINT i;
3210 for (i = 0; i < re->classCount; i++) {
3211 if (re->classList[i].converted)
3212 heap_free(re->classList[i].u.bits);
3213 re->classList[i].u.bits = NULL;
3215 heap_free(re->classList);
3217 heap_free(re);
3220 static JSRegExp *
3221 js_NewRegExp(script_ctx_t *cx, jsstr_t *str, UINT flags, BOOL flat)
3223 JSRegExp *re;
3224 jsheap_t *mark;
3225 CompilerState state;
3226 size_t resize;
3227 jsbytecode *endPC;
3228 UINT i;
3229 size_t len;
3231 re = NULL;
3232 mark = jsheap_mark(&cx->tmp_heap);
3233 len = jsstr_length(str);
3235 state.context = cx;
3236 state.cp = str->str;
3237 if (!state.cp)
3238 goto out;
3239 state.cpbegin = state.cp;
3240 state.cpend = state.cp + len;
3241 state.flags = flags;
3242 state.parenCount = 0;
3243 state.classCount = 0;
3244 state.progLength = 0;
3245 state.treeDepth = 0;
3246 state.classBitmapsMem = 0;
3247 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3248 state.classCache[i].start = NULL;
3250 if (len != 0 && flat) {
3251 state.result = NewRENode(&state, REOP_FLAT);
3252 if (!state.result)
3253 goto out;
3254 state.result->u.flat.chr = *state.cpbegin;
3255 state.result->u.flat.length = len;
3256 state.result->kid = (void *) state.cpbegin;
3257 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3258 state.progLength += 1 + GetCompactIndexWidth(0)
3259 + GetCompactIndexWidth(len);
3260 } else {
3261 if (!ParseRegExp(&state))
3262 goto out;
3264 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3265 re = heap_alloc(resize);
3266 if (!re)
3267 goto out;
3269 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3270 re->classCount = state.classCount;
3271 if (re->classCount) {
3272 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3273 if (!re->classList) {
3274 js_DestroyRegExp(re);
3275 re = NULL;
3276 goto out;
3278 for (i = 0; i < re->classCount; i++)
3279 re->classList[i].converted = FALSE;
3280 } else {
3281 re->classList = NULL;
3283 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3284 if (!endPC) {
3285 js_DestroyRegExp(re);
3286 re = NULL;
3287 goto out;
3289 *endPC++ = REOP_END;
3291 * Check whether size was overestimated and shrink using realloc.
3292 * This is safe since no pointers to newly parsed regexp or its parts
3293 * besides re exist here.
3295 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3296 JSRegExp *tmp;
3297 assert((size_t)(endPC - re->program) < state.progLength + 1);
3298 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3299 tmp = heap_realloc(re, resize);
3300 if (tmp)
3301 re = tmp;
3304 re->flags = flags;
3305 re->parenCount = state.parenCount;
3306 re->source = str;
3308 out:
3309 jsheap_clear(mark);
3310 return re;
3313 static inline RegExpInstance *regexp_from_vdisp(vdisp_t *vdisp)
3315 return (RegExpInstance*)vdisp->u.jsdisp;
3318 static void set_last_index(RegExpInstance *This, DWORD last_index)
3320 This->last_index = last_index;
3321 jsval_release(This->last_index_val);
3322 This->last_index_val = jsval_number(last_index);
3325 static HRESULT do_regexp_match_next(script_ctx_t *ctx, RegExpInstance *regexp, DWORD rem_flags,
3326 jsstr_t *str, const WCHAR **cp, match_result_t **parens, DWORD *parens_size,
3327 DWORD *parens_cnt, match_result_t *ret)
3329 REMatchState *x, *result;
3330 REGlobalData gData;
3331 DWORD matchlen;
3333 gData.cpbegin = str->str;
3334 gData.cpend = str->str + jsstr_length(str);
3335 gData.start = *cp-str->str;
3336 gData.skipped = 0;
3337 gData.pool = &ctx->tmp_heap;
3339 x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
3340 if(!x) {
3341 WARN("InitMatch failed\n");
3342 return E_FAIL;
3345 x->cp = *cp;
3346 result = MatchRegExp(&gData, x);
3347 if(!gData.ok) {
3348 WARN("MatchRegExp failed\n");
3349 return E_FAIL;
3352 if(!result) {
3353 if(rem_flags & REM_RESET_INDEX)
3354 set_last_index(regexp, 0);
3355 return S_FALSE;
3358 if(parens) {
3359 if(regexp->jsregexp->parenCount > *parens_size) {
3360 match_result_t *new_parens;
3362 if(*parens)
3363 new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
3364 else
3365 new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
3366 if(!new_parens)
3367 return E_OUTOFMEMORY;
3369 *parens = new_parens;
3373 if(!(rem_flags & REM_NO_CTX_UPDATE) && ctx->last_match != str) {
3374 jsstr_release(ctx->last_match);
3375 ctx->last_match = jsstr_addref(str);
3378 if(parens) {
3379 DWORD i;
3381 *parens_cnt = regexp->jsregexp->parenCount;
3383 for(i=0; i < regexp->jsregexp->parenCount; i++) {
3384 if(result->parens[i].index == -1) {
3385 (*parens)[i].str = NULL;
3386 (*parens)[i].len = 0;
3387 }else {
3388 (*parens)[i].str = str->str + result->parens[i].index;
3389 (*parens)[i].len = result->parens[i].length;
3394 if(!(rem_flags & REM_NO_CTX_UPDATE)) {
3395 DWORD i, n = min(sizeof(ctx->match_parens)/sizeof(ctx->match_parens[0]), regexp->jsregexp->parenCount);
3397 for(i=0; i < n; i++) {
3398 if(result->parens[i].index == -1) {
3399 ctx->match_parens[i].str = NULL;
3400 ctx->match_parens[i].len = 0;
3401 }else {
3402 ctx->match_parens[i].str = ctx->last_match->str + result->parens[i].index;
3403 ctx->match_parens[i].len = result->parens[i].length;
3407 if(n < sizeof(ctx->match_parens)/sizeof(ctx->match_parens[0]))
3408 memset(ctx->match_parens+n, 0, sizeof(ctx->match_parens) - n*sizeof(ctx->match_parens[0]));
3411 matchlen = (result->cp-*cp) - gData.skipped;
3412 *cp = result->cp;
3413 ret->str = result->cp-matchlen;
3414 ret->len = matchlen;
3415 set_last_index(regexp, result->cp-str->str);
3417 if(!(rem_flags & REM_NO_CTX_UPDATE)) {
3418 ctx->last_match_index = ret->str-str->str;
3419 ctx->last_match_length = matchlen;
3422 return S_OK;
3425 HRESULT regexp_match_next(script_ctx_t *ctx, jsdisp_t *dispex, DWORD rem_flags, jsstr_t *str,
3426 const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt,
3427 match_result_t *ret)
3429 RegExpInstance *regexp = (RegExpInstance*)dispex;
3430 jsheap_t *mark;
3431 HRESULT hres;
3433 if((rem_flags & REM_CHECK_GLOBAL) && !(regexp->jsregexp->flags & JSREG_GLOB))
3434 return S_FALSE;
3436 mark = jsheap_mark(&ctx->tmp_heap);
3438 hres = do_regexp_match_next(ctx, regexp, rem_flags, str, cp, parens, parens_size, parens_cnt, ret);
3440 jsheap_clear(mark);
3441 return hres;
3444 static HRESULT regexp_match(script_ctx_t *ctx, jsdisp_t *dispex, jsstr_t *str, BOOL gflag,
3445 match_result_t **match_result, DWORD *result_cnt)
3447 RegExpInstance *This = (RegExpInstance*)dispex;
3448 match_result_t *ret = NULL, cres;
3449 const WCHAR *cp = str->str;
3450 DWORD i=0, ret_size = 0;
3451 jsheap_t *mark;
3452 HRESULT hres;
3454 mark = jsheap_mark(&ctx->tmp_heap);
3456 while(1) {
3457 hres = do_regexp_match_next(ctx, This, 0, str, &cp, NULL, NULL, NULL, &cres);
3458 if(hres == S_FALSE) {
3459 hres = S_OK;
3460 break;
3463 if(FAILED(hres))
3464 break;
3466 if(ret_size == i) {
3467 if(ret)
3468 ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
3469 else
3470 ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
3471 if(!ret) {
3472 hres = E_OUTOFMEMORY;
3473 break;
3477 ret[i++] = cres;
3479 if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
3480 hres = S_OK;
3481 break;
3485 jsheap_clear(mark);
3486 if(FAILED(hres)) {
3487 heap_free(ret);
3488 return hres;
3491 *match_result = ret;
3492 *result_cnt = i;
3493 return S_OK;
3496 static HRESULT RegExp_source(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3497 jsval_t *r)
3499 TRACE("\n");
3501 switch(flags) {
3502 case DISPATCH_PROPERTYGET: {
3503 RegExpInstance *This = regexp_from_vdisp(jsthis);
3504 *r = jsval_string(jsstr_addref(This->str));
3505 break;
3507 default:
3508 FIXME("Unimplemented flags %x\n", flags);
3509 return E_NOTIMPL;
3512 return S_OK;
3515 static HRESULT RegExp_global(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3516 jsval_t *r)
3518 FIXME("\n");
3519 return E_NOTIMPL;
3522 static HRESULT RegExp_ignoreCase(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3523 jsval_t *r)
3525 FIXME("\n");
3526 return E_NOTIMPL;
3529 static HRESULT RegExp_multiline(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3530 jsval_t *r)
3532 FIXME("\n");
3533 return E_NOTIMPL;
3536 static INT index_from_val(script_ctx_t *ctx, jsval_t v)
3538 double n;
3539 HRESULT hres;
3541 hres = to_number(ctx, v, &n);
3542 if(FAILED(hres)) {
3543 clear_ei(ctx); /* FIXME: Move ignoring exceptions to to_primitive */
3544 return 0;
3547 n = floor(n);
3548 return is_int32(n) ? n : 0;
3551 static HRESULT RegExp_lastIndex(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3552 jsval_t *r)
3554 TRACE("\n");
3556 switch(flags) {
3557 case DISPATCH_PROPERTYGET: {
3558 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3560 return jsval_copy(regexp->last_index_val, r);
3562 case DISPATCH_PROPERTYPUT: {
3563 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3564 HRESULT hres;
3566 hres = jsval_copy(argv[0], &regexp->last_index_val);
3567 if(FAILED(hres))
3568 return hres;
3570 regexp->last_index = index_from_val(ctx, argv[0]);
3571 break;
3573 default:
3574 FIXME("unimplemented flags: %x\n", flags);
3575 return E_NOTIMPL;
3578 return S_OK;
3581 static HRESULT RegExp_toString(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3582 jsval_t *r)
3584 FIXME("\n");
3585 return E_NOTIMPL;
3588 static HRESULT create_match_array(script_ctx_t *ctx, jsstr_t *input, const match_result_t *result,
3589 const match_result_t *parens, DWORD parens_cnt, IDispatch **ret)
3591 jsdisp_t *array;
3592 jsstr_t *str;
3593 int i;
3594 HRESULT hres = S_OK;
3596 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3597 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3598 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3599 static const WCHAR zeroW[] = {'0',0};
3601 hres = create_array(ctx, parens_cnt+1, &array);
3602 if(FAILED(hres))
3603 return hres;
3605 for(i=0; i < parens_cnt; i++) {
3606 str = jsstr_alloc_len(parens[i].str, parens[i].len);
3607 if(!str) {
3608 hres = E_OUTOFMEMORY;
3609 break;
3612 hres = jsdisp_propput_idx(array, i+1, jsval_string(str));
3613 jsstr_release(str);
3614 if(FAILED(hres))
3615 break;
3618 while(SUCCEEDED(hres)) {
3619 hres = jsdisp_propput_name(array, indexW, jsval_number(result->str-input->str));
3620 if(FAILED(hres))
3621 break;
3623 hres = jsdisp_propput_name(array, lastIndexW, jsval_number(result->str-input->str+result->len));
3624 if(FAILED(hres))
3625 break;
3627 hres = jsdisp_propput_name(array, inputW, jsval_string(jsstr_addref(input)));
3628 if(FAILED(hres))
3629 break;
3631 str = jsstr_alloc_len(result->str, result->len);
3632 if(!str) {
3633 hres = E_OUTOFMEMORY;
3634 break;
3636 hres = jsdisp_propput_name(array, zeroW, jsval_string(str));
3637 jsstr_release(str);
3638 break;
3641 if(FAILED(hres)) {
3642 jsdisp_release(array);
3643 return hres;
3646 *ret = to_disp(array);
3647 return S_OK;
3650 static HRESULT run_exec(script_ctx_t *ctx, vdisp_t *jsthis, jsval_t arg, jsstr_t **input,
3651 match_result_t *match, match_result_t **parens, DWORD *parens_cnt, BOOL *ret)
3653 RegExpInstance *regexp;
3654 DWORD parens_size = 0, last_index = 0;
3655 const WCHAR *cp;
3656 jsstr_t *string;
3657 HRESULT hres;
3659 if(!is_vclass(jsthis, JSCLASS_REGEXP)) {
3660 FIXME("Not a RegExp\n");
3661 return E_NOTIMPL;
3664 regexp = regexp_from_vdisp(jsthis);
3666 hres = to_string(ctx, arg, &string);
3667 if(FAILED(hres))
3668 return hres;
3670 if(regexp->jsregexp->flags & JSREG_GLOB) {
3671 if(regexp->last_index < 0) {
3672 jsstr_release(string);
3673 set_last_index(regexp, 0);
3674 *ret = FALSE;
3675 if(input)
3676 *input = jsstr_empty();
3677 return S_OK;
3680 last_index = regexp->last_index;
3683 cp = string->str + last_index;
3684 hres = regexp_match_next(ctx, &regexp->dispex, REM_RESET_INDEX, string, &cp, parens,
3685 parens ? &parens_size : NULL, parens_cnt, match);
3686 if(FAILED(hres)) {
3687 jsstr_release(string);
3688 return hres;
3691 *ret = hres == S_OK;
3692 if(input)
3693 *input = string;
3694 else
3695 jsstr_release(string);
3696 return S_OK;
3699 static HRESULT RegExp_exec(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3700 jsval_t *r)
3702 match_result_t *parens = NULL, match;
3703 DWORD parens_cnt = 0;
3704 BOOL b;
3705 jsstr_t *string;
3706 HRESULT hres;
3708 TRACE("\n");
3710 hres = run_exec(ctx, jsthis, argc ? argv[0] : jsval_string(jsstr_empty()), &string, &match, &parens, &parens_cnt, &b);
3711 if(FAILED(hres))
3712 return hres;
3714 if(r) {
3715 if(b) {
3716 IDispatch *ret;
3718 hres = create_match_array(ctx, string, &match, parens, parens_cnt, &ret);
3719 if(SUCCEEDED(hres))
3720 *r = jsval_disp(ret);
3721 }else {
3722 *r = jsval_null();
3726 heap_free(parens);
3727 jsstr_release(string);
3728 return hres;
3731 static HRESULT RegExp_test(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3732 jsval_t *r)
3734 match_result_t match;
3735 jsstr_t *undef_str;
3736 BOOL b;
3737 HRESULT hres;
3739 TRACE("\n");
3741 if(!argc) {
3742 undef_str = jsstr_alloc(undefinedW);
3743 if(!undef_str)
3744 return E_OUTOFMEMORY;
3747 hres = run_exec(ctx, jsthis, argc ? argv[0] : jsval_string(undef_str), NULL, &match, NULL, NULL, &b);
3748 if(!argc)
3749 jsstr_release(undef_str);
3750 if(FAILED(hres))
3751 return hres;
3753 if(r)
3754 *r = jsval_bool(b);
3755 return S_OK;
3758 static HRESULT RegExp_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3759 jsval_t *r)
3761 TRACE("\n");
3763 switch(flags) {
3764 case INVOKE_FUNC:
3765 return throw_type_error(ctx, JS_E_FUNCTION_EXPECTED, NULL);
3766 default:
3767 FIXME("unimplemented flags %x\n", flags);
3768 return E_NOTIMPL;
3771 return S_OK;
3774 static void RegExp_destructor(jsdisp_t *dispex)
3776 RegExpInstance *This = (RegExpInstance*)dispex;
3778 if(This->jsregexp)
3779 js_DestroyRegExp(This->jsregexp);
3780 jsval_release(This->last_index_val);
3781 jsstr_release(This->str);
3782 heap_free(This);
3785 static const builtin_prop_t RegExp_props[] = {
3786 {execW, RegExp_exec, PROPF_METHOD|1},
3787 {globalW, RegExp_global, 0},
3788 {ignoreCaseW, RegExp_ignoreCase, 0},
3789 {lastIndexW, RegExp_lastIndex, 0},
3790 {multilineW, RegExp_multiline, 0},
3791 {sourceW, RegExp_source, 0},
3792 {testW, RegExp_test, PROPF_METHOD|1},
3793 {toStringW, RegExp_toString, PROPF_METHOD}
3796 static const builtin_info_t RegExp_info = {
3797 JSCLASS_REGEXP,
3798 {NULL, RegExp_value, 0},
3799 sizeof(RegExp_props)/sizeof(*RegExp_props),
3800 RegExp_props,
3801 RegExp_destructor,
3802 NULL
3805 static const builtin_prop_t RegExpInst_props[] = {
3806 {globalW, RegExp_global, 0},
3807 {ignoreCaseW, RegExp_ignoreCase, 0},
3808 {lastIndexW, RegExp_lastIndex, 0},
3809 {multilineW, RegExp_multiline, 0},
3810 {sourceW, RegExp_source, 0}
3813 static const builtin_info_t RegExpInst_info = {
3814 JSCLASS_REGEXP,
3815 {NULL, RegExp_value, 0},
3816 sizeof(RegExpInst_props)/sizeof(*RegExpInst_props),
3817 RegExpInst_props,
3818 RegExp_destructor,
3819 NULL
3822 static HRESULT alloc_regexp(script_ctx_t *ctx, jsdisp_t *object_prototype, RegExpInstance **ret)
3824 RegExpInstance *regexp;
3825 HRESULT hres;
3827 regexp = heap_alloc_zero(sizeof(RegExpInstance));
3828 if(!regexp)
3829 return E_OUTOFMEMORY;
3831 if(object_prototype)
3832 hres = init_dispex(&regexp->dispex, ctx, &RegExp_info, object_prototype);
3833 else
3834 hres = init_dispex_from_constr(&regexp->dispex, ctx, &RegExpInst_info, ctx->regexp_constr);
3836 if(FAILED(hres)) {
3837 heap_free(regexp);
3838 return hres;
3841 *ret = regexp;
3842 return S_OK;
3845 HRESULT create_regexp(script_ctx_t *ctx, jsstr_t *src, DWORD flags, jsdisp_t **ret)
3847 RegExpInstance *regexp;
3848 HRESULT hres;
3850 TRACE("%s %x\n", debugstr_jsstr(src), flags);
3852 hres = alloc_regexp(ctx, NULL, &regexp);
3853 if(FAILED(hres))
3854 return hres;
3856 regexp->str = jsstr_addref(src);
3857 regexp->last_index_val = jsval_number(0);
3859 regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
3860 if(!regexp->jsregexp) {
3861 WARN("js_NewRegExp failed\n");
3862 jsdisp_release(&regexp->dispex);
3863 return E_FAIL;
3866 *ret = &regexp->dispex;
3867 return S_OK;
3870 HRESULT create_regexp_var(script_ctx_t *ctx, jsval_t src_arg, jsval_t *flags_arg, jsdisp_t **ret)
3872 jsstr_t *src, *opt = NULL;
3873 DWORD flags;
3874 HRESULT hres;
3876 if(is_object_instance(src_arg)) {
3877 jsdisp_t *obj;
3879 obj = iface_to_jsdisp((IUnknown*)get_object(src_arg));
3880 if(obj) {
3881 if(is_class(obj, JSCLASS_REGEXP)) {
3882 RegExpInstance *regexp = (RegExpInstance*)obj;
3884 hres = create_regexp(ctx, regexp->str, regexp->jsregexp->flags, ret);
3885 jsdisp_release(obj);
3886 return hres;
3889 jsdisp_release(obj);
3893 if(!is_string(src_arg)) {
3894 FIXME("src_arg = %s\n", debugstr_jsval(src_arg));
3895 return E_NOTIMPL;
3898 src = get_string(src_arg);
3900 if(flags_arg) {
3901 if(!is_string(*flags_arg)) {
3902 FIXME("unimplemented for %s\n", debugstr_jsval(*flags_arg));
3903 return E_NOTIMPL;
3906 opt = get_string(*flags_arg);
3909 hres = parse_regexp_flags(opt ? opt->str : NULL, opt ? jsstr_length(opt) : 0, &flags);
3910 if(FAILED(hres))
3911 return hres;
3913 return create_regexp(ctx, src, flags, ret);
3916 HRESULT regexp_string_match(script_ctx_t *ctx, jsdisp_t *re, jsstr_t *str, jsval_t *r)
3918 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3919 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3920 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3922 RegExpInstance *regexp = (RegExpInstance*)re;
3923 match_result_t *match_result;
3924 unsigned match_cnt, i;
3925 jsdisp_t *array;
3926 HRESULT hres;
3928 if(!(regexp->jsregexp->flags & JSREG_GLOB)) {
3929 match_result_t match, *parens = NULL;
3930 DWORD parens_cnt, parens_size = 0;
3931 const WCHAR *cp = str->str;
3933 hres = regexp_match_next(ctx, &regexp->dispex, 0, str, &cp, &parens, &parens_size, &parens_cnt, &match);
3934 if(FAILED(hres))
3935 return hres;
3937 if(r) {
3938 if(hres == S_OK) {
3939 IDispatch *ret;
3941 hres = create_match_array(ctx, str, &match, parens, parens_cnt, &ret);
3942 if(SUCCEEDED(hres))
3943 *r = jsval_disp(ret);
3944 }else {
3945 *r = jsval_null();
3949 heap_free(parens);
3950 return S_OK;
3953 hres = regexp_match(ctx, &regexp->dispex, str, FALSE, &match_result, &match_cnt);
3954 if(FAILED(hres))
3955 return hres;
3957 if(!match_cnt) {
3958 TRACE("no match\n");
3960 if(r)
3961 *r = jsval_null();
3962 return S_OK;
3965 hres = create_array(ctx, match_cnt, &array);
3966 if(FAILED(hres))
3967 return hres;
3969 for(i=0; i < match_cnt; i++) {
3970 jsstr_t *tmp_str;
3972 tmp_str = jsstr_alloc_len(match_result[i].str, match_result[i].len);
3973 if(!tmp_str) {
3974 hres = E_OUTOFMEMORY;
3975 break;
3978 hres = jsdisp_propput_idx(array, i, jsval_string(tmp_str));
3979 jsstr_release(tmp_str);
3980 if(FAILED(hres))
3981 break;
3984 while(SUCCEEDED(hres)) {
3985 hres = jsdisp_propput_name(array, indexW, jsval_number(match_result[match_cnt-1].str-str->str));
3986 if(FAILED(hres))
3987 break;
3989 hres = jsdisp_propput_name(array, lastIndexW,
3990 jsval_number(match_result[match_cnt-1].str-str->str+match_result[match_cnt-1].len));
3991 if(FAILED(hres))
3992 break;
3994 hres = jsdisp_propput_name(array, inputW, jsval_string(str));
3995 break;
3998 heap_free(match_result);
4000 if(SUCCEEDED(hres) && r)
4001 *r = jsval_obj(array);
4002 else
4003 jsdisp_release(array);
4004 return hres;
4007 static HRESULT global_idx(script_ctx_t *ctx, DWORD flags, DWORD idx, jsval_t *r)
4009 switch(flags) {
4010 case DISPATCH_PROPERTYGET: {
4011 jsstr_t *ret;
4013 ret = jsstr_alloc_len(ctx->match_parens[idx].str, ctx->match_parens[idx].len);
4014 if(!ret)
4015 return E_OUTOFMEMORY;
4017 *r = jsval_string(ret);
4018 break;
4020 case DISPATCH_PROPERTYPUT:
4021 break;
4022 default:
4023 FIXME("unsupported flags\n");
4024 return E_NOTIMPL;
4027 return S_OK;
4030 static HRESULT RegExpConstr_idx1(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4031 unsigned argc, jsval_t *argv, jsval_t *r)
4033 TRACE("\n");
4034 return global_idx(ctx, flags, 0, r);
4037 static HRESULT RegExpConstr_idx2(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4038 unsigned argc, jsval_t *argv, jsval_t *r)
4040 TRACE("\n");
4041 return global_idx(ctx, flags, 1, r);
4044 static HRESULT RegExpConstr_idx3(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4045 unsigned argc, jsval_t *argv, jsval_t *r)
4047 TRACE("\n");
4048 return global_idx(ctx, flags, 2, r);
4051 static HRESULT RegExpConstr_idx4(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4052 unsigned argc, jsval_t *argv, jsval_t *r)
4054 TRACE("\n");
4055 return global_idx(ctx, flags, 3, r);
4058 static HRESULT RegExpConstr_idx5(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4059 unsigned argc, jsval_t *argv, jsval_t *r)
4061 TRACE("\n");
4062 return global_idx(ctx, flags, 4, r);
4065 static HRESULT RegExpConstr_idx6(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4066 unsigned argc, jsval_t *argv, jsval_t *r)
4068 TRACE("\n");
4069 return global_idx(ctx, flags, 5, r);
4072 static HRESULT RegExpConstr_idx7(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4073 unsigned argc, jsval_t *argv, jsval_t *r)
4075 TRACE("\n");
4076 return global_idx(ctx, flags, 6, r);
4079 static HRESULT RegExpConstr_idx8(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4080 unsigned argc, jsval_t *argv, jsval_t *r)
4082 TRACE("\n");
4083 return global_idx(ctx, flags, 7, r);
4086 static HRESULT RegExpConstr_idx9(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4087 unsigned argc, jsval_t *argv, jsval_t *r)
4089 TRACE("\n");
4090 return global_idx(ctx, flags, 8, r);
4093 static HRESULT RegExpConstr_leftContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4094 unsigned argc, jsval_t *argv, jsval_t *r)
4096 TRACE("\n");
4098 switch(flags) {
4099 case DISPATCH_PROPERTYGET: {
4100 jsstr_t *ret;
4102 ret = jsstr_alloc_len(ctx->last_match->str, ctx->last_match_index);
4103 if(!ret)
4104 return E_OUTOFMEMORY;
4106 *r = jsval_string(ret);
4107 break;
4109 case DISPATCH_PROPERTYPUT:
4110 break;
4111 default:
4112 FIXME("unsupported flags\n");
4113 return E_NOTIMPL;
4116 return S_OK;
4119 static HRESULT RegExpConstr_rightContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4120 unsigned argc, jsval_t *argv, jsval_t *r)
4122 TRACE("\n");
4124 switch(flags) {
4125 case DISPATCH_PROPERTYGET: {
4126 jsstr_t *ret;
4128 ret = jsstr_alloc(ctx->last_match->str+ctx->last_match_index+ctx->last_match_length);
4129 if(!ret)
4130 return E_OUTOFMEMORY;
4132 *r = jsval_string(ret);
4133 break;
4135 case DISPATCH_PROPERTYPUT:
4136 break;
4137 default:
4138 FIXME("unsupported flags\n");
4139 return E_NOTIMPL;
4142 return S_OK;
4145 static HRESULT RegExpConstr_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
4146 jsval_t *r)
4148 TRACE("\n");
4150 switch(flags) {
4151 case DISPATCH_METHOD:
4152 if(argc) {
4153 if(is_object_instance(argv[0])) {
4154 jsdisp_t *jsdisp = iface_to_jsdisp((IUnknown*)get_object(argv[0]));
4155 if(jsdisp) {
4156 if(is_class(jsdisp, JSCLASS_REGEXP)) {
4157 if(argc > 1 && !is_undefined(argv[1])) {
4158 jsdisp_release(jsdisp);
4159 return throw_regexp_error(ctx, JS_E_REGEXP_SYNTAX, NULL);
4162 if(r)
4163 *r = jsval_obj(jsdisp);
4164 else
4165 jsdisp_release(jsdisp);
4166 return S_OK;
4168 jsdisp_release(jsdisp);
4172 /* fall through */
4173 case DISPATCH_CONSTRUCT: {
4174 jsdisp_t *ret;
4175 HRESULT hres;
4177 if(!argc) {
4178 FIXME("no args\n");
4179 return E_NOTIMPL;
4182 hres = create_regexp_var(ctx, argv[0], argc > 1 ? argv+1 : NULL, &ret);
4183 if(FAILED(hres))
4184 return hres;
4186 if(r)
4187 *r = jsval_obj(ret);
4188 else
4189 jsdisp_release(ret);
4190 return S_OK;
4192 default:
4193 FIXME("unimplemented flags: %x\n", flags);
4194 return E_NOTIMPL;
4197 return S_OK;
4200 static const builtin_prop_t RegExpConstr_props[] = {
4201 {idx1W, RegExpConstr_idx1, 0},
4202 {idx2W, RegExpConstr_idx2, 0},
4203 {idx3W, RegExpConstr_idx3, 0},
4204 {idx4W, RegExpConstr_idx4, 0},
4205 {idx5W, RegExpConstr_idx5, 0},
4206 {idx6W, RegExpConstr_idx6, 0},
4207 {idx7W, RegExpConstr_idx7, 0},
4208 {idx8W, RegExpConstr_idx8, 0},
4209 {idx9W, RegExpConstr_idx9, 0},
4210 {leftContextW, RegExpConstr_leftContext, 0},
4211 {rightContextW, RegExpConstr_rightContext, 0}
4214 static const builtin_info_t RegExpConstr_info = {
4215 JSCLASS_FUNCTION,
4216 {NULL, Function_value, 0},
4217 sizeof(RegExpConstr_props)/sizeof(*RegExpConstr_props),
4218 RegExpConstr_props,
4219 NULL,
4220 NULL
4223 HRESULT create_regexp_constr(script_ctx_t *ctx, jsdisp_t *object_prototype, jsdisp_t **ret)
4225 RegExpInstance *regexp;
4226 HRESULT hres;
4228 static const WCHAR RegExpW[] = {'R','e','g','E','x','p',0};
4230 hres = alloc_regexp(ctx, object_prototype, &regexp);
4231 if(FAILED(hres))
4232 return hres;
4234 hres = create_builtin_constructor(ctx, RegExpConstr_value, RegExpW, &RegExpConstr_info,
4235 PROPF_CONSTR|2, &regexp->dispex, ret);
4237 jsdisp_release(&regexp->dispex);
4238 return hres;
4241 HRESULT parse_regexp_flags(const WCHAR *str, DWORD str_len, DWORD *ret)
4243 const WCHAR *p;
4244 DWORD flags = 0;
4246 for (p = str; p < str+str_len; p++) {
4247 switch (*p) {
4248 case 'g':
4249 flags |= JSREG_GLOB;
4250 break;
4251 case 'i':
4252 flags |= JSREG_FOLD;
4253 break;
4254 case 'm':
4255 flags |= JSREG_MULTILINE;
4256 break;
4257 case 'y':
4258 flags |= JSREG_STICKY;
4259 break;
4260 default:
4261 WARN("wrong flag %c\n", *p);
4262 return E_FAIL;
4266 *ret = flags;
4267 return S_OK;