winex11: Separate fetching the window icon bits and setting the WM hints.
[wine.git] / dlls / jscript / regexp.c
blobb53b8487437582348a2b981138a0bc95916218cc
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
35 #include <math.h>
37 #include "jscript.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
44 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
45 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
46 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
48 typedef BYTE JSPackedBool;
49 typedef BYTE jsbytecode;
52 * This struct holds a bitmap representation of a class from a regexp.
53 * There's a list of these referenced by the classList field in the JSRegExp
54 * struct below. The initial state has startIndex set to the offset in the
55 * original regexp source of the beginning of the class contents. The first
56 * use of the class converts the source representation into a bitmap.
59 typedef struct RECharSet {
60 JSPackedBool converted;
61 JSPackedBool sense;
62 WORD length;
63 union {
64 BYTE *bits;
65 struct {
66 size_t startIndex;
67 size_t length;
68 } src;
69 } u;
70 } RECharSet;
72 typedef struct {
73 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
74 size_t parenCount; /* number of parenthesized submatches */
75 size_t classCount; /* count [...] bitmaps */
76 RECharSet *classList; /* list of [...] bitmaps */
77 BSTR source; /* locked source string, sans // */
78 jsbytecode program[1]; /* regular expression bytecode */
79 } JSRegExp;
81 typedef struct {
82 jsdisp_t dispex;
84 JSRegExp *jsregexp;
85 BSTR str;
86 INT last_index;
87 jsval_t last_index_val;
88 } RegExpInstance;
90 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
91 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
92 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
93 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
94 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
95 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
96 static const WCHAR execW[] = {'e','x','e','c',0};
97 static const WCHAR testW[] = {'t','e','s','t',0};
99 static const WCHAR leftContextW[] =
100 {'l','e','f','t','C','o','n','t','e','x','t',0};
101 static const WCHAR rightContextW[] =
102 {'r','i','g','h','t','C','o','n','t','e','x','t',0};
104 static const WCHAR idx1W[] = {'$','1',0};
105 static const WCHAR idx2W[] = {'$','2',0};
106 static const WCHAR idx3W[] = {'$','3',0};
107 static const WCHAR idx4W[] = {'$','4',0};
108 static const WCHAR idx5W[] = {'$','5',0};
109 static const WCHAR idx6W[] = {'$','6',0};
110 static const WCHAR idx7W[] = {'$','7',0};
111 static const WCHAR idx8W[] = {'$','8',0};
112 static const WCHAR idx9W[] = {'$','9',0};
114 static const WCHAR undefinedW[] = {'u','n','d','e','f','i','n','e','d',0};
115 static const WCHAR emptyW[] = {0};
117 /* FIXME: Better error handling */
118 #define ReportRegExpError(a,b,c)
119 #define ReportRegExpErrorHelper(a,b,c,d)
120 #define JS_ReportErrorNumber(a,b,c,d)
121 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
122 #define js_ReportOutOfScriptQuota(a)
123 #define JS_ReportOutOfMemory(a)
124 #define JS_COUNT_OPERATION(a,b)
126 #define JSMSG_MIN_TOO_BIG 47
127 #define JSMSG_MAX_TOO_BIG 48
128 #define JSMSG_OUT_OF_ORDER 49
129 #define JSMSG_OUT_OF_MEMORY 137
131 #define LINE_SEPARATOR 0x2028
132 #define PARA_SEPARATOR 0x2029
134 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
135 ((c >= 'a') && (c <= 'z')) )
136 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
137 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
139 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
141 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
142 #define JS7_UNDEC(c) ((c) - '0')
144 typedef enum REOp {
145 REOP_EMPTY,
146 REOP_BOL,
147 REOP_EOL,
148 REOP_WBDRY,
149 REOP_WNONBDRY,
150 REOP_DOT,
151 REOP_DIGIT,
152 REOP_NONDIGIT,
153 REOP_ALNUM,
154 REOP_NONALNUM,
155 REOP_SPACE,
156 REOP_NONSPACE,
157 REOP_BACKREF,
158 REOP_FLAT,
159 REOP_FLAT1,
160 REOP_FLATi,
161 REOP_FLAT1i,
162 REOP_UCFLAT1,
163 REOP_UCFLAT1i,
164 REOP_UCFLAT,
165 REOP_UCFLATi,
166 REOP_CLASS,
167 REOP_NCLASS,
168 REOP_ALT,
169 REOP_QUANT,
170 REOP_STAR,
171 REOP_PLUS,
172 REOP_OPT,
173 REOP_LPAREN,
174 REOP_RPAREN,
175 REOP_JUMP,
176 REOP_DOTSTAR,
177 REOP_LPARENNON,
178 REOP_ASSERT,
179 REOP_ASSERT_NOT,
180 REOP_ASSERTTEST,
181 REOP_ASSERTNOTTEST,
182 REOP_MINIMALSTAR,
183 REOP_MINIMALPLUS,
184 REOP_MINIMALOPT,
185 REOP_MINIMALQUANT,
186 REOP_ENDCHILD,
187 REOP_REPEAT,
188 REOP_MINIMALREPEAT,
189 REOP_ALTPREREQ,
190 REOP_ALTPREREQ2,
191 REOP_ENDALT,
192 REOP_CONCAT,
193 REOP_END,
194 REOP_LIMIT /* META: no operator >= to this */
195 } REOp;
197 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
199 static const char *reop_names[] = {
200 "empty",
201 "bol",
202 "eol",
203 "wbdry",
204 "wnonbdry",
205 "dot",
206 "digit",
207 "nondigit",
208 "alnum",
209 "nonalnum",
210 "space",
211 "nonspace",
212 "backref",
213 "flat",
214 "flat1",
215 "flati",
216 "flat1i",
217 "ucflat1",
218 "ucflat1i",
219 "ucflat",
220 "ucflati",
221 "class",
222 "nclass",
223 "alt",
224 "quant",
225 "star",
226 "plus",
227 "opt",
228 "lparen",
229 "rparen",
230 "jump",
231 "dotstar",
232 "lparennon",
233 "assert",
234 "assert_not",
235 "asserttest",
236 "assertnottest",
237 "minimalstar",
238 "minimalplus",
239 "minimalopt",
240 "minimalquant",
241 "endchild",
242 "repeat",
243 "minimalrepeat",
244 "altprereq",
245 "alrprereq2",
246 "endalt",
247 "concat",
248 "end",
249 NULL
252 typedef struct RECapture {
253 ptrdiff_t index; /* start of contents, -1 for empty */
254 size_t length; /* length of capture */
255 } RECapture;
257 typedef struct REMatchState {
258 const WCHAR *cp;
259 RECapture parens[1]; /* first of 're->parenCount' captures,
260 allocated at end of this struct */
261 } REMatchState;
263 typedef struct REProgState {
264 jsbytecode *continue_pc; /* current continuation data */
265 jsbytecode continue_op;
266 ptrdiff_t index; /* progress in text */
267 size_t parenSoFar; /* highest indexed paren started */
268 union {
269 struct {
270 UINT min; /* current quantifier limits */
271 UINT max;
272 } quantifier;
273 struct {
274 size_t top; /* backtrack stack state */
275 size_t sz;
276 } assertion;
277 } u;
278 } REProgState;
280 typedef struct REBackTrackData {
281 size_t sz; /* size of previous stack entry */
282 jsbytecode *backtrack_pc; /* where to backtrack to */
283 jsbytecode backtrack_op;
284 const WCHAR *cp; /* index in text of match at backtrack */
285 size_t parenIndex; /* start index of saved paren contents */
286 size_t parenCount; /* # of saved paren contents */
287 size_t saveStateStackTop; /* number of parent states */
288 /* saved parent states follow */
289 /* saved paren contents follow */
290 } REBackTrackData;
292 #define INITIAL_STATESTACK 100
293 #define INITIAL_BACKTRACK 8000
295 typedef struct REGlobalData {
296 script_ctx_t *cx;
297 JSRegExp *regexp; /* the RE in execution */
298 BOOL ok; /* runtime error (out_of_memory only?) */
299 size_t start; /* offset to start at */
300 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
301 const WCHAR *cpbegin; /* text base address */
302 const WCHAR *cpend; /* text limit address */
304 REProgState *stateStack; /* stack of state of current parents */
305 size_t stateStackTop;
306 size_t stateStackLimit;
308 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
309 REBackTrackData *backTrackSP;
310 size_t backTrackStackSize;
311 size_t cursz; /* size of current stack entry */
312 size_t backTrackCount; /* how many times we've backtracked */
313 size_t backTrackLimit; /* upper limit on backtrack states */
315 jsheap_t *pool; /* It's faster to use one malloc'd pool
316 than to malloc/free the three items
317 that are allocated from this pool */
318 } REGlobalData;
320 typedef struct RENode RENode;
321 struct RENode {
322 REOp op; /* r.e. op bytecode */
323 RENode *next; /* next in concatenation order */
324 void *kid; /* first operand */
325 union {
326 void *kid2; /* second operand */
327 INT num; /* could be a number */
328 size_t parenIndex; /* or a parenthesis index */
329 struct { /* or a quantifier range */
330 UINT min;
331 UINT max;
332 JSPackedBool greedy;
333 } range;
334 struct { /* or a character class */
335 size_t startIndex;
336 size_t kidlen; /* length of string at kid, in jschars */
337 size_t index; /* index into class list */
338 WORD bmsize; /* bitmap size, based on max char code */
339 JSPackedBool sense;
340 } ucclass;
341 struct { /* or a literal sequence */
342 WCHAR chr; /* of one character */
343 size_t length; /* or many (via the kid) */
344 } flat;
345 struct {
346 RENode *kid2; /* second operand from ALT */
347 WCHAR ch1; /* match char for ALTPREREQ */
348 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
349 } altprereq;
350 } u;
353 #define CLASS_CACHE_SIZE 4
355 typedef struct CompilerState {
356 script_ctx_t *context;
357 const WCHAR *cpbegin;
358 const WCHAR *cpend;
359 const WCHAR *cp;
360 size_t parenCount;
361 size_t classCount; /* number of [] encountered */
362 size_t treeDepth; /* maximum depth of parse tree */
363 size_t progLength; /* estimated bytecode length */
364 RENode *result;
365 size_t classBitmapsMem; /* memory to hold all class bitmaps */
366 struct {
367 const WCHAR *start; /* small cache of class strings */
368 size_t length; /* since they're often the same */
369 size_t index;
370 } classCache[CLASS_CACHE_SIZE];
371 WORD flags;
372 } CompilerState;
374 typedef struct EmitStateStackEntry {
375 jsbytecode *altHead; /* start of REOP_ALT* opcode */
376 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
377 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
378 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
379 RENode *continueNode; /* original REOP_ALT* node being stacked */
380 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
381 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
382 avoid 16-bit unsigned offset overflow */
383 } EmitStateStackEntry;
386 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
387 * the getters and setters take the pc of the offset, not of the opcode before
388 * the offset.
390 #define ARG_LEN 2
391 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
392 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
393 (pc)[1] = (jsbytecode) (arg))
395 #define OFFSET_LEN ARG_LEN
396 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
397 #define GET_OFFSET(pc) GET_ARG(pc)
399 static BOOL ParseRegExp(CompilerState*);
402 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
403 * For sanity, we limit it to 2^24 bytes.
405 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
408 * The maximum memory that can be allocated for class bitmaps.
409 * For sanity, we limit it to 2^24 bytes.
411 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
414 * Functions to get size and write/read bytecode that represent small indexes
415 * compactly.
416 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
417 * indicates that the following byte brings more bits to the index. Otherwise
418 * this is the last byte in the index bytecode representing highest index bits.
420 static size_t
421 GetCompactIndexWidth(size_t index)
423 size_t width;
425 for (width = 1; (index >>= 7) != 0; ++width) { }
426 return width;
429 static inline jsbytecode *
430 WriteCompactIndex(jsbytecode *pc, size_t index)
432 size_t next;
434 while ((next = index >> 7) != 0) {
435 *pc++ = (jsbytecode)(index | 0x80);
436 index = next;
438 *pc++ = (jsbytecode)index;
439 return pc;
442 static inline jsbytecode *
443 ReadCompactIndex(jsbytecode *pc, size_t *result)
445 size_t nextByte;
447 nextByte = *pc++;
448 if ((nextByte & 0x80) == 0) {
450 * Short-circuit the most common case when compact index <= 127.
452 *result = nextByte;
453 } else {
454 size_t shift = 7;
455 *result = 0x7F & nextByte;
456 do {
457 nextByte = *pc++;
458 *result |= (nextByte & 0x7F) << shift;
459 shift += 7;
460 } while ((nextByte & 0x80) != 0);
462 return pc;
465 /* Construct and initialize an RENode, returning NULL for out-of-memory */
466 static RENode *
467 NewRENode(CompilerState *state, REOp op)
469 RENode *ren;
471 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
472 if (!ren) {
473 /* js_ReportOutOfScriptQuota(cx); */
474 return NULL;
476 ren->op = op;
477 ren->next = NULL;
478 ren->kid = NULL;
479 return ren;
483 * Validates and converts hex ascii value.
485 static BOOL
486 isASCIIHexDigit(WCHAR c, UINT *digit)
488 UINT cv = c;
490 if (cv < '0')
491 return FALSE;
492 if (cv <= '9') {
493 *digit = cv - '0';
494 return TRUE;
496 cv |= 0x20;
497 if (cv >= 'a' && cv <= 'f') {
498 *digit = cv - 'a' + 10;
499 return TRUE;
501 return FALSE;
504 typedef struct {
505 REOp op;
506 const WCHAR *errPos;
507 size_t parenIndex;
508 } REOpData;
510 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
511 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
513 static BOOL
514 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
516 ptrdiff_t offset = target - jump;
518 /* Check that target really points forward. */
519 assert(offset >= 2);
520 if ((size_t)offset > OFFSET_MAX)
521 return FALSE;
523 jump[0] = JUMP_OFFSET_HI(offset);
524 jump[1] = JUMP_OFFSET_LO(offset);
525 return TRUE;
529 * Generate bytecode for the tree rooted at t using an explicit stack instead
530 * of recursion.
532 static jsbytecode *
533 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
534 jsbytecode *pc, RENode *t)
536 EmitStateStackEntry *emitStateSP, *emitStateStack;
537 RECharSet *charSet;
538 REOp op;
540 if (treeDepth == 0) {
541 emitStateStack = NULL;
542 } else {
543 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
544 if (!emitStateStack)
545 return NULL;
547 emitStateSP = emitStateStack;
548 op = t->op;
549 assert(op < REOP_LIMIT);
551 for (;;) {
552 *pc++ = op;
553 switch (op) {
554 case REOP_EMPTY:
555 --pc;
556 break;
558 case REOP_ALTPREREQ2:
559 case REOP_ALTPREREQ:
560 assert(emitStateSP);
561 emitStateSP->altHead = pc - 1;
562 emitStateSP->endTermFixup = pc;
563 pc += OFFSET_LEN;
564 SET_ARG(pc, t->u.altprereq.ch1);
565 pc += ARG_LEN;
566 SET_ARG(pc, t->u.altprereq.ch2);
567 pc += ARG_LEN;
569 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
570 pc += OFFSET_LEN;
572 emitStateSP->continueNode = t;
573 emitStateSP->continueOp = REOP_JUMP;
574 emitStateSP->jumpToJumpFlag = FALSE;
575 ++emitStateSP;
576 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
577 t = t->kid;
578 op = t->op;
579 assert(op < REOP_LIMIT);
580 continue;
582 case REOP_JUMP:
583 emitStateSP->nextTermFixup = pc; /* offset to following term */
584 pc += OFFSET_LEN;
585 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
586 goto jump_too_big;
587 emitStateSP->continueOp = REOP_ENDALT;
588 ++emitStateSP;
589 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
590 t = t->u.kid2;
591 op = t->op;
592 assert(op < REOP_LIMIT);
593 continue;
595 case REOP_ENDALT:
597 * If we already patched emitStateSP->nextTermFixup to jump to
598 * a nearer jump, to avoid 16-bit immediate offset overflow, we
599 * are done here.
601 if (emitStateSP->jumpToJumpFlag)
602 break;
605 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
606 * REOP_ENDALT is executed only on successful match of the last
607 * alternate in a group.
609 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
610 goto jump_too_big;
611 if (t->op != REOP_ALT) {
612 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
613 goto jump_too_big;
617 * If the program is bigger than the REOP_JUMP offset range, then
618 * we must check for alternates before this one that are part of
619 * the same group, and fix up their jump offsets to target jumps
620 * close enough to fit in a 16-bit unsigned offset immediate.
622 if ((size_t)(pc - re->program) > OFFSET_MAX &&
623 emitStateSP > emitStateStack) {
624 EmitStateStackEntry *esp, *esp2;
625 jsbytecode *alt, *jump;
626 ptrdiff_t span, header;
628 esp2 = emitStateSP;
629 alt = esp2->altHead;
630 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
631 if (esp->continueOp == REOP_ENDALT &&
632 !esp->jumpToJumpFlag &&
633 esp->nextTermFixup + OFFSET_LEN == alt &&
634 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
635 ? esp->endTermFixup
636 : esp->nextTermFixup)) > OFFSET_MAX) {
637 alt = esp->altHead;
638 jump = esp->nextTermFixup;
641 * The span must be 1 less than the distance from
642 * jump offset to jump offset, so we actually jump
643 * to a REOP_JUMP bytecode, not to its offset!
645 for (;;) {
646 assert(jump < esp2->nextTermFixup);
647 span = esp2->nextTermFixup - jump - 1;
648 if ((size_t)span <= OFFSET_MAX)
649 break;
650 do {
651 if (--esp2 == esp)
652 goto jump_too_big;
653 } while (esp2->continueOp != REOP_ENDALT);
656 jump[0] = JUMP_OFFSET_HI(span);
657 jump[1] = JUMP_OFFSET_LO(span);
659 if (esp->continueNode->op != REOP_ALT) {
661 * We must patch the offset at esp->endTermFixup
662 * as well, for the REOP_ALTPREREQ{,2} opcodes.
663 * If we're unlucky and endTermFixup is more than
664 * OFFSET_MAX bytes from its target, we cheat by
665 * jumping 6 bytes to the jump whose offset is at
666 * esp->nextTermFixup, which has the same target.
668 jump = esp->endTermFixup;
669 header = esp->nextTermFixup - jump;
670 span += header;
671 if ((size_t)span > OFFSET_MAX)
672 span = header;
674 jump[0] = JUMP_OFFSET_HI(span);
675 jump[1] = JUMP_OFFSET_LO(span);
678 esp->jumpToJumpFlag = TRUE;
682 break;
684 case REOP_ALT:
685 assert(emitStateSP);
686 emitStateSP->altHead = pc - 1;
687 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
688 pc += OFFSET_LEN;
689 emitStateSP->continueNode = t;
690 emitStateSP->continueOp = REOP_JUMP;
691 emitStateSP->jumpToJumpFlag = FALSE;
692 ++emitStateSP;
693 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
694 t = t->kid;
695 op = t->op;
696 assert(op < REOP_LIMIT);
697 continue;
699 case REOP_FLAT:
701 * Coalesce FLATs if possible and if it would not increase bytecode
702 * beyond preallocated limit. The latter happens only when bytecode
703 * size for coalesced string with offset p and length 2 exceeds 6
704 * bytes preallocated for 2 single char nodes, i.e. when
705 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
706 * GetCompactIndexWidth(p) > 4.
707 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
708 * nodes strictly decreases bytecode size, the check has to be
709 * done only for the first coalescing.
711 if (t->kid &&
712 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
714 while (t->next &&
715 t->next->op == REOP_FLAT &&
716 (WCHAR*)t->kid + t->u.flat.length ==
717 t->next->kid) {
718 t->u.flat.length += t->next->u.flat.length;
719 t->next = t->next->next;
722 if (t->kid && t->u.flat.length > 1) {
723 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
724 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
725 pc = WriteCompactIndex(pc, t->u.flat.length);
726 } else if (t->u.flat.chr < 256) {
727 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
728 *pc++ = (jsbytecode) t->u.flat.chr;
729 } else {
730 pc[-1] = (state->flags & JSREG_FOLD)
731 ? REOP_UCFLAT1i
732 : REOP_UCFLAT1;
733 SET_ARG(pc, t->u.flat.chr);
734 pc += ARG_LEN;
736 break;
738 case REOP_LPAREN:
739 assert(emitStateSP);
740 pc = WriteCompactIndex(pc, t->u.parenIndex);
741 emitStateSP->continueNode = t;
742 emitStateSP->continueOp = REOP_RPAREN;
743 ++emitStateSP;
744 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
745 t = t->kid;
746 op = t->op;
747 continue;
749 case REOP_RPAREN:
750 pc = WriteCompactIndex(pc, t->u.parenIndex);
751 break;
753 case REOP_BACKREF:
754 pc = WriteCompactIndex(pc, t->u.parenIndex);
755 break;
757 case REOP_ASSERT:
758 assert(emitStateSP);
759 emitStateSP->nextTermFixup = pc;
760 pc += OFFSET_LEN;
761 emitStateSP->continueNode = t;
762 emitStateSP->continueOp = REOP_ASSERTTEST;
763 ++emitStateSP;
764 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
765 t = t->kid;
766 op = t->op;
767 continue;
769 case REOP_ASSERTTEST:
770 case REOP_ASSERTNOTTEST:
771 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
772 goto jump_too_big;
773 break;
775 case REOP_ASSERT_NOT:
776 assert(emitStateSP);
777 emitStateSP->nextTermFixup = pc;
778 pc += OFFSET_LEN;
779 emitStateSP->continueNode = t;
780 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
781 ++emitStateSP;
782 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
783 t = t->kid;
784 op = t->op;
785 continue;
787 case REOP_QUANT:
788 assert(emitStateSP);
789 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
790 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
791 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
792 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
793 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
794 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
795 } else {
796 if (!t->u.range.greedy)
797 pc[-1] = REOP_MINIMALQUANT;
798 pc = WriteCompactIndex(pc, t->u.range.min);
800 * Write max + 1 to avoid using size_t(max) + 1 bytes
801 * for (UINT)-1 sentinel.
803 pc = WriteCompactIndex(pc, t->u.range.max + 1);
805 emitStateSP->nextTermFixup = pc;
806 pc += OFFSET_LEN;
807 emitStateSP->continueNode = t;
808 emitStateSP->continueOp = REOP_ENDCHILD;
809 ++emitStateSP;
810 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
811 t = t->kid;
812 op = t->op;
813 continue;
815 case REOP_ENDCHILD:
816 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
817 goto jump_too_big;
818 break;
820 case REOP_CLASS:
821 if (!t->u.ucclass.sense)
822 pc[-1] = REOP_NCLASS;
823 pc = WriteCompactIndex(pc, t->u.ucclass.index);
824 charSet = &re->classList[t->u.ucclass.index];
825 charSet->converted = FALSE;
826 charSet->length = t->u.ucclass.bmsize;
827 charSet->u.src.startIndex = t->u.ucclass.startIndex;
828 charSet->u.src.length = t->u.ucclass.kidlen;
829 charSet->sense = t->u.ucclass.sense;
830 break;
832 default:
833 break;
836 t = t->next;
837 if (t) {
838 op = t->op;
839 } else {
840 if (emitStateSP == emitStateStack)
841 break;
842 --emitStateSP;
843 t = emitStateSP->continueNode;
844 op = (REOp) emitStateSP->continueOp;
848 cleanup:
849 heap_free(emitStateStack);
850 return pc;
852 jump_too_big:
853 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
854 pc = NULL;
855 goto cleanup;
859 * Process the op against the two top operands, reducing them to a single
860 * operand in the penultimate slot. Update progLength and treeDepth.
862 static BOOL
863 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
864 INT operandSP)
866 RENode *result;
868 switch (opData->op) {
869 case REOP_ALT:
870 result = NewRENode(state, REOP_ALT);
871 if (!result)
872 return FALSE;
873 result->kid = operandStack[operandSP - 2];
874 result->u.kid2 = operandStack[operandSP - 1];
875 operandStack[operandSP - 2] = result;
877 if (state->treeDepth == TREE_DEPTH_MAX) {
878 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
879 return FALSE;
881 ++state->treeDepth;
884 * Look at both alternates to see if there's a FLAT or a CLASS at
885 * the start of each. If so, use a prerequisite match.
887 if (((RENode *) result->kid)->op == REOP_FLAT &&
888 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
889 (state->flags & JSREG_FOLD) == 0) {
890 result->op = REOP_ALTPREREQ;
891 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
892 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
893 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
894 JUMP, <end> ... ENDALT */
895 state->progLength += 13;
897 else
898 if (((RENode *) result->kid)->op == REOP_CLASS &&
899 ((RENode *) result->kid)->u.ucclass.index < 256 &&
900 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
901 (state->flags & JSREG_FOLD) == 0) {
902 result->op = REOP_ALTPREREQ2;
903 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
904 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
905 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
906 JUMP, <end> ... ENDALT */
907 state->progLength += 13;
909 else
910 if (((RENode *) result->kid)->op == REOP_FLAT &&
911 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
912 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
913 (state->flags & JSREG_FOLD) == 0) {
914 result->op = REOP_ALTPREREQ2;
915 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
916 result->u.altprereq.ch2 =
917 ((RENode *) result->u.kid2)->u.ucclass.index;
918 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
919 JUMP, <end> ... ENDALT */
920 state->progLength += 13;
922 else {
923 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
924 state->progLength += 7;
926 break;
928 case REOP_CONCAT:
929 result = operandStack[operandSP - 2];
930 while (result->next)
931 result = result->next;
932 result->next = operandStack[operandSP - 1];
933 break;
935 case REOP_ASSERT:
936 case REOP_ASSERT_NOT:
937 case REOP_LPARENNON:
938 case REOP_LPAREN:
939 /* These should have been processed by a close paren. */
940 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
941 opData->errPos);
942 return FALSE;
944 default:;
946 return TRUE;
950 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
951 * its being on the stack, and to propagate errors to its callers.
953 #define JSREG_FIND_PAREN_COUNT 0x8000
954 #define JSREG_FIND_PAREN_ERROR 0x4000
957 * Magic return value from FindParenCount and GetDecimalValue, to indicate
958 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
959 * its findMax parameter is non-null.
961 #define OVERFLOW_VALUE ((UINT)-1)
963 static UINT
964 FindParenCount(CompilerState *state)
966 CompilerState temp;
967 int i;
969 if (state->flags & JSREG_FIND_PAREN_COUNT)
970 return OVERFLOW_VALUE;
973 * Copy state into temp, flag it so we never report an invalid backref,
974 * and reset its members to parse the entire regexp. This is obviously
975 * suboptimal, but GetDecimalValue calls us only if a backref appears to
976 * refer to a forward parenthetical, which is rare.
978 temp = *state;
979 temp.flags |= JSREG_FIND_PAREN_COUNT;
980 temp.cp = temp.cpbegin;
981 temp.parenCount = 0;
982 temp.classCount = 0;
983 temp.progLength = 0;
984 temp.treeDepth = 0;
985 temp.classBitmapsMem = 0;
986 for (i = 0; i < CLASS_CACHE_SIZE; i++)
987 temp.classCache[i].start = NULL;
989 if (!ParseRegExp(&temp)) {
990 state->flags |= JSREG_FIND_PAREN_ERROR;
991 return OVERFLOW_VALUE;
993 return temp.parenCount;
997 * Extract and return a decimal value at state->cp. The initial character c
998 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
999 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
1000 * state->flags to discover whether an error occurred under findMax.
1002 static UINT
1003 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
1004 CompilerState *state)
1006 UINT value = JS7_UNDEC(c);
1007 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
1009 /* The following restriction allows simpler overflow checks. */
1010 assert(max <= ((UINT)-1 - 9) / 10);
1011 while (state->cp < state->cpend) {
1012 c = *state->cp;
1013 if (!JS7_ISDEC(c))
1014 break;
1015 value = 10 * value + JS7_UNDEC(c);
1016 if (!overflow && value > max && (!findMax || value > findMax(state)))
1017 overflow = TRUE;
1018 ++state->cp;
1020 return overflow ? OVERFLOW_VALUE : value;
1024 * Calculate the total size of the bitmap required for a class expression.
1026 static BOOL
1027 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1028 const WCHAR *end)
1030 UINT max = 0;
1031 BOOL inRange = FALSE;
1032 WCHAR c, rangeStart = 0;
1033 UINT n, digit, nDigits, i;
1035 target->u.ucclass.bmsize = 0;
1036 target->u.ucclass.sense = TRUE;
1038 if (src == end)
1039 return TRUE;
1041 if (*src == '^') {
1042 ++src;
1043 target->u.ucclass.sense = FALSE;
1046 while (src != end) {
1047 BOOL canStartRange = TRUE;
1048 UINT localMax = 0;
1050 switch (*src) {
1051 case '\\':
1052 ++src;
1053 c = *src++;
1054 switch (c) {
1055 case 'b':
1056 localMax = 0x8;
1057 break;
1058 case 'f':
1059 localMax = 0xC;
1060 break;
1061 case 'n':
1062 localMax = 0xA;
1063 break;
1064 case 'r':
1065 localMax = 0xD;
1066 break;
1067 case 't':
1068 localMax = 0x9;
1069 break;
1070 case 'v':
1071 localMax = 0xB;
1072 break;
1073 case 'c':
1074 if (src < end && RE_IS_LETTER(*src)) {
1075 localMax = (UINT) (*src++) & 0x1F;
1076 } else {
1077 --src;
1078 localMax = '\\';
1080 break;
1081 case 'x':
1082 nDigits = 2;
1083 goto lexHex;
1084 case 'u':
1085 nDigits = 4;
1086 lexHex:
1087 n = 0;
1088 for (i = 0; (i < nDigits) && (src < end); i++) {
1089 c = *src++;
1090 if (!isASCIIHexDigit(c, &digit)) {
1092 * Back off to accepting the original
1093 *'\' as a literal.
1095 src -= i + 1;
1096 n = '\\';
1097 break;
1099 n = (n << 4) | digit;
1101 localMax = n;
1102 break;
1103 case 'd':
1104 canStartRange = FALSE;
1105 if (inRange) {
1106 JS_ReportErrorNumber(state->context,
1107 js_GetErrorMessage, NULL,
1108 JSMSG_BAD_CLASS_RANGE);
1109 return FALSE;
1111 localMax = '9';
1112 break;
1113 case 'D':
1114 case 's':
1115 case 'S':
1116 case 'w':
1117 case 'W':
1118 canStartRange = FALSE;
1119 if (inRange) {
1120 JS_ReportErrorNumber(state->context,
1121 js_GetErrorMessage, NULL,
1122 JSMSG_BAD_CLASS_RANGE);
1123 return FALSE;
1125 max = 65535;
1128 * If this is the start of a range, ensure that it's less than
1129 * the end.
1131 localMax = 0;
1132 break;
1133 case '0':
1134 case '1':
1135 case '2':
1136 case '3':
1137 case '4':
1138 case '5':
1139 case '6':
1140 case '7':
1142 * This is a non-ECMA extension - decimal escapes (in this
1143 * case, octal!) are supposed to be an error inside class
1144 * ranges, but supported here for backwards compatibility.
1147 n = JS7_UNDEC(c);
1148 c = *src;
1149 if ('0' <= c && c <= '7') {
1150 src++;
1151 n = 8 * n + JS7_UNDEC(c);
1152 c = *src;
1153 if ('0' <= c && c <= '7') {
1154 src++;
1155 i = 8 * n + JS7_UNDEC(c);
1156 if (i <= 0377)
1157 n = i;
1158 else
1159 src--;
1162 localMax = n;
1163 break;
1165 default:
1166 localMax = c;
1167 break;
1169 break;
1170 default:
1171 localMax = *src++;
1172 break;
1175 if (inRange) {
1176 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1177 if (rangeStart > localMax) {
1178 JS_ReportErrorNumber(state->context,
1179 js_GetErrorMessage, NULL,
1180 JSMSG_BAD_CLASS_RANGE);
1181 return FALSE;
1183 inRange = FALSE;
1184 } else {
1185 if (canStartRange && src < end - 1) {
1186 if (*src == '-') {
1187 ++src;
1188 inRange = TRUE;
1189 rangeStart = (WCHAR)localMax;
1190 continue;
1193 if (state->flags & JSREG_FOLD)
1194 rangeStart = localMax; /* one run of the uc/dc loop below */
1197 if (state->flags & JSREG_FOLD) {
1198 WCHAR maxch = localMax;
1200 for (i = rangeStart; i <= localMax; i++) {
1201 WCHAR uch, dch;
1203 uch = toupperW(i);
1204 dch = tolowerW(i);
1205 if(maxch < uch)
1206 maxch = uch;
1207 if(maxch < dch)
1208 maxch = dch;
1210 localMax = maxch;
1213 if (localMax > max)
1214 max = localMax;
1216 target->u.ucclass.bmsize = max;
1217 return TRUE;
1220 static INT
1221 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1223 UINT min, max;
1224 WCHAR c;
1225 const WCHAR *errp = state->cp++;
1227 c = *state->cp;
1228 if (JS7_ISDEC(c)) {
1229 ++state->cp;
1230 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1231 c = *state->cp;
1233 if (!ignoreValues && min == OVERFLOW_VALUE)
1234 return JSMSG_MIN_TOO_BIG;
1236 if (c == ',') {
1237 c = *++state->cp;
1238 if (JS7_ISDEC(c)) {
1239 ++state->cp;
1240 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1241 c = *state->cp;
1242 if (!ignoreValues && max == OVERFLOW_VALUE)
1243 return JSMSG_MAX_TOO_BIG;
1244 if (!ignoreValues && min > max)
1245 return JSMSG_OUT_OF_ORDER;
1246 } else {
1247 max = (UINT)-1;
1249 } else {
1250 max = min;
1252 if (c == '}') {
1253 state->result = NewRENode(state, REOP_QUANT);
1254 if (!state->result)
1255 return JSMSG_OUT_OF_MEMORY;
1256 state->result->u.range.min = min;
1257 state->result->u.range.max = max;
1259 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1260 * where <max> is written as compact(max+1) to make
1261 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1263 state->progLength += (1 + GetCompactIndexWidth(min)
1264 + GetCompactIndexWidth(max + 1)
1265 +3);
1266 return 0;
1270 state->cp = errp;
1271 return -1;
1274 static BOOL
1275 ParseQuantifier(CompilerState *state)
1277 RENode *term;
1278 term = state->result;
1279 if (state->cp < state->cpend) {
1280 switch (*state->cp) {
1281 case '+':
1282 state->result = NewRENode(state, REOP_QUANT);
1283 if (!state->result)
1284 return FALSE;
1285 state->result->u.range.min = 1;
1286 state->result->u.range.max = (UINT)-1;
1287 /* <PLUS>, <next> ... <ENDCHILD> */
1288 state->progLength += 4;
1289 goto quantifier;
1290 case '*':
1291 state->result = NewRENode(state, REOP_QUANT);
1292 if (!state->result)
1293 return FALSE;
1294 state->result->u.range.min = 0;
1295 state->result->u.range.max = (UINT)-1;
1296 /* <STAR>, <next> ... <ENDCHILD> */
1297 state->progLength += 4;
1298 goto quantifier;
1299 case '?':
1300 state->result = NewRENode(state, REOP_QUANT);
1301 if (!state->result)
1302 return FALSE;
1303 state->result->u.range.min = 0;
1304 state->result->u.range.max = 1;
1305 /* <OPT>, <next> ... <ENDCHILD> */
1306 state->progLength += 4;
1307 goto quantifier;
1308 case '{': /* balance '}' */
1310 INT err;
1312 err = ParseMinMaxQuantifier(state, FALSE);
1313 if (err == 0)
1314 goto quantifier;
1315 if (err == -1)
1316 return TRUE;
1318 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1319 return FALSE;
1321 default:;
1324 return TRUE;
1326 quantifier:
1327 if (state->treeDepth == TREE_DEPTH_MAX) {
1328 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1329 return FALSE;
1332 ++state->treeDepth;
1333 ++state->cp;
1334 state->result->kid = term;
1335 if (state->cp < state->cpend && *state->cp == '?') {
1336 ++state->cp;
1337 state->result->u.range.greedy = FALSE;
1338 } else {
1339 state->result->u.range.greedy = TRUE;
1341 return TRUE;
1345 * item: assertion An item is either an assertion or
1346 * quantatom a quantified atom.
1348 * assertion: '^' Assertions match beginning of string
1349 * (or line if the class static property
1350 * RegExp.multiline is true).
1351 * '$' End of string (or line if the class
1352 * static property RegExp.multiline is
1353 * true).
1354 * '\b' Word boundary (between \w and \W).
1355 * '\B' Word non-boundary.
1357 * quantatom: atom An unquantified atom.
1358 * quantatom '{' n ',' m '}'
1359 * Atom must occur between n and m times.
1360 * quantatom '{' n ',' '}' Atom must occur at least n times.
1361 * quantatom '{' n '}' Atom must occur exactly n times.
1362 * quantatom '*' Zero or more times (same as {0,}).
1363 * quantatom '+' One or more times (same as {1,}).
1364 * quantatom '?' Zero or one time (same as {0,1}).
1366 * any of which can be optionally followed by '?' for ungreedy
1368 * atom: '(' regexp ')' A parenthesized regexp (what matched
1369 * can be addressed using a backreference,
1370 * see '\' n below).
1371 * '.' Matches any char except '\n'.
1372 * '[' classlist ']' A character class.
1373 * '[' '^' classlist ']' A negated character class.
1374 * '\f' Form Feed.
1375 * '\n' Newline (Line Feed).
1376 * '\r' Carriage Return.
1377 * '\t' Horizontal Tab.
1378 * '\v' Vertical Tab.
1379 * '\d' A digit (same as [0-9]).
1380 * '\D' A non-digit.
1381 * '\w' A word character, [0-9a-z_A-Z].
1382 * '\W' A non-word character.
1383 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1384 * '\S' A non-whitespace character.
1385 * '\' n A backreference to the nth (n decimal
1386 * and positive) parenthesized expression.
1387 * '\' octal An octal escape sequence (octal must be
1388 * two or three digits long, unless it is
1389 * 0 for the null character).
1390 * '\x' hex A hex escape (hex must be two digits).
1391 * '\u' unicode A unicode escape (must be four digits).
1392 * '\c' ctrl A control character, ctrl is a letter.
1393 * '\' literalatomchar Any character except one of the above
1394 * that follow '\' in an atom.
1395 * otheratomchar Any character not first among the other
1396 * atom right-hand sides.
1398 static BOOL
1399 ParseTerm(CompilerState *state)
1401 WCHAR c = *state->cp++;
1402 UINT nDigits;
1403 UINT num, tmp, n, i;
1404 const WCHAR *termStart;
1406 switch (c) {
1407 /* assertions and atoms */
1408 case '^':
1409 state->result = NewRENode(state, REOP_BOL);
1410 if (!state->result)
1411 return FALSE;
1412 state->progLength++;
1413 return TRUE;
1414 case '$':
1415 state->result = NewRENode(state, REOP_EOL);
1416 if (!state->result)
1417 return FALSE;
1418 state->progLength++;
1419 return TRUE;
1420 case '\\':
1421 if (state->cp >= state->cpend) {
1422 /* a trailing '\' is an error */
1423 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1424 return FALSE;
1426 c = *state->cp++;
1427 switch (c) {
1428 /* assertion escapes */
1429 case 'b' :
1430 state->result = NewRENode(state, REOP_WBDRY);
1431 if (!state->result)
1432 return FALSE;
1433 state->progLength++;
1434 return TRUE;
1435 case 'B':
1436 state->result = NewRENode(state, REOP_WNONBDRY);
1437 if (!state->result)
1438 return FALSE;
1439 state->progLength++;
1440 return TRUE;
1441 /* Decimal escape */
1442 case '0':
1443 /* Give a strict warning. See also the note below. */
1444 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1445 doOctal:
1446 num = 0;
1447 while (state->cp < state->cpend) {
1448 c = *state->cp;
1449 if (c < '0' || '7' < c)
1450 break;
1451 state->cp++;
1452 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1453 if (tmp > 0377)
1454 break;
1455 num = tmp;
1457 c = (WCHAR)num;
1458 doFlat:
1459 state->result = NewRENode(state, REOP_FLAT);
1460 if (!state->result)
1461 return FALSE;
1462 state->result->u.flat.chr = c;
1463 state->result->u.flat.length = 1;
1464 state->progLength += 3;
1465 break;
1466 case '1':
1467 case '2':
1468 case '3':
1469 case '4':
1470 case '5':
1471 case '6':
1472 case '7':
1473 case '8':
1474 case '9':
1475 termStart = state->cp - 1;
1476 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1477 if (state->flags & JSREG_FIND_PAREN_ERROR)
1478 return FALSE;
1479 if (num == OVERFLOW_VALUE) {
1480 /* Give a strict mode warning. */
1481 WARN("back-reference exceeds number of capturing parentheses\n");
1484 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1485 * error here. However, for compatibility with IE, we treat the
1486 * whole backref as flat if the first character in it is not a
1487 * valid octal character, and as an octal escape otherwise.
1489 state->cp = termStart;
1490 if (c >= '8') {
1491 /* Treat this as flat. termStart - 1 is the \. */
1492 c = '\\';
1493 goto asFlat;
1496 /* Treat this as an octal escape. */
1497 goto doOctal;
1499 assert(1 <= num && num <= 0x10000);
1500 state->result = NewRENode(state, REOP_BACKREF);
1501 if (!state->result)
1502 return FALSE;
1503 state->result->u.parenIndex = num - 1;
1504 state->progLength
1505 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1506 break;
1507 /* Control escape */
1508 case 'f':
1509 c = 0xC;
1510 goto doFlat;
1511 case 'n':
1512 c = 0xA;
1513 goto doFlat;
1514 case 'r':
1515 c = 0xD;
1516 goto doFlat;
1517 case 't':
1518 c = 0x9;
1519 goto doFlat;
1520 case 'v':
1521 c = 0xB;
1522 goto doFlat;
1523 /* Control letter */
1524 case 'c':
1525 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1526 c = (WCHAR) (*state->cp++ & 0x1F);
1527 } else {
1528 /* back off to accepting the original '\' as a literal */
1529 --state->cp;
1530 c = '\\';
1532 goto doFlat;
1533 /* HexEscapeSequence */
1534 case 'x':
1535 nDigits = 2;
1536 goto lexHex;
1537 /* UnicodeEscapeSequence */
1538 case 'u':
1539 nDigits = 4;
1540 lexHex:
1541 n = 0;
1542 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1543 UINT digit;
1544 c = *state->cp++;
1545 if (!isASCIIHexDigit(c, &digit)) {
1547 * Back off to accepting the original 'u' or 'x' as a
1548 * literal.
1550 state->cp -= i + 2;
1551 n = *state->cp++;
1552 break;
1554 n = (n << 4) | digit;
1556 c = (WCHAR) n;
1557 goto doFlat;
1558 /* Character class escapes */
1559 case 'd':
1560 state->result = NewRENode(state, REOP_DIGIT);
1561 doSimple:
1562 if (!state->result)
1563 return FALSE;
1564 state->progLength++;
1565 break;
1566 case 'D':
1567 state->result = NewRENode(state, REOP_NONDIGIT);
1568 goto doSimple;
1569 case 's':
1570 state->result = NewRENode(state, REOP_SPACE);
1571 goto doSimple;
1572 case 'S':
1573 state->result = NewRENode(state, REOP_NONSPACE);
1574 goto doSimple;
1575 case 'w':
1576 state->result = NewRENode(state, REOP_ALNUM);
1577 goto doSimple;
1578 case 'W':
1579 state->result = NewRENode(state, REOP_NONALNUM);
1580 goto doSimple;
1581 /* IdentityEscape */
1582 default:
1583 state->result = NewRENode(state, REOP_FLAT);
1584 if (!state->result)
1585 return FALSE;
1586 state->result->u.flat.chr = c;
1587 state->result->u.flat.length = 1;
1588 state->result->kid = (void *) (state->cp - 1);
1589 state->progLength += 3;
1590 break;
1592 break;
1593 case '[':
1594 state->result = NewRENode(state, REOP_CLASS);
1595 if (!state->result)
1596 return FALSE;
1597 termStart = state->cp;
1598 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1599 for (;;) {
1600 if (state->cp == state->cpend) {
1601 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1602 JSMSG_UNTERM_CLASS, termStart);
1604 return FALSE;
1606 if (*state->cp == '\\') {
1607 state->cp++;
1608 if (state->cp != state->cpend)
1609 state->cp++;
1610 continue;
1612 if (*state->cp == ']') {
1613 state->result->u.ucclass.kidlen = state->cp - termStart;
1614 break;
1616 state->cp++;
1618 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1619 if (!state->classCache[i].start) {
1620 state->classCache[i].start = termStart;
1621 state->classCache[i].length = state->result->u.ucclass.kidlen;
1622 state->classCache[i].index = state->classCount;
1623 break;
1625 if (state->classCache[i].length ==
1626 state->result->u.ucclass.kidlen) {
1627 for (n = 0; ; n++) {
1628 if (n == state->classCache[i].length) {
1629 state->result->u.ucclass.index
1630 = state->classCache[i].index;
1631 goto claim;
1633 if (state->classCache[i].start[n] != termStart[n])
1634 break;
1638 state->result->u.ucclass.index = state->classCount++;
1640 claim:
1642 * Call CalculateBitmapSize now as we want any errors it finds
1643 * to be reported during the parse phase, not at execution.
1645 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1646 return FALSE;
1648 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1649 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1650 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1652 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1653 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1654 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1655 return FALSE;
1657 state->classBitmapsMem += n;
1658 /* CLASS, <index> */
1659 state->progLength
1660 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1661 break;
1663 case '.':
1664 state->result = NewRENode(state, REOP_DOT);
1665 goto doSimple;
1667 case '{':
1669 const WCHAR *errp = state->cp--;
1670 INT err;
1672 err = ParseMinMaxQuantifier(state, TRUE);
1673 state->cp = errp;
1675 if (err < 0)
1676 goto asFlat;
1678 /* FALL THROUGH */
1680 case '*':
1681 case '+':
1682 case '?':
1683 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1684 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1685 return FALSE;
1686 default:
1687 asFlat:
1688 state->result = NewRENode(state, REOP_FLAT);
1689 if (!state->result)
1690 return FALSE;
1691 state->result->u.flat.chr = c;
1692 state->result->u.flat.length = 1;
1693 state->result->kid = (void *) (state->cp - 1);
1694 state->progLength += 3;
1695 break;
1697 return ParseQuantifier(state);
1701 * Top-down regular expression grammar, based closely on Perl4.
1703 * regexp: altern A regular expression is one or more
1704 * altern '|' regexp alternatives separated by vertical bar.
1706 #define INITIAL_STACK_SIZE 128
1708 static BOOL
1709 ParseRegExp(CompilerState *state)
1711 size_t parenIndex;
1712 RENode *operand;
1713 REOpData *operatorStack;
1714 RENode **operandStack;
1715 REOp op;
1716 INT i;
1717 BOOL result = FALSE;
1719 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1720 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1722 /* Watch out for empty regexp */
1723 if (state->cp == state->cpend) {
1724 state->result = NewRENode(state, REOP_EMPTY);
1725 return (state->result != NULL);
1728 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1729 if (!operatorStack)
1730 return FALSE;
1732 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1733 if (!operandStack)
1734 goto out;
1736 for (;;) {
1737 parenIndex = state->parenCount;
1738 if (state->cp == state->cpend) {
1740 * If we are at the end of the regexp and we're short one or more
1741 * operands, the regexp must have the form /x|/ or some such, with
1742 * left parentheses making us short more than one operand.
1744 if (operatorSP >= operandSP) {
1745 operand = NewRENode(state, REOP_EMPTY);
1746 if (!operand)
1747 goto out;
1748 goto pushOperand;
1750 } else {
1751 switch (*state->cp) {
1752 case '(':
1753 ++state->cp;
1754 if (state->cp + 1 < state->cpend &&
1755 *state->cp == '?' &&
1756 (state->cp[1] == '=' ||
1757 state->cp[1] == '!' ||
1758 state->cp[1] == ':')) {
1759 switch (state->cp[1]) {
1760 case '=':
1761 op = REOP_ASSERT;
1762 /* ASSERT, <next>, ... ASSERTTEST */
1763 state->progLength += 4;
1764 break;
1765 case '!':
1766 op = REOP_ASSERT_NOT;
1767 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1768 state->progLength += 4;
1769 break;
1770 default:
1771 op = REOP_LPARENNON;
1772 break;
1774 state->cp += 2;
1775 } else {
1776 op = REOP_LPAREN;
1777 /* LPAREN, <index>, ... RPAREN, <index> */
1778 state->progLength
1779 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1780 state->parenCount++;
1781 if (state->parenCount == 65535) {
1782 ReportRegExpError(state, JSREPORT_ERROR,
1783 JSMSG_TOO_MANY_PARENS);
1784 goto out;
1787 goto pushOperator;
1789 case ')':
1791 * If there's no stacked open parenthesis, throw syntax error.
1793 for (i = operatorSP - 1; ; i--) {
1794 if (i < 0) {
1795 ReportRegExpError(state, JSREPORT_ERROR,
1796 JSMSG_UNMATCHED_RIGHT_PAREN);
1797 goto out;
1799 if (operatorStack[i].op == REOP_ASSERT ||
1800 operatorStack[i].op == REOP_ASSERT_NOT ||
1801 operatorStack[i].op == REOP_LPARENNON ||
1802 operatorStack[i].op == REOP_LPAREN) {
1803 break;
1806 /* FALL THROUGH */
1808 case '|':
1809 /* Expected an operand before these, so make an empty one */
1810 operand = NewRENode(state, REOP_EMPTY);
1811 if (!operand)
1812 goto out;
1813 goto pushOperand;
1815 default:
1816 if (!ParseTerm(state))
1817 goto out;
1818 operand = state->result;
1819 pushOperand:
1820 if (operandSP == operandStackSize) {
1821 RENode **tmp;
1822 operandStackSize += operandStackSize;
1823 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1824 if (!tmp)
1825 goto out;
1826 operandStack = tmp;
1828 operandStack[operandSP++] = operand;
1829 break;
1833 /* At the end; process remaining operators. */
1834 restartOperator:
1835 if (state->cp == state->cpend) {
1836 while (operatorSP) {
1837 --operatorSP;
1838 if (!ProcessOp(state, &operatorStack[operatorSP],
1839 operandStack, operandSP))
1840 goto out;
1841 --operandSP;
1843 assert(operandSP == 1);
1844 state->result = operandStack[0];
1845 result = TRUE;
1846 goto out;
1849 switch (*state->cp) {
1850 case '|':
1851 /* Process any stacked 'concat' operators */
1852 ++state->cp;
1853 while (operatorSP &&
1854 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1855 --operatorSP;
1856 if (!ProcessOp(state, &operatorStack[operatorSP],
1857 operandStack, operandSP)) {
1858 goto out;
1860 --operandSP;
1862 op = REOP_ALT;
1863 goto pushOperator;
1865 case ')':
1867 * If there's no stacked open parenthesis, throw syntax error.
1869 for (i = operatorSP - 1; ; i--) {
1870 if (i < 0) {
1871 ReportRegExpError(state, JSREPORT_ERROR,
1872 JSMSG_UNMATCHED_RIGHT_PAREN);
1873 goto out;
1875 if (operatorStack[i].op == REOP_ASSERT ||
1876 operatorStack[i].op == REOP_ASSERT_NOT ||
1877 operatorStack[i].op == REOP_LPARENNON ||
1878 operatorStack[i].op == REOP_LPAREN) {
1879 break;
1882 ++state->cp;
1884 /* Process everything on the stack until the open parenthesis. */
1885 for (;;) {
1886 assert(operatorSP);
1887 --operatorSP;
1888 switch (operatorStack[operatorSP].op) {
1889 case REOP_ASSERT:
1890 case REOP_ASSERT_NOT:
1891 case REOP_LPAREN:
1892 operand = NewRENode(state, operatorStack[operatorSP].op);
1893 if (!operand)
1894 goto out;
1895 operand->u.parenIndex =
1896 operatorStack[operatorSP].parenIndex;
1897 assert(operandSP);
1898 operand->kid = operandStack[operandSP - 1];
1899 operandStack[operandSP - 1] = operand;
1900 if (state->treeDepth == TREE_DEPTH_MAX) {
1901 ReportRegExpError(state, JSREPORT_ERROR,
1902 JSMSG_REGEXP_TOO_COMPLEX);
1903 goto out;
1905 ++state->treeDepth;
1906 /* FALL THROUGH */
1908 case REOP_LPARENNON:
1909 state->result = operandStack[operandSP - 1];
1910 if (!ParseQuantifier(state))
1911 goto out;
1912 operandStack[operandSP - 1] = state->result;
1913 goto restartOperator;
1914 default:
1915 if (!ProcessOp(state, &operatorStack[operatorSP],
1916 operandStack, operandSP))
1917 goto out;
1918 --operandSP;
1919 break;
1922 break;
1924 case '{':
1926 const WCHAR *errp = state->cp;
1928 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1930 * This didn't even scan correctly as a quantifier, so we should
1931 * treat it as flat.
1933 op = REOP_CONCAT;
1934 goto pushOperator;
1937 state->cp = errp;
1938 /* FALL THROUGH */
1941 case '+':
1942 case '*':
1943 case '?':
1944 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1945 state->cp);
1946 result = FALSE;
1947 goto out;
1949 default:
1950 /* Anything else is the start of the next term. */
1951 op = REOP_CONCAT;
1952 pushOperator:
1953 if (operatorSP == operatorStackSize) {
1954 REOpData *tmp;
1955 operatorStackSize += operatorStackSize;
1956 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1957 if (!tmp)
1958 goto out;
1959 operatorStack = tmp;
1961 operatorStack[operatorSP].op = op;
1962 operatorStack[operatorSP].errPos = state->cp;
1963 operatorStack[operatorSP++].parenIndex = parenIndex;
1964 break;
1967 out:
1968 heap_free(operatorStack);
1969 heap_free(operandStack);
1970 return result;
1974 * Save the current state of the match - the position in the input
1975 * text as well as the position in the bytecode. The state of any
1976 * parent expressions is also saved (preceding state).
1977 * Contents of parenCount parentheses from parenIndex are also saved.
1979 static REBackTrackData *
1980 PushBackTrackState(REGlobalData *gData, REOp op,
1981 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1982 size_t parenIndex, size_t parenCount)
1984 size_t i;
1985 REBackTrackData *result =
1986 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1988 size_t sz = sizeof(REBackTrackData) +
1989 gData->stateStackTop * sizeof(REProgState) +
1990 parenCount * sizeof(RECapture);
1992 ptrdiff_t btsize = gData->backTrackStackSize;
1993 ptrdiff_t btincr = ((char *)result + sz) -
1994 ((char *)gData->backTrackStack + btsize);
1996 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1998 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1999 if (btincr > 0) {
2000 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
2002 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
2003 btincr = ((btincr+btsize-1)/btsize)*btsize;
2004 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
2005 if (!gData->backTrackStack) {
2006 js_ReportOutOfScriptQuota(gData->cx);
2007 gData->ok = FALSE;
2008 return NULL;
2010 gData->backTrackStackSize = btsize + btincr;
2011 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
2013 gData->backTrackSP = result;
2014 result->sz = gData->cursz;
2015 gData->cursz = sz;
2017 result->backtrack_op = op;
2018 result->backtrack_pc = target;
2019 result->cp = cp;
2020 result->parenCount = parenCount;
2021 result->parenIndex = parenIndex;
2023 result->saveStateStackTop = gData->stateStackTop;
2024 assert(gData->stateStackTop);
2025 memcpy(result + 1, gData->stateStack,
2026 sizeof(REProgState) * result->saveStateStackTop);
2028 if (parenCount != 0) {
2029 memcpy((char *)(result + 1) +
2030 sizeof(REProgState) * result->saveStateStackTop,
2031 &x->parens[parenIndex],
2032 sizeof(RECapture) * parenCount);
2033 for (i = 0; i != parenCount; i++)
2034 x->parens[parenIndex + i].index = -1;
2037 return result;
2040 static inline REMatchState *
2041 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2042 size_t length)
2044 size_t i;
2045 assert(gData->cpend >= x->cp);
2046 if (length > (size_t)(gData->cpend - x->cp))
2047 return NULL;
2048 for (i = 0; i != length; i++) {
2049 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2050 return NULL;
2052 x->cp += length;
2053 return x;
2057 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2058 * 2. If E is not a character then go to step 6.
2059 * 3. Let ch be E's character.
2060 * 4. Let A be a one-element RECharSet containing the character ch.
2061 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2062 * 6. E must be an integer. Let n be that integer.
2063 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2064 * 8. Return an internal Matcher closure that takes two arguments, a State x
2065 * and a Continuation c, and performs the following:
2066 * 1. Let cap be x's captures internal array.
2067 * 2. Let s be cap[n].
2068 * 3. If s is undefined, then call c(x) and return its result.
2069 * 4. Let e be x's endIndex.
2070 * 5. Let len be s's length.
2071 * 6. Let f be e+len.
2072 * 7. If f>InputLength, return failure.
2073 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2074 * such that Canonicalize(s[i]) is not the same character as
2075 * Canonicalize(Input [e+i]), then return failure.
2076 * 9. Let y be the State (f, cap).
2077 * 10. Call c(y) and return its result.
2079 static REMatchState *
2080 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2082 size_t len, i;
2083 const WCHAR *parenContent;
2084 RECapture *cap = &x->parens[parenIndex];
2086 if (cap->index == -1)
2087 return x;
2089 len = cap->length;
2090 if (x->cp + len > gData->cpend)
2091 return NULL;
2093 parenContent = &gData->cpbegin[cap->index];
2094 if (gData->regexp->flags & JSREG_FOLD) {
2095 for (i = 0; i < len; i++) {
2096 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2097 return NULL;
2099 } else {
2100 for (i = 0; i < len; i++) {
2101 if (parenContent[i] != x->cp[i])
2102 return NULL;
2105 x->cp += len;
2106 return x;
2109 /* Add a single character to the RECharSet */
2110 static void
2111 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2113 UINT byteIndex = (UINT)(c >> 3);
2114 assert(c <= cs->length);
2115 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2119 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2120 static void
2121 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2123 UINT i;
2125 UINT byteIndex1 = c1 >> 3;
2126 UINT byteIndex2 = c2 >> 3;
2128 assert(c2 <= cs->length && c1 <= c2);
2130 c1 &= 0x7;
2131 c2 &= 0x7;
2133 if (byteIndex1 == byteIndex2) {
2134 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2135 } else {
2136 cs->u.bits[byteIndex1] |= 0xFF << c1;
2137 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2138 cs->u.bits[i] = 0xFF;
2139 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2143 /* Compile the source of the class into a RECharSet */
2144 static BOOL
2145 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2147 const WCHAR *src, *end;
2148 BOOL inRange = FALSE;
2149 WCHAR rangeStart = 0;
2150 UINT byteLength, n;
2151 WCHAR c, thisCh;
2152 INT nDigits, i;
2154 assert(!charSet->converted);
2156 * Assert that startIndex and length points to chars inside [] inside
2157 * source string.
2159 assert(1 <= charSet->u.src.startIndex);
2160 assert(charSet->u.src.startIndex
2161 < SysStringLen(gData->regexp->source));
2162 assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
2163 - 1 - charSet->u.src.startIndex);
2165 charSet->converted = TRUE;
2166 src = gData->regexp->source + charSet->u.src.startIndex;
2168 end = src + charSet->u.src.length;
2170 assert(src[-1] == '[' && end[0] == ']');
2172 byteLength = (charSet->length >> 3) + 1;
2173 charSet->u.bits = heap_alloc(byteLength);
2174 if (!charSet->u.bits) {
2175 JS_ReportOutOfMemory(gData->cx);
2176 gData->ok = FALSE;
2177 return FALSE;
2179 memset(charSet->u.bits, 0, byteLength);
2181 if (src == end)
2182 return TRUE;
2184 if (*src == '^') {
2185 assert(charSet->sense == FALSE);
2186 ++src;
2187 } else {
2188 assert(charSet->sense == TRUE);
2191 while (src != end) {
2192 switch (*src) {
2193 case '\\':
2194 ++src;
2195 c = *src++;
2196 switch (c) {
2197 case 'b':
2198 thisCh = 0x8;
2199 break;
2200 case 'f':
2201 thisCh = 0xC;
2202 break;
2203 case 'n':
2204 thisCh = 0xA;
2205 break;
2206 case 'r':
2207 thisCh = 0xD;
2208 break;
2209 case 't':
2210 thisCh = 0x9;
2211 break;
2212 case 'v':
2213 thisCh = 0xB;
2214 break;
2215 case 'c':
2216 if (src < end && JS_ISWORD(*src)) {
2217 thisCh = (WCHAR)(*src++ & 0x1F);
2218 } else {
2219 --src;
2220 thisCh = '\\';
2222 break;
2223 case 'x':
2224 nDigits = 2;
2225 goto lexHex;
2226 case 'u':
2227 nDigits = 4;
2228 lexHex:
2229 n = 0;
2230 for (i = 0; (i < nDigits) && (src < end); i++) {
2231 UINT digit;
2232 c = *src++;
2233 if (!isASCIIHexDigit(c, &digit)) {
2235 * Back off to accepting the original '\'
2236 * as a literal
2238 src -= i + 1;
2239 n = '\\';
2240 break;
2242 n = (n << 4) | digit;
2244 thisCh = (WCHAR)n;
2245 break;
2246 case '0':
2247 case '1':
2248 case '2':
2249 case '3':
2250 case '4':
2251 case '5':
2252 case '6':
2253 case '7':
2255 * This is a non-ECMA extension - decimal escapes (in this
2256 * case, octal!) are supposed to be an error inside class
2257 * ranges, but supported here for backwards compatibility.
2259 n = JS7_UNDEC(c);
2260 c = *src;
2261 if ('0' <= c && c <= '7') {
2262 src++;
2263 n = 8 * n + JS7_UNDEC(c);
2264 c = *src;
2265 if ('0' <= c && c <= '7') {
2266 src++;
2267 i = 8 * n + JS7_UNDEC(c);
2268 if (i <= 0377)
2269 n = i;
2270 else
2271 src--;
2274 thisCh = (WCHAR)n;
2275 break;
2277 case 'd':
2278 AddCharacterRangeToCharSet(charSet, '0', '9');
2279 continue; /* don't need range processing */
2280 case 'D':
2281 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2282 AddCharacterRangeToCharSet(charSet,
2283 (WCHAR)('9' + 1),
2284 (WCHAR)charSet->length);
2285 continue;
2286 case 's':
2287 for (i = (INT)charSet->length; i >= 0; i--)
2288 if (isspaceW(i))
2289 AddCharacterToCharSet(charSet, (WCHAR)i);
2290 continue;
2291 case 'S':
2292 for (i = (INT)charSet->length; i >= 0; i--)
2293 if (!isspaceW(i))
2294 AddCharacterToCharSet(charSet, (WCHAR)i);
2295 continue;
2296 case 'w':
2297 for (i = (INT)charSet->length; i >= 0; i--)
2298 if (JS_ISWORD(i))
2299 AddCharacterToCharSet(charSet, (WCHAR)i);
2300 continue;
2301 case 'W':
2302 for (i = (INT)charSet->length; i >= 0; i--)
2303 if (!JS_ISWORD(i))
2304 AddCharacterToCharSet(charSet, (WCHAR)i);
2305 continue;
2306 default:
2307 thisCh = c;
2308 break;
2311 break;
2313 default:
2314 thisCh = *src++;
2315 break;
2318 if (inRange) {
2319 if (gData->regexp->flags & JSREG_FOLD) {
2320 assert(rangeStart <= thisCh);
2321 for (i = rangeStart; i <= thisCh; i++) {
2322 WCHAR uch, dch;
2324 AddCharacterToCharSet(charSet, i);
2325 uch = toupperW(i);
2326 dch = tolowerW(i);
2327 if (i != uch)
2328 AddCharacterToCharSet(charSet, uch);
2329 if (i != dch)
2330 AddCharacterToCharSet(charSet, dch);
2332 } else {
2333 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2335 inRange = FALSE;
2336 } else {
2337 if (gData->regexp->flags & JSREG_FOLD) {
2338 AddCharacterToCharSet(charSet, toupperW(thisCh));
2339 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2340 } else {
2341 AddCharacterToCharSet(charSet, thisCh);
2343 if (src < end - 1) {
2344 if (*src == '-') {
2345 ++src;
2346 inRange = TRUE;
2347 rangeStart = thisCh;
2352 return TRUE;
2355 static BOOL
2356 ReallocStateStack(REGlobalData *gData)
2358 size_t limit = gData->stateStackLimit;
2359 size_t sz = sizeof(REProgState) * limit;
2361 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2362 if (!gData->stateStack) {
2363 js_ReportOutOfScriptQuota(gData->cx);
2364 gData->ok = FALSE;
2365 return FALSE;
2367 gData->stateStackLimit = limit + limit;
2368 return TRUE;
2371 #define PUSH_STATE_STACK(data) \
2372 do { \
2373 ++(data)->stateStackTop; \
2374 if ((data)->stateStackTop == (data)->stateStackLimit && \
2375 !ReallocStateStack((data))) { \
2376 return NULL; \
2378 }while(0)
2381 * Apply the current op against the given input to see if it's going to match
2382 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2383 * true, then update the current state's cp. Always update startpc to the next
2384 * op.
2386 static inline REMatchState *
2387 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2388 jsbytecode **startpc, BOOL updatecp)
2390 REMatchState *result = NULL;
2391 WCHAR matchCh;
2392 size_t parenIndex;
2393 size_t offset, length, index;
2394 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2395 WCHAR *source;
2396 const WCHAR *startcp = x->cp;
2397 WCHAR ch;
2398 RECharSet *charSet;
2400 const char *opname = reop_names[op];
2401 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2402 (int)gData->stateStackTop * 2, "", opname);
2404 switch (op) {
2405 case REOP_EMPTY:
2406 result = x;
2407 break;
2408 case REOP_BOL:
2409 if (x->cp != gData->cpbegin) {
2410 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2411 !(gData->regexp->flags & JSREG_MULTILINE)) {
2412 break;
2414 if (!RE_IS_LINE_TERM(x->cp[-1]))
2415 break;
2417 result = x;
2418 break;
2419 case REOP_EOL:
2420 if (x->cp != gData->cpend) {
2421 if (/*!gData->cx->regExpStatics.multiline &&*/
2422 !(gData->regexp->flags & JSREG_MULTILINE)) {
2423 break;
2425 if (!RE_IS_LINE_TERM(*x->cp))
2426 break;
2428 result = x;
2429 break;
2430 case REOP_WBDRY:
2431 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2432 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2433 result = x;
2435 break;
2436 case REOP_WNONBDRY:
2437 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2438 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2439 result = x;
2441 break;
2442 case REOP_DOT:
2443 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2444 result = x;
2445 result->cp++;
2447 break;
2448 case REOP_DIGIT:
2449 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2450 result = x;
2451 result->cp++;
2453 break;
2454 case REOP_NONDIGIT:
2455 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2456 result = x;
2457 result->cp++;
2459 break;
2460 case REOP_ALNUM:
2461 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2462 result = x;
2463 result->cp++;
2465 break;
2466 case REOP_NONALNUM:
2467 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2468 result = x;
2469 result->cp++;
2471 break;
2472 case REOP_SPACE:
2473 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2474 result = x;
2475 result->cp++;
2477 break;
2478 case REOP_NONSPACE:
2479 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2480 result = x;
2481 result->cp++;
2483 break;
2484 case REOP_BACKREF:
2485 pc = ReadCompactIndex(pc, &parenIndex);
2486 assert(parenIndex < gData->regexp->parenCount);
2487 result = BackrefMatcher(gData, x, parenIndex);
2488 break;
2489 case REOP_FLAT:
2490 pc = ReadCompactIndex(pc, &offset);
2491 assert(offset < SysStringLen(gData->regexp->source));
2492 pc = ReadCompactIndex(pc, &length);
2493 assert(1 <= length);
2494 assert(length <= SysStringLen(gData->regexp->source) - offset);
2495 if (length <= (size_t)(gData->cpend - x->cp)) {
2496 source = gData->regexp->source + offset;
2497 TRACE("%s\n", debugstr_wn(source, length));
2498 for (index = 0; index != length; index++) {
2499 if (source[index] != x->cp[index])
2500 return NULL;
2502 x->cp += length;
2503 result = x;
2505 break;
2506 case REOP_FLAT1:
2507 matchCh = *pc++;
2508 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2509 if (x->cp != gData->cpend && *x->cp == matchCh) {
2510 result = x;
2511 result->cp++;
2513 break;
2514 case REOP_FLATi:
2515 pc = ReadCompactIndex(pc, &offset);
2516 assert(offset < SysStringLen(gData->regexp->source));
2517 pc = ReadCompactIndex(pc, &length);
2518 assert(1 <= length);
2519 assert(length <= SysStringLen(gData->regexp->source) - offset);
2520 source = gData->regexp->source;
2521 result = FlatNIMatcher(gData, x, source + offset, length);
2522 break;
2523 case REOP_FLAT1i:
2524 matchCh = *pc++;
2525 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2526 result = x;
2527 result->cp++;
2529 break;
2530 case REOP_UCFLAT1:
2531 matchCh = GET_ARG(pc);
2532 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2533 pc += ARG_LEN;
2534 if (x->cp != gData->cpend && *x->cp == matchCh) {
2535 result = x;
2536 result->cp++;
2538 break;
2539 case REOP_UCFLAT1i:
2540 matchCh = GET_ARG(pc);
2541 pc += ARG_LEN;
2542 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2543 result = x;
2544 result->cp++;
2546 break;
2547 case REOP_CLASS:
2548 pc = ReadCompactIndex(pc, &index);
2549 assert(index < gData->regexp->classCount);
2550 if (x->cp != gData->cpend) {
2551 charSet = &gData->regexp->classList[index];
2552 assert(charSet->converted);
2553 ch = *x->cp;
2554 index = ch >> 3;
2555 if (charSet->length != 0 &&
2556 ch <= charSet->length &&
2557 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2558 result = x;
2559 result->cp++;
2562 break;
2563 case REOP_NCLASS:
2564 pc = ReadCompactIndex(pc, &index);
2565 assert(index < gData->regexp->classCount);
2566 if (x->cp != gData->cpend) {
2567 charSet = &gData->regexp->classList[index];
2568 assert(charSet->converted);
2569 ch = *x->cp;
2570 index = ch >> 3;
2571 if (charSet->length == 0 ||
2572 ch > charSet->length ||
2573 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2574 result = x;
2575 result->cp++;
2578 break;
2580 default:
2581 assert(FALSE);
2583 if (result) {
2584 if (!updatecp)
2585 x->cp = startcp;
2586 *startpc = pc;
2587 TRACE(" *\n");
2588 return result;
2590 x->cp = startcp;
2591 return NULL;
2594 static inline REMatchState *
2595 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2597 REMatchState *result = NULL;
2598 REBackTrackData *backTrackData;
2599 jsbytecode *nextpc, *testpc;
2600 REOp nextop;
2601 RECapture *cap;
2602 REProgState *curState;
2603 const WCHAR *startcp;
2604 size_t parenIndex, k;
2605 size_t parenSoFar = 0;
2607 WCHAR matchCh1, matchCh2;
2608 RECharSet *charSet;
2610 BOOL anchor;
2611 jsbytecode *pc = gData->regexp->program;
2612 REOp op = (REOp) *pc++;
2615 * If the first node is a simple match, step the index into the string
2616 * until that match is made, or fail if it can't be found at all.
2618 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2619 anchor = FALSE;
2620 while (x->cp <= gData->cpend) {
2621 nextpc = pc; /* reset back to start each time */
2622 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2623 if (result) {
2624 anchor = TRUE;
2625 x = result;
2626 pc = nextpc; /* accept skip to next opcode */
2627 op = (REOp) *pc++;
2628 assert(op < REOP_LIMIT);
2629 break;
2631 gData->skipped++;
2632 x->cp++;
2634 if (!anchor)
2635 goto bad;
2638 for (;;) {
2639 const char *opname = reop_names[op];
2640 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2641 (int)gData->stateStackTop * 2, "", opname);
2643 if (REOP_IS_SIMPLE(op)) {
2644 result = SimpleMatch(gData, x, op, &pc, TRUE);
2645 } else {
2646 curState = &gData->stateStack[gData->stateStackTop];
2647 switch (op) {
2648 case REOP_END:
2649 goto good;
2650 case REOP_ALTPREREQ2:
2651 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2652 pc += ARG_LEN;
2653 matchCh2 = GET_ARG(pc);
2654 pc += ARG_LEN;
2655 k = GET_ARG(pc);
2656 pc += ARG_LEN;
2658 if (x->cp != gData->cpend) {
2659 if (*x->cp == matchCh2)
2660 goto doAlt;
2662 charSet = &gData->regexp->classList[k];
2663 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2664 goto bad;
2665 matchCh1 = *x->cp;
2666 k = matchCh1 >> 3;
2667 if ((charSet->length == 0 ||
2668 matchCh1 > charSet->length ||
2669 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2670 charSet->sense) {
2671 goto doAlt;
2674 result = NULL;
2675 break;
2677 case REOP_ALTPREREQ:
2678 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2679 pc += ARG_LEN;
2680 matchCh1 = GET_ARG(pc);
2681 pc += ARG_LEN;
2682 matchCh2 = GET_ARG(pc);
2683 pc += ARG_LEN;
2684 if (x->cp == gData->cpend ||
2685 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2686 result = NULL;
2687 break;
2689 /* else fall through... */
2691 case REOP_ALT:
2692 doAlt:
2693 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2694 pc += ARG_LEN; /* start of this alternate */
2695 curState->parenSoFar = parenSoFar;
2696 PUSH_STATE_STACK(gData);
2697 op = (REOp) *pc++;
2698 startcp = x->cp;
2699 if (REOP_IS_SIMPLE(op)) {
2700 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2701 op = (REOp) *nextpc++;
2702 pc = nextpc;
2703 continue;
2705 result = x;
2706 op = (REOp) *pc++;
2708 nextop = (REOp) *nextpc++;
2709 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2710 goto bad;
2711 continue;
2714 * Occurs at (successful) end of REOP_ALT,
2716 case REOP_JUMP:
2718 * If we have not gotten a result here, it is because of an
2719 * empty match. Do the same thing REOP_EMPTY would do.
2721 if (!result)
2722 result = x;
2724 --gData->stateStackTop;
2725 pc += GET_OFFSET(pc);
2726 op = (REOp) *pc++;
2727 continue;
2730 * Occurs at last (successful) end of REOP_ALT,
2732 case REOP_ENDALT:
2734 * If we have not gotten a result here, it is because of an
2735 * empty match. Do the same thing REOP_EMPTY would do.
2737 if (!result)
2738 result = x;
2740 --gData->stateStackTop;
2741 op = (REOp) *pc++;
2742 continue;
2744 case REOP_LPAREN:
2745 pc = ReadCompactIndex(pc, &parenIndex);
2746 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2747 assert(parenIndex < gData->regexp->parenCount);
2748 if (parenIndex + 1 > parenSoFar)
2749 parenSoFar = parenIndex + 1;
2750 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2751 x->parens[parenIndex].length = 0;
2752 op = (REOp) *pc++;
2753 continue;
2755 case REOP_RPAREN:
2757 ptrdiff_t delta;
2759 pc = ReadCompactIndex(pc, &parenIndex);
2760 assert(parenIndex < gData->regexp->parenCount);
2761 cap = &x->parens[parenIndex];
2762 delta = x->cp - (gData->cpbegin + cap->index);
2763 cap->length = (delta < 0) ? 0 : (size_t) delta;
2764 op = (REOp) *pc++;
2766 if (!result)
2767 result = x;
2768 continue;
2770 case REOP_ASSERT:
2771 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2772 pc += ARG_LEN; /* start of ASSERT child */
2773 op = (REOp) *pc++;
2774 testpc = pc;
2775 if (REOP_IS_SIMPLE(op) &&
2776 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2777 result = NULL;
2778 break;
2780 curState->u.assertion.top =
2781 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2782 curState->u.assertion.sz = gData->cursz;
2783 curState->index = x->cp - gData->cpbegin;
2784 curState->parenSoFar = parenSoFar;
2785 PUSH_STATE_STACK(gData);
2786 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2787 nextpc, x, x->cp, 0, 0)) {
2788 goto bad;
2790 continue;
2792 case REOP_ASSERT_NOT:
2793 nextpc = pc + GET_OFFSET(pc);
2794 pc += ARG_LEN;
2795 op = (REOp) *pc++;
2796 testpc = pc;
2797 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2798 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2799 *testpc == REOP_ASSERTNOTTEST) {
2800 result = NULL;
2801 break;
2803 curState->u.assertion.top
2804 = (char *)gData->backTrackSP -
2805 (char *)gData->backTrackStack;
2806 curState->u.assertion.sz = gData->cursz;
2807 curState->index = x->cp - gData->cpbegin;
2808 curState->parenSoFar = parenSoFar;
2809 PUSH_STATE_STACK(gData);
2810 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2811 nextpc, x, x->cp, 0, 0)) {
2812 goto bad;
2814 continue;
2816 case REOP_ASSERTTEST:
2817 --gData->stateStackTop;
2818 --curState;
2819 x->cp = gData->cpbegin + curState->index;
2820 gData->backTrackSP =
2821 (REBackTrackData *) ((char *)gData->backTrackStack +
2822 curState->u.assertion.top);
2823 gData->cursz = curState->u.assertion.sz;
2824 if (result)
2825 result = x;
2826 break;
2828 case REOP_ASSERTNOTTEST:
2829 --gData->stateStackTop;
2830 --curState;
2831 x->cp = gData->cpbegin + curState->index;
2832 gData->backTrackSP =
2833 (REBackTrackData *) ((char *)gData->backTrackStack +
2834 curState->u.assertion.top);
2835 gData->cursz = curState->u.assertion.sz;
2836 result = (!result) ? x : NULL;
2837 break;
2838 case REOP_STAR:
2839 curState->u.quantifier.min = 0;
2840 curState->u.quantifier.max = (UINT)-1;
2841 goto quantcommon;
2842 case REOP_PLUS:
2843 curState->u.quantifier.min = 1;
2844 curState->u.quantifier.max = (UINT)-1;
2845 goto quantcommon;
2846 case REOP_OPT:
2847 curState->u.quantifier.min = 0;
2848 curState->u.quantifier.max = 1;
2849 goto quantcommon;
2850 case REOP_QUANT:
2851 pc = ReadCompactIndex(pc, &k);
2852 curState->u.quantifier.min = k;
2853 pc = ReadCompactIndex(pc, &k);
2854 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2855 curState->u.quantifier.max = k - 1;
2856 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2857 quantcommon:
2858 if (curState->u.quantifier.max == 0) {
2859 pc = pc + GET_OFFSET(pc);
2860 op = (REOp) *pc++;
2861 result = x;
2862 continue;
2864 /* Step over <next> */
2865 nextpc = pc + ARG_LEN;
2866 op = (REOp) *nextpc++;
2867 startcp = x->cp;
2868 if (REOP_IS_SIMPLE(op)) {
2869 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2870 if (curState->u.quantifier.min == 0)
2871 result = x;
2872 else
2873 result = NULL;
2874 pc = pc + GET_OFFSET(pc);
2875 break;
2877 op = (REOp) *nextpc++;
2878 result = x;
2880 curState->index = startcp - gData->cpbegin;
2881 curState->continue_op = REOP_REPEAT;
2882 curState->continue_pc = pc;
2883 curState->parenSoFar = parenSoFar;
2884 PUSH_STATE_STACK(gData);
2885 if (curState->u.quantifier.min == 0 &&
2886 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2887 0, 0)) {
2888 goto bad;
2890 pc = nextpc;
2891 continue;
2893 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2894 pc = curState[-1].continue_pc;
2895 op = (REOp) curState[-1].continue_op;
2897 if (!result)
2898 result = x;
2899 continue;
2901 case REOP_REPEAT:
2902 --curState;
2903 do {
2904 --gData->stateStackTop;
2905 if (!result) {
2906 /* Failed, see if we have enough children. */
2907 if (curState->u.quantifier.min == 0)
2908 goto repeatDone;
2909 goto break_switch;
2911 if (curState->u.quantifier.min == 0 &&
2912 x->cp == gData->cpbegin + curState->index) {
2913 /* matched an empty string, that'll get us nowhere */
2914 result = NULL;
2915 goto break_switch;
2917 if (curState->u.quantifier.min != 0)
2918 curState->u.quantifier.min--;
2919 if (curState->u.quantifier.max != (UINT) -1)
2920 curState->u.quantifier.max--;
2921 if (curState->u.quantifier.max == 0)
2922 goto repeatDone;
2923 nextpc = pc + ARG_LEN;
2924 nextop = (REOp) *nextpc;
2925 startcp = x->cp;
2926 if (REOP_IS_SIMPLE(nextop)) {
2927 nextpc++;
2928 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2929 if (curState->u.quantifier.min == 0)
2930 goto repeatDone;
2931 result = NULL;
2932 goto break_switch;
2934 result = x;
2936 curState->index = startcp - gData->cpbegin;
2937 PUSH_STATE_STACK(gData);
2938 if (curState->u.quantifier.min == 0 &&
2939 !PushBackTrackState(gData, REOP_REPEAT,
2940 pc, x, startcp,
2941 curState->parenSoFar,
2942 parenSoFar -
2943 curState->parenSoFar)) {
2944 goto bad;
2946 } while (*nextpc == REOP_ENDCHILD);
2947 pc = nextpc;
2948 op = (REOp) *pc++;
2949 parenSoFar = curState->parenSoFar;
2950 continue;
2952 repeatDone:
2953 result = x;
2954 pc += GET_OFFSET(pc);
2955 goto break_switch;
2957 case REOP_MINIMALSTAR:
2958 curState->u.quantifier.min = 0;
2959 curState->u.quantifier.max = (UINT)-1;
2960 goto minimalquantcommon;
2961 case REOP_MINIMALPLUS:
2962 curState->u.quantifier.min = 1;
2963 curState->u.quantifier.max = (UINT)-1;
2964 goto minimalquantcommon;
2965 case REOP_MINIMALOPT:
2966 curState->u.quantifier.min = 0;
2967 curState->u.quantifier.max = 1;
2968 goto minimalquantcommon;
2969 case REOP_MINIMALQUANT:
2970 pc = ReadCompactIndex(pc, &k);
2971 curState->u.quantifier.min = k;
2972 pc = ReadCompactIndex(pc, &k);
2973 /* See REOP_QUANT comments about k - 1. */
2974 curState->u.quantifier.max = k - 1;
2975 assert(curState->u.quantifier.min
2976 <= curState->u.quantifier.max);
2977 minimalquantcommon:
2978 curState->index = x->cp - gData->cpbegin;
2979 curState->parenSoFar = parenSoFar;
2980 PUSH_STATE_STACK(gData);
2981 if (curState->u.quantifier.min != 0) {
2982 curState->continue_op = REOP_MINIMALREPEAT;
2983 curState->continue_pc = pc;
2984 /* step over <next> */
2985 pc += OFFSET_LEN;
2986 op = (REOp) *pc++;
2987 } else {
2988 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2989 pc, x, x->cp, 0, 0)) {
2990 goto bad;
2992 --gData->stateStackTop;
2993 pc = pc + GET_OFFSET(pc);
2994 op = (REOp) *pc++;
2996 continue;
2998 case REOP_MINIMALREPEAT:
2999 --gData->stateStackTop;
3000 --curState;
3002 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
3003 #define PREPARE_REPEAT() \
3004 do { \
3005 curState->index = x->cp - gData->cpbegin; \
3006 curState->continue_op = REOP_MINIMALREPEAT; \
3007 curState->continue_pc = pc; \
3008 pc += ARG_LEN; \
3009 for (k = curState->parenSoFar; k < parenSoFar; k++) \
3010 x->parens[k].index = -1; \
3011 PUSH_STATE_STACK(gData); \
3012 op = (REOp) *pc++; \
3013 assert(op < REOP_LIMIT); \
3014 }while(0)
3016 if (!result) {
3017 TRACE(" -\n");
3019 * Non-greedy failure - try to consume another child.
3021 if (curState->u.quantifier.max == (UINT) -1 ||
3022 curState->u.quantifier.max > 0) {
3023 PREPARE_REPEAT();
3024 continue;
3026 /* Don't need to adjust pc since we're going to pop. */
3027 break;
3029 if (curState->u.quantifier.min == 0 &&
3030 x->cp == gData->cpbegin + curState->index) {
3031 /* Matched an empty string, that'll get us nowhere. */
3032 result = NULL;
3033 break;
3035 if (curState->u.quantifier.min != 0)
3036 curState->u.quantifier.min--;
3037 if (curState->u.quantifier.max != (UINT) -1)
3038 curState->u.quantifier.max--;
3039 if (curState->u.quantifier.min != 0) {
3040 PREPARE_REPEAT();
3041 continue;
3043 curState->index = x->cp - gData->cpbegin;
3044 curState->parenSoFar = parenSoFar;
3045 PUSH_STATE_STACK(gData);
3046 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3047 pc, x, x->cp,
3048 curState->parenSoFar,
3049 parenSoFar - curState->parenSoFar)) {
3050 goto bad;
3052 --gData->stateStackTop;
3053 pc = pc + GET_OFFSET(pc);
3054 op = (REOp) *pc++;
3055 assert(op < REOP_LIMIT);
3056 continue;
3057 default:
3058 assert(FALSE);
3059 result = NULL;
3061 break_switch:;
3065 * If the match failed and there's a backtrack option, take it.
3066 * Otherwise this is a complete and utter failure.
3068 if (!result) {
3069 if (gData->cursz == 0)
3070 return NULL;
3072 /* Potentially detect explosive regex here. */
3073 gData->backTrackCount++;
3074 if (gData->backTrackLimit &&
3075 gData->backTrackCount >= gData->backTrackLimit) {
3076 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3077 JSMSG_REGEXP_TOO_COMPLEX);
3078 gData->ok = FALSE;
3079 return NULL;
3082 backTrackData = gData->backTrackSP;
3083 gData->cursz = backTrackData->sz;
3084 gData->backTrackSP =
3085 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3086 x->cp = backTrackData->cp;
3087 pc = backTrackData->backtrack_pc;
3088 op = (REOp) backTrackData->backtrack_op;
3089 assert(op < REOP_LIMIT);
3090 gData->stateStackTop = backTrackData->saveStateStackTop;
3091 assert(gData->stateStackTop);
3093 memcpy(gData->stateStack, backTrackData + 1,
3094 sizeof(REProgState) * backTrackData->saveStateStackTop);
3095 curState = &gData->stateStack[gData->stateStackTop - 1];
3097 if (backTrackData->parenCount) {
3098 memcpy(&x->parens[backTrackData->parenIndex],
3099 (char *)(backTrackData + 1) +
3100 sizeof(REProgState) * backTrackData->saveStateStackTop,
3101 sizeof(RECapture) * backTrackData->parenCount);
3102 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3103 } else {
3104 for (k = curState->parenSoFar; k < parenSoFar; k++)
3105 x->parens[k].index = -1;
3106 parenSoFar = curState->parenSoFar;
3109 TRACE("\tBT_Pop: %ld,%ld\n",
3110 (ULONG_PTR)backTrackData->parenIndex,
3111 (ULONG_PTR)backTrackData->parenCount);
3112 continue;
3114 x = result;
3117 * Continue with the expression.
3119 op = (REOp)*pc++;
3120 assert(op < REOP_LIMIT);
3123 bad:
3124 TRACE("\n");
3125 return NULL;
3127 good:
3128 TRACE("\n");
3129 return x;
3132 static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
3134 REMatchState *result;
3135 const WCHAR *cp = x->cp;
3136 const WCHAR *cp2;
3137 UINT j;
3140 * Have to include the position beyond the last character
3141 * in order to detect end-of-input/line condition.
3143 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3144 gData->skipped = cp2 - cp;
3145 x->cp = cp2;
3146 for (j = 0; j < gData->regexp->parenCount; j++)
3147 x->parens[j].index = -1;
3148 result = ExecuteREBytecode(gData, x);
3149 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
3150 return result;
3151 gData->backTrackSP = gData->backTrackStack;
3152 gData->cursz = 0;
3153 gData->stateStackTop = 0;
3154 cp2 = cp + gData->skipped;
3156 return NULL;
3159 #define MIN_BACKTRACK_LIMIT 400000
3161 static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
3163 REMatchState *result;
3164 UINT i;
3166 gData->backTrackStackSize = INITIAL_BACKTRACK;
3167 gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
3168 if (!gData->backTrackStack)
3169 goto bad;
3171 gData->backTrackSP = gData->backTrackStack;
3172 gData->cursz = 0;
3173 gData->backTrackCount = 0;
3174 gData->backTrackLimit = 0;
3176 gData->stateStackLimit = INITIAL_STATESTACK;
3177 gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3178 if (!gData->stateStack)
3179 goto bad;
3181 gData->stateStackTop = 0;
3182 gData->cx = cx;
3183 gData->regexp = re;
3184 gData->ok = TRUE;
3186 result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
3187 if (!result)
3188 goto bad;
3190 for (i = 0; i < re->classCount; i++) {
3191 if (!re->classList[i].converted &&
3192 !ProcessCharSet(gData, &re->classList[i])) {
3193 return NULL;
3197 return result;
3199 bad:
3200 js_ReportOutOfScriptQuota(cx);
3201 gData->ok = FALSE;
3202 return NULL;
3205 static void
3206 js_DestroyRegExp(JSRegExp *re)
3208 if (re->classList) {
3209 UINT i;
3210 for (i = 0; i < re->classCount; i++) {
3211 if (re->classList[i].converted)
3212 heap_free(re->classList[i].u.bits);
3213 re->classList[i].u.bits = NULL;
3215 heap_free(re->classList);
3217 heap_free(re);
3220 static JSRegExp *
3221 js_NewRegExp(script_ctx_t *cx, BSTR str, UINT flags, BOOL flat)
3223 JSRegExp *re;
3224 jsheap_t *mark;
3225 CompilerState state;
3226 size_t resize;
3227 jsbytecode *endPC;
3228 UINT i;
3229 size_t len;
3231 re = NULL;
3232 mark = jsheap_mark(&cx->tmp_heap);
3233 len = SysStringLen(str);
3235 state.context = cx;
3236 state.cp = str;
3237 if (!state.cp)
3238 goto out;
3239 state.cpbegin = state.cp;
3240 state.cpend = state.cp + len;
3241 state.flags = flags;
3242 state.parenCount = 0;
3243 state.classCount = 0;
3244 state.progLength = 0;
3245 state.treeDepth = 0;
3246 state.classBitmapsMem = 0;
3247 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3248 state.classCache[i].start = NULL;
3250 if (len != 0 && flat) {
3251 state.result = NewRENode(&state, REOP_FLAT);
3252 if (!state.result)
3253 goto out;
3254 state.result->u.flat.chr = *state.cpbegin;
3255 state.result->u.flat.length = len;
3256 state.result->kid = (void *) state.cpbegin;
3257 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3258 state.progLength += 1 + GetCompactIndexWidth(0)
3259 + GetCompactIndexWidth(len);
3260 } else {
3261 if (!ParseRegExp(&state))
3262 goto out;
3264 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3265 re = heap_alloc(resize);
3266 if (!re)
3267 goto out;
3269 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3270 re->classCount = state.classCount;
3271 if (re->classCount) {
3272 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3273 if (!re->classList) {
3274 js_DestroyRegExp(re);
3275 re = NULL;
3276 goto out;
3278 for (i = 0; i < re->classCount; i++)
3279 re->classList[i].converted = FALSE;
3280 } else {
3281 re->classList = NULL;
3283 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3284 if (!endPC) {
3285 js_DestroyRegExp(re);
3286 re = NULL;
3287 goto out;
3289 *endPC++ = REOP_END;
3291 * Check whether size was overestimated and shrink using realloc.
3292 * This is safe since no pointers to newly parsed regexp or its parts
3293 * besides re exist here.
3295 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3296 JSRegExp *tmp;
3297 assert((size_t)(endPC - re->program) < state.progLength + 1);
3298 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3299 tmp = heap_realloc(re, resize);
3300 if (tmp)
3301 re = tmp;
3304 re->flags = flags;
3305 re->parenCount = state.parenCount;
3306 re->source = str;
3308 out:
3309 jsheap_clear(mark);
3310 return re;
3313 static inline RegExpInstance *regexp_from_vdisp(vdisp_t *vdisp)
3315 return (RegExpInstance*)vdisp->u.jsdisp;
3318 static void set_last_index(RegExpInstance *This, DWORD last_index)
3320 This->last_index = last_index;
3321 jsval_release(This->last_index_val);
3322 This->last_index_val = jsval_number(last_index);
3325 static HRESULT do_regexp_match_next(script_ctx_t *ctx, RegExpInstance *regexp, DWORD rem_flags,
3326 const WCHAR *str, DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size,
3327 DWORD *parens_cnt, match_result_t *ret)
3329 REMatchState *x, *result;
3330 REGlobalData gData;
3331 DWORD matchlen;
3333 gData.cpbegin = str;
3334 gData.cpend = str + len;
3335 gData.start = *cp-str;
3336 gData.skipped = 0;
3337 gData.pool = &ctx->tmp_heap;
3339 x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
3340 if(!x) {
3341 WARN("InitMatch failed\n");
3342 return E_FAIL;
3345 x->cp = *cp;
3346 result = MatchRegExp(&gData, x);
3347 if(!gData.ok) {
3348 WARN("MatchRegExp failed\n");
3349 return E_FAIL;
3352 if(!result) {
3353 if(rem_flags & REM_RESET_INDEX)
3354 set_last_index(regexp, 0);
3355 return S_FALSE;
3358 if(parens) {
3359 if(regexp->jsregexp->parenCount > *parens_size) {
3360 match_result_t *new_parens;
3362 if(*parens)
3363 new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
3364 else
3365 new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
3366 if(!new_parens)
3367 return E_OUTOFMEMORY;
3369 *parens = new_parens;
3373 /* FIXME: We often already have a copy of input string that we could use to store last match */
3374 if(!(rem_flags & REM_NO_CTX_UPDATE) &&
3375 (!ctx->last_match || len != SysStringLen(ctx->last_match) || strncmpW(ctx->last_match, str, len))) {
3376 BSTR last_match;
3378 last_match = SysAllocStringLen(str, len);
3379 if(!last_match)
3380 return E_OUTOFMEMORY;
3381 SysFreeString(ctx->last_match);
3382 ctx->last_match = last_match;
3385 if(parens) {
3386 DWORD i;
3388 *parens_cnt = regexp->jsregexp->parenCount;
3390 for(i=0; i < regexp->jsregexp->parenCount; i++) {
3391 if(result->parens[i].index == -1) {
3392 (*parens)[i].str = NULL;
3393 (*parens)[i].len = 0;
3394 }else {
3395 (*parens)[i].str = str + result->parens[i].index;
3396 (*parens)[i].len = result->parens[i].length;
3401 if(!(rem_flags & REM_NO_CTX_UPDATE)) {
3402 DWORD i, n = min(sizeof(ctx->match_parens)/sizeof(ctx->match_parens[0]), regexp->jsregexp->parenCount);
3404 for(i=0; i < n; i++) {
3405 if(result->parens[i].index == -1) {
3406 ctx->match_parens[i].str = NULL;
3407 ctx->match_parens[i].len = 0;
3408 }else {
3409 ctx->match_parens[i].str = ctx->last_match + result->parens[i].index;
3410 ctx->match_parens[i].len = result->parens[i].length;
3414 if(n < sizeof(ctx->match_parens)/sizeof(ctx->match_parens[0]))
3415 memset(ctx->match_parens+n, 0, sizeof(ctx->match_parens) - n*sizeof(ctx->match_parens[0]));
3418 matchlen = (result->cp-*cp) - gData.skipped;
3419 *cp = result->cp;
3420 ret->str = result->cp-matchlen;
3421 ret->len = matchlen;
3422 set_last_index(regexp, result->cp-str);
3424 if(!(rem_flags & REM_NO_CTX_UPDATE)) {
3425 ctx->last_match_index = ret->str-str;
3426 ctx->last_match_length = matchlen;
3429 return S_OK;
3432 HRESULT regexp_match_next(script_ctx_t *ctx, jsdisp_t *dispex, DWORD rem_flags, const WCHAR *str,
3433 DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt,
3434 match_result_t *ret)
3436 RegExpInstance *regexp = (RegExpInstance*)dispex;
3437 jsheap_t *mark;
3438 HRESULT hres;
3440 if((rem_flags & REM_CHECK_GLOBAL) && !(regexp->jsregexp->flags & JSREG_GLOB))
3441 return S_FALSE;
3443 mark = jsheap_mark(&ctx->tmp_heap);
3445 hres = do_regexp_match_next(ctx, regexp, rem_flags, str, len, cp, parens, parens_size, parens_cnt, ret);
3447 jsheap_clear(mark);
3448 return hres;
3451 HRESULT regexp_match(script_ctx_t *ctx, jsdisp_t *dispex, const WCHAR *str, DWORD len, BOOL gflag,
3452 match_result_t **match_result, DWORD *result_cnt)
3454 RegExpInstance *This = (RegExpInstance*)dispex;
3455 match_result_t *ret = NULL, cres;
3456 const WCHAR *cp = str;
3457 DWORD i=0, ret_size = 0;
3458 jsheap_t *mark;
3459 HRESULT hres;
3461 mark = jsheap_mark(&ctx->tmp_heap);
3463 while(1) {
3464 hres = do_regexp_match_next(ctx, This, 0, str, len, &cp, NULL, NULL, NULL, &cres);
3465 if(hres == S_FALSE) {
3466 hres = S_OK;
3467 break;
3470 if(FAILED(hres))
3471 break;
3473 if(ret_size == i) {
3474 if(ret)
3475 ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
3476 else
3477 ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
3478 if(!ret) {
3479 hres = E_OUTOFMEMORY;
3480 break;
3484 ret[i++] = cres;
3486 if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
3487 hres = S_OK;
3488 break;
3492 jsheap_clear(mark);
3493 if(FAILED(hres)) {
3494 heap_free(ret);
3495 return hres;
3498 *match_result = ret;
3499 *result_cnt = i;
3500 return S_OK;
3503 static HRESULT RegExp_source(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3504 jsval_t *r)
3506 TRACE("\n");
3508 switch(flags) {
3509 case DISPATCH_PROPERTYGET: {
3510 RegExpInstance *This = regexp_from_vdisp(jsthis);
3511 BSTR ret = SysAllocString(This->str);
3512 if(!ret)
3513 return E_OUTOFMEMORY;
3514 *r = jsval_string(ret);
3515 break;
3517 default:
3518 FIXME("Unimplemented flags %x\n", flags);
3519 return E_NOTIMPL;
3522 return S_OK;
3525 static HRESULT RegExp_global(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3526 jsval_t *r)
3528 FIXME("\n");
3529 return E_NOTIMPL;
3532 static HRESULT RegExp_ignoreCase(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3533 jsval_t *r)
3535 FIXME("\n");
3536 return E_NOTIMPL;
3539 static HRESULT RegExp_multiline(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3540 jsval_t *r)
3542 FIXME("\n");
3543 return E_NOTIMPL;
3546 static INT index_from_val(script_ctx_t *ctx, jsval_t v)
3548 double n;
3549 HRESULT hres;
3551 hres = to_number(ctx, v, &n);
3552 if(FAILED(hres)) {
3553 clear_ei(ctx); /* FIXME: Move ignoring exceptions to to_primitive */
3554 return 0;
3557 n = floor(n);
3558 return is_int32(n) ? n : 0;
3561 static HRESULT RegExp_lastIndex(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3562 jsval_t *r)
3564 TRACE("\n");
3566 switch(flags) {
3567 case DISPATCH_PROPERTYGET: {
3568 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3570 return jsval_copy(regexp->last_index_val, r);
3572 case DISPATCH_PROPERTYPUT: {
3573 RegExpInstance *regexp = regexp_from_vdisp(jsthis);
3574 HRESULT hres;
3576 hres = jsval_copy(argv[0], &regexp->last_index_val);
3577 if(FAILED(hres))
3578 return hres;
3580 regexp->last_index = index_from_val(ctx, argv[0]);
3581 break;
3583 default:
3584 FIXME("unimplemented flags: %x\n", flags);
3585 return E_NOTIMPL;
3588 return S_OK;
3591 static HRESULT RegExp_toString(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3592 jsval_t *r)
3594 FIXME("\n");
3595 return E_NOTIMPL;
3598 static HRESULT create_match_array(script_ctx_t *ctx, BSTR input, const match_result_t *result,
3599 const match_result_t *parens, DWORD parens_cnt, IDispatch **ret)
3601 jsdisp_t *array;
3602 BSTR str;
3603 int i;
3604 HRESULT hres = S_OK;
3606 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3607 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3608 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3609 static const WCHAR zeroW[] = {'0',0};
3611 hres = create_array(ctx, parens_cnt+1, &array);
3612 if(FAILED(hres))
3613 return hres;
3615 for(i=0; i < parens_cnt; i++) {
3616 str = SysAllocStringLen(parens[i].str, parens[i].len);
3617 if(!str) {
3618 hres = E_OUTOFMEMORY;
3619 break;
3622 hres = jsdisp_propput_idx(array, i+1, jsval_string(str));
3623 SysFreeString(str);
3624 if(FAILED(hres))
3625 break;
3628 while(SUCCEEDED(hres)) {
3629 hres = jsdisp_propput_name(array, indexW, jsval_number(result->str-input));
3630 if(FAILED(hres))
3631 break;
3633 hres = jsdisp_propput_name(array, lastIndexW, jsval_number(result->str-input+result->len));
3634 if(FAILED(hres))
3635 break;
3637 hres = jsdisp_propput_name(array, inputW, jsval_string(input));
3638 if(FAILED(hres))
3639 break;
3641 str = SysAllocStringLen(result->str, result->len);
3642 if(!str) {
3643 hres = E_OUTOFMEMORY;
3644 break;
3646 hres = jsdisp_propput_name(array, zeroW, jsval_string(str));
3647 SysFreeString(str);
3648 break;
3651 if(FAILED(hres)) {
3652 jsdisp_release(array);
3653 return hres;
3656 *ret = to_disp(array);
3657 return S_OK;
3660 static HRESULT run_exec(script_ctx_t *ctx, vdisp_t *jsthis, jsval_t arg, BSTR *input,
3661 match_result_t *match, match_result_t **parens, DWORD *parens_cnt, BOOL *ret)
3663 RegExpInstance *regexp;
3664 DWORD parens_size = 0, last_index = 0, length;
3665 const WCHAR *cp;
3666 BSTR string;
3667 HRESULT hres;
3669 if(!is_vclass(jsthis, JSCLASS_REGEXP)) {
3670 FIXME("Not a RegExp\n");
3671 return E_NOTIMPL;
3674 regexp = regexp_from_vdisp(jsthis);
3676 hres = to_string(ctx, arg, &string);
3677 if(FAILED(hres))
3678 return hres;
3679 length = SysStringLen(string);
3681 if(regexp->jsregexp->flags & JSREG_GLOB) {
3682 if(regexp->last_index < 0) {
3683 SysFreeString(string);
3684 set_last_index(regexp, 0);
3685 *ret = FALSE;
3686 if(input) {
3687 *input = NULL;
3689 return S_OK;
3692 last_index = regexp->last_index;
3695 cp = string + last_index;
3696 hres = regexp_match_next(ctx, &regexp->dispex, REM_RESET_INDEX, string, length, &cp, parens,
3697 parens ? &parens_size : NULL, parens_cnt, match);
3698 if(FAILED(hres)) {
3699 SysFreeString(string);
3700 return hres;
3703 *ret = hres == S_OK;
3704 if(input) {
3705 *input = string;
3706 }else {
3707 SysFreeString(string);
3709 return S_OK;
3712 static HRESULT RegExp_exec(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3713 jsval_t *r)
3715 match_result_t *parens = NULL, match;
3716 DWORD parens_cnt = 0;
3717 BOOL b;
3718 BSTR string;
3719 HRESULT hres;
3721 TRACE("\n");
3723 hres = run_exec(ctx, jsthis, argc ? argv[0] : jsval_string(NULL), &string, &match, &parens, &parens_cnt, &b);
3724 if(FAILED(hres))
3725 return hres;
3727 if(r) {
3728 if(b) {
3729 IDispatch *ret;
3731 hres = create_match_array(ctx, string, &match, parens, parens_cnt, &ret);
3732 if(SUCCEEDED(hres))
3733 *r = jsval_disp(ret);
3734 }else {
3735 *r = jsval_null();
3739 heap_free(parens);
3740 SysFreeString(string);
3741 return hres;
3744 static HRESULT RegExp_test(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3745 jsval_t *r)
3747 match_result_t match;
3748 BSTR undef_str;
3749 BOOL b;
3750 HRESULT hres;
3752 TRACE("\n");
3754 if(!argc) {
3755 undef_str = SysAllocString(undefinedW);
3756 if(!undef_str)
3757 return E_OUTOFMEMORY;
3760 hres = run_exec(ctx, jsthis, argc ? argv[0] : jsval_string(undef_str), NULL, &match, NULL, NULL, &b);
3761 if(!argc)
3762 SysFreeString(undef_str);
3763 if(FAILED(hres))
3764 return hres;
3766 if(r)
3767 *r = jsval_bool(b);
3768 return S_OK;
3771 static HRESULT RegExp_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
3772 jsval_t *r)
3774 TRACE("\n");
3776 switch(flags) {
3777 case INVOKE_FUNC:
3778 return throw_type_error(ctx, JS_E_FUNCTION_EXPECTED, NULL);
3779 default:
3780 FIXME("unimplemented flags %x\n", flags);
3781 return E_NOTIMPL;
3784 return S_OK;
3787 static void RegExp_destructor(jsdisp_t *dispex)
3789 RegExpInstance *This = (RegExpInstance*)dispex;
3791 if(This->jsregexp)
3792 js_DestroyRegExp(This->jsregexp);
3793 jsval_release(This->last_index_val);
3794 SysFreeString(This->str);
3795 heap_free(This);
3798 static const builtin_prop_t RegExp_props[] = {
3799 {execW, RegExp_exec, PROPF_METHOD|1},
3800 {globalW, RegExp_global, 0},
3801 {ignoreCaseW, RegExp_ignoreCase, 0},
3802 {lastIndexW, RegExp_lastIndex, 0},
3803 {multilineW, RegExp_multiline, 0},
3804 {sourceW, RegExp_source, 0},
3805 {testW, RegExp_test, PROPF_METHOD|1},
3806 {toStringW, RegExp_toString, PROPF_METHOD}
3809 static const builtin_info_t RegExp_info = {
3810 JSCLASS_REGEXP,
3811 {NULL, RegExp_value, 0},
3812 sizeof(RegExp_props)/sizeof(*RegExp_props),
3813 RegExp_props,
3814 RegExp_destructor,
3815 NULL
3818 static const builtin_prop_t RegExpInst_props[] = {
3819 {globalW, RegExp_global, 0},
3820 {ignoreCaseW, RegExp_ignoreCase, 0},
3821 {lastIndexW, RegExp_lastIndex, 0},
3822 {multilineW, RegExp_multiline, 0},
3823 {sourceW, RegExp_source, 0}
3826 static const builtin_info_t RegExpInst_info = {
3827 JSCLASS_REGEXP,
3828 {NULL, RegExp_value, 0},
3829 sizeof(RegExpInst_props)/sizeof(*RegExpInst_props),
3830 RegExpInst_props,
3831 RegExp_destructor,
3832 NULL
3835 static HRESULT alloc_regexp(script_ctx_t *ctx, jsdisp_t *object_prototype, RegExpInstance **ret)
3837 RegExpInstance *regexp;
3838 HRESULT hres;
3840 regexp = heap_alloc_zero(sizeof(RegExpInstance));
3841 if(!regexp)
3842 return E_OUTOFMEMORY;
3844 if(object_prototype)
3845 hres = init_dispex(&regexp->dispex, ctx, &RegExp_info, object_prototype);
3846 else
3847 hres = init_dispex_from_constr(&regexp->dispex, ctx, &RegExpInst_info, ctx->regexp_constr);
3849 if(FAILED(hres)) {
3850 heap_free(regexp);
3851 return hres;
3854 *ret = regexp;
3855 return S_OK;
3858 HRESULT create_regexp(script_ctx_t *ctx, const WCHAR *exp, int len, DWORD flags, jsdisp_t **ret)
3860 RegExpInstance *regexp;
3861 HRESULT hres;
3863 TRACE("%s %x\n", debugstr_wn(exp, len), flags);
3865 hres = alloc_regexp(ctx, NULL, &regexp);
3866 if(FAILED(hres))
3867 return hres;
3869 if(len == -1)
3870 regexp->str = SysAllocString(exp);
3871 else
3872 regexp->str = SysAllocStringLen(exp, len);
3873 if(!regexp->str) {
3874 jsdisp_release(&regexp->dispex);
3875 return E_OUTOFMEMORY;
3878 regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
3879 if(!regexp->jsregexp) {
3880 WARN("js_NewRegExp failed\n");
3881 jsdisp_release(&regexp->dispex);
3882 return E_FAIL;
3885 regexp->last_index_val = jsval_number(0);
3887 *ret = &regexp->dispex;
3888 return S_OK;
3891 HRESULT create_regexp_var(script_ctx_t *ctx, jsval_t src_arg, jsval_t *flags_arg, jsdisp_t **ret)
3893 const WCHAR *opt = emptyW, *src;
3894 DWORD flags;
3895 HRESULT hres;
3897 if(is_object_instance(src_arg)) {
3898 jsdisp_t *obj;
3900 obj = iface_to_jsdisp((IUnknown*)get_object(src_arg));
3901 if(obj) {
3902 if(is_class(obj, JSCLASS_REGEXP)) {
3903 RegExpInstance *regexp = (RegExpInstance*)obj;
3905 hres = create_regexp(ctx, regexp->str, -1, regexp->jsregexp->flags, ret);
3906 jsdisp_release(obj);
3907 return hres;
3910 jsdisp_release(obj);
3914 if(!is_string(src_arg)) {
3915 FIXME("src_arg = %s\n", debugstr_jsval(src_arg));
3916 return E_NOTIMPL;
3919 src = get_string(src_arg);
3921 if(flags_arg) {
3922 if(!is_string(*flags_arg)) {
3923 FIXME("unimplemented for %s\n", debugstr_jsval(*flags_arg));
3924 return E_NOTIMPL;
3927 opt = get_string(*flags_arg);
3930 hres = parse_regexp_flags(opt, strlenW(opt), &flags);
3931 if(FAILED(hres))
3932 return hres;
3934 return create_regexp(ctx, src, -1, flags, ret);
3937 HRESULT regexp_string_match(script_ctx_t *ctx, jsdisp_t *re, BSTR str, jsval_t *r)
3939 static const WCHAR indexW[] = {'i','n','d','e','x',0};
3940 static const WCHAR inputW[] = {'i','n','p','u','t',0};
3941 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
3943 RegExpInstance *regexp = (RegExpInstance*)re;
3944 match_result_t *match_result;
3945 DWORD match_cnt, i, length;
3946 jsdisp_t *array;
3947 HRESULT hres;
3949 length = SysStringLen(str);
3951 if(!(regexp->jsregexp->flags & JSREG_GLOB)) {
3952 match_result_t match, *parens = NULL;
3953 DWORD parens_cnt, parens_size = 0;
3954 const WCHAR *cp = str;
3956 hres = regexp_match_next(ctx, &regexp->dispex, 0, str, length, &cp, &parens, &parens_size, &parens_cnt, &match);
3957 if(FAILED(hres))
3958 return hres;
3960 if(r) {
3961 if(hres == S_OK) {
3962 IDispatch *ret;
3964 hres = create_match_array(ctx, str, &match, parens, parens_cnt, &ret);
3965 if(SUCCEEDED(hres))
3966 *r = jsval_disp(ret);
3967 }else {
3968 *r = jsval_null();
3972 heap_free(parens);
3973 return S_OK;
3976 hres = regexp_match(ctx, &regexp->dispex, str, length, FALSE, &match_result, &match_cnt);
3977 if(FAILED(hres))
3978 return hres;
3980 if(!match_cnt) {
3981 TRACE("no match\n");
3983 if(r)
3984 *r = jsval_null();
3985 return S_OK;
3988 hres = create_array(ctx, match_cnt, &array);
3989 if(FAILED(hres))
3990 return hres;
3992 for(i=0; i < match_cnt; i++) {
3993 BSTR tmp_str;
3995 tmp_str = SysAllocStringLen(match_result[i].str, match_result[i].len);
3996 if(!tmp_str) {
3997 hres = E_OUTOFMEMORY;
3998 break;
4001 hres = jsdisp_propput_idx(array, i, jsval_string(tmp_str));
4002 SysFreeString(tmp_str);
4003 if(FAILED(hres))
4004 break;
4007 while(SUCCEEDED(hres)) {
4008 hres = jsdisp_propput_name(array, indexW, jsval_number(match_result[match_cnt-1].str-str));
4009 if(FAILED(hres))
4010 break;
4012 hres = jsdisp_propput_name(array, lastIndexW,
4013 jsval_number(match_result[match_cnt-1].str-str+match_result[match_cnt-1].len));
4014 if(FAILED(hres))
4015 break;
4017 hres = jsdisp_propput_name(array, inputW, jsval_string(str));
4018 break;
4021 heap_free(match_result);
4023 if(SUCCEEDED(hres) && r)
4024 *r = jsval_obj(array);
4025 else
4026 jsdisp_release(array);
4027 return hres;
4030 static HRESULT global_idx(script_ctx_t *ctx, DWORD flags, DWORD idx, jsval_t *r)
4032 switch(flags) {
4033 case DISPATCH_PROPERTYGET: {
4034 BSTR ret = NULL;
4036 ret = SysAllocStringLen(ctx->match_parens[idx].str, ctx->match_parens[idx].len);
4037 if(!ret)
4038 return E_OUTOFMEMORY;
4040 *r = jsval_string(ret);
4041 break;
4043 case DISPATCH_PROPERTYPUT:
4044 break;
4045 default:
4046 FIXME("unsupported flags\n");
4047 return E_NOTIMPL;
4050 return S_OK;
4053 static HRESULT RegExpConstr_idx1(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4054 unsigned argc, jsval_t *argv, jsval_t *r)
4056 TRACE("\n");
4057 return global_idx(ctx, flags, 0, r);
4060 static HRESULT RegExpConstr_idx2(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4061 unsigned argc, jsval_t *argv, jsval_t *r)
4063 TRACE("\n");
4064 return global_idx(ctx, flags, 1, r);
4067 static HRESULT RegExpConstr_idx3(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4068 unsigned argc, jsval_t *argv, jsval_t *r)
4070 TRACE("\n");
4071 return global_idx(ctx, flags, 2, r);
4074 static HRESULT RegExpConstr_idx4(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4075 unsigned argc, jsval_t *argv, jsval_t *r)
4077 TRACE("\n");
4078 return global_idx(ctx, flags, 3, r);
4081 static HRESULT RegExpConstr_idx5(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4082 unsigned argc, jsval_t *argv, jsval_t *r)
4084 TRACE("\n");
4085 return global_idx(ctx, flags, 4, r);
4088 static HRESULT RegExpConstr_idx6(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4089 unsigned argc, jsval_t *argv, jsval_t *r)
4091 TRACE("\n");
4092 return global_idx(ctx, flags, 5, r);
4095 static HRESULT RegExpConstr_idx7(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4096 unsigned argc, jsval_t *argv, jsval_t *r)
4098 TRACE("\n");
4099 return global_idx(ctx, flags, 6, r);
4102 static HRESULT RegExpConstr_idx8(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4103 unsigned argc, jsval_t *argv, jsval_t *r)
4105 TRACE("\n");
4106 return global_idx(ctx, flags, 7, r);
4109 static HRESULT RegExpConstr_idx9(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4110 unsigned argc, jsval_t *argv, jsval_t *r)
4112 TRACE("\n");
4113 return global_idx(ctx, flags, 8, r);
4116 static HRESULT RegExpConstr_leftContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4117 unsigned argc, jsval_t *argv, jsval_t *r)
4119 TRACE("\n");
4121 switch(flags) {
4122 case DISPATCH_PROPERTYGET: {
4123 BSTR ret;
4125 ret = SysAllocStringLen(ctx->last_match, ctx->last_match_index);
4126 if(!ret)
4127 return E_OUTOFMEMORY;
4129 *r = jsval_string(ret);
4130 break;
4132 case DISPATCH_PROPERTYPUT:
4133 break;
4134 default:
4135 FIXME("unsupported flags\n");
4136 return E_NOTIMPL;
4139 return S_OK;
4142 static HRESULT RegExpConstr_rightContext(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags,
4143 unsigned argc, jsval_t *argv, jsval_t *r)
4145 TRACE("\n");
4147 switch(flags) {
4148 case DISPATCH_PROPERTYGET: {
4149 BSTR ret;
4151 ret = SysAllocString(ctx->last_match+ctx->last_match_index+ctx->last_match_length);
4152 if(!ret)
4153 return E_OUTOFMEMORY;
4155 *r = jsval_string(ret);
4156 break;
4158 case DISPATCH_PROPERTYPUT:
4159 break;
4160 default:
4161 FIXME("unsupported flags\n");
4162 return E_NOTIMPL;
4165 return S_OK;
4168 static HRESULT RegExpConstr_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, unsigned argc, jsval_t *argv,
4169 jsval_t *r)
4171 TRACE("\n");
4173 switch(flags) {
4174 case DISPATCH_METHOD:
4175 if(argc) {
4176 if(is_object_instance(argv[0])) {
4177 jsdisp_t *jsdisp = iface_to_jsdisp((IUnknown*)get_object(argv[0]));
4178 if(jsdisp) {
4179 if(is_class(jsdisp, JSCLASS_REGEXP)) {
4180 if(argc > 1 && !is_undefined(argv[1])) {
4181 jsdisp_release(jsdisp);
4182 return throw_regexp_error(ctx, JS_E_REGEXP_SYNTAX, NULL);
4185 if(r)
4186 *r = jsval_obj(jsdisp);
4187 else
4188 jsdisp_release(jsdisp);
4189 return S_OK;
4191 jsdisp_release(jsdisp);
4195 /* fall through */
4196 case DISPATCH_CONSTRUCT: {
4197 jsdisp_t *ret;
4198 HRESULT hres;
4200 if(!argc) {
4201 FIXME("no args\n");
4202 return E_NOTIMPL;
4205 hres = create_regexp_var(ctx, argv[0], argc > 1 ? argv+1 : NULL, &ret);
4206 if(FAILED(hres))
4207 return hres;
4209 if(r)
4210 *r = jsval_obj(ret);
4211 else
4212 jsdisp_release(ret);
4213 return S_OK;
4215 default:
4216 FIXME("unimplemented flags: %x\n", flags);
4217 return E_NOTIMPL;
4220 return S_OK;
4223 static const builtin_prop_t RegExpConstr_props[] = {
4224 {idx1W, RegExpConstr_idx1, 0},
4225 {idx2W, RegExpConstr_idx2, 0},
4226 {idx3W, RegExpConstr_idx3, 0},
4227 {idx4W, RegExpConstr_idx4, 0},
4228 {idx5W, RegExpConstr_idx5, 0},
4229 {idx6W, RegExpConstr_idx6, 0},
4230 {idx7W, RegExpConstr_idx7, 0},
4231 {idx8W, RegExpConstr_idx8, 0},
4232 {idx9W, RegExpConstr_idx9, 0},
4233 {leftContextW, RegExpConstr_leftContext, 0},
4234 {rightContextW, RegExpConstr_rightContext, 0}
4237 static const builtin_info_t RegExpConstr_info = {
4238 JSCLASS_FUNCTION,
4239 {NULL, Function_value, 0},
4240 sizeof(RegExpConstr_props)/sizeof(*RegExpConstr_props),
4241 RegExpConstr_props,
4242 NULL,
4243 NULL
4246 HRESULT create_regexp_constr(script_ctx_t *ctx, jsdisp_t *object_prototype, jsdisp_t **ret)
4248 RegExpInstance *regexp;
4249 HRESULT hres;
4251 static const WCHAR RegExpW[] = {'R','e','g','E','x','p',0};
4253 hres = alloc_regexp(ctx, object_prototype, &regexp);
4254 if(FAILED(hres))
4255 return hres;
4257 hres = create_builtin_constructor(ctx, RegExpConstr_value, RegExpW, &RegExpConstr_info,
4258 PROPF_CONSTR|2, &regexp->dispex, ret);
4260 jsdisp_release(&regexp->dispex);
4261 return hres;
4264 HRESULT parse_regexp_flags(const WCHAR *str, DWORD str_len, DWORD *ret)
4266 const WCHAR *p;
4267 DWORD flags = 0;
4269 for (p = str; p < str+str_len; p++) {
4270 switch (*p) {
4271 case 'g':
4272 flags |= JSREG_GLOB;
4273 break;
4274 case 'i':
4275 flags |= JSREG_FOLD;
4276 break;
4277 case 'm':
4278 flags |= JSREG_MULTILINE;
4279 break;
4280 case 'y':
4281 flags |= JSREG_STICKY;
4282 break;
4283 default:
4284 WARN("wrong flag %c\n", *p);
4285 return E_FAIL;
4289 *ret = flags;
4290 return S_OK;