mshtml: Wine Gecko 2.34-beta2 release.
[wine/wine-gecko.git] / dlls / vbscript / regexp.c
blob4777d5069e55fd63544d0c678f6832cf1b7606a6
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
36 #include "vbscript.h"
37 #include "regexp.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(vbscript);
43 /* FIXME: Better error handling */
44 #define ReportRegExpError(a,b,c)
45 #define ReportRegExpErrorHelper(a,b,c,d)
46 #define JS_ReportErrorNumber(a,b,c,d)
47 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
48 #define js_ReportOutOfScriptQuota(a)
49 #define JS_ReportOutOfMemory(a)
50 #define JS_COUNT_OPERATION(a,b)
53 typedef BYTE JSPackedBool;
56 * This struct holds a bitmap representation of a class from a regexp.
57 * There's a list of these referenced by the classList field in the regexp_t
58 * struct below. The initial state has startIndex set to the offset in the
59 * original regexp source of the beginning of the class contents. The first
60 * use of the class converts the source representation into a bitmap.
63 typedef struct RECharSet {
64 JSPackedBool converted;
65 JSPackedBool sense;
66 WORD length;
67 union {
68 BYTE *bits;
69 struct {
70 size_t startIndex;
71 size_t length;
72 } src;
73 } u;
74 } RECharSet;
76 #define JSMSG_MIN_TOO_BIG 47
77 #define JSMSG_MAX_TOO_BIG 48
78 #define JSMSG_OUT_OF_ORDER 49
79 #define JSMSG_OUT_OF_MEMORY 137
81 #define LINE_SEPARATOR 0x2028
82 #define PARA_SEPARATOR 0x2029
84 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
85 ((c >= 'a') && (c <= 'z')) )
86 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
87 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
89 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
91 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
92 #define JS7_UNDEC(c) ((c) - '0')
94 typedef enum REOp {
95 REOP_EMPTY,
96 REOP_BOL,
97 REOP_EOL,
98 REOP_WBDRY,
99 REOP_WNONBDRY,
100 REOP_DOT,
101 REOP_DIGIT,
102 REOP_NONDIGIT,
103 REOP_ALNUM,
104 REOP_NONALNUM,
105 REOP_SPACE,
106 REOP_NONSPACE,
107 REOP_BACKREF,
108 REOP_FLAT,
109 REOP_FLAT1,
110 REOP_FLATi,
111 REOP_FLAT1i,
112 REOP_UCFLAT1,
113 REOP_UCFLAT1i,
114 REOP_UCFLAT,
115 REOP_UCFLATi,
116 REOP_CLASS,
117 REOP_NCLASS,
118 REOP_ALT,
119 REOP_QUANT,
120 REOP_STAR,
121 REOP_PLUS,
122 REOP_OPT,
123 REOP_LPAREN,
124 REOP_RPAREN,
125 REOP_JUMP,
126 REOP_DOTSTAR,
127 REOP_LPARENNON,
128 REOP_ASSERT,
129 REOP_ASSERT_NOT,
130 REOP_ASSERTTEST,
131 REOP_ASSERTNOTTEST,
132 REOP_MINIMALSTAR,
133 REOP_MINIMALPLUS,
134 REOP_MINIMALOPT,
135 REOP_MINIMALQUANT,
136 REOP_ENDCHILD,
137 REOP_REPEAT,
138 REOP_MINIMALREPEAT,
139 REOP_ALTPREREQ,
140 REOP_ALTPREREQ2,
141 REOP_ENDALT,
142 REOP_CONCAT,
143 REOP_END,
144 REOP_LIMIT /* META: no operator >= to this */
145 } REOp;
147 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
149 static const char *reop_names[] = {
150 "empty",
151 "bol",
152 "eol",
153 "wbdry",
154 "wnonbdry",
155 "dot",
156 "digit",
157 "nondigit",
158 "alnum",
159 "nonalnum",
160 "space",
161 "nonspace",
162 "backref",
163 "flat",
164 "flat1",
165 "flati",
166 "flat1i",
167 "ucflat1",
168 "ucflat1i",
169 "ucflat",
170 "ucflati",
171 "class",
172 "nclass",
173 "alt",
174 "quant",
175 "star",
176 "plus",
177 "opt",
178 "lparen",
179 "rparen",
180 "jump",
181 "dotstar",
182 "lparennon",
183 "assert",
184 "assert_not",
185 "asserttest",
186 "assertnottest",
187 "minimalstar",
188 "minimalplus",
189 "minimalopt",
190 "minimalquant",
191 "endchild",
192 "repeat",
193 "minimalrepeat",
194 "altprereq",
195 "alrprereq2",
196 "endalt",
197 "concat",
198 "end",
199 NULL
202 typedef struct REProgState {
203 jsbytecode *continue_pc; /* current continuation data */
204 jsbytecode continue_op;
205 ptrdiff_t index; /* progress in text */
206 size_t parenSoFar; /* highest indexed paren started */
207 union {
208 struct {
209 UINT min; /* current quantifier limits */
210 UINT max;
211 } quantifier;
212 struct {
213 size_t top; /* backtrack stack state */
214 size_t sz;
215 } assertion;
216 } u;
217 } REProgState;
219 typedef struct REBackTrackData {
220 size_t sz; /* size of previous stack entry */
221 jsbytecode *backtrack_pc; /* where to backtrack to */
222 jsbytecode backtrack_op;
223 const WCHAR *cp; /* index in text of match at backtrack */
224 size_t parenIndex; /* start index of saved paren contents */
225 size_t parenCount; /* # of saved paren contents */
226 size_t saveStateStackTop; /* number of parent states */
227 /* saved parent states follow */
228 /* saved paren contents follow */
229 } REBackTrackData;
231 #define INITIAL_STATESTACK 100
232 #define INITIAL_BACKTRACK 8000
234 typedef struct REGlobalData {
235 void *cx;
236 regexp_t *regexp; /* the RE in execution */
237 BOOL ok; /* runtime error (out_of_memory only?) */
238 size_t start; /* offset to start at */
239 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
240 const WCHAR *cpbegin; /* text base address */
241 const WCHAR *cpend; /* text limit address */
243 REProgState *stateStack; /* stack of state of current parents */
244 size_t stateStackTop;
245 size_t stateStackLimit;
247 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
248 REBackTrackData *backTrackSP;
249 size_t backTrackStackSize;
250 size_t cursz; /* size of current stack entry */
251 size_t backTrackCount; /* how many times we've backtracked */
252 size_t backTrackLimit; /* upper limit on backtrack states */
254 heap_pool_t *pool; /* It's faster to use one malloc'd pool
255 than to malloc/free the three items
256 that are allocated from this pool */
257 } REGlobalData;
259 typedef struct RENode RENode;
260 struct RENode {
261 REOp op; /* r.e. op bytecode */
262 RENode *next; /* next in concatenation order */
263 void *kid; /* first operand */
264 union {
265 void *kid2; /* second operand */
266 INT num; /* could be a number */
267 size_t parenIndex; /* or a parenthesis index */
268 struct { /* or a quantifier range */
269 UINT min;
270 UINT max;
271 JSPackedBool greedy;
272 } range;
273 struct { /* or a character class */
274 size_t startIndex;
275 size_t kidlen; /* length of string at kid, in jschars */
276 size_t index; /* index into class list */
277 WORD bmsize; /* bitmap size, based on max char code */
278 JSPackedBool sense;
279 } ucclass;
280 struct { /* or a literal sequence */
281 WCHAR chr; /* of one character */
282 size_t length; /* or many (via the kid) */
283 } flat;
284 struct {
285 RENode *kid2; /* second operand from ALT */
286 WCHAR ch1; /* match char for ALTPREREQ */
287 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
288 } altprereq;
289 } u;
292 #define CLASS_CACHE_SIZE 4
294 typedef struct CompilerState {
295 void *context;
296 const WCHAR *cpbegin;
297 const WCHAR *cpend;
298 const WCHAR *cp;
299 size_t parenCount;
300 size_t classCount; /* number of [] encountered */
301 size_t treeDepth; /* maximum depth of parse tree */
302 size_t progLength; /* estimated bytecode length */
303 RENode *result;
304 size_t classBitmapsMem; /* memory to hold all class bitmaps */
305 struct {
306 const WCHAR *start; /* small cache of class strings */
307 size_t length; /* since they're often the same */
308 size_t index;
309 } classCache[CLASS_CACHE_SIZE];
310 WORD flags;
312 heap_pool_t *pool; /* It's faster to use one malloc'd pool
313 than to malloc/free */
314 } CompilerState;
316 typedef struct EmitStateStackEntry {
317 jsbytecode *altHead; /* start of REOP_ALT* opcode */
318 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
319 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
320 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
321 RENode *continueNode; /* original REOP_ALT* node being stacked */
322 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
323 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
324 avoid 16-bit unsigned offset overflow */
325 } EmitStateStackEntry;
328 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
329 * the getters and setters take the pc of the offset, not of the opcode before
330 * the offset.
332 #define ARG_LEN 2
333 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
334 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
335 (pc)[1] = (jsbytecode) (arg))
337 #define OFFSET_LEN ARG_LEN
338 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
339 #define GET_OFFSET(pc) GET_ARG(pc)
341 static BOOL ParseRegExp(CompilerState*);
344 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
345 * For sanity, we limit it to 2^24 bytes.
347 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
350 * The maximum memory that can be allocated for class bitmaps.
351 * For sanity, we limit it to 2^24 bytes.
353 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
356 * Functions to get size and write/read bytecode that represent small indexes
357 * compactly.
358 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
359 * indicates that the following byte brings more bits to the index. Otherwise
360 * this is the last byte in the index bytecode representing highest index bits.
362 static size_t
363 GetCompactIndexWidth(size_t index)
365 size_t width;
367 for (width = 1; (index >>= 7) != 0; ++width) { }
368 return width;
371 static inline jsbytecode *
372 WriteCompactIndex(jsbytecode *pc, size_t index)
374 size_t next;
376 while ((next = index >> 7) != 0) {
377 *pc++ = (jsbytecode)(index | 0x80);
378 index = next;
380 *pc++ = (jsbytecode)index;
381 return pc;
384 static inline jsbytecode *
385 ReadCompactIndex(jsbytecode *pc, size_t *result)
387 size_t nextByte;
389 nextByte = *pc++;
390 if ((nextByte & 0x80) == 0) {
392 * Short-circuit the most common case when compact index <= 127.
394 *result = nextByte;
395 } else {
396 size_t shift = 7;
397 *result = 0x7F & nextByte;
398 do {
399 nextByte = *pc++;
400 *result |= (nextByte & 0x7F) << shift;
401 shift += 7;
402 } while ((nextByte & 0x80) != 0);
404 return pc;
407 /* Construct and initialize an RENode, returning NULL for out-of-memory */
408 static RENode *
409 NewRENode(CompilerState *state, REOp op)
411 RENode *ren;
413 ren = heap_pool_alloc(state->pool, sizeof(*ren));
414 if (!ren) {
415 /* js_ReportOutOfScriptQuota(cx); */
416 return NULL;
418 ren->op = op;
419 ren->next = NULL;
420 ren->kid = NULL;
421 return ren;
425 * Validates and converts hex ascii value.
427 static BOOL
428 isASCIIHexDigit(WCHAR c, UINT *digit)
430 UINT cv = c;
432 if (cv < '0')
433 return FALSE;
434 if (cv <= '9') {
435 *digit = cv - '0';
436 return TRUE;
438 cv |= 0x20;
439 if (cv >= 'a' && cv <= 'f') {
440 *digit = cv - 'a' + 10;
441 return TRUE;
443 return FALSE;
446 typedef struct {
447 REOp op;
448 const WCHAR *errPos;
449 size_t parenIndex;
450 } REOpData;
452 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
453 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
455 static BOOL
456 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
458 ptrdiff_t offset = target - jump;
460 /* Check that target really points forward. */
461 assert(offset >= 2);
462 if ((size_t)offset > OFFSET_MAX)
463 return FALSE;
465 jump[0] = JUMP_OFFSET_HI(offset);
466 jump[1] = JUMP_OFFSET_LO(offset);
467 return TRUE;
471 * Generate bytecode for the tree rooted at t using an explicit stack instead
472 * of recursion.
474 static jsbytecode *
475 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
476 jsbytecode *pc, RENode *t)
478 EmitStateStackEntry *emitStateSP, *emitStateStack;
479 RECharSet *charSet;
480 REOp op;
482 if (treeDepth == 0) {
483 emitStateStack = NULL;
484 } else {
485 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
486 if (!emitStateStack)
487 return NULL;
489 emitStateSP = emitStateStack;
490 op = t->op;
491 assert(op < REOP_LIMIT);
493 for (;;) {
494 *pc++ = op;
495 switch (op) {
496 case REOP_EMPTY:
497 --pc;
498 break;
500 case REOP_ALTPREREQ2:
501 case REOP_ALTPREREQ:
502 assert(emitStateSP);
503 emitStateSP->altHead = pc - 1;
504 emitStateSP->endTermFixup = pc;
505 pc += OFFSET_LEN;
506 SET_ARG(pc, t->u.altprereq.ch1);
507 pc += ARG_LEN;
508 SET_ARG(pc, t->u.altprereq.ch2);
509 pc += ARG_LEN;
511 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
512 pc += OFFSET_LEN;
514 emitStateSP->continueNode = t;
515 emitStateSP->continueOp = REOP_JUMP;
516 emitStateSP->jumpToJumpFlag = FALSE;
517 ++emitStateSP;
518 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
519 t = t->kid;
520 op = t->op;
521 assert(op < REOP_LIMIT);
522 continue;
524 case REOP_JUMP:
525 emitStateSP->nextTermFixup = pc; /* offset to following term */
526 pc += OFFSET_LEN;
527 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
528 goto jump_too_big;
529 emitStateSP->continueOp = REOP_ENDALT;
530 ++emitStateSP;
531 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
532 t = t->u.kid2;
533 op = t->op;
534 assert(op < REOP_LIMIT);
535 continue;
537 case REOP_ENDALT:
539 * If we already patched emitStateSP->nextTermFixup to jump to
540 * a nearer jump, to avoid 16-bit immediate offset overflow, we
541 * are done here.
543 if (emitStateSP->jumpToJumpFlag)
544 break;
547 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
548 * REOP_ENDALT is executed only on successful match of the last
549 * alternate in a group.
551 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
552 goto jump_too_big;
553 if (t->op != REOP_ALT) {
554 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
555 goto jump_too_big;
559 * If the program is bigger than the REOP_JUMP offset range, then
560 * we must check for alternates before this one that are part of
561 * the same group, and fix up their jump offsets to target jumps
562 * close enough to fit in a 16-bit unsigned offset immediate.
564 if ((size_t)(pc - re->program) > OFFSET_MAX &&
565 emitStateSP > emitStateStack) {
566 EmitStateStackEntry *esp, *esp2;
567 jsbytecode *alt, *jump;
568 ptrdiff_t span, header;
570 esp2 = emitStateSP;
571 alt = esp2->altHead;
572 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
573 if (esp->continueOp == REOP_ENDALT &&
574 !esp->jumpToJumpFlag &&
575 esp->nextTermFixup + OFFSET_LEN == alt &&
576 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
577 ? esp->endTermFixup
578 : esp->nextTermFixup)) > OFFSET_MAX) {
579 alt = esp->altHead;
580 jump = esp->nextTermFixup;
583 * The span must be 1 less than the distance from
584 * jump offset to jump offset, so we actually jump
585 * to a REOP_JUMP bytecode, not to its offset!
587 for (;;) {
588 assert(jump < esp2->nextTermFixup);
589 span = esp2->nextTermFixup - jump - 1;
590 if ((size_t)span <= OFFSET_MAX)
591 break;
592 do {
593 if (--esp2 == esp)
594 goto jump_too_big;
595 } while (esp2->continueOp != REOP_ENDALT);
598 jump[0] = JUMP_OFFSET_HI(span);
599 jump[1] = JUMP_OFFSET_LO(span);
601 if (esp->continueNode->op != REOP_ALT) {
603 * We must patch the offset at esp->endTermFixup
604 * as well, for the REOP_ALTPREREQ{,2} opcodes.
605 * If we're unlucky and endTermFixup is more than
606 * OFFSET_MAX bytes from its target, we cheat by
607 * jumping 6 bytes to the jump whose offset is at
608 * esp->nextTermFixup, which has the same target.
610 jump = esp->endTermFixup;
611 header = esp->nextTermFixup - jump;
612 span += header;
613 if ((size_t)span > OFFSET_MAX)
614 span = header;
616 jump[0] = JUMP_OFFSET_HI(span);
617 jump[1] = JUMP_OFFSET_LO(span);
620 esp->jumpToJumpFlag = TRUE;
624 break;
626 case REOP_ALT:
627 assert(emitStateSP);
628 emitStateSP->altHead = pc - 1;
629 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
630 pc += OFFSET_LEN;
631 emitStateSP->continueNode = t;
632 emitStateSP->continueOp = REOP_JUMP;
633 emitStateSP->jumpToJumpFlag = FALSE;
634 ++emitStateSP;
635 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
636 t = t->kid;
637 op = t->op;
638 assert(op < REOP_LIMIT);
639 continue;
641 case REOP_FLAT:
643 * Coalesce FLATs if possible and if it would not increase bytecode
644 * beyond preallocated limit. The latter happens only when bytecode
645 * size for coalesced string with offset p and length 2 exceeds 6
646 * bytes preallocated for 2 single char nodes, i.e. when
647 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
648 * GetCompactIndexWidth(p) > 4.
649 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
650 * nodes strictly decreases bytecode size, the check has to be
651 * done only for the first coalescing.
653 if (t->kid &&
654 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
656 while (t->next &&
657 t->next->op == REOP_FLAT &&
658 (WCHAR*)t->kid + t->u.flat.length ==
659 t->next->kid) {
660 t->u.flat.length += t->next->u.flat.length;
661 t->next = t->next->next;
664 if (t->kid && t->u.flat.length > 1) {
665 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
666 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
667 pc = WriteCompactIndex(pc, t->u.flat.length);
668 } else if (t->u.flat.chr < 256) {
669 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
670 *pc++ = (jsbytecode) t->u.flat.chr;
671 } else {
672 pc[-1] = (state->flags & REG_FOLD)
673 ? REOP_UCFLAT1i
674 : REOP_UCFLAT1;
675 SET_ARG(pc, t->u.flat.chr);
676 pc += ARG_LEN;
678 break;
680 case REOP_LPAREN:
681 assert(emitStateSP);
682 pc = WriteCompactIndex(pc, t->u.parenIndex);
683 emitStateSP->continueNode = t;
684 emitStateSP->continueOp = REOP_RPAREN;
685 ++emitStateSP;
686 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
687 t = t->kid;
688 op = t->op;
689 continue;
691 case REOP_RPAREN:
692 pc = WriteCompactIndex(pc, t->u.parenIndex);
693 break;
695 case REOP_BACKREF:
696 pc = WriteCompactIndex(pc, t->u.parenIndex);
697 break;
699 case REOP_ASSERT:
700 assert(emitStateSP);
701 emitStateSP->nextTermFixup = pc;
702 pc += OFFSET_LEN;
703 emitStateSP->continueNode = t;
704 emitStateSP->continueOp = REOP_ASSERTTEST;
705 ++emitStateSP;
706 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
707 t = t->kid;
708 op = t->op;
709 continue;
711 case REOP_ASSERTTEST:
712 case REOP_ASSERTNOTTEST:
713 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
714 goto jump_too_big;
715 break;
717 case REOP_ASSERT_NOT:
718 assert(emitStateSP);
719 emitStateSP->nextTermFixup = pc;
720 pc += OFFSET_LEN;
721 emitStateSP->continueNode = t;
722 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
723 ++emitStateSP;
724 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
725 t = t->kid;
726 op = t->op;
727 continue;
729 case REOP_QUANT:
730 assert(emitStateSP);
731 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
732 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
733 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
734 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
735 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
736 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
737 } else {
738 if (!t->u.range.greedy)
739 pc[-1] = REOP_MINIMALQUANT;
740 pc = WriteCompactIndex(pc, t->u.range.min);
742 * Write max + 1 to avoid using size_t(max) + 1 bytes
743 * for (UINT)-1 sentinel.
745 pc = WriteCompactIndex(pc, t->u.range.max + 1);
747 emitStateSP->nextTermFixup = pc;
748 pc += OFFSET_LEN;
749 emitStateSP->continueNode = t;
750 emitStateSP->continueOp = REOP_ENDCHILD;
751 ++emitStateSP;
752 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
753 t = t->kid;
754 op = t->op;
755 continue;
757 case REOP_ENDCHILD:
758 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
759 goto jump_too_big;
760 break;
762 case REOP_CLASS:
763 if (!t->u.ucclass.sense)
764 pc[-1] = REOP_NCLASS;
765 pc = WriteCompactIndex(pc, t->u.ucclass.index);
766 charSet = &re->classList[t->u.ucclass.index];
767 charSet->converted = FALSE;
768 charSet->length = t->u.ucclass.bmsize;
769 charSet->u.src.startIndex = t->u.ucclass.startIndex;
770 charSet->u.src.length = t->u.ucclass.kidlen;
771 charSet->sense = t->u.ucclass.sense;
772 break;
774 default:
775 break;
778 t = t->next;
779 if (t) {
780 op = t->op;
781 } else {
782 if (emitStateSP == emitStateStack)
783 break;
784 --emitStateSP;
785 t = emitStateSP->continueNode;
786 op = (REOp) emitStateSP->continueOp;
790 cleanup:
791 heap_free(emitStateStack);
792 return pc;
794 jump_too_big:
795 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
796 pc = NULL;
797 goto cleanup;
801 * Process the op against the two top operands, reducing them to a single
802 * operand in the penultimate slot. Update progLength and treeDepth.
804 static BOOL
805 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
806 INT operandSP)
808 RENode *result;
810 switch (opData->op) {
811 case REOP_ALT:
812 result = NewRENode(state, REOP_ALT);
813 if (!result)
814 return FALSE;
815 result->kid = operandStack[operandSP - 2];
816 result->u.kid2 = operandStack[operandSP - 1];
817 operandStack[operandSP - 2] = result;
819 if (state->treeDepth == TREE_DEPTH_MAX) {
820 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
821 return FALSE;
823 ++state->treeDepth;
826 * Look at both alternates to see if there's a FLAT or a CLASS at
827 * the start of each. If so, use a prerequisite match.
829 if (((RENode *) result->kid)->op == REOP_FLAT &&
830 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
831 (state->flags & REG_FOLD) == 0) {
832 result->op = REOP_ALTPREREQ;
833 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
834 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
835 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
836 JUMP, <end> ... ENDALT */
837 state->progLength += 13;
839 else
840 if (((RENode *) result->kid)->op == REOP_CLASS &&
841 ((RENode *) result->kid)->u.ucclass.index < 256 &&
842 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
843 (state->flags & REG_FOLD) == 0) {
844 result->op = REOP_ALTPREREQ2;
845 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
846 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
847 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
848 JUMP, <end> ... ENDALT */
849 state->progLength += 13;
851 else
852 if (((RENode *) result->kid)->op == REOP_FLAT &&
853 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
854 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
855 (state->flags & REG_FOLD) == 0) {
856 result->op = REOP_ALTPREREQ2;
857 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
858 result->u.altprereq.ch2 =
859 ((RENode *) result->u.kid2)->u.ucclass.index;
860 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
861 JUMP, <end> ... ENDALT */
862 state->progLength += 13;
864 else {
865 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
866 state->progLength += 7;
868 break;
870 case REOP_CONCAT:
871 result = operandStack[operandSP - 2];
872 while (result->next)
873 result = result->next;
874 result->next = operandStack[operandSP - 1];
875 break;
877 case REOP_ASSERT:
878 case REOP_ASSERT_NOT:
879 case REOP_LPARENNON:
880 case REOP_LPAREN:
881 /* These should have been processed by a close paren. */
882 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
883 opData->errPos);
884 return FALSE;
886 default:;
888 return TRUE;
892 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
893 * it being on the stack, and to propagate errors to its callers.
895 #define JSREG_FIND_PAREN_COUNT 0x8000
896 #define JSREG_FIND_PAREN_ERROR 0x4000
899 * Magic return value from FindParenCount and GetDecimalValue, to indicate
900 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
901 * its findMax parameter is non-null.
903 #define OVERFLOW_VALUE ((UINT)-1)
905 static UINT
906 FindParenCount(CompilerState *state)
908 CompilerState temp;
909 int i;
911 if (state->flags & JSREG_FIND_PAREN_COUNT)
912 return OVERFLOW_VALUE;
915 * Copy state into temp, flag it so we never report an invalid backref,
916 * and reset its members to parse the entire regexp. This is obviously
917 * suboptimal, but GetDecimalValue calls us only if a backref appears to
918 * refer to a forward parenthetical, which is rare.
920 temp = *state;
921 temp.flags |= JSREG_FIND_PAREN_COUNT;
922 temp.cp = temp.cpbegin;
923 temp.parenCount = 0;
924 temp.classCount = 0;
925 temp.progLength = 0;
926 temp.treeDepth = 0;
927 temp.classBitmapsMem = 0;
928 for (i = 0; i < CLASS_CACHE_SIZE; i++)
929 temp.classCache[i].start = NULL;
931 if (!ParseRegExp(&temp)) {
932 state->flags |= JSREG_FIND_PAREN_ERROR;
933 return OVERFLOW_VALUE;
935 return temp.parenCount;
939 * Extract and return a decimal value at state->cp. The initial character c
940 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
941 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
942 * state->flags to discover whether an error occurred under findMax.
944 static UINT
945 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
946 CompilerState *state)
948 UINT value = JS7_UNDEC(c);
949 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
951 /* The following restriction allows simpler overflow checks. */
952 assert(max <= ((UINT)-1 - 9) / 10);
953 while (state->cp < state->cpend) {
954 c = *state->cp;
955 if (!JS7_ISDEC(c))
956 break;
957 value = 10 * value + JS7_UNDEC(c);
958 if (!overflow && value > max && (!findMax || value > findMax(state)))
959 overflow = TRUE;
960 ++state->cp;
962 return overflow ? OVERFLOW_VALUE : value;
966 * Calculate the total size of the bitmap required for a class expression.
968 static BOOL
969 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
970 const WCHAR *end)
972 UINT max = 0;
973 BOOL inRange = FALSE;
974 WCHAR c, rangeStart = 0;
975 UINT n, digit, nDigits, i;
977 target->u.ucclass.bmsize = 0;
978 target->u.ucclass.sense = TRUE;
980 if (src == end)
981 return TRUE;
983 if (*src == '^') {
984 ++src;
985 target->u.ucclass.sense = FALSE;
988 while (src != end) {
989 BOOL canStartRange = TRUE;
990 UINT localMax = 0;
992 switch (*src) {
993 case '\\':
994 ++src;
995 c = *src++;
996 switch (c) {
997 case 'b':
998 localMax = 0x8;
999 break;
1000 case 'f':
1001 localMax = 0xC;
1002 break;
1003 case 'n':
1004 localMax = 0xA;
1005 break;
1006 case 'r':
1007 localMax = 0xD;
1008 break;
1009 case 't':
1010 localMax = 0x9;
1011 break;
1012 case 'v':
1013 localMax = 0xB;
1014 break;
1015 case 'c':
1016 if (src < end && RE_IS_LETTER(*src)) {
1017 localMax = (UINT) (*src++) & 0x1F;
1018 } else {
1019 --src;
1020 localMax = '\\';
1022 break;
1023 case 'x':
1024 nDigits = 2;
1025 goto lexHex;
1026 case 'u':
1027 nDigits = 4;
1028 lexHex:
1029 n = 0;
1030 for (i = 0; (i < nDigits) && (src < end); i++) {
1031 c = *src++;
1032 if (!isASCIIHexDigit(c, &digit)) {
1034 * Back off to accepting the original
1035 *'\' as a literal.
1037 src -= i + 1;
1038 n = '\\';
1039 break;
1041 n = (n << 4) | digit;
1043 localMax = n;
1044 break;
1045 case 'd':
1046 canStartRange = FALSE;
1047 if (inRange) {
1048 JS_ReportErrorNumber(state->context,
1049 js_GetErrorMessage, NULL,
1050 JSMSG_BAD_CLASS_RANGE);
1051 return FALSE;
1053 localMax = '9';
1054 break;
1055 case 'D':
1056 case 's':
1057 case 'S':
1058 case 'w':
1059 case 'W':
1060 canStartRange = FALSE;
1061 if (inRange) {
1062 JS_ReportErrorNumber(state->context,
1063 js_GetErrorMessage, NULL,
1064 JSMSG_BAD_CLASS_RANGE);
1065 return FALSE;
1067 max = 65535;
1070 * If this is the start of a range, ensure that it's less than
1071 * the end.
1073 localMax = 0;
1074 break;
1075 case '0':
1076 case '1':
1077 case '2':
1078 case '3':
1079 case '4':
1080 case '5':
1081 case '6':
1082 case '7':
1084 * This is a non-ECMA extension - decimal escapes (in this
1085 * case, octal!) are supposed to be an error inside class
1086 * ranges, but supported here for backwards compatibility.
1089 n = JS7_UNDEC(c);
1090 c = *src;
1091 if ('0' <= c && c <= '7') {
1092 src++;
1093 n = 8 * n + JS7_UNDEC(c);
1094 c = *src;
1095 if ('0' <= c && c <= '7') {
1096 src++;
1097 i = 8 * n + JS7_UNDEC(c);
1098 if (i <= 0377)
1099 n = i;
1100 else
1101 src--;
1104 localMax = n;
1105 break;
1107 default:
1108 localMax = c;
1109 break;
1111 break;
1112 default:
1113 localMax = *src++;
1114 break;
1117 if (inRange) {
1118 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1119 if (rangeStart > localMax) {
1120 JS_ReportErrorNumber(state->context,
1121 js_GetErrorMessage, NULL,
1122 JSMSG_BAD_CLASS_RANGE);
1123 return FALSE;
1125 inRange = FALSE;
1126 } else {
1127 if (canStartRange && src < end - 1) {
1128 if (*src == '-') {
1129 ++src;
1130 inRange = TRUE;
1131 rangeStart = (WCHAR)localMax;
1132 continue;
1135 if (state->flags & REG_FOLD)
1136 rangeStart = localMax; /* one run of the uc/dc loop below */
1139 if (state->flags & REG_FOLD) {
1140 WCHAR maxch = localMax;
1142 for (i = rangeStart; i <= localMax; i++) {
1143 WCHAR uch, dch;
1145 uch = toupperW(i);
1146 dch = tolowerW(i);
1147 if(maxch < uch)
1148 maxch = uch;
1149 if(maxch < dch)
1150 maxch = dch;
1152 localMax = maxch;
1155 if (localMax > max)
1156 max = localMax;
1158 target->u.ucclass.bmsize = max;
1159 return TRUE;
1162 static INT
1163 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1165 UINT min, max;
1166 WCHAR c;
1167 const WCHAR *errp = state->cp++;
1169 c = *state->cp;
1170 if (JS7_ISDEC(c)) {
1171 ++state->cp;
1172 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1173 c = *state->cp;
1175 if (!ignoreValues && min == OVERFLOW_VALUE)
1176 return JSMSG_MIN_TOO_BIG;
1178 if (c == ',') {
1179 c = *++state->cp;
1180 if (JS7_ISDEC(c)) {
1181 ++state->cp;
1182 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1183 c = *state->cp;
1184 if (!ignoreValues && max == OVERFLOW_VALUE)
1185 return JSMSG_MAX_TOO_BIG;
1186 if (!ignoreValues && min > max)
1187 return JSMSG_OUT_OF_ORDER;
1188 } else {
1189 max = (UINT)-1;
1191 } else {
1192 max = min;
1194 if (c == '}') {
1195 state->result = NewRENode(state, REOP_QUANT);
1196 if (!state->result)
1197 return JSMSG_OUT_OF_MEMORY;
1198 state->result->u.range.min = min;
1199 state->result->u.range.max = max;
1201 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1202 * where <max> is written as compact(max+1) to make
1203 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1205 state->progLength += (1 + GetCompactIndexWidth(min)
1206 + GetCompactIndexWidth(max + 1)
1207 +3);
1208 return 0;
1212 state->cp = errp;
1213 return -1;
1216 static BOOL
1217 ParseQuantifier(CompilerState *state)
1219 RENode *term;
1220 term = state->result;
1221 if (state->cp < state->cpend) {
1222 switch (*state->cp) {
1223 case '+':
1224 state->result = NewRENode(state, REOP_QUANT);
1225 if (!state->result)
1226 return FALSE;
1227 state->result->u.range.min = 1;
1228 state->result->u.range.max = (UINT)-1;
1229 /* <PLUS>, <next> ... <ENDCHILD> */
1230 state->progLength += 4;
1231 goto quantifier;
1232 case '*':
1233 state->result = NewRENode(state, REOP_QUANT);
1234 if (!state->result)
1235 return FALSE;
1236 state->result->u.range.min = 0;
1237 state->result->u.range.max = (UINT)-1;
1238 /* <STAR>, <next> ... <ENDCHILD> */
1239 state->progLength += 4;
1240 goto quantifier;
1241 case '?':
1242 state->result = NewRENode(state, REOP_QUANT);
1243 if (!state->result)
1244 return FALSE;
1245 state->result->u.range.min = 0;
1246 state->result->u.range.max = 1;
1247 /* <OPT>, <next> ... <ENDCHILD> */
1248 state->progLength += 4;
1249 goto quantifier;
1250 case '{': /* balance '}' */
1252 INT err;
1254 err = ParseMinMaxQuantifier(state, FALSE);
1255 if (err == 0)
1256 goto quantifier;
1257 if (err == -1)
1258 return TRUE;
1260 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1261 return FALSE;
1263 default:;
1266 return TRUE;
1268 quantifier:
1269 if (state->treeDepth == TREE_DEPTH_MAX) {
1270 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1271 return FALSE;
1274 ++state->treeDepth;
1275 ++state->cp;
1276 state->result->kid = term;
1277 if (state->cp < state->cpend && *state->cp == '?') {
1278 ++state->cp;
1279 state->result->u.range.greedy = FALSE;
1280 } else {
1281 state->result->u.range.greedy = TRUE;
1283 return TRUE;
1287 * item: assertion An item is either an assertion or
1288 * quantatom a quantified atom.
1290 * assertion: '^' Assertions match beginning of string
1291 * (or line if the class static property
1292 * RegExp.multiline is true).
1293 * '$' End of string (or line if the class
1294 * static property RegExp.multiline is
1295 * true).
1296 * '\b' Word boundary (between \w and \W).
1297 * '\B' Word non-boundary.
1299 * quantatom: atom An unquantified atom.
1300 * quantatom '{' n ',' m '}'
1301 * Atom must occur between n and m times.
1302 * quantatom '{' n ',' '}' Atom must occur at least n times.
1303 * quantatom '{' n '}' Atom must occur exactly n times.
1304 * quantatom '*' Zero or more times (same as {0,}).
1305 * quantatom '+' One or more times (same as {1,}).
1306 * quantatom '?' Zero or one time (same as {0,1}).
1308 * any of which can be optionally followed by '?' for ungreedy
1310 * atom: '(' regexp ')' A parenthesized regexp (what matched
1311 * can be addressed using a backreference,
1312 * see '\' n below).
1313 * '.' Matches any char except '\n'.
1314 * '[' classlist ']' A character class.
1315 * '[' '^' classlist ']' A negated character class.
1316 * '\f' Form Feed.
1317 * '\n' Newline (Line Feed).
1318 * '\r' Carriage Return.
1319 * '\t' Horizontal Tab.
1320 * '\v' Vertical Tab.
1321 * '\d' A digit (same as [0-9]).
1322 * '\D' A non-digit.
1323 * '\w' A word character, [0-9a-z_A-Z].
1324 * '\W' A non-word character.
1325 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1326 * '\S' A non-whitespace character.
1327 * '\' n A backreference to the nth (n decimal
1328 * and positive) parenthesized expression.
1329 * '\' octal An octal escape sequence (octal must be
1330 * two or three digits long, unless it is
1331 * 0 for the null character).
1332 * '\x' hex A hex escape (hex must be two digits).
1333 * '\u' unicode A unicode escape (must be four digits).
1334 * '\c' ctrl A control character, ctrl is a letter.
1335 * '\' literalatomchar Any character except one of the above
1336 * that follow '\' in an atom.
1337 * otheratomchar Any character not first among the other
1338 * atom right-hand sides.
1340 static BOOL
1341 ParseTerm(CompilerState *state)
1343 WCHAR c = *state->cp++;
1344 UINT nDigits;
1345 UINT num, tmp, n, i;
1346 const WCHAR *termStart;
1348 switch (c) {
1349 /* assertions and atoms */
1350 case '^':
1351 state->result = NewRENode(state, REOP_BOL);
1352 if (!state->result)
1353 return FALSE;
1354 state->progLength++;
1355 return TRUE;
1356 case '$':
1357 state->result = NewRENode(state, REOP_EOL);
1358 if (!state->result)
1359 return FALSE;
1360 state->progLength++;
1361 return TRUE;
1362 case '\\':
1363 if (state->cp >= state->cpend) {
1364 /* a trailing '\' is an error */
1365 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1366 return FALSE;
1368 c = *state->cp++;
1369 switch (c) {
1370 /* assertion escapes */
1371 case 'b' :
1372 state->result = NewRENode(state, REOP_WBDRY);
1373 if (!state->result)
1374 return FALSE;
1375 state->progLength++;
1376 return TRUE;
1377 case 'B':
1378 state->result = NewRENode(state, REOP_WNONBDRY);
1379 if (!state->result)
1380 return FALSE;
1381 state->progLength++;
1382 return TRUE;
1383 /* Decimal escape */
1384 case '0':
1385 /* Give a strict warning. See also the note below. */
1386 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1387 doOctal:
1388 num = 0;
1389 while (state->cp < state->cpend) {
1390 c = *state->cp;
1391 if (c < '0' || '7' < c)
1392 break;
1393 state->cp++;
1394 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1395 if (tmp > 0377)
1396 break;
1397 num = tmp;
1399 c = (WCHAR)num;
1400 doFlat:
1401 state->result = NewRENode(state, REOP_FLAT);
1402 if (!state->result)
1403 return FALSE;
1404 state->result->u.flat.chr = c;
1405 state->result->u.flat.length = 1;
1406 state->progLength += 3;
1407 break;
1408 case '1':
1409 case '2':
1410 case '3':
1411 case '4':
1412 case '5':
1413 case '6':
1414 case '7':
1415 case '8':
1416 case '9':
1417 termStart = state->cp - 1;
1418 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1419 if (state->flags & JSREG_FIND_PAREN_ERROR)
1420 return FALSE;
1421 if (num == OVERFLOW_VALUE) {
1422 /* Give a strict mode warning. */
1423 WARN("back-reference exceeds number of capturing parentheses\n");
1426 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1427 * error here. However, for compatibility with IE, we treat the
1428 * whole backref as flat if the first character in it is not a
1429 * valid octal character, and as an octal escape otherwise.
1431 state->cp = termStart;
1432 if (c >= '8') {
1433 /* Treat this as flat. termStart - 1 is the \. */
1434 c = '\\';
1435 goto asFlat;
1438 /* Treat this as an octal escape. */
1439 goto doOctal;
1441 assert(1 <= num && num <= 0x10000);
1442 state->result = NewRENode(state, REOP_BACKREF);
1443 if (!state->result)
1444 return FALSE;
1445 state->result->u.parenIndex = num - 1;
1446 state->progLength
1447 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1448 break;
1449 /* Control escape */
1450 case 'f':
1451 c = 0xC;
1452 goto doFlat;
1453 case 'n':
1454 c = 0xA;
1455 goto doFlat;
1456 case 'r':
1457 c = 0xD;
1458 goto doFlat;
1459 case 't':
1460 c = 0x9;
1461 goto doFlat;
1462 case 'v':
1463 c = 0xB;
1464 goto doFlat;
1465 /* Control letter */
1466 case 'c':
1467 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1468 c = (WCHAR) (*state->cp++ & 0x1F);
1469 } else {
1470 /* back off to accepting the original '\' as a literal */
1471 --state->cp;
1472 c = '\\';
1474 goto doFlat;
1475 /* HexEscapeSequence */
1476 case 'x':
1477 nDigits = 2;
1478 goto lexHex;
1479 /* UnicodeEscapeSequence */
1480 case 'u':
1481 nDigits = 4;
1482 lexHex:
1483 n = 0;
1484 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1485 UINT digit;
1486 c = *state->cp++;
1487 if (!isASCIIHexDigit(c, &digit)) {
1489 * Back off to accepting the original 'u' or 'x' as a
1490 * literal.
1492 state->cp -= i + 2;
1493 n = *state->cp++;
1494 break;
1496 n = (n << 4) | digit;
1498 c = (WCHAR) n;
1499 goto doFlat;
1500 /* Character class escapes */
1501 case 'd':
1502 state->result = NewRENode(state, REOP_DIGIT);
1503 doSimple:
1504 if (!state->result)
1505 return FALSE;
1506 state->progLength++;
1507 break;
1508 case 'D':
1509 state->result = NewRENode(state, REOP_NONDIGIT);
1510 goto doSimple;
1511 case 's':
1512 state->result = NewRENode(state, REOP_SPACE);
1513 goto doSimple;
1514 case 'S':
1515 state->result = NewRENode(state, REOP_NONSPACE);
1516 goto doSimple;
1517 case 'w':
1518 state->result = NewRENode(state, REOP_ALNUM);
1519 goto doSimple;
1520 case 'W':
1521 state->result = NewRENode(state, REOP_NONALNUM);
1522 goto doSimple;
1523 /* IdentityEscape */
1524 default:
1525 state->result = NewRENode(state, REOP_FLAT);
1526 if (!state->result)
1527 return FALSE;
1528 state->result->u.flat.chr = c;
1529 state->result->u.flat.length = 1;
1530 state->result->kid = (void *) (state->cp - 1);
1531 state->progLength += 3;
1532 break;
1534 break;
1535 case '[':
1536 state->result = NewRENode(state, REOP_CLASS);
1537 if (!state->result)
1538 return FALSE;
1539 termStart = state->cp;
1540 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1541 for (;;) {
1542 if (state->cp == state->cpend) {
1543 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1544 JSMSG_UNTERM_CLASS, termStart);
1546 return FALSE;
1548 if (*state->cp == '\\') {
1549 state->cp++;
1550 if (state->cp != state->cpend)
1551 state->cp++;
1552 continue;
1554 if (*state->cp == ']') {
1555 state->result->u.ucclass.kidlen = state->cp - termStart;
1556 break;
1558 state->cp++;
1560 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1561 if (!state->classCache[i].start) {
1562 state->classCache[i].start = termStart;
1563 state->classCache[i].length = state->result->u.ucclass.kidlen;
1564 state->classCache[i].index = state->classCount;
1565 break;
1567 if (state->classCache[i].length ==
1568 state->result->u.ucclass.kidlen) {
1569 for (n = 0; ; n++) {
1570 if (n == state->classCache[i].length) {
1571 state->result->u.ucclass.index
1572 = state->classCache[i].index;
1573 goto claim;
1575 if (state->classCache[i].start[n] != termStart[n])
1576 break;
1580 state->result->u.ucclass.index = state->classCount++;
1582 claim:
1584 * Call CalculateBitmapSize now as we want any errors it finds
1585 * to be reported during the parse phase, not at execution.
1587 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1588 return FALSE;
1590 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1591 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1592 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1594 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1595 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1596 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1597 return FALSE;
1599 state->classBitmapsMem += n;
1600 /* CLASS, <index> */
1601 state->progLength
1602 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1603 break;
1605 case '.':
1606 state->result = NewRENode(state, REOP_DOT);
1607 goto doSimple;
1609 case '{':
1611 const WCHAR *errp = state->cp--;
1612 INT err;
1614 err = ParseMinMaxQuantifier(state, TRUE);
1615 state->cp = errp;
1617 if (err < 0)
1618 goto asFlat;
1620 /* FALL THROUGH */
1622 case '*':
1623 case '+':
1624 case '?':
1625 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1626 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1627 return FALSE;
1628 default:
1629 asFlat:
1630 state->result = NewRENode(state, REOP_FLAT);
1631 if (!state->result)
1632 return FALSE;
1633 state->result->u.flat.chr = c;
1634 state->result->u.flat.length = 1;
1635 state->result->kid = (void *) (state->cp - 1);
1636 state->progLength += 3;
1637 break;
1639 return ParseQuantifier(state);
1643 * Top-down regular expression grammar, based closely on Perl4.
1645 * regexp: altern A regular expression is one or more
1646 * altern '|' regexp alternatives separated by vertical bar.
1648 #define INITIAL_STACK_SIZE 128
1650 static BOOL
1651 ParseRegExp(CompilerState *state)
1653 size_t parenIndex;
1654 RENode *operand;
1655 REOpData *operatorStack;
1656 RENode **operandStack;
1657 REOp op;
1658 INT i;
1659 BOOL result = FALSE;
1661 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1662 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1664 /* Watch out for empty regexp */
1665 if (state->cp == state->cpend) {
1666 state->result = NewRENode(state, REOP_EMPTY);
1667 return (state->result != NULL);
1670 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1671 if (!operatorStack)
1672 return FALSE;
1674 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1675 if (!operandStack)
1676 goto out;
1678 for (;;) {
1679 parenIndex = state->parenCount;
1680 if (state->cp == state->cpend) {
1682 * If we are at the end of the regexp and we're short one or more
1683 * operands, the regexp must have the form /x|/ or some such, with
1684 * left parentheses making us short more than one operand.
1686 if (operatorSP >= operandSP) {
1687 operand = NewRENode(state, REOP_EMPTY);
1688 if (!operand)
1689 goto out;
1690 goto pushOperand;
1692 } else {
1693 switch (*state->cp) {
1694 case '(':
1695 ++state->cp;
1696 if (state->cp + 1 < state->cpend &&
1697 *state->cp == '?' &&
1698 (state->cp[1] == '=' ||
1699 state->cp[1] == '!' ||
1700 state->cp[1] == ':')) {
1701 switch (state->cp[1]) {
1702 case '=':
1703 op = REOP_ASSERT;
1704 /* ASSERT, <next>, ... ASSERTTEST */
1705 state->progLength += 4;
1706 break;
1707 case '!':
1708 op = REOP_ASSERT_NOT;
1709 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1710 state->progLength += 4;
1711 break;
1712 default:
1713 op = REOP_LPARENNON;
1714 break;
1716 state->cp += 2;
1717 } else {
1718 op = REOP_LPAREN;
1719 /* LPAREN, <index>, ... RPAREN, <index> */
1720 state->progLength
1721 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1722 state->parenCount++;
1723 if (state->parenCount == 65535) {
1724 ReportRegExpError(state, JSREPORT_ERROR,
1725 JSMSG_TOO_MANY_PARENS);
1726 goto out;
1729 goto pushOperator;
1731 case ')':
1733 * If there's no stacked open parenthesis, throw syntax error.
1735 for (i = operatorSP - 1; ; i--) {
1736 if (i < 0) {
1737 ReportRegExpError(state, JSREPORT_ERROR,
1738 JSMSG_UNMATCHED_RIGHT_PAREN);
1739 goto out;
1741 if (operatorStack[i].op == REOP_ASSERT ||
1742 operatorStack[i].op == REOP_ASSERT_NOT ||
1743 operatorStack[i].op == REOP_LPARENNON ||
1744 operatorStack[i].op == REOP_LPAREN) {
1745 break;
1748 /* FALL THROUGH */
1750 case '|':
1751 /* Expected an operand before these, so make an empty one */
1752 operand = NewRENode(state, REOP_EMPTY);
1753 if (!operand)
1754 goto out;
1755 goto pushOperand;
1757 default:
1758 if (!ParseTerm(state))
1759 goto out;
1760 operand = state->result;
1761 pushOperand:
1762 if (operandSP == operandStackSize) {
1763 RENode **tmp;
1764 operandStackSize += operandStackSize;
1765 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1766 if (!tmp)
1767 goto out;
1768 operandStack = tmp;
1770 operandStack[operandSP++] = operand;
1771 break;
1775 /* At the end; process remaining operators. */
1776 restartOperator:
1777 if (state->cp == state->cpend) {
1778 while (operatorSP) {
1779 --operatorSP;
1780 if (!ProcessOp(state, &operatorStack[operatorSP],
1781 operandStack, operandSP))
1782 goto out;
1783 --operandSP;
1785 assert(operandSP == 1);
1786 state->result = operandStack[0];
1787 result = TRUE;
1788 goto out;
1791 switch (*state->cp) {
1792 case '|':
1793 /* Process any stacked 'concat' operators */
1794 ++state->cp;
1795 while (operatorSP &&
1796 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1797 --operatorSP;
1798 if (!ProcessOp(state, &operatorStack[operatorSP],
1799 operandStack, operandSP)) {
1800 goto out;
1802 --operandSP;
1804 op = REOP_ALT;
1805 goto pushOperator;
1807 case ')':
1809 * If there's no stacked open parenthesis, throw syntax error.
1811 for (i = operatorSP - 1; ; i--) {
1812 if (i < 0) {
1813 ReportRegExpError(state, JSREPORT_ERROR,
1814 JSMSG_UNMATCHED_RIGHT_PAREN);
1815 goto out;
1817 if (operatorStack[i].op == REOP_ASSERT ||
1818 operatorStack[i].op == REOP_ASSERT_NOT ||
1819 operatorStack[i].op == REOP_LPARENNON ||
1820 operatorStack[i].op == REOP_LPAREN) {
1821 break;
1824 ++state->cp;
1826 /* Process everything on the stack until the open parenthesis. */
1827 for (;;) {
1828 assert(operatorSP);
1829 --operatorSP;
1830 switch (operatorStack[operatorSP].op) {
1831 case REOP_ASSERT:
1832 case REOP_ASSERT_NOT:
1833 case REOP_LPAREN:
1834 operand = NewRENode(state, operatorStack[operatorSP].op);
1835 if (!operand)
1836 goto out;
1837 operand->u.parenIndex =
1838 operatorStack[operatorSP].parenIndex;
1839 assert(operandSP);
1840 operand->kid = operandStack[operandSP - 1];
1841 operandStack[operandSP - 1] = operand;
1842 if (state->treeDepth == TREE_DEPTH_MAX) {
1843 ReportRegExpError(state, JSREPORT_ERROR,
1844 JSMSG_REGEXP_TOO_COMPLEX);
1845 goto out;
1847 ++state->treeDepth;
1848 /* FALL THROUGH */
1850 case REOP_LPARENNON:
1851 state->result = operandStack[operandSP - 1];
1852 if (!ParseQuantifier(state))
1853 goto out;
1854 operandStack[operandSP - 1] = state->result;
1855 goto restartOperator;
1856 default:
1857 if (!ProcessOp(state, &operatorStack[operatorSP],
1858 operandStack, operandSP))
1859 goto out;
1860 --operandSP;
1861 break;
1864 break;
1866 case '{':
1868 const WCHAR *errp = state->cp;
1870 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1872 * This didn't even scan correctly as a quantifier, so we should
1873 * treat it as flat.
1875 op = REOP_CONCAT;
1876 goto pushOperator;
1879 state->cp = errp;
1880 /* FALL THROUGH */
1883 case '+':
1884 case '*':
1885 case '?':
1886 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1887 state->cp);
1888 result = FALSE;
1889 goto out;
1891 default:
1892 /* Anything else is the start of the next term. */
1893 op = REOP_CONCAT;
1894 pushOperator:
1895 if (operatorSP == operatorStackSize) {
1896 REOpData *tmp;
1897 operatorStackSize += operatorStackSize;
1898 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1899 if (!tmp)
1900 goto out;
1901 operatorStack = tmp;
1903 operatorStack[operatorSP].op = op;
1904 operatorStack[operatorSP].errPos = state->cp;
1905 operatorStack[operatorSP++].parenIndex = parenIndex;
1906 break;
1909 out:
1910 heap_free(operatorStack);
1911 heap_free(operandStack);
1912 return result;
1916 * Save the current state of the match - the position in the input
1917 * text as well as the position in the bytecode. The state of any
1918 * parent expressions is also saved (preceding state).
1919 * Contents of parenCount parentheses from parenIndex are also saved.
1921 static REBackTrackData *
1922 PushBackTrackState(REGlobalData *gData, REOp op,
1923 jsbytecode *target, match_state_t *x, const WCHAR *cp,
1924 size_t parenIndex, size_t parenCount)
1926 size_t i;
1927 REBackTrackData *result =
1928 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1930 size_t sz = sizeof(REBackTrackData) +
1931 gData->stateStackTop * sizeof(REProgState) +
1932 parenCount * sizeof(RECapture);
1934 ptrdiff_t btsize = gData->backTrackStackSize;
1935 ptrdiff_t btincr = ((char *)result + sz) -
1936 ((char *)gData->backTrackStack + btsize);
1938 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1940 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1941 if (btincr > 0) {
1942 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1944 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1945 btincr = ((btincr+btsize-1)/btsize)*btsize;
1946 gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1947 if (!gData->backTrackStack) {
1948 js_ReportOutOfScriptQuota(gData->cx);
1949 gData->ok = FALSE;
1950 return NULL;
1952 gData->backTrackStackSize = btsize + btincr;
1953 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1955 gData->backTrackSP = result;
1956 result->sz = gData->cursz;
1957 gData->cursz = sz;
1959 result->backtrack_op = op;
1960 result->backtrack_pc = target;
1961 result->cp = cp;
1962 result->parenCount = parenCount;
1963 result->parenIndex = parenIndex;
1965 result->saveStateStackTop = gData->stateStackTop;
1966 assert(gData->stateStackTop);
1967 memcpy(result + 1, gData->stateStack,
1968 sizeof(REProgState) * result->saveStateStackTop);
1970 if (parenCount != 0) {
1971 memcpy((char *)(result + 1) +
1972 sizeof(REProgState) * result->saveStateStackTop,
1973 &x->parens[parenIndex],
1974 sizeof(RECapture) * parenCount);
1975 for (i = 0; i != parenCount; i++)
1976 x->parens[parenIndex + i].index = -1;
1979 return result;
1982 static inline match_state_t *
1983 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1984 size_t length)
1986 size_t i;
1987 assert(gData->cpend >= x->cp);
1988 if (length > (size_t)(gData->cpend - x->cp))
1989 return NULL;
1990 for (i = 0; i != length; i++) {
1991 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
1992 return NULL;
1994 x->cp += length;
1995 return x;
1999 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2000 * 2. If E is not a character then go to step 6.
2001 * 3. Let ch be E's character.
2002 * 4. Let A be a one-element RECharSet containing the character ch.
2003 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2004 * 6. E must be an integer. Let n be that integer.
2005 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2006 * 8. Return an internal Matcher closure that takes two arguments, a State x
2007 * and a Continuation c, and performs the following:
2008 * 1. Let cap be x's captures internal array.
2009 * 2. Let s be cap[n].
2010 * 3. If s is undefined, then call c(x) and return its result.
2011 * 4. Let e be x's endIndex.
2012 * 5. Let len be s's length.
2013 * 6. Let f be e+len.
2014 * 7. If f>InputLength, return failure.
2015 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2016 * such that Canonicalize(s[i]) is not the same character as
2017 * Canonicalize(Input [e+i]), then return failure.
2018 * 9. Let y be the State (f, cap).
2019 * 10. Call c(y) and return its result.
2021 static match_state_t *
2022 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2024 size_t len, i;
2025 const WCHAR *parenContent;
2026 RECapture *cap = &x->parens[parenIndex];
2028 if (cap->index == -1)
2029 return x;
2031 len = cap->length;
2032 if (x->cp + len > gData->cpend)
2033 return NULL;
2035 parenContent = &gData->cpbegin[cap->index];
2036 if (gData->regexp->flags & REG_FOLD) {
2037 for (i = 0; i < len; i++) {
2038 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2039 return NULL;
2041 } else {
2042 for (i = 0; i < len; i++) {
2043 if (parenContent[i] != x->cp[i])
2044 return NULL;
2047 x->cp += len;
2048 return x;
2051 /* Add a single character to the RECharSet */
2052 static void
2053 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2055 UINT byteIndex = (UINT)(c >> 3);
2056 assert(c <= cs->length);
2057 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2061 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2062 static void
2063 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2065 UINT i;
2067 UINT byteIndex1 = c1 >> 3;
2068 UINT byteIndex2 = c2 >> 3;
2070 assert(c2 <= cs->length && c1 <= c2);
2072 c1 &= 0x7;
2073 c2 &= 0x7;
2075 if (byteIndex1 == byteIndex2) {
2076 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2077 } else {
2078 cs->u.bits[byteIndex1] |= 0xFF << c1;
2079 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2080 cs->u.bits[i] = 0xFF;
2081 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2085 /* Compile the source of the class into a RECharSet */
2086 static BOOL
2087 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2089 const WCHAR *src, *end;
2090 BOOL inRange = FALSE;
2091 WCHAR rangeStart = 0;
2092 UINT byteLength, n;
2093 WCHAR c, thisCh;
2094 INT nDigits, i;
2096 assert(!charSet->converted);
2098 * Assert that startIndex and length points to chars inside [] inside
2099 * source string.
2101 assert(1 <= charSet->u.src.startIndex);
2102 assert(charSet->u.src.startIndex < gData->regexp->source_len);
2103 assert(charSet->u.src.length <= gData->regexp->source_len
2104 - 1 - charSet->u.src.startIndex);
2106 charSet->converted = TRUE;
2107 src = gData->regexp->source + charSet->u.src.startIndex;
2109 end = src + charSet->u.src.length;
2111 assert(src[-1] == '[' && end[0] == ']');
2113 byteLength = (charSet->length >> 3) + 1;
2114 charSet->u.bits = heap_alloc(byteLength);
2115 if (!charSet->u.bits) {
2116 JS_ReportOutOfMemory(gData->cx);
2117 gData->ok = FALSE;
2118 return FALSE;
2120 memset(charSet->u.bits, 0, byteLength);
2122 if (src == end)
2123 return TRUE;
2125 if (*src == '^') {
2126 assert(charSet->sense == FALSE);
2127 ++src;
2128 } else {
2129 assert(charSet->sense == TRUE);
2132 while (src != end) {
2133 switch (*src) {
2134 case '\\':
2135 ++src;
2136 c = *src++;
2137 switch (c) {
2138 case 'b':
2139 thisCh = 0x8;
2140 break;
2141 case 'f':
2142 thisCh = 0xC;
2143 break;
2144 case 'n':
2145 thisCh = 0xA;
2146 break;
2147 case 'r':
2148 thisCh = 0xD;
2149 break;
2150 case 't':
2151 thisCh = 0x9;
2152 break;
2153 case 'v':
2154 thisCh = 0xB;
2155 break;
2156 case 'c':
2157 if (src < end && JS_ISWORD(*src)) {
2158 thisCh = (WCHAR)(*src++ & 0x1F);
2159 } else {
2160 --src;
2161 thisCh = '\\';
2163 break;
2164 case 'x':
2165 nDigits = 2;
2166 goto lexHex;
2167 case 'u':
2168 nDigits = 4;
2169 lexHex:
2170 n = 0;
2171 for (i = 0; (i < nDigits) && (src < end); i++) {
2172 UINT digit;
2173 c = *src++;
2174 if (!isASCIIHexDigit(c, &digit)) {
2176 * Back off to accepting the original '\'
2177 * as a literal
2179 src -= i + 1;
2180 n = '\\';
2181 break;
2183 n = (n << 4) | digit;
2185 thisCh = (WCHAR)n;
2186 break;
2187 case '0':
2188 case '1':
2189 case '2':
2190 case '3':
2191 case '4':
2192 case '5':
2193 case '6':
2194 case '7':
2196 * This is a non-ECMA extension - decimal escapes (in this
2197 * case, octal!) are supposed to be an error inside class
2198 * ranges, but supported here for backwards compatibility.
2200 n = JS7_UNDEC(c);
2201 c = *src;
2202 if ('0' <= c && c <= '7') {
2203 src++;
2204 n = 8 * n + JS7_UNDEC(c);
2205 c = *src;
2206 if ('0' <= c && c <= '7') {
2207 src++;
2208 i = 8 * n + JS7_UNDEC(c);
2209 if (i <= 0377)
2210 n = i;
2211 else
2212 src--;
2215 thisCh = (WCHAR)n;
2216 break;
2218 case 'd':
2219 AddCharacterRangeToCharSet(charSet, '0', '9');
2220 continue; /* don't need range processing */
2221 case 'D':
2222 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2223 AddCharacterRangeToCharSet(charSet,
2224 (WCHAR)('9' + 1),
2225 (WCHAR)charSet->length);
2226 continue;
2227 case 's':
2228 for (i = (INT)charSet->length; i >= 0; i--)
2229 if (isspaceW(i))
2230 AddCharacterToCharSet(charSet, (WCHAR)i);
2231 continue;
2232 case 'S':
2233 for (i = (INT)charSet->length; i >= 0; i--)
2234 if (!isspaceW(i))
2235 AddCharacterToCharSet(charSet, (WCHAR)i);
2236 continue;
2237 case 'w':
2238 for (i = (INT)charSet->length; i >= 0; i--)
2239 if (JS_ISWORD(i))
2240 AddCharacterToCharSet(charSet, (WCHAR)i);
2241 continue;
2242 case 'W':
2243 for (i = (INT)charSet->length; i >= 0; i--)
2244 if (!JS_ISWORD(i))
2245 AddCharacterToCharSet(charSet, (WCHAR)i);
2246 continue;
2247 default:
2248 thisCh = c;
2249 break;
2252 break;
2254 default:
2255 thisCh = *src++;
2256 break;
2259 if (inRange) {
2260 if (gData->regexp->flags & REG_FOLD) {
2261 assert(rangeStart <= thisCh);
2262 for (i = rangeStart; i <= thisCh; i++) {
2263 WCHAR uch, dch;
2265 AddCharacterToCharSet(charSet, i);
2266 uch = toupperW(i);
2267 dch = tolowerW(i);
2268 if (i != uch)
2269 AddCharacterToCharSet(charSet, uch);
2270 if (i != dch)
2271 AddCharacterToCharSet(charSet, dch);
2273 } else {
2274 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2276 inRange = FALSE;
2277 } else {
2278 if (gData->regexp->flags & REG_FOLD) {
2279 AddCharacterToCharSet(charSet, toupperW(thisCh));
2280 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2281 } else {
2282 AddCharacterToCharSet(charSet, thisCh);
2284 if (src < end - 1) {
2285 if (*src == '-') {
2286 ++src;
2287 inRange = TRUE;
2288 rangeStart = thisCh;
2293 return TRUE;
2296 static BOOL
2297 ReallocStateStack(REGlobalData *gData)
2299 size_t limit = gData->stateStackLimit;
2300 size_t sz = sizeof(REProgState) * limit;
2302 gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2303 if (!gData->stateStack) {
2304 js_ReportOutOfScriptQuota(gData->cx);
2305 gData->ok = FALSE;
2306 return FALSE;
2308 gData->stateStackLimit = limit + limit;
2309 return TRUE;
2312 #define PUSH_STATE_STACK(data) \
2313 do { \
2314 ++(data)->stateStackTop; \
2315 if ((data)->stateStackTop == (data)->stateStackLimit && \
2316 !ReallocStateStack((data))) { \
2317 return NULL; \
2319 }while(0)
2322 * Apply the current op against the given input to see if it's going to match
2323 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2324 * true, then update the current state's cp. Always update startpc to the next
2325 * op.
2327 static inline match_state_t *
2328 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2329 jsbytecode **startpc, BOOL updatecp)
2331 match_state_t *result = NULL;
2332 WCHAR matchCh;
2333 size_t parenIndex;
2334 size_t offset, length, index;
2335 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2336 const WCHAR *source;
2337 const WCHAR *startcp = x->cp;
2338 WCHAR ch;
2339 RECharSet *charSet;
2341 const char *opname = reop_names[op];
2342 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2343 (int)gData->stateStackTop * 2, "", opname);
2345 switch (op) {
2346 case REOP_EMPTY:
2347 result = x;
2348 break;
2349 case REOP_BOL:
2350 if (x->cp != gData->cpbegin) {
2351 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2352 !(gData->regexp->flags & REG_MULTILINE)) {
2353 break;
2355 if (!RE_IS_LINE_TERM(x->cp[-1]))
2356 break;
2358 result = x;
2359 break;
2360 case REOP_EOL:
2361 if (x->cp != gData->cpend) {
2362 if (/*!gData->cx->regExpStatics.multiline &&*/
2363 !(gData->regexp->flags & REG_MULTILINE)) {
2364 break;
2366 if (!RE_IS_LINE_TERM(*x->cp))
2367 break;
2369 result = x;
2370 break;
2371 case REOP_WBDRY:
2372 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2373 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2374 result = x;
2376 break;
2377 case REOP_WNONBDRY:
2378 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2379 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2380 result = x;
2382 break;
2383 case REOP_DOT:
2384 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2385 result = x;
2386 result->cp++;
2388 break;
2389 case REOP_DIGIT:
2390 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2391 result = x;
2392 result->cp++;
2394 break;
2395 case REOP_NONDIGIT:
2396 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2397 result = x;
2398 result->cp++;
2400 break;
2401 case REOP_ALNUM:
2402 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2403 result = x;
2404 result->cp++;
2406 break;
2407 case REOP_NONALNUM:
2408 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2409 result = x;
2410 result->cp++;
2412 break;
2413 case REOP_SPACE:
2414 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2415 result = x;
2416 result->cp++;
2418 break;
2419 case REOP_NONSPACE:
2420 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2421 result = x;
2422 result->cp++;
2424 break;
2425 case REOP_BACKREF:
2426 pc = ReadCompactIndex(pc, &parenIndex);
2427 assert(parenIndex < gData->regexp->parenCount);
2428 result = BackrefMatcher(gData, x, parenIndex);
2429 break;
2430 case REOP_FLAT:
2431 pc = ReadCompactIndex(pc, &offset);
2432 assert(offset < gData->regexp->source_len);
2433 pc = ReadCompactIndex(pc, &length);
2434 assert(1 <= length);
2435 assert(length <= gData->regexp->source_len - offset);
2436 if (length <= (size_t)(gData->cpend - x->cp)) {
2437 source = gData->regexp->source + offset;
2438 TRACE("%s\n", debugstr_wn(source, length));
2439 for (index = 0; index != length; index++) {
2440 if (source[index] != x->cp[index])
2441 return NULL;
2443 x->cp += length;
2444 result = x;
2446 break;
2447 case REOP_FLAT1:
2448 matchCh = *pc++;
2449 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2450 if (x->cp != gData->cpend && *x->cp == matchCh) {
2451 result = x;
2452 result->cp++;
2454 break;
2455 case REOP_FLATi:
2456 pc = ReadCompactIndex(pc, &offset);
2457 assert(offset < gData->regexp->source_len);
2458 pc = ReadCompactIndex(pc, &length);
2459 assert(1 <= length);
2460 assert(length <= gData->regexp->source_len - offset);
2461 source = gData->regexp->source;
2462 result = FlatNIMatcher(gData, x, source + offset, length);
2463 break;
2464 case REOP_FLAT1i:
2465 matchCh = *pc++;
2466 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2467 result = x;
2468 result->cp++;
2470 break;
2471 case REOP_UCFLAT1:
2472 matchCh = GET_ARG(pc);
2473 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2474 pc += ARG_LEN;
2475 if (x->cp != gData->cpend && *x->cp == matchCh) {
2476 result = x;
2477 result->cp++;
2479 break;
2480 case REOP_UCFLAT1i:
2481 matchCh = GET_ARG(pc);
2482 pc += ARG_LEN;
2483 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2484 result = x;
2485 result->cp++;
2487 break;
2488 case REOP_CLASS:
2489 pc = ReadCompactIndex(pc, &index);
2490 assert(index < gData->regexp->classCount);
2491 if (x->cp != gData->cpend) {
2492 charSet = &gData->regexp->classList[index];
2493 assert(charSet->converted);
2494 ch = *x->cp;
2495 index = ch >> 3;
2496 if (charSet->length != 0 &&
2497 ch <= charSet->length &&
2498 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2499 result = x;
2500 result->cp++;
2503 break;
2504 case REOP_NCLASS:
2505 pc = ReadCompactIndex(pc, &index);
2506 assert(index < gData->regexp->classCount);
2507 if (x->cp != gData->cpend) {
2508 charSet = &gData->regexp->classList[index];
2509 assert(charSet->converted);
2510 ch = *x->cp;
2511 index = ch >> 3;
2512 if (charSet->length == 0 ||
2513 ch > charSet->length ||
2514 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2515 result = x;
2516 result->cp++;
2519 break;
2521 default:
2522 assert(FALSE);
2524 if (result) {
2525 if (!updatecp)
2526 x->cp = startcp;
2527 *startpc = pc;
2528 TRACE(" *\n");
2529 return result;
2531 x->cp = startcp;
2532 return NULL;
2535 static inline match_state_t *
2536 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2538 match_state_t *result = NULL;
2539 REBackTrackData *backTrackData;
2540 jsbytecode *nextpc, *testpc;
2541 REOp nextop;
2542 RECapture *cap;
2543 REProgState *curState;
2544 const WCHAR *startcp;
2545 size_t parenIndex, k;
2546 size_t parenSoFar = 0;
2548 WCHAR matchCh1, matchCh2;
2549 RECharSet *charSet;
2551 BOOL anchor;
2552 jsbytecode *pc = gData->regexp->program;
2553 REOp op = (REOp) *pc++;
2556 * If the first node is a simple match, step the index into the string
2557 * until that match is made, or fail if it can't be found at all.
2559 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2560 anchor = FALSE;
2561 while (x->cp <= gData->cpend) {
2562 nextpc = pc; /* reset back to start each time */
2563 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2564 if (result) {
2565 anchor = TRUE;
2566 x = result;
2567 pc = nextpc; /* accept skip to next opcode */
2568 op = (REOp) *pc++;
2569 assert(op < REOP_LIMIT);
2570 break;
2572 gData->skipped++;
2573 x->cp++;
2575 if (!anchor)
2576 goto bad;
2579 for (;;) {
2580 const char *opname = reop_names[op];
2581 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2582 (int)gData->stateStackTop * 2, "", opname);
2584 if (REOP_IS_SIMPLE(op)) {
2585 result = SimpleMatch(gData, x, op, &pc, TRUE);
2586 } else {
2587 curState = &gData->stateStack[gData->stateStackTop];
2588 switch (op) {
2589 case REOP_END:
2590 goto good;
2591 case REOP_ALTPREREQ2:
2592 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2593 pc += ARG_LEN;
2594 matchCh2 = GET_ARG(pc);
2595 pc += ARG_LEN;
2596 k = GET_ARG(pc);
2597 pc += ARG_LEN;
2599 if (x->cp != gData->cpend) {
2600 if (*x->cp == matchCh2)
2601 goto doAlt;
2603 charSet = &gData->regexp->classList[k];
2604 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2605 goto bad;
2606 matchCh1 = *x->cp;
2607 k = matchCh1 >> 3;
2608 if ((charSet->length == 0 ||
2609 matchCh1 > charSet->length ||
2610 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2611 charSet->sense) {
2612 goto doAlt;
2615 result = NULL;
2616 break;
2618 case REOP_ALTPREREQ:
2619 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2620 pc += ARG_LEN;
2621 matchCh1 = GET_ARG(pc);
2622 pc += ARG_LEN;
2623 matchCh2 = GET_ARG(pc);
2624 pc += ARG_LEN;
2625 if (x->cp == gData->cpend ||
2626 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2627 result = NULL;
2628 break;
2630 /* else fall through... */
2632 case REOP_ALT:
2633 doAlt:
2634 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2635 pc += ARG_LEN; /* start of this alternate */
2636 curState->parenSoFar = parenSoFar;
2637 PUSH_STATE_STACK(gData);
2638 op = (REOp) *pc++;
2639 startcp = x->cp;
2640 if (REOP_IS_SIMPLE(op)) {
2641 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2642 op = (REOp) *nextpc++;
2643 pc = nextpc;
2644 continue;
2646 result = x;
2647 op = (REOp) *pc++;
2649 nextop = (REOp) *nextpc++;
2650 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2651 goto bad;
2652 continue;
2655 * Occurs at (successful) end of REOP_ALT,
2657 case REOP_JUMP:
2659 * If we have not gotten a result here, it is because of an
2660 * empty match. Do the same thing REOP_EMPTY would do.
2662 if (!result)
2663 result = x;
2665 --gData->stateStackTop;
2666 pc += GET_OFFSET(pc);
2667 op = (REOp) *pc++;
2668 continue;
2671 * Occurs at last (successful) end of REOP_ALT,
2673 case REOP_ENDALT:
2675 * If we have not gotten a result here, it is because of an
2676 * empty match. Do the same thing REOP_EMPTY would do.
2678 if (!result)
2679 result = x;
2681 --gData->stateStackTop;
2682 op = (REOp) *pc++;
2683 continue;
2685 case REOP_LPAREN:
2686 pc = ReadCompactIndex(pc, &parenIndex);
2687 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2688 assert(parenIndex < gData->regexp->parenCount);
2689 if (parenIndex + 1 > parenSoFar)
2690 parenSoFar = parenIndex + 1;
2691 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2692 x->parens[parenIndex].length = 0;
2693 op = (REOp) *pc++;
2694 continue;
2696 case REOP_RPAREN:
2698 ptrdiff_t delta;
2700 pc = ReadCompactIndex(pc, &parenIndex);
2701 assert(parenIndex < gData->regexp->parenCount);
2702 cap = &x->parens[parenIndex];
2703 delta = x->cp - (gData->cpbegin + cap->index);
2704 cap->length = (delta < 0) ? 0 : (size_t) delta;
2705 op = (REOp) *pc++;
2707 if (!result)
2708 result = x;
2709 continue;
2711 case REOP_ASSERT:
2712 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2713 pc += ARG_LEN; /* start of ASSERT child */
2714 op = (REOp) *pc++;
2715 testpc = pc;
2716 if (REOP_IS_SIMPLE(op) &&
2717 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2718 result = NULL;
2719 break;
2721 curState->u.assertion.top =
2722 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2723 curState->u.assertion.sz = gData->cursz;
2724 curState->index = x->cp - gData->cpbegin;
2725 curState->parenSoFar = parenSoFar;
2726 PUSH_STATE_STACK(gData);
2727 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2728 nextpc, x, x->cp, 0, 0)) {
2729 goto bad;
2731 continue;
2733 case REOP_ASSERT_NOT:
2734 nextpc = pc + GET_OFFSET(pc);
2735 pc += ARG_LEN;
2736 op = (REOp) *pc++;
2737 testpc = pc;
2738 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2739 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2740 *testpc == REOP_ASSERTNOTTEST) {
2741 result = NULL;
2742 break;
2744 curState->u.assertion.top
2745 = (char *)gData->backTrackSP -
2746 (char *)gData->backTrackStack;
2747 curState->u.assertion.sz = gData->cursz;
2748 curState->index = x->cp - gData->cpbegin;
2749 curState->parenSoFar = parenSoFar;
2750 PUSH_STATE_STACK(gData);
2751 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2752 nextpc, x, x->cp, 0, 0)) {
2753 goto bad;
2755 continue;
2757 case REOP_ASSERTTEST:
2758 --gData->stateStackTop;
2759 --curState;
2760 x->cp = gData->cpbegin + curState->index;
2761 gData->backTrackSP =
2762 (REBackTrackData *) ((char *)gData->backTrackStack +
2763 curState->u.assertion.top);
2764 gData->cursz = curState->u.assertion.sz;
2765 if (result)
2766 result = x;
2767 break;
2769 case REOP_ASSERTNOTTEST:
2770 --gData->stateStackTop;
2771 --curState;
2772 x->cp = gData->cpbegin + curState->index;
2773 gData->backTrackSP =
2774 (REBackTrackData *) ((char *)gData->backTrackStack +
2775 curState->u.assertion.top);
2776 gData->cursz = curState->u.assertion.sz;
2777 result = (!result) ? x : NULL;
2778 break;
2779 case REOP_STAR:
2780 curState->u.quantifier.min = 0;
2781 curState->u.quantifier.max = (UINT)-1;
2782 goto quantcommon;
2783 case REOP_PLUS:
2784 curState->u.quantifier.min = 1;
2785 curState->u.quantifier.max = (UINT)-1;
2786 goto quantcommon;
2787 case REOP_OPT:
2788 curState->u.quantifier.min = 0;
2789 curState->u.quantifier.max = 1;
2790 goto quantcommon;
2791 case REOP_QUANT:
2792 pc = ReadCompactIndex(pc, &k);
2793 curState->u.quantifier.min = k;
2794 pc = ReadCompactIndex(pc, &k);
2795 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2796 curState->u.quantifier.max = k - 1;
2797 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2798 quantcommon:
2799 if (curState->u.quantifier.max == 0) {
2800 pc = pc + GET_OFFSET(pc);
2801 op = (REOp) *pc++;
2802 result = x;
2803 continue;
2805 /* Step over <next> */
2806 nextpc = pc + ARG_LEN;
2807 op = (REOp) *nextpc++;
2808 startcp = x->cp;
2809 if (REOP_IS_SIMPLE(op)) {
2810 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2811 if (curState->u.quantifier.min == 0)
2812 result = x;
2813 else
2814 result = NULL;
2815 pc = pc + GET_OFFSET(pc);
2816 break;
2818 op = (REOp) *nextpc++;
2819 result = x;
2821 curState->index = startcp - gData->cpbegin;
2822 curState->continue_op = REOP_REPEAT;
2823 curState->continue_pc = pc;
2824 curState->parenSoFar = parenSoFar;
2825 PUSH_STATE_STACK(gData);
2826 if (curState->u.quantifier.min == 0 &&
2827 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2828 0, 0)) {
2829 goto bad;
2831 pc = nextpc;
2832 continue;
2834 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2835 pc = curState[-1].continue_pc;
2836 op = (REOp) curState[-1].continue_op;
2838 if (!result)
2839 result = x;
2840 continue;
2842 case REOP_REPEAT:
2843 --curState;
2844 do {
2845 --gData->stateStackTop;
2846 if (!result) {
2847 /* Failed, see if we have enough children. */
2848 if (curState->u.quantifier.min == 0)
2849 goto repeatDone;
2850 goto break_switch;
2852 if (curState->u.quantifier.min == 0 &&
2853 x->cp == gData->cpbegin + curState->index) {
2854 /* matched an empty string, that'll get us nowhere */
2855 result = NULL;
2856 goto break_switch;
2858 if (curState->u.quantifier.min != 0)
2859 curState->u.quantifier.min--;
2860 if (curState->u.quantifier.max != (UINT) -1)
2861 curState->u.quantifier.max--;
2862 if (curState->u.quantifier.max == 0)
2863 goto repeatDone;
2864 nextpc = pc + ARG_LEN;
2865 nextop = (REOp) *nextpc;
2866 startcp = x->cp;
2867 if (REOP_IS_SIMPLE(nextop)) {
2868 nextpc++;
2869 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2870 if (curState->u.quantifier.min == 0)
2871 goto repeatDone;
2872 result = NULL;
2873 goto break_switch;
2875 result = x;
2877 curState->index = startcp - gData->cpbegin;
2878 PUSH_STATE_STACK(gData);
2879 if (curState->u.quantifier.min == 0 &&
2880 !PushBackTrackState(gData, REOP_REPEAT,
2881 pc, x, startcp,
2882 curState->parenSoFar,
2883 parenSoFar -
2884 curState->parenSoFar)) {
2885 goto bad;
2887 } while (*nextpc == REOP_ENDCHILD);
2888 pc = nextpc;
2889 op = (REOp) *pc++;
2890 parenSoFar = curState->parenSoFar;
2891 continue;
2893 repeatDone:
2894 result = x;
2895 pc += GET_OFFSET(pc);
2896 goto break_switch;
2898 case REOP_MINIMALSTAR:
2899 curState->u.quantifier.min = 0;
2900 curState->u.quantifier.max = (UINT)-1;
2901 goto minimalquantcommon;
2902 case REOP_MINIMALPLUS:
2903 curState->u.quantifier.min = 1;
2904 curState->u.quantifier.max = (UINT)-1;
2905 goto minimalquantcommon;
2906 case REOP_MINIMALOPT:
2907 curState->u.quantifier.min = 0;
2908 curState->u.quantifier.max = 1;
2909 goto minimalquantcommon;
2910 case REOP_MINIMALQUANT:
2911 pc = ReadCompactIndex(pc, &k);
2912 curState->u.quantifier.min = k;
2913 pc = ReadCompactIndex(pc, &k);
2914 /* See REOP_QUANT comments about k - 1. */
2915 curState->u.quantifier.max = k - 1;
2916 assert(curState->u.quantifier.min
2917 <= curState->u.quantifier.max);
2918 minimalquantcommon:
2919 curState->index = x->cp - gData->cpbegin;
2920 curState->parenSoFar = parenSoFar;
2921 PUSH_STATE_STACK(gData);
2922 if (curState->u.quantifier.min != 0) {
2923 curState->continue_op = REOP_MINIMALREPEAT;
2924 curState->continue_pc = pc;
2925 /* step over <next> */
2926 pc += OFFSET_LEN;
2927 op = (REOp) *pc++;
2928 } else {
2929 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2930 pc, x, x->cp, 0, 0)) {
2931 goto bad;
2933 --gData->stateStackTop;
2934 pc = pc + GET_OFFSET(pc);
2935 op = (REOp) *pc++;
2937 continue;
2939 case REOP_MINIMALREPEAT:
2940 --gData->stateStackTop;
2941 --curState;
2943 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2944 #define PREPARE_REPEAT() \
2945 do { \
2946 curState->index = x->cp - gData->cpbegin; \
2947 curState->continue_op = REOP_MINIMALREPEAT; \
2948 curState->continue_pc = pc; \
2949 pc += ARG_LEN; \
2950 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2951 x->parens[k].index = -1; \
2952 PUSH_STATE_STACK(gData); \
2953 op = (REOp) *pc++; \
2954 assert(op < REOP_LIMIT); \
2955 }while(0)
2957 if (!result) {
2958 TRACE(" -\n");
2960 * Non-greedy failure - try to consume another child.
2962 if (curState->u.quantifier.max == (UINT) -1 ||
2963 curState->u.quantifier.max > 0) {
2964 PREPARE_REPEAT();
2965 continue;
2967 /* Don't need to adjust pc since we're going to pop. */
2968 break;
2970 if (curState->u.quantifier.min == 0 &&
2971 x->cp == gData->cpbegin + curState->index) {
2972 /* Matched an empty string, that'll get us nowhere. */
2973 result = NULL;
2974 break;
2976 if (curState->u.quantifier.min != 0)
2977 curState->u.quantifier.min--;
2978 if (curState->u.quantifier.max != (UINT) -1)
2979 curState->u.quantifier.max--;
2980 if (curState->u.quantifier.min != 0) {
2981 PREPARE_REPEAT();
2982 continue;
2984 curState->index = x->cp - gData->cpbegin;
2985 curState->parenSoFar = parenSoFar;
2986 PUSH_STATE_STACK(gData);
2987 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2988 pc, x, x->cp,
2989 curState->parenSoFar,
2990 parenSoFar - curState->parenSoFar)) {
2991 goto bad;
2993 --gData->stateStackTop;
2994 pc = pc + GET_OFFSET(pc);
2995 op = (REOp) *pc++;
2996 assert(op < REOP_LIMIT);
2997 continue;
2998 default:
2999 assert(FALSE);
3000 result = NULL;
3002 break_switch:;
3006 * If the match failed and there's a backtrack option, take it.
3007 * Otherwise this is a complete and utter failure.
3009 if (!result) {
3010 if (gData->cursz == 0)
3011 return NULL;
3013 /* Potentially detect explosive regex here. */
3014 gData->backTrackCount++;
3015 if (gData->backTrackLimit &&
3016 gData->backTrackCount >= gData->backTrackLimit) {
3017 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3018 JSMSG_REGEXP_TOO_COMPLEX);
3019 gData->ok = FALSE;
3020 return NULL;
3023 backTrackData = gData->backTrackSP;
3024 gData->cursz = backTrackData->sz;
3025 gData->backTrackSP =
3026 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3027 x->cp = backTrackData->cp;
3028 pc = backTrackData->backtrack_pc;
3029 op = (REOp) backTrackData->backtrack_op;
3030 assert(op < REOP_LIMIT);
3031 gData->stateStackTop = backTrackData->saveStateStackTop;
3032 assert(gData->stateStackTop);
3034 memcpy(gData->stateStack, backTrackData + 1,
3035 sizeof(REProgState) * backTrackData->saveStateStackTop);
3036 curState = &gData->stateStack[gData->stateStackTop - 1];
3038 if (backTrackData->parenCount) {
3039 memcpy(&x->parens[backTrackData->parenIndex],
3040 (char *)(backTrackData + 1) +
3041 sizeof(REProgState) * backTrackData->saveStateStackTop,
3042 sizeof(RECapture) * backTrackData->parenCount);
3043 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3044 } else {
3045 for (k = curState->parenSoFar; k < parenSoFar; k++)
3046 x->parens[k].index = -1;
3047 parenSoFar = curState->parenSoFar;
3050 TRACE("\tBT_Pop: %ld,%ld\n",
3051 (ULONG_PTR)backTrackData->parenIndex,
3052 (ULONG_PTR)backTrackData->parenCount);
3053 continue;
3055 x = result;
3058 * Continue with the expression.
3060 op = (REOp)*pc++;
3061 assert(op < REOP_LIMIT);
3064 bad:
3065 TRACE("\n");
3066 return NULL;
3068 good:
3069 TRACE("\n");
3070 return x;
3073 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3075 match_state_t *result;
3076 const WCHAR *cp = x->cp;
3077 const WCHAR *cp2;
3078 UINT j;
3081 * Have to include the position beyond the last character
3082 * in order to detect end-of-input/line condition.
3084 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3085 gData->skipped = cp2 - cp;
3086 x->cp = cp2;
3087 for (j = 0; j < gData->regexp->parenCount; j++)
3088 x->parens[j].index = -1;
3089 result = ExecuteREBytecode(gData, x);
3090 if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3091 return result;
3092 gData->backTrackSP = gData->backTrackStack;
3093 gData->cursz = 0;
3094 gData->stateStackTop = 0;
3095 cp2 = cp + gData->skipped;
3097 return NULL;
3100 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3102 UINT i;
3104 gData->backTrackStackSize = INITIAL_BACKTRACK;
3105 gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3106 if (!gData->backTrackStack)
3107 goto bad;
3109 gData->backTrackSP = gData->backTrackStack;
3110 gData->cursz = 0;
3111 gData->backTrackCount = 0;
3112 gData->backTrackLimit = 0;
3114 gData->stateStackLimit = INITIAL_STATESTACK;
3115 gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3116 if (!gData->stateStack)
3117 goto bad;
3119 gData->stateStackTop = 0;
3120 gData->cx = cx;
3121 gData->pool = pool;
3122 gData->regexp = re;
3123 gData->ok = TRUE;
3125 for (i = 0; i < re->classCount; i++) {
3126 if (!re->classList[i].converted &&
3127 !ProcessCharSet(gData, &re->classList[i])) {
3128 return E_FAIL;
3132 return S_OK;
3134 bad:
3135 js_ReportOutOfScriptQuota(cx);
3136 gData->ok = FALSE;
3137 return E_OUTOFMEMORY;
3140 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3141 const WCHAR *str, DWORD str_len, match_state_t *result)
3143 match_state_t *res;
3144 REGlobalData gData;
3145 heap_pool_t *mark = heap_pool_mark(pool);
3146 const WCHAR *str_beg = result->cp;
3147 HRESULT hres;
3149 assert(result->cp != NULL);
3151 gData.cpbegin = str;
3152 gData.cpend = str+str_len;
3153 gData.start = result->cp-str;
3154 gData.skipped = 0;
3155 gData.pool = pool;
3157 hres = InitMatch(regexp, cx, pool, &gData);
3158 if(FAILED(hres)) {
3159 WARN("InitMatch failed\n");
3160 heap_pool_clear(mark);
3161 return hres;
3164 res = MatchRegExp(&gData, result);
3165 heap_pool_clear(mark);
3166 if(!gData.ok) {
3167 WARN("MatchRegExp failed\n");
3168 return E_FAIL;
3171 if(!res) {
3172 result->match_len = 0;
3173 return S_FALSE;
3176 result->match_len = (result->cp-str_beg) - gData.skipped;
3177 result->paren_count = regexp->parenCount;
3178 return S_OK;
3181 void regexp_destroy(regexp_t *re)
3183 if (re->classList) {
3184 UINT i;
3185 for (i = 0; i < re->classCount; i++) {
3186 if (re->classList[i].converted)
3187 heap_free(re->classList[i].u.bits);
3188 re->classList[i].u.bits = NULL;
3190 heap_free(re->classList);
3192 heap_free(re);
3195 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3196 DWORD str_len, WORD flags, BOOL flat)
3198 regexp_t *re;
3199 heap_pool_t *mark;
3200 CompilerState state;
3201 size_t resize;
3202 jsbytecode *endPC;
3203 UINT i;
3204 size_t len;
3206 re = NULL;
3207 mark = heap_pool_mark(pool);
3208 len = str_len;
3210 state.context = cx;
3211 state.pool = pool;
3212 state.cp = str;
3213 if (!state.cp)
3214 goto out;
3215 state.cpbegin = state.cp;
3216 state.cpend = state.cp + len;
3217 state.flags = flags;
3218 state.parenCount = 0;
3219 state.classCount = 0;
3220 state.progLength = 0;
3221 state.treeDepth = 0;
3222 state.classBitmapsMem = 0;
3223 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3224 state.classCache[i].start = NULL;
3226 if (len != 0 && flat) {
3227 state.result = NewRENode(&state, REOP_FLAT);
3228 if (!state.result)
3229 goto out;
3230 state.result->u.flat.chr = *state.cpbegin;
3231 state.result->u.flat.length = len;
3232 state.result->kid = (void *) state.cpbegin;
3233 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3234 state.progLength += 1 + GetCompactIndexWidth(0)
3235 + GetCompactIndexWidth(len);
3236 } else {
3237 if (!ParseRegExp(&state))
3238 goto out;
3240 resize = offsetof(regexp_t, program) + state.progLength + 1;
3241 re = heap_alloc(resize);
3242 if (!re)
3243 goto out;
3245 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3246 re->classCount = state.classCount;
3247 if (re->classCount) {
3248 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3249 if (!re->classList) {
3250 regexp_destroy(re);
3251 re = NULL;
3252 goto out;
3254 for (i = 0; i < re->classCount; i++)
3255 re->classList[i].converted = FALSE;
3256 } else {
3257 re->classList = NULL;
3259 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3260 if (!endPC) {
3261 regexp_destroy(re);
3262 re = NULL;
3263 goto out;
3265 *endPC++ = REOP_END;
3267 * Check whether size was overestimated and shrink using realloc.
3268 * This is safe since no pointers to newly parsed regexp or its parts
3269 * besides re exist here.
3271 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3272 regexp_t *tmp;
3273 assert((size_t)(endPC - re->program) < state.progLength + 1);
3274 resize = offsetof(regexp_t, program) + (endPC - re->program);
3275 tmp = heap_realloc(re, resize);
3276 if (tmp)
3277 re = tmp;
3280 re->flags = flags;
3281 re->parenCount = state.parenCount;
3282 re->source = str;
3283 re->source_len = str_len;
3285 out:
3286 heap_pool_clear(mark);
3287 return re;
3290 HRESULT regexp_set_flags(regexp_t **regexp, void *cx, heap_pool_t *pool, WORD flags)
3292 if(((*regexp)->flags & REG_FOLD) != (flags & REG_FOLD)) {
3293 regexp_t *new_regexp = regexp_new(cx, pool, (*regexp)->source,
3294 (*regexp)->source_len, flags, FALSE);
3296 if(!new_regexp)
3297 return E_FAIL;
3299 regexp_destroy(*regexp);
3300 *regexp = new_regexp;
3301 }else {
3302 (*regexp)->flags = flags;
3305 return S_OK;