wow32: Use spec file imports.
[wine.git] / dlls / jscript / regexp.c
blob325b5ad56d6c635b2af570644000689eff911e70
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
34 #include <assert.h>
36 #include "jscript.h"
37 #include "regexp.h"
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 /* FIXME: Better error handling */
44 #define ReportRegExpError(a,b,c) throw_error((a)->context, E_FAIL, L"")
45 #define ReportRegExpErrorHelper(a,b,c,d) throw_error((a)->context, E_FAIL, L"")
46 #define JS_ReportErrorNumber(a,b,c,d) throw_error((a), E_FAIL, L"")
47 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f) throw_error((a), E_FAIL, L"")
48 #define JS_COUNT_OPERATION(a,b) throw_error((a), E_FAIL, L"")
51 typedef BYTE JSPackedBool;
54 * This struct holds a bitmap representation of a class from a regexp.
55 * There's a list of these referenced by the classList field in the regexp_t
56 * struct below. The initial state has startIndex set to the offset in the
57 * original regexp source of the beginning of the class contents. The first
58 * use of the class converts the source representation into a bitmap.
61 typedef struct RECharSet {
62 JSPackedBool converted;
63 JSPackedBool sense;
64 WORD length;
65 union {
66 BYTE *bits;
67 struct {
68 size_t startIndex;
69 size_t length;
70 } src;
71 } u;
72 } RECharSet;
74 #define JSMSG_MIN_TOO_BIG 47
75 #define JSMSG_MAX_TOO_BIG 48
76 #define JSMSG_OUT_OF_ORDER 49
77 #define JSMSG_OUT_OF_MEMORY 137
79 #define LINE_SEPARATOR 0x2028
80 #define PARA_SEPARATOR 0x2029
82 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
83 ((c >= 'a') && (c <= 'z')) )
84 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
85 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
87 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
89 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
90 #define JS7_UNDEC(c) ((c) - '0')
92 typedef enum REOp {
93 REOP_EMPTY,
94 REOP_BOL,
95 REOP_EOL,
96 REOP_WBDRY,
97 REOP_WNONBDRY,
98 REOP_DOT,
99 REOP_DIGIT,
100 REOP_NONDIGIT,
101 REOP_ALNUM,
102 REOP_NONALNUM,
103 REOP_SPACE,
104 REOP_NONSPACE,
105 REOP_BACKREF,
106 REOP_FLAT,
107 REOP_FLAT1,
108 REOP_FLATi,
109 REOP_FLAT1i,
110 REOP_UCFLAT1,
111 REOP_UCFLAT1i,
112 REOP_UCFLAT,
113 REOP_UCFLATi,
114 REOP_CLASS,
115 REOP_NCLASS,
116 REOP_ALT,
117 REOP_QUANT,
118 REOP_STAR,
119 REOP_PLUS,
120 REOP_OPT,
121 REOP_LPAREN,
122 REOP_RPAREN,
123 REOP_JUMP,
124 REOP_DOTSTAR,
125 REOP_LPARENNON,
126 REOP_ASSERT,
127 REOP_ASSERT_NOT,
128 REOP_ASSERTTEST,
129 REOP_ASSERTNOTTEST,
130 REOP_MINIMALSTAR,
131 REOP_MINIMALPLUS,
132 REOP_MINIMALOPT,
133 REOP_MINIMALQUANT,
134 REOP_ENDCHILD,
135 REOP_REPEAT,
136 REOP_MINIMALREPEAT,
137 REOP_ALTPREREQ,
138 REOP_ALTPREREQ2,
139 REOP_ENDALT,
140 REOP_CONCAT,
141 REOP_END,
142 REOP_LIMIT /* META: no operator >= to this */
143 } REOp;
145 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
147 static const char *reop_names[] = {
148 "empty",
149 "bol",
150 "eol",
151 "wbdry",
152 "wnonbdry",
153 "dot",
154 "digit",
155 "nondigit",
156 "alnum",
157 "nonalnum",
158 "space",
159 "nonspace",
160 "backref",
161 "flat",
162 "flat1",
163 "flati",
164 "flat1i",
165 "ucflat1",
166 "ucflat1i",
167 "ucflat",
168 "ucflati",
169 "class",
170 "nclass",
171 "alt",
172 "quant",
173 "star",
174 "plus",
175 "opt",
176 "lparen",
177 "rparen",
178 "jump",
179 "dotstar",
180 "lparennon",
181 "assert",
182 "assert_not",
183 "asserttest",
184 "assertnottest",
185 "minimalstar",
186 "minimalplus",
187 "minimalopt",
188 "minimalquant",
189 "endchild",
190 "repeat",
191 "minimalrepeat",
192 "altprereq",
193 "alrprereq2",
194 "endalt",
195 "concat",
196 "end",
197 NULL
200 typedef struct REProgState {
201 jsbytecode *continue_pc; /* current continuation data */
202 jsbytecode continue_op;
203 ptrdiff_t index; /* progress in text */
204 size_t parenSoFar; /* highest indexed paren started */
205 union {
206 struct {
207 UINT min; /* current quantifier limits */
208 UINT max;
209 } quantifier;
210 struct {
211 size_t top; /* backtrack stack state */
212 size_t sz;
213 } assertion;
214 } u;
215 } REProgState;
217 typedef struct REBackTrackData {
218 size_t sz; /* size of previous stack entry */
219 jsbytecode *backtrack_pc; /* where to backtrack to */
220 jsbytecode backtrack_op;
221 const WCHAR *cp; /* index in text of match at backtrack */
222 size_t parenIndex; /* start index of saved paren contents */
223 size_t parenCount; /* # of saved paren contents */
224 size_t saveStateStackTop; /* number of parent states */
225 /* saved parent states follow */
226 /* saved paren contents follow */
227 } REBackTrackData;
229 #define INITIAL_STATESTACK 100
230 #define INITIAL_BACKTRACK 8000
232 typedef struct REGlobalData {
233 void *cx;
234 regexp_t *regexp; /* the RE in execution */
235 BOOL ok; /* runtime error (out_of_memory only?) */
236 size_t start; /* offset to start at */
237 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
238 const WCHAR *cpbegin; /* text base address */
239 const WCHAR *cpend; /* text limit address */
241 REProgState *stateStack; /* stack of state of current parents */
242 size_t stateStackTop;
243 size_t stateStackLimit;
245 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
246 REBackTrackData *backTrackSP;
247 size_t backTrackStackSize;
248 size_t cursz; /* size of current stack entry */
249 size_t backTrackCount; /* how many times we've backtracked */
250 size_t backTrackLimit; /* upper limit on backtrack states */
252 heap_pool_t *pool; /* It's faster to use one malloc'd pool
253 than to malloc/free the three items
254 that are allocated from this pool */
255 } REGlobalData;
257 typedef struct RENode RENode;
258 struct RENode {
259 REOp op; /* r.e. op bytecode */
260 RENode *next; /* next in concatenation order */
261 void *kid; /* first operand */
262 union {
263 void *kid2; /* second operand */
264 INT num; /* could be a number */
265 size_t parenIndex; /* or a parenthesis index */
266 struct { /* or a quantifier range */
267 UINT min;
268 UINT max;
269 JSPackedBool greedy;
270 } range;
271 struct { /* or a character class */
272 size_t startIndex;
273 size_t kidlen; /* length of string at kid, in jschars */
274 size_t index; /* index into class list */
275 WORD bmsize; /* bitmap size, based on max char code */
276 JSPackedBool sense;
277 } ucclass;
278 struct { /* or a literal sequence */
279 WCHAR chr; /* of one character */
280 size_t length; /* or many (via the kid) */
281 } flat;
282 struct {
283 RENode *kid2; /* second operand from ALT */
284 WCHAR ch1; /* match char for ALTPREREQ */
285 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
286 } altprereq;
287 } u;
290 #define CLASS_CACHE_SIZE 4
292 typedef struct CompilerState {
293 script_ctx_t *context;
294 const WCHAR *cpbegin;
295 const WCHAR *cpend;
296 const WCHAR *cp;
297 size_t parenCount;
298 size_t classCount; /* number of [] encountered */
299 size_t treeDepth; /* maximum depth of parse tree */
300 size_t progLength; /* estimated bytecode length */
301 RENode *result;
302 size_t classBitmapsMem; /* memory to hold all class bitmaps */
303 struct {
304 const WCHAR *start; /* small cache of class strings */
305 size_t length; /* since they're often the same */
306 size_t index;
307 } classCache[CLASS_CACHE_SIZE];
308 WORD flags;
310 heap_pool_t *pool; /* It's faster to use one malloc'd pool
311 than to malloc/free */
312 } CompilerState;
314 typedef struct EmitStateStackEntry {
315 jsbytecode *altHead; /* start of REOP_ALT* opcode */
316 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
317 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
318 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
319 RENode *continueNode; /* original REOP_ALT* node being stacked */
320 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
321 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
322 avoid 16-bit unsigned offset overflow */
323 } EmitStateStackEntry;
326 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
327 * the getters and setters take the pc of the offset, not of the opcode before
328 * the offset.
330 #define ARG_LEN 2
331 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
332 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
333 (pc)[1] = (jsbytecode) (arg))
335 #define OFFSET_LEN ARG_LEN
336 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
337 #define GET_OFFSET(pc) GET_ARG(pc)
339 static BOOL ParseRegExp(CompilerState*);
342 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
343 * For sanity, we limit it to 2^24 bytes.
345 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
348 * The maximum memory that can be allocated for class bitmaps.
349 * For sanity, we limit it to 2^24 bytes.
351 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
354 * Functions to get size and write/read bytecode that represent small indexes
355 * compactly.
356 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
357 * indicates that the following byte brings more bits to the index. Otherwise
358 * this is the last byte in the index bytecode representing highest index bits.
360 static size_t
361 GetCompactIndexWidth(size_t index)
363 size_t width;
365 for (width = 1; (index >>= 7) != 0; ++width) { }
366 return width;
369 static inline jsbytecode *
370 WriteCompactIndex(jsbytecode *pc, size_t index)
372 size_t next;
374 while ((next = index >> 7) != 0) {
375 *pc++ = (jsbytecode)(index | 0x80);
376 index = next;
378 *pc++ = (jsbytecode)index;
379 return pc;
382 static inline jsbytecode *
383 ReadCompactIndex(jsbytecode *pc, size_t *result)
385 size_t nextByte;
387 nextByte = *pc++;
388 if ((nextByte & 0x80) == 0) {
390 * Short-circuit the most common case when compact index <= 127.
392 *result = nextByte;
393 } else {
394 size_t shift = 7;
395 *result = 0x7F & nextByte;
396 do {
397 nextByte = *pc++;
398 *result |= (nextByte & 0x7F) << shift;
399 shift += 7;
400 } while ((nextByte & 0x80) != 0);
402 return pc;
405 /* Construct and initialize an RENode, returning NULL for out-of-memory */
406 static RENode *
407 NewRENode(CompilerState *state, REOp op)
409 RENode *ren;
411 ren = heap_pool_alloc(state->pool, sizeof(*ren));
412 if (!ren) {
413 throw_error(state->context, E_OUTOFMEMORY, L"");
414 return NULL;
416 ren->op = op;
417 ren->next = NULL;
418 ren->kid = NULL;
419 return ren;
423 * Validates and converts hex ascii value.
425 static BOOL
426 isASCIIHexDigit(WCHAR c, UINT *digit)
428 UINT cv = c;
430 if (cv < '0')
431 return FALSE;
432 if (cv <= '9') {
433 *digit = cv - '0';
434 return TRUE;
436 cv |= 0x20;
437 if (cv >= 'a' && cv <= 'f') {
438 *digit = cv - 'a' + 10;
439 return TRUE;
441 return FALSE;
444 typedef struct {
445 REOp op;
446 const WCHAR *errPos;
447 size_t parenIndex;
448 } REOpData;
450 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
451 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
453 static BOOL
454 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
456 ptrdiff_t offset = target - jump;
458 /* Check that target really points forward. */
459 assert(offset >= 2);
460 if ((size_t)offset > OFFSET_MAX)
461 return FALSE;
463 jump[0] = JUMP_OFFSET_HI(offset);
464 jump[1] = JUMP_OFFSET_LO(offset);
465 return TRUE;
469 * Generate bytecode for the tree rooted at t using an explicit stack instead
470 * of recursion.
472 static jsbytecode *
473 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
474 jsbytecode *pc, RENode *t)
476 EmitStateStackEntry *emitStateSP, *emitStateStack;
477 RECharSet *charSet;
478 REOp op;
480 if (treeDepth == 0) {
481 emitStateStack = NULL;
482 } else {
483 emitStateStack = malloc(sizeof(EmitStateStackEntry) * treeDepth);
484 if (!emitStateStack)
485 return NULL;
487 emitStateSP = emitStateStack;
488 op = t->op;
489 assert(op < REOP_LIMIT);
491 for (;;) {
492 *pc++ = op;
493 switch (op) {
494 case REOP_EMPTY:
495 --pc;
496 break;
498 case REOP_ALTPREREQ2:
499 case REOP_ALTPREREQ:
500 assert(emitStateSP);
501 emitStateSP->altHead = pc - 1;
502 emitStateSP->endTermFixup = pc;
503 pc += OFFSET_LEN;
504 SET_ARG(pc, t->u.altprereq.ch1);
505 pc += ARG_LEN;
506 SET_ARG(pc, t->u.altprereq.ch2);
507 pc += ARG_LEN;
509 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
510 pc += OFFSET_LEN;
512 emitStateSP->continueNode = t;
513 emitStateSP->continueOp = REOP_JUMP;
514 emitStateSP->jumpToJumpFlag = FALSE;
515 ++emitStateSP;
516 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
517 t = t->kid;
518 op = t->op;
519 assert(op < REOP_LIMIT);
520 continue;
522 case REOP_JUMP:
523 emitStateSP->nextTermFixup = pc; /* offset to following term */
524 pc += OFFSET_LEN;
525 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
526 goto jump_too_big;
527 emitStateSP->continueOp = REOP_ENDALT;
528 ++emitStateSP;
529 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
530 t = t->u.kid2;
531 op = t->op;
532 assert(op < REOP_LIMIT);
533 continue;
535 case REOP_ENDALT:
537 * If we already patched emitStateSP->nextTermFixup to jump to
538 * a nearer jump, to avoid 16-bit immediate offset overflow, we
539 * are done here.
541 if (emitStateSP->jumpToJumpFlag)
542 break;
545 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
546 * REOP_ENDALT is executed only on successful match of the last
547 * alternate in a group.
549 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
550 goto jump_too_big;
551 if (t->op != REOP_ALT) {
552 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
553 goto jump_too_big;
557 * If the program is bigger than the REOP_JUMP offset range, then
558 * we must check for alternates before this one that are part of
559 * the same group, and fix up their jump offsets to target jumps
560 * close enough to fit in a 16-bit unsigned offset immediate.
562 if ((size_t)(pc - re->program) > OFFSET_MAX &&
563 emitStateSP > emitStateStack) {
564 EmitStateStackEntry *esp, *esp2;
565 jsbytecode *alt, *jump;
566 ptrdiff_t span, header;
568 esp2 = emitStateSP;
569 alt = esp2->altHead;
570 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
571 if (esp->continueOp == REOP_ENDALT &&
572 !esp->jumpToJumpFlag &&
573 esp->nextTermFixup + OFFSET_LEN == alt &&
574 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
575 ? esp->endTermFixup
576 : esp->nextTermFixup)) > OFFSET_MAX) {
577 alt = esp->altHead;
578 jump = esp->nextTermFixup;
581 * The span must be 1 less than the distance from
582 * jump offset to jump offset, so we actually jump
583 * to a REOP_JUMP bytecode, not to its offset!
585 for (;;) {
586 assert(jump < esp2->nextTermFixup);
587 span = esp2->nextTermFixup - jump - 1;
588 if ((size_t)span <= OFFSET_MAX)
589 break;
590 do {
591 if (--esp2 == esp)
592 goto jump_too_big;
593 } while (esp2->continueOp != REOP_ENDALT);
596 jump[0] = JUMP_OFFSET_HI(span);
597 jump[1] = JUMP_OFFSET_LO(span);
599 if (esp->continueNode->op != REOP_ALT) {
601 * We must patch the offset at esp->endTermFixup
602 * as well, for the REOP_ALTPREREQ{,2} opcodes.
603 * If we're unlucky and endTermFixup is more than
604 * OFFSET_MAX bytes from its target, we cheat by
605 * jumping 6 bytes to the jump whose offset is at
606 * esp->nextTermFixup, which has the same target.
608 jump = esp->endTermFixup;
609 header = esp->nextTermFixup - jump;
610 span += header;
611 if ((size_t)span > OFFSET_MAX)
612 span = header;
614 jump[0] = JUMP_OFFSET_HI(span);
615 jump[1] = JUMP_OFFSET_LO(span);
618 esp->jumpToJumpFlag = TRUE;
622 break;
624 case REOP_ALT:
625 assert(emitStateSP);
626 emitStateSP->altHead = pc - 1;
627 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
628 pc += OFFSET_LEN;
629 emitStateSP->continueNode = t;
630 emitStateSP->continueOp = REOP_JUMP;
631 emitStateSP->jumpToJumpFlag = FALSE;
632 ++emitStateSP;
633 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
634 t = t->kid;
635 op = t->op;
636 assert(op < REOP_LIMIT);
637 continue;
639 case REOP_FLAT:
641 * Coalesce FLATs if possible and if it would not increase bytecode
642 * beyond preallocated limit. The latter happens only when bytecode
643 * size for coalesced string with offset p and length 2 exceeds 6
644 * bytes preallocated for 2 single char nodes, i.e. when
645 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
646 * GetCompactIndexWidth(p) > 4.
647 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
648 * nodes strictly decreases bytecode size, the check has to be
649 * done only for the first coalescing.
651 if (t->kid &&
652 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
654 while (t->next &&
655 t->next->op == REOP_FLAT &&
656 (WCHAR*)t->kid + t->u.flat.length ==
657 t->next->kid) {
658 t->u.flat.length += t->next->u.flat.length;
659 t->next = t->next->next;
662 if (t->kid && t->u.flat.length > 1) {
663 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
664 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
665 pc = WriteCompactIndex(pc, t->u.flat.length);
666 } else if (t->u.flat.chr < 256) {
667 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
668 *pc++ = (jsbytecode) t->u.flat.chr;
669 } else {
670 pc[-1] = (state->flags & REG_FOLD)
671 ? REOP_UCFLAT1i
672 : REOP_UCFLAT1;
673 SET_ARG(pc, t->u.flat.chr);
674 pc += ARG_LEN;
676 break;
678 case REOP_LPAREN:
679 assert(emitStateSP);
680 pc = WriteCompactIndex(pc, t->u.parenIndex);
681 emitStateSP->continueNode = t;
682 emitStateSP->continueOp = REOP_RPAREN;
683 ++emitStateSP;
684 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
685 t = t->kid;
686 op = t->op;
687 continue;
689 case REOP_RPAREN:
690 pc = WriteCompactIndex(pc, t->u.parenIndex);
691 break;
693 case REOP_BACKREF:
694 pc = WriteCompactIndex(pc, t->u.parenIndex);
695 break;
697 case REOP_ASSERT:
698 assert(emitStateSP);
699 emitStateSP->nextTermFixup = pc;
700 pc += OFFSET_LEN;
701 emitStateSP->continueNode = t;
702 emitStateSP->continueOp = REOP_ASSERTTEST;
703 ++emitStateSP;
704 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
705 t = t->kid;
706 op = t->op;
707 continue;
709 case REOP_ASSERTTEST:
710 case REOP_ASSERTNOTTEST:
711 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
712 goto jump_too_big;
713 break;
715 case REOP_ASSERT_NOT:
716 assert(emitStateSP);
717 emitStateSP->nextTermFixup = pc;
718 pc += OFFSET_LEN;
719 emitStateSP->continueNode = t;
720 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
721 ++emitStateSP;
722 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
723 t = t->kid;
724 op = t->op;
725 continue;
727 case REOP_QUANT:
728 assert(emitStateSP);
729 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
730 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
731 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
732 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
733 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
734 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
735 } else {
736 if (!t->u.range.greedy)
737 pc[-1] = REOP_MINIMALQUANT;
738 pc = WriteCompactIndex(pc, t->u.range.min);
740 * Write max + 1 to avoid using size_t(max) + 1 bytes
741 * for (UINT)-1 sentinel.
743 pc = WriteCompactIndex(pc, t->u.range.max + 1);
745 emitStateSP->nextTermFixup = pc;
746 pc += OFFSET_LEN;
747 emitStateSP->continueNode = t;
748 emitStateSP->continueOp = REOP_ENDCHILD;
749 ++emitStateSP;
750 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
751 t = t->kid;
752 op = t->op;
753 continue;
755 case REOP_ENDCHILD:
756 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
757 goto jump_too_big;
758 break;
760 case REOP_CLASS:
761 if (!t->u.ucclass.sense)
762 pc[-1] = REOP_NCLASS;
763 pc = WriteCompactIndex(pc, t->u.ucclass.index);
764 charSet = &re->classList[t->u.ucclass.index];
765 charSet->converted = FALSE;
766 charSet->length = t->u.ucclass.bmsize;
767 charSet->u.src.startIndex = t->u.ucclass.startIndex;
768 charSet->u.src.length = t->u.ucclass.kidlen;
769 charSet->sense = t->u.ucclass.sense;
770 break;
772 default:
773 break;
776 t = t->next;
777 if (t) {
778 op = t->op;
779 } else {
780 if (emitStateSP == emitStateStack)
781 break;
782 --emitStateSP;
783 t = emitStateSP->continueNode;
784 op = (REOp) emitStateSP->continueOp;
788 cleanup:
789 free(emitStateStack);
790 return pc;
792 jump_too_big:
793 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
794 pc = NULL;
795 goto cleanup;
799 * Process the op against the two top operands, reducing them to a single
800 * operand in the penultimate slot. Update progLength and treeDepth.
802 static BOOL
803 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
804 INT operandSP)
806 RENode *result;
808 switch (opData->op) {
809 case REOP_ALT:
810 result = NewRENode(state, REOP_ALT);
811 if (!result)
812 return FALSE;
813 result->kid = operandStack[operandSP - 2];
814 result->u.kid2 = operandStack[operandSP - 1];
815 operandStack[operandSP - 2] = result;
817 if (state->treeDepth == TREE_DEPTH_MAX) {
818 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
819 return FALSE;
821 ++state->treeDepth;
824 * Look at both alternates to see if there's a FLAT or a CLASS at
825 * the start of each. If so, use a prerequisite match.
827 if (((RENode *) result->kid)->op == REOP_FLAT &&
828 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
829 (state->flags & REG_FOLD) == 0) {
830 result->op = REOP_ALTPREREQ;
831 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
832 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
833 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
834 JUMP, <end> ... ENDALT */
835 state->progLength += 13;
837 else
838 if (((RENode *) result->kid)->op == REOP_CLASS &&
839 ((RENode *) result->kid)->u.ucclass.index < 256 &&
840 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
841 (state->flags & REG_FOLD) == 0) {
842 result->op = REOP_ALTPREREQ2;
843 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
844 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
845 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
846 JUMP, <end> ... ENDALT */
847 state->progLength += 13;
849 else
850 if (((RENode *) result->kid)->op == REOP_FLAT &&
851 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
852 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
853 (state->flags & REG_FOLD) == 0) {
854 result->op = REOP_ALTPREREQ2;
855 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
856 result->u.altprereq.ch2 =
857 ((RENode *) result->u.kid2)->u.ucclass.index;
858 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
859 JUMP, <end> ... ENDALT */
860 state->progLength += 13;
862 else {
863 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
864 state->progLength += 7;
866 break;
868 case REOP_CONCAT:
869 result = operandStack[operandSP - 2];
870 while (result->next)
871 result = result->next;
872 result->next = operandStack[operandSP - 1];
873 break;
875 case REOP_ASSERT:
876 case REOP_ASSERT_NOT:
877 case REOP_LPARENNON:
878 case REOP_LPAREN:
879 /* These should have been processed by a close paren. */
880 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
881 opData->errPos);
882 return FALSE;
884 default:;
886 return TRUE;
890 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
891 * it being on the stack, and to propagate errors to its callers.
893 #define JSREG_FIND_PAREN_COUNT 0x8000
894 #define JSREG_FIND_PAREN_ERROR 0x4000
897 * Magic return value from FindParenCount and GetDecimalValue, to indicate
898 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
899 * its findMax parameter is non-null.
901 #define OVERFLOW_VALUE ((UINT)-1)
903 static UINT
904 FindParenCount(CompilerState *state)
906 CompilerState temp;
907 int i;
909 if (state->flags & JSREG_FIND_PAREN_COUNT)
910 return OVERFLOW_VALUE;
913 * Copy state into temp, flag it so we never report an invalid backref,
914 * and reset its members to parse the entire regexp. This is obviously
915 * suboptimal, but GetDecimalValue calls us only if a backref appears to
916 * refer to a forward parenthetical, which is rare.
918 temp = *state;
919 temp.flags |= JSREG_FIND_PAREN_COUNT;
920 temp.cp = temp.cpbegin;
921 temp.parenCount = 0;
922 temp.classCount = 0;
923 temp.progLength = 0;
924 temp.treeDepth = 0;
925 temp.classBitmapsMem = 0;
926 for (i = 0; i < CLASS_CACHE_SIZE; i++)
927 temp.classCache[i].start = NULL;
929 if (!ParseRegExp(&temp)) {
930 state->flags |= JSREG_FIND_PAREN_ERROR;
931 return OVERFLOW_VALUE;
933 return temp.parenCount;
937 * Extract and return a decimal value at state->cp. The initial character c
938 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
939 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
940 * state->flags to discover whether an error occurred under findMax.
942 static UINT
943 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
944 CompilerState *state)
946 UINT value = JS7_UNDEC(c);
947 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
949 /* The following restriction allows simpler overflow checks. */
950 assert(max <= ((UINT)-1 - 9) / 10);
951 while (state->cp < state->cpend) {
952 c = *state->cp;
953 if (!JS7_ISDEC(c))
954 break;
955 value = 10 * value + JS7_UNDEC(c);
956 if (!overflow && value > max && (!findMax || value > findMax(state)))
957 overflow = TRUE;
958 ++state->cp;
960 return overflow ? OVERFLOW_VALUE : value;
964 * Calculate the total size of the bitmap required for a class expression.
966 static BOOL
967 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
968 const WCHAR *end)
970 UINT max = 0;
971 BOOL inRange = FALSE;
972 WCHAR c, rangeStart = 0;
973 UINT n, digit, nDigits, i;
975 target->u.ucclass.bmsize = 0;
976 target->u.ucclass.sense = TRUE;
978 if (src == end)
979 return TRUE;
981 if (*src == '^') {
982 ++src;
983 target->u.ucclass.sense = FALSE;
986 while (src != end) {
987 BOOL canStartRange = TRUE;
988 UINT localMax = 0;
990 switch (*src) {
991 case '\\':
992 ++src;
993 c = *src++;
994 switch (c) {
995 case 'b':
996 localMax = 0x8;
997 break;
998 case 'f':
999 localMax = 0xC;
1000 break;
1001 case 'n':
1002 localMax = 0xA;
1003 break;
1004 case 'r':
1005 localMax = 0xD;
1006 break;
1007 case 't':
1008 localMax = 0x9;
1009 break;
1010 case 'v':
1011 localMax = 0xB;
1012 break;
1013 case 'c':
1014 if (src < end && RE_IS_LETTER(*src)) {
1015 localMax = (UINT) (*src++) & 0x1F;
1016 } else {
1017 --src;
1018 localMax = '\\';
1020 break;
1021 case 'x':
1022 nDigits = 2;
1023 goto lexHex;
1024 case 'u':
1025 nDigits = 4;
1026 lexHex:
1027 n = 0;
1028 for (i = 0; (i < nDigits) && (src < end); i++) {
1029 c = *src++;
1030 if (!isASCIIHexDigit(c, &digit)) {
1032 * Back off to accepting the original
1033 *'\' as a literal.
1035 src -= i + 1;
1036 n = '\\';
1037 break;
1039 n = (n << 4) | digit;
1041 localMax = n;
1042 break;
1043 case 'd':
1044 canStartRange = FALSE;
1045 if (inRange) {
1046 JS_ReportErrorNumber(state->context,
1047 js_GetErrorMessage, NULL,
1048 JSMSG_BAD_CLASS_RANGE);
1049 return FALSE;
1051 localMax = '9';
1052 break;
1053 case 'D':
1054 case 's':
1055 case 'S':
1056 case 'w':
1057 case 'W':
1058 canStartRange = FALSE;
1059 if (inRange) {
1060 JS_ReportErrorNumber(state->context,
1061 js_GetErrorMessage, NULL,
1062 JSMSG_BAD_CLASS_RANGE);
1063 return FALSE;
1065 max = 65535;
1068 * If this is the start of a range, ensure that it's less than
1069 * the end.
1071 localMax = 0;
1072 break;
1073 case '0':
1074 case '1':
1075 case '2':
1076 case '3':
1077 case '4':
1078 case '5':
1079 case '6':
1080 case '7':
1082 * This is a non-ECMA extension - decimal escapes (in this
1083 * case, octal!) are supposed to be an error inside class
1084 * ranges, but supported here for backwards compatibility.
1087 n = JS7_UNDEC(c);
1088 c = *src;
1089 if ('0' <= c && c <= '7') {
1090 src++;
1091 n = 8 * n + JS7_UNDEC(c);
1092 c = *src;
1093 if ('0' <= c && c <= '7') {
1094 src++;
1095 i = 8 * n + JS7_UNDEC(c);
1096 if (i <= 0377)
1097 n = i;
1098 else
1099 src--;
1102 localMax = n;
1103 break;
1105 default:
1106 localMax = c;
1107 break;
1109 break;
1110 default:
1111 localMax = *src++;
1112 break;
1115 if (inRange) {
1116 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1117 if (rangeStart > localMax) {
1118 JS_ReportErrorNumber(state->context,
1119 js_GetErrorMessage, NULL,
1120 JSMSG_BAD_CLASS_RANGE);
1121 return FALSE;
1123 inRange = FALSE;
1124 } else {
1125 if (canStartRange && src < end - 1) {
1126 if (*src == '-') {
1127 ++src;
1128 inRange = TRUE;
1129 rangeStart = (WCHAR)localMax;
1130 continue;
1133 if (state->flags & REG_FOLD)
1134 rangeStart = localMax; /* one run of the uc/dc loop below */
1137 if (state->flags & REG_FOLD) {
1138 WCHAR maxch = localMax;
1140 for (i = rangeStart; i <= localMax; i++) {
1141 WCHAR uch, dch;
1143 uch = towupper(i);
1144 dch = towlower(i);
1145 if(maxch < uch)
1146 maxch = uch;
1147 if(maxch < dch)
1148 maxch = dch;
1150 localMax = maxch;
1153 if (localMax > max)
1154 max = localMax;
1156 target->u.ucclass.bmsize = max;
1157 return TRUE;
1160 static INT
1161 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1163 UINT min, max;
1164 WCHAR c;
1165 const WCHAR *errp = state->cp++;
1167 c = *state->cp;
1168 if (JS7_ISDEC(c)) {
1169 ++state->cp;
1170 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1171 c = *state->cp;
1173 if (!ignoreValues && min == OVERFLOW_VALUE)
1174 return JSMSG_MIN_TOO_BIG;
1176 if (c == ',') {
1177 c = *++state->cp;
1178 if (JS7_ISDEC(c)) {
1179 ++state->cp;
1180 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1181 c = *state->cp;
1182 if (!ignoreValues && max == OVERFLOW_VALUE)
1183 return JSMSG_MAX_TOO_BIG;
1184 if (!ignoreValues && min > max)
1185 return JSMSG_OUT_OF_ORDER;
1186 } else {
1187 max = (UINT)-1;
1189 } else {
1190 max = min;
1192 if (c == '}') {
1193 state->result = NewRENode(state, REOP_QUANT);
1194 if (!state->result)
1195 return JSMSG_OUT_OF_MEMORY;
1196 state->result->u.range.min = min;
1197 state->result->u.range.max = max;
1199 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1200 * where <max> is written as compact(max+1) to make
1201 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1203 state->progLength += (1 + GetCompactIndexWidth(min)
1204 + GetCompactIndexWidth(max + 1)
1205 +3);
1206 return 0;
1210 state->cp = errp;
1211 return -1;
1214 static BOOL
1215 ParseQuantifier(CompilerState *state)
1217 RENode *term;
1218 term = state->result;
1219 if (state->cp < state->cpend) {
1220 switch (*state->cp) {
1221 case '+':
1222 state->result = NewRENode(state, REOP_QUANT);
1223 if (!state->result)
1224 return FALSE;
1225 state->result->u.range.min = 1;
1226 state->result->u.range.max = (UINT)-1;
1227 /* <PLUS>, <next> ... <ENDCHILD> */
1228 state->progLength += 4;
1229 goto quantifier;
1230 case '*':
1231 state->result = NewRENode(state, REOP_QUANT);
1232 if (!state->result)
1233 return FALSE;
1234 state->result->u.range.min = 0;
1235 state->result->u.range.max = (UINT)-1;
1236 /* <STAR>, <next> ... <ENDCHILD> */
1237 state->progLength += 4;
1238 goto quantifier;
1239 case '?':
1240 state->result = NewRENode(state, REOP_QUANT);
1241 if (!state->result)
1242 return FALSE;
1243 state->result->u.range.min = 0;
1244 state->result->u.range.max = 1;
1245 /* <OPT>, <next> ... <ENDCHILD> */
1246 state->progLength += 4;
1247 goto quantifier;
1248 case '{': /* balance '}' */
1250 INT err;
1252 err = ParseMinMaxQuantifier(state, FALSE);
1253 if (err == 0)
1254 goto quantifier;
1255 if (err == -1)
1256 return TRUE;
1258 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1259 return FALSE;
1261 default:;
1264 return TRUE;
1266 quantifier:
1267 if (state->treeDepth == TREE_DEPTH_MAX) {
1268 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1269 return FALSE;
1272 ++state->treeDepth;
1273 ++state->cp;
1274 state->result->kid = term;
1275 if (state->cp < state->cpend && *state->cp == '?') {
1276 ++state->cp;
1277 state->result->u.range.greedy = FALSE;
1278 } else {
1279 state->result->u.range.greedy = TRUE;
1281 return TRUE;
1285 * item: assertion An item is either an assertion or
1286 * quantatom a quantified atom.
1288 * assertion: '^' Assertions match beginning of string
1289 * (or line if the class static property
1290 * RegExp.multiline is true).
1291 * '$' End of string (or line if the class
1292 * static property RegExp.multiline is
1293 * true).
1294 * '\b' Word boundary (between \w and \W).
1295 * '\B' Word non-boundary.
1297 * quantatom: atom An unquantified atom.
1298 * quantatom '{' n ',' m '}'
1299 * Atom must occur between n and m times.
1300 * quantatom '{' n ',' '}' Atom must occur at least n times.
1301 * quantatom '{' n '}' Atom must occur exactly n times.
1302 * quantatom '*' Zero or more times (same as {0,}).
1303 * quantatom '+' One or more times (same as {1,}).
1304 * quantatom '?' Zero or one time (same as {0,1}).
1306 * any of which can be optionally followed by '?' for ungreedy
1308 * atom: '(' regexp ')' A parenthesized regexp (what matched
1309 * can be addressed using a backreference,
1310 * see '\' n below).
1311 * '.' Matches any char except '\n'.
1312 * '[' classlist ']' A character class.
1313 * '[' '^' classlist ']' A negated character class.
1314 * '\f' Form Feed.
1315 * '\n' Newline (Line Feed).
1316 * '\r' Carriage Return.
1317 * '\t' Horizontal Tab.
1318 * '\v' Vertical Tab.
1319 * '\d' A digit (same as [0-9]).
1320 * '\D' A non-digit.
1321 * '\w' A word character, [0-9a-z_A-Z].
1322 * '\W' A non-word character.
1323 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1324 * '\S' A non-whitespace character.
1325 * '\' n A backreference to the nth (n decimal
1326 * and positive) parenthesized expression.
1327 * '\' octal An octal escape sequence (octal must be
1328 * two or three digits long, unless it is
1329 * 0 for the null character).
1330 * '\x' hex A hex escape (hex must be two digits).
1331 * '\u' unicode A unicode escape (must be four digits).
1332 * '\c' ctrl A control character, ctrl is a letter.
1333 * '\' literalatomchar Any character except one of the above
1334 * that follow '\' in an atom.
1335 * otheratomchar Any character not first among the other
1336 * atom right-hand sides.
1338 static BOOL
1339 ParseTerm(CompilerState *state)
1341 WCHAR c = *state->cp++;
1342 UINT nDigits;
1343 UINT num, tmp, n, i;
1344 const WCHAR *termStart;
1346 switch (c) {
1347 /* assertions and atoms */
1348 case '^':
1349 state->result = NewRENode(state, REOP_BOL);
1350 if (!state->result)
1351 return FALSE;
1352 state->progLength++;
1353 return TRUE;
1354 case '$':
1355 state->result = NewRENode(state, REOP_EOL);
1356 if (!state->result)
1357 return FALSE;
1358 state->progLength++;
1359 return TRUE;
1360 case '\\':
1361 if (state->cp >= state->cpend) {
1362 /* a trailing '\' is an error */
1363 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1364 return FALSE;
1366 c = *state->cp++;
1367 switch (c) {
1368 /* assertion escapes */
1369 case 'b' :
1370 state->result = NewRENode(state, REOP_WBDRY);
1371 if (!state->result)
1372 return FALSE;
1373 state->progLength++;
1374 return TRUE;
1375 case 'B':
1376 state->result = NewRENode(state, REOP_WNONBDRY);
1377 if (!state->result)
1378 return FALSE;
1379 state->progLength++;
1380 return TRUE;
1381 /* Decimal escape */
1382 case '0':
1383 /* Give a strict warning. See also the note below. */
1384 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1385 doOctal:
1386 num = 0;
1387 while (state->cp < state->cpend) {
1388 c = *state->cp;
1389 if (c < '0' || '7' < c)
1390 break;
1391 state->cp++;
1392 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1393 if (tmp > 0377)
1394 break;
1395 num = tmp;
1397 c = (WCHAR)num;
1398 doFlat:
1399 state->result = NewRENode(state, REOP_FLAT);
1400 if (!state->result)
1401 return FALSE;
1402 state->result->u.flat.chr = c;
1403 state->result->u.flat.length = 1;
1404 state->progLength += 3;
1405 break;
1406 case '1':
1407 case '2':
1408 case '3':
1409 case '4':
1410 case '5':
1411 case '6':
1412 case '7':
1413 case '8':
1414 case '9':
1415 termStart = state->cp - 1;
1416 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1417 if (state->flags & JSREG_FIND_PAREN_ERROR)
1418 return FALSE;
1419 if (num == OVERFLOW_VALUE) {
1420 /* Give a strict mode warning. */
1421 WARN("back-reference exceeds number of capturing parentheses\n");
1424 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1425 * error here. However, for compatibility with IE, we treat the
1426 * whole backref as flat if the first character in it is not a
1427 * valid octal character, and as an octal escape otherwise.
1429 state->cp = termStart;
1430 if (c >= '8') {
1431 /* Treat this as flat. termStart - 1 is the \. */
1432 c = '\\';
1433 goto asFlat;
1436 /* Treat this as an octal escape. */
1437 goto doOctal;
1439 assert(1 <= num && num <= 0x10000);
1440 state->result = NewRENode(state, REOP_BACKREF);
1441 if (!state->result)
1442 return FALSE;
1443 state->result->u.parenIndex = num - 1;
1444 state->progLength
1445 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1446 break;
1447 /* Control escape */
1448 case 'f':
1449 c = 0xC;
1450 goto doFlat;
1451 case 'n':
1452 c = 0xA;
1453 goto doFlat;
1454 case 'r':
1455 c = 0xD;
1456 goto doFlat;
1457 case 't':
1458 c = 0x9;
1459 goto doFlat;
1460 case 'v':
1461 c = 0xB;
1462 goto doFlat;
1463 /* Control letter */
1464 case 'c':
1465 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1466 c = (WCHAR) (*state->cp++ & 0x1F);
1467 } else {
1468 /* back off to accepting the original '\' as a literal */
1469 --state->cp;
1470 c = '\\';
1472 goto doFlat;
1473 /* HexEscapeSequence */
1474 case 'x':
1475 nDigits = 2;
1476 goto lexHex;
1477 /* UnicodeEscapeSequence */
1478 case 'u':
1479 nDigits = 4;
1480 lexHex:
1481 n = 0;
1482 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1483 UINT digit;
1484 c = *state->cp++;
1485 if (!isASCIIHexDigit(c, &digit)) {
1487 * Back off to accepting the original 'u' or 'x' as a
1488 * literal.
1490 state->cp -= i + 2;
1491 n = *state->cp++;
1492 break;
1494 n = (n << 4) | digit;
1496 c = (WCHAR) n;
1497 goto doFlat;
1498 /* Character class escapes */
1499 case 'd':
1500 state->result = NewRENode(state, REOP_DIGIT);
1501 doSimple:
1502 if (!state->result)
1503 return FALSE;
1504 state->progLength++;
1505 break;
1506 case 'D':
1507 state->result = NewRENode(state, REOP_NONDIGIT);
1508 goto doSimple;
1509 case 's':
1510 state->result = NewRENode(state, REOP_SPACE);
1511 goto doSimple;
1512 case 'S':
1513 state->result = NewRENode(state, REOP_NONSPACE);
1514 goto doSimple;
1515 case 'w':
1516 state->result = NewRENode(state, REOP_ALNUM);
1517 goto doSimple;
1518 case 'W':
1519 state->result = NewRENode(state, REOP_NONALNUM);
1520 goto doSimple;
1521 /* IdentityEscape */
1522 default:
1523 state->result = NewRENode(state, REOP_FLAT);
1524 if (!state->result)
1525 return FALSE;
1526 state->result->u.flat.chr = c;
1527 state->result->u.flat.length = 1;
1528 state->result->kid = (void *) (state->cp - 1);
1529 state->progLength += 3;
1530 break;
1532 break;
1533 case '[':
1534 state->result = NewRENode(state, REOP_CLASS);
1535 if (!state->result)
1536 return FALSE;
1537 termStart = state->cp;
1538 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1539 for (;;) {
1540 if (state->cp == state->cpend) {
1541 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1542 JSMSG_UNTERM_CLASS, termStart);
1544 return FALSE;
1546 if (*state->cp == '\\') {
1547 state->cp++;
1548 if (state->cp != state->cpend)
1549 state->cp++;
1550 continue;
1552 if (*state->cp == ']') {
1553 state->result->u.ucclass.kidlen = state->cp - termStart;
1554 break;
1556 state->cp++;
1558 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1559 if (!state->classCache[i].start) {
1560 state->classCache[i].start = termStart;
1561 state->classCache[i].length = state->result->u.ucclass.kidlen;
1562 state->classCache[i].index = state->classCount;
1563 break;
1565 if (state->classCache[i].length ==
1566 state->result->u.ucclass.kidlen) {
1567 for (n = 0; ; n++) {
1568 if (n == state->classCache[i].length) {
1569 state->result->u.ucclass.index
1570 = state->classCache[i].index;
1571 goto claim;
1573 if (state->classCache[i].start[n] != termStart[n])
1574 break;
1578 state->result->u.ucclass.index = state->classCount++;
1580 claim:
1582 * Call CalculateBitmapSize now as we want any errors it finds
1583 * to be reported during the parse phase, not at execution.
1585 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1586 return FALSE;
1588 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1589 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1590 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1592 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1593 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1594 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1595 return FALSE;
1597 state->classBitmapsMem += n;
1598 /* CLASS, <index> */
1599 state->progLength
1600 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1601 break;
1603 case '.':
1604 state->result = NewRENode(state, REOP_DOT);
1605 goto doSimple;
1607 case '{':
1609 const WCHAR *errp = state->cp--;
1610 INT err;
1612 err = ParseMinMaxQuantifier(state, TRUE);
1613 state->cp = errp;
1615 if (err < 0)
1616 goto asFlat;
1618 /* FALL THROUGH */
1620 case '*':
1621 case '+':
1622 case '?':
1623 throw_error(state->context, state->context->version < SCRIPTLANGUAGEVERSION_ES5 ? JS_E_REGEXP_SYNTAX : JS_E_UNEXPECTED_QUANTIFIER, L"");
1624 return FALSE;
1625 default:
1626 asFlat:
1627 state->result = NewRENode(state, REOP_FLAT);
1628 if (!state->result)
1629 return FALSE;
1630 state->result->u.flat.chr = c;
1631 state->result->u.flat.length = 1;
1632 state->result->kid = (void *) (state->cp - 1);
1633 state->progLength += 3;
1634 break;
1636 return ParseQuantifier(state);
1640 * Top-down regular expression grammar, based closely on Perl4.
1642 * regexp: altern A regular expression is one or more
1643 * altern '|' regexp alternatives separated by vertical bar.
1645 #define INITIAL_STACK_SIZE 128
1647 static BOOL
1648 ParseRegExp(CompilerState *state)
1650 size_t parenIndex;
1651 RENode *operand;
1652 REOpData *operatorStack;
1653 RENode **operandStack;
1654 REOp op;
1655 INT i;
1656 BOOL result = FALSE;
1658 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1659 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1661 /* Watch out for empty regexp */
1662 if (state->cp == state->cpend) {
1663 state->result = NewRENode(state, REOP_EMPTY);
1664 return (state->result != NULL);
1667 operatorStack = malloc(sizeof(REOpData) * operatorStackSize);
1668 if (!operatorStack)
1669 return FALSE;
1671 operandStack = malloc(sizeof(RENode *) * operandStackSize);
1672 if (!operandStack)
1673 goto out;
1675 for (;;) {
1676 parenIndex = state->parenCount;
1677 if (state->cp == state->cpend) {
1679 * If we are at the end of the regexp and we're short one or more
1680 * operands, the regexp must have the form /x|/ or some such, with
1681 * left parentheses making us short more than one operand.
1683 if (operatorSP >= operandSP) {
1684 operand = NewRENode(state, REOP_EMPTY);
1685 if (!operand)
1686 goto out;
1687 goto pushOperand;
1689 } else {
1690 switch (*state->cp) {
1691 case '(':
1692 ++state->cp;
1693 if (state->cp + 1 < state->cpend &&
1694 *state->cp == '?' &&
1695 (state->cp[1] == '=' ||
1696 state->cp[1] == '!' ||
1697 state->cp[1] == ':')) {
1698 switch (state->cp[1]) {
1699 case '=':
1700 op = REOP_ASSERT;
1701 /* ASSERT, <next>, ... ASSERTTEST */
1702 state->progLength += 4;
1703 break;
1704 case '!':
1705 op = REOP_ASSERT_NOT;
1706 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1707 state->progLength += 4;
1708 break;
1709 default:
1710 op = REOP_LPARENNON;
1711 break;
1713 state->cp += 2;
1714 } else {
1715 op = REOP_LPAREN;
1716 /* LPAREN, <index>, ... RPAREN, <index> */
1717 state->progLength
1718 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1719 state->parenCount++;
1720 if (state->parenCount == 65535) {
1721 ReportRegExpError(state, JSREPORT_ERROR,
1722 JSMSG_TOO_MANY_PARENS);
1723 goto out;
1726 goto pushOperator;
1728 case ')':
1730 * If there's no stacked open parenthesis, throw syntax error.
1732 for (i = operatorSP - 1; ; i--) {
1733 if (i < 0) {
1734 ReportRegExpError(state, JSREPORT_ERROR,
1735 JSMSG_UNMATCHED_RIGHT_PAREN);
1736 goto out;
1738 if (operatorStack[i].op == REOP_ASSERT ||
1739 operatorStack[i].op == REOP_ASSERT_NOT ||
1740 operatorStack[i].op == REOP_LPARENNON ||
1741 operatorStack[i].op == REOP_LPAREN) {
1742 break;
1745 /* FALL THROUGH */
1747 case '|':
1748 /* Expected an operand before these, so make an empty one */
1749 operand = NewRENode(state, REOP_EMPTY);
1750 if (!operand)
1751 goto out;
1752 goto pushOperand;
1754 default:
1755 if (!ParseTerm(state))
1756 goto out;
1757 operand = state->result;
1758 pushOperand:
1759 if (operandSP == operandStackSize) {
1760 RENode **tmp;
1761 operandStackSize += operandStackSize;
1762 tmp = realloc(operandStack, sizeof(RENode *) * operandStackSize);
1763 if (!tmp)
1764 goto out;
1765 operandStack = tmp;
1767 operandStack[operandSP++] = operand;
1768 break;
1772 /* At the end; process remaining operators. */
1773 restartOperator:
1774 if (state->cp == state->cpend) {
1775 while (operatorSP) {
1776 --operatorSP;
1777 if (!ProcessOp(state, &operatorStack[operatorSP],
1778 operandStack, operandSP))
1779 goto out;
1780 --operandSP;
1782 assert(operandSP == 1);
1783 state->result = operandStack[0];
1784 result = TRUE;
1785 goto out;
1788 switch (*state->cp) {
1789 case '|':
1790 /* Process any stacked 'concat' operators */
1791 ++state->cp;
1792 while (operatorSP &&
1793 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1794 --operatorSP;
1795 if (!ProcessOp(state, &operatorStack[operatorSP],
1796 operandStack, operandSP)) {
1797 goto out;
1799 --operandSP;
1801 op = REOP_ALT;
1802 goto pushOperator;
1804 case ')':
1806 * If there's no stacked open parenthesis, throw syntax error.
1808 for (i = operatorSP - 1; ; i--) {
1809 if (i < 0) {
1810 ReportRegExpError(state, JSREPORT_ERROR,
1811 JSMSG_UNMATCHED_RIGHT_PAREN);
1812 goto out;
1814 if (operatorStack[i].op == REOP_ASSERT ||
1815 operatorStack[i].op == REOP_ASSERT_NOT ||
1816 operatorStack[i].op == REOP_LPARENNON ||
1817 operatorStack[i].op == REOP_LPAREN) {
1818 break;
1821 ++state->cp;
1823 /* Process everything on the stack until the open parenthesis. */
1824 for (;;) {
1825 assert(operatorSP);
1826 --operatorSP;
1827 switch (operatorStack[operatorSP].op) {
1828 case REOP_ASSERT:
1829 case REOP_ASSERT_NOT:
1830 case REOP_LPAREN:
1831 operand = NewRENode(state, operatorStack[operatorSP].op);
1832 if (!operand)
1833 goto out;
1834 operand->u.parenIndex =
1835 operatorStack[operatorSP].parenIndex;
1836 assert(operandSP);
1837 operand->kid = operandStack[operandSP - 1];
1838 operandStack[operandSP - 1] = operand;
1839 if (state->treeDepth == TREE_DEPTH_MAX) {
1840 ReportRegExpError(state, JSREPORT_ERROR,
1841 JSMSG_REGEXP_TOO_COMPLEX);
1842 goto out;
1844 ++state->treeDepth;
1845 /* FALL THROUGH */
1847 case REOP_LPARENNON:
1848 state->result = operandStack[operandSP - 1];
1849 if (!ParseQuantifier(state))
1850 goto out;
1851 operandStack[operandSP - 1] = state->result;
1852 goto restartOperator;
1853 default:
1854 if (!ProcessOp(state, &operatorStack[operatorSP],
1855 operandStack, operandSP))
1856 goto out;
1857 --operandSP;
1858 break;
1861 break;
1863 case '{':
1865 const WCHAR *errp = state->cp;
1867 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1869 * This didn't even scan correctly as a quantifier, so we should
1870 * treat it as flat.
1872 op = REOP_CONCAT;
1873 goto pushOperator;
1876 state->cp = errp;
1877 /* FALL THROUGH */
1880 case '+':
1881 case '*':
1882 case '?':
1883 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1884 state->cp);
1885 result = FALSE;
1886 goto out;
1888 default:
1889 /* Anything else is the start of the next term. */
1890 op = REOP_CONCAT;
1891 pushOperator:
1892 if (operatorSP == operatorStackSize) {
1893 REOpData *tmp;
1894 operatorStackSize += operatorStackSize;
1895 tmp = realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1896 if (!tmp)
1897 goto out;
1898 operatorStack = tmp;
1900 operatorStack[operatorSP].op = op;
1901 operatorStack[operatorSP].errPos = state->cp;
1902 operatorStack[operatorSP++].parenIndex = parenIndex;
1903 break;
1906 out:
1907 free(operatorStack);
1908 free(operandStack);
1909 return result;
1913 * Save the current state of the match - the position in the input
1914 * text as well as the position in the bytecode. The state of any
1915 * parent expressions is also saved (preceding state).
1916 * Contents of parenCount parentheses from parenIndex are also saved.
1918 static REBackTrackData *
1919 PushBackTrackState(REGlobalData *gData, REOp op,
1920 jsbytecode *target, match_state_t *x, const WCHAR *cp,
1921 size_t parenIndex, size_t parenCount)
1923 size_t i;
1924 REBackTrackData *result =
1925 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1927 size_t sz = sizeof(REBackTrackData) +
1928 gData->stateStackTop * sizeof(REProgState) +
1929 parenCount * sizeof(RECapture);
1931 ptrdiff_t btsize = gData->backTrackStackSize;
1932 ptrdiff_t btincr = ((char *)result + sz) -
1933 ((char *)gData->backTrackStack + btsize);
1935 TRACE("\tBT_Push: %Iu,%Iu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1937 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1938 if (btincr > 0) {
1939 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1941 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1942 btincr = ((btincr+btsize-1)/btsize)*btsize;
1943 gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1944 if (!gData->backTrackStack) {
1945 throw_error(gData->cx, E_OUTOFMEMORY, L"");
1946 gData->ok = FALSE;
1947 return NULL;
1949 gData->backTrackStackSize = btsize + btincr;
1950 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1952 gData->backTrackSP = result;
1953 result->sz = gData->cursz;
1954 gData->cursz = sz;
1956 result->backtrack_op = op;
1957 result->backtrack_pc = target;
1958 result->cp = cp;
1959 result->parenCount = parenCount;
1960 result->parenIndex = parenIndex;
1962 result->saveStateStackTop = gData->stateStackTop;
1963 assert(gData->stateStackTop);
1964 memcpy(result + 1, gData->stateStack,
1965 sizeof(REProgState) * result->saveStateStackTop);
1967 if (parenCount != 0) {
1968 memcpy((char *)(result + 1) +
1969 sizeof(REProgState) * result->saveStateStackTop,
1970 &x->parens[parenIndex],
1971 sizeof(RECapture) * parenCount);
1972 for (i = 0; i != parenCount; i++)
1973 x->parens[parenIndex + i].index = -1;
1976 return result;
1979 static inline match_state_t *
1980 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1981 size_t length)
1983 size_t i;
1984 assert(gData->cpend >= x->cp);
1985 if (length > (size_t)(gData->cpend - x->cp))
1986 return NULL;
1987 for (i = 0; i != length; i++) {
1988 if (towupper(matchChars[i]) != towupper(x->cp[i]))
1989 return NULL;
1991 x->cp += length;
1992 return x;
1996 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1997 * 2. If E is not a character then go to step 6.
1998 * 3. Let ch be E's character.
1999 * 4. Let A be a one-element RECharSet containing the character ch.
2000 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2001 * 6. E must be an integer. Let n be that integer.
2002 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2003 * 8. Return an internal Matcher closure that takes two arguments, a State x
2004 * and a Continuation c, and performs the following:
2005 * 1. Let cap be x's captures internal array.
2006 * 2. Let s be cap[n].
2007 * 3. If s is undefined, then call c(x) and return its result.
2008 * 4. Let e be x's endIndex.
2009 * 5. Let len be s's length.
2010 * 6. Let f be e+len.
2011 * 7. If f>InputLength, return failure.
2012 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2013 * such that Canonicalize(s[i]) is not the same character as
2014 * Canonicalize(Input [e+i]), then return failure.
2015 * 9. Let y be the State (f, cap).
2016 * 10. Call c(y) and return its result.
2018 static match_state_t *
2019 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2021 size_t len, i;
2022 const WCHAR *parenContent;
2023 RECapture *cap = &x->parens[parenIndex];
2025 if (cap->index == -1)
2026 return x;
2028 len = cap->length;
2029 if (x->cp + len > gData->cpend)
2030 return NULL;
2032 parenContent = &gData->cpbegin[cap->index];
2033 if (gData->regexp->flags & REG_FOLD) {
2034 for (i = 0; i < len; i++) {
2035 if (towupper(parenContent[i]) != towupper(x->cp[i]))
2036 return NULL;
2038 } else {
2039 for (i = 0; i < len; i++) {
2040 if (parenContent[i] != x->cp[i])
2041 return NULL;
2044 x->cp += len;
2045 return x;
2048 /* Add a single character to the RECharSet */
2049 static void
2050 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2052 UINT byteIndex = (UINT)(c >> 3);
2053 assert(c <= cs->length);
2054 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2058 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2059 static void
2060 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2062 UINT i;
2064 UINT byteIndex1 = c1 >> 3;
2065 UINT byteIndex2 = c2 >> 3;
2067 assert(c2 <= cs->length && c1 <= c2);
2069 c1 &= 0x7;
2070 c2 &= 0x7;
2072 if (byteIndex1 == byteIndex2) {
2073 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2074 } else {
2075 cs->u.bits[byteIndex1] |= 0xFF << c1;
2076 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2077 cs->u.bits[i] = 0xFF;
2078 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2082 /* Compile the source of the class into a RECharSet */
2083 static BOOL
2084 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2086 const WCHAR *src, *end;
2087 BOOL inRange = FALSE;
2088 WCHAR rangeStart = 0;
2089 UINT byteLength, n;
2090 WCHAR c, thisCh;
2091 INT nDigits, i;
2093 assert(!charSet->converted);
2095 * Assert that startIndex and length points to chars inside [] inside
2096 * source string.
2098 assert(1 <= charSet->u.src.startIndex);
2099 assert(charSet->u.src.startIndex < gData->regexp->source_len);
2100 assert(charSet->u.src.length <= gData->regexp->source_len
2101 - 1 - charSet->u.src.startIndex);
2103 charSet->converted = TRUE;
2104 src = gData->regexp->source + charSet->u.src.startIndex;
2106 end = src + charSet->u.src.length;
2108 assert(src[-1] == '[' && end[0] == ']');
2110 byteLength = (charSet->length >> 3) + 1;
2111 charSet->u.bits = malloc(byteLength);
2112 if (!charSet->u.bits) {
2113 throw_error(gData->cx, E_OUTOFMEMORY, L"");
2114 gData->ok = FALSE;
2115 return FALSE;
2117 memset(charSet->u.bits, 0, byteLength);
2119 if (src == end)
2120 return TRUE;
2122 if (*src == '^') {
2123 assert(charSet->sense == FALSE);
2124 ++src;
2125 } else {
2126 assert(charSet->sense == TRUE);
2129 while (src != end) {
2130 switch (*src) {
2131 case '\\':
2132 ++src;
2133 c = *src++;
2134 switch (c) {
2135 case 'b':
2136 thisCh = 0x8;
2137 break;
2138 case 'f':
2139 thisCh = 0xC;
2140 break;
2141 case 'n':
2142 thisCh = 0xA;
2143 break;
2144 case 'r':
2145 thisCh = 0xD;
2146 break;
2147 case 't':
2148 thisCh = 0x9;
2149 break;
2150 case 'v':
2151 thisCh = 0xB;
2152 break;
2153 case 'c':
2154 if (src < end && JS_ISWORD(*src)) {
2155 thisCh = (WCHAR)(*src++ & 0x1F);
2156 } else {
2157 --src;
2158 thisCh = '\\';
2160 break;
2161 case 'x':
2162 nDigits = 2;
2163 goto lexHex;
2164 case 'u':
2165 nDigits = 4;
2166 lexHex:
2167 n = 0;
2168 for (i = 0; (i < nDigits) && (src < end); i++) {
2169 UINT digit;
2170 c = *src++;
2171 if (!isASCIIHexDigit(c, &digit)) {
2173 * Back off to accepting the original '\'
2174 * as a literal
2176 src -= i + 1;
2177 n = '\\';
2178 break;
2180 n = (n << 4) | digit;
2182 thisCh = (WCHAR)n;
2183 break;
2184 case '0':
2185 case '1':
2186 case '2':
2187 case '3':
2188 case '4':
2189 case '5':
2190 case '6':
2191 case '7':
2193 * This is a non-ECMA extension - decimal escapes (in this
2194 * case, octal!) are supposed to be an error inside class
2195 * ranges, but supported here for backwards compatibility.
2197 n = JS7_UNDEC(c);
2198 c = *src;
2199 if ('0' <= c && c <= '7') {
2200 src++;
2201 n = 8 * n + JS7_UNDEC(c);
2202 c = *src;
2203 if ('0' <= c && c <= '7') {
2204 src++;
2205 i = 8 * n + JS7_UNDEC(c);
2206 if (i <= 0377)
2207 n = i;
2208 else
2209 src--;
2212 thisCh = (WCHAR)n;
2213 break;
2215 case 'd':
2216 AddCharacterRangeToCharSet(charSet, '0', '9');
2217 continue; /* don't need range processing */
2218 case 'D':
2219 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2220 AddCharacterRangeToCharSet(charSet,
2221 (WCHAR)('9' + 1),
2222 (WCHAR)charSet->length);
2223 continue;
2224 case 's':
2225 for (i = (INT)charSet->length; i >= 0; i--)
2226 if (iswspace(i))
2227 AddCharacterToCharSet(charSet, (WCHAR)i);
2228 continue;
2229 case 'S':
2230 for (i = (INT)charSet->length; i >= 0; i--)
2231 if (!iswspace(i))
2232 AddCharacterToCharSet(charSet, (WCHAR)i);
2233 continue;
2234 case 'w':
2235 for (i = (INT)charSet->length; i >= 0; i--)
2236 if (JS_ISWORD(i))
2237 AddCharacterToCharSet(charSet, (WCHAR)i);
2238 continue;
2239 case 'W':
2240 for (i = (INT)charSet->length; i >= 0; i--)
2241 if (!JS_ISWORD(i))
2242 AddCharacterToCharSet(charSet, (WCHAR)i);
2243 continue;
2244 default:
2245 thisCh = c;
2246 break;
2249 break;
2251 default:
2252 thisCh = *src++;
2253 break;
2256 if (inRange) {
2257 if (gData->regexp->flags & REG_FOLD) {
2258 assert(rangeStart <= thisCh);
2259 for (i = rangeStart; i <= thisCh; i++) {
2260 WCHAR uch, dch;
2262 AddCharacterToCharSet(charSet, i);
2263 uch = towupper(i);
2264 dch = towlower(i);
2265 if (i != uch)
2266 AddCharacterToCharSet(charSet, uch);
2267 if (i != dch)
2268 AddCharacterToCharSet(charSet, dch);
2270 } else {
2271 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2273 inRange = FALSE;
2274 } else {
2275 if (gData->regexp->flags & REG_FOLD) {
2276 AddCharacterToCharSet(charSet, towupper(thisCh));
2277 AddCharacterToCharSet(charSet, towlower(thisCh));
2278 } else {
2279 AddCharacterToCharSet(charSet, thisCh);
2281 if (src < end - 1) {
2282 if (*src == '-') {
2283 ++src;
2284 inRange = TRUE;
2285 rangeStart = thisCh;
2290 return TRUE;
2293 static BOOL
2294 ReallocStateStack(REGlobalData *gData)
2296 size_t limit = gData->stateStackLimit;
2297 size_t sz = sizeof(REProgState) * limit;
2299 gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2300 if (!gData->stateStack) {
2301 throw_error(gData->cx, E_OUTOFMEMORY, L"");
2302 gData->ok = FALSE;
2303 return FALSE;
2305 gData->stateStackLimit = limit + limit;
2306 return TRUE;
2309 #define PUSH_STATE_STACK(data) \
2310 do { \
2311 ++(data)->stateStackTop; \
2312 if ((data)->stateStackTop == (data)->stateStackLimit && \
2313 !ReallocStateStack((data))) { \
2314 return NULL; \
2316 }while(0)
2319 * Apply the current op against the given input to see if it's going to match
2320 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2321 * true, then update the current state's cp. Always update startpc to the next
2322 * op.
2324 static inline match_state_t *
2325 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2326 jsbytecode **startpc, BOOL updatecp)
2328 match_state_t *result = NULL;
2329 WCHAR matchCh;
2330 size_t parenIndex;
2331 size_t offset, length, index;
2332 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2333 const WCHAR *source;
2334 const WCHAR *startcp = x->cp;
2335 WCHAR ch;
2336 RECharSet *charSet;
2338 const char *opname = reop_names[op];
2339 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2340 (int)gData->stateStackTop * 2, "", opname);
2342 switch (op) {
2343 case REOP_EMPTY:
2344 result = x;
2345 break;
2346 case REOP_BOL:
2347 if (x->cp != gData->cpbegin) {
2348 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2349 !(gData->regexp->flags & REG_MULTILINE)) {
2350 break;
2352 if (!RE_IS_LINE_TERM(x->cp[-1]))
2353 break;
2355 result = x;
2356 break;
2357 case REOP_EOL:
2358 if (x->cp != gData->cpend) {
2359 if (/*!gData->cx->regExpStatics.multiline &&*/
2360 !(gData->regexp->flags & REG_MULTILINE)) {
2361 break;
2363 if (!RE_IS_LINE_TERM(*x->cp))
2364 break;
2366 result = x;
2367 break;
2368 case REOP_WBDRY:
2369 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2370 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2371 result = x;
2373 break;
2374 case REOP_WNONBDRY:
2375 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2376 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2377 result = x;
2379 break;
2380 case REOP_DOT:
2381 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2382 result = x;
2383 result->cp++;
2385 break;
2386 case REOP_DIGIT:
2387 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2388 result = x;
2389 result->cp++;
2391 break;
2392 case REOP_NONDIGIT:
2393 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2394 result = x;
2395 result->cp++;
2397 break;
2398 case REOP_ALNUM:
2399 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2400 result = x;
2401 result->cp++;
2403 break;
2404 case REOP_NONALNUM:
2405 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2406 result = x;
2407 result->cp++;
2409 break;
2410 case REOP_SPACE:
2411 if (x->cp != gData->cpend && iswspace(*x->cp)) {
2412 result = x;
2413 result->cp++;
2415 break;
2416 case REOP_NONSPACE:
2417 if (x->cp != gData->cpend && !iswspace(*x->cp)) {
2418 result = x;
2419 result->cp++;
2421 break;
2422 case REOP_BACKREF:
2423 pc = ReadCompactIndex(pc, &parenIndex);
2424 assert(parenIndex < gData->regexp->parenCount);
2425 result = BackrefMatcher(gData, x, parenIndex);
2426 break;
2427 case REOP_FLAT:
2428 pc = ReadCompactIndex(pc, &offset);
2429 assert(offset < gData->regexp->source_len);
2430 pc = ReadCompactIndex(pc, &length);
2431 assert(1 <= length);
2432 assert(length <= gData->regexp->source_len - offset);
2433 if (length <= (size_t)(gData->cpend - x->cp)) {
2434 source = gData->regexp->source + offset;
2435 TRACE("%s\n", debugstr_wn(source, length));
2436 for (index = 0; index != length; index++) {
2437 if (source[index] != x->cp[index])
2438 return NULL;
2440 x->cp += length;
2441 result = x;
2443 break;
2444 case REOP_FLAT1:
2445 matchCh = *pc++;
2446 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2447 if (x->cp != gData->cpend && *x->cp == matchCh) {
2448 result = x;
2449 result->cp++;
2451 break;
2452 case REOP_FLATi:
2453 pc = ReadCompactIndex(pc, &offset);
2454 assert(offset < gData->regexp->source_len);
2455 pc = ReadCompactIndex(pc, &length);
2456 assert(1 <= length);
2457 assert(length <= gData->regexp->source_len - offset);
2458 source = gData->regexp->source;
2459 result = FlatNIMatcher(gData, x, source + offset, length);
2460 break;
2461 case REOP_FLAT1i:
2462 matchCh = *pc++;
2463 if (x->cp != gData->cpend && towupper(*x->cp) == towupper(matchCh)) {
2464 result = x;
2465 result->cp++;
2467 break;
2468 case REOP_UCFLAT1:
2469 matchCh = GET_ARG(pc);
2470 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2471 pc += ARG_LEN;
2472 if (x->cp != gData->cpend && *x->cp == matchCh) {
2473 result = x;
2474 result->cp++;
2476 break;
2477 case REOP_UCFLAT1i:
2478 matchCh = GET_ARG(pc);
2479 pc += ARG_LEN;
2480 if (x->cp != gData->cpend && towupper(*x->cp) == towupper(matchCh)) {
2481 result = x;
2482 result->cp++;
2484 break;
2485 case REOP_CLASS:
2486 pc = ReadCompactIndex(pc, &index);
2487 assert(index < gData->regexp->classCount);
2488 if (x->cp != gData->cpend) {
2489 charSet = &gData->regexp->classList[index];
2490 assert(charSet->converted);
2491 ch = *x->cp;
2492 index = ch >> 3;
2493 if (charSet->length != 0 &&
2494 ch <= charSet->length &&
2495 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2496 result = x;
2497 result->cp++;
2500 break;
2501 case REOP_NCLASS:
2502 pc = ReadCompactIndex(pc, &index);
2503 assert(index < gData->regexp->classCount);
2504 if (x->cp != gData->cpend) {
2505 charSet = &gData->regexp->classList[index];
2506 assert(charSet->converted);
2507 ch = *x->cp;
2508 index = ch >> 3;
2509 if (charSet->length == 0 ||
2510 ch > charSet->length ||
2511 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2512 result = x;
2513 result->cp++;
2516 break;
2518 default:
2519 assert(FALSE);
2521 if (result) {
2522 if (!updatecp)
2523 x->cp = startcp;
2524 *startpc = pc;
2525 TRACE(" *\n");
2526 return result;
2528 x->cp = startcp;
2529 return NULL;
2532 static inline match_state_t *
2533 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2535 match_state_t *result = NULL;
2536 REBackTrackData *backTrackData;
2537 jsbytecode *nextpc, *testpc;
2538 REOp nextop;
2539 RECapture *cap;
2540 REProgState *curState;
2541 const WCHAR *startcp;
2542 size_t parenIndex, k;
2543 size_t parenSoFar = 0;
2545 WCHAR matchCh1, matchCh2;
2546 RECharSet *charSet;
2548 BOOL anchor;
2549 jsbytecode *pc = gData->regexp->program;
2550 REOp op = (REOp) *pc++;
2553 * If the first node is a simple match, step the index into the string
2554 * until that match is made, or fail if it can't be found at all.
2556 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2557 anchor = FALSE;
2558 while (x->cp <= gData->cpend) {
2559 nextpc = pc; /* reset back to start each time */
2560 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2561 if (result) {
2562 anchor = TRUE;
2563 x = result;
2564 pc = nextpc; /* accept skip to next opcode */
2565 op = (REOp) *pc++;
2566 assert(op < REOP_LIMIT);
2567 break;
2569 gData->skipped++;
2570 x->cp++;
2572 if (!anchor)
2573 goto bad;
2576 for (;;) {
2577 const char *opname = reop_names[op];
2578 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2579 (int)gData->stateStackTop * 2, "", opname);
2581 if (REOP_IS_SIMPLE(op)) {
2582 result = SimpleMatch(gData, x, op, &pc, TRUE);
2583 } else {
2584 curState = &gData->stateStack[gData->stateStackTop];
2585 switch (op) {
2586 case REOP_END:
2587 goto good;
2588 case REOP_ALTPREREQ2:
2589 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2590 pc += ARG_LEN;
2591 matchCh2 = GET_ARG(pc);
2592 pc += ARG_LEN;
2593 k = GET_ARG(pc);
2594 pc += ARG_LEN;
2596 if (x->cp != gData->cpend) {
2597 if (*x->cp == matchCh2)
2598 goto doAlt;
2600 charSet = &gData->regexp->classList[k];
2601 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2602 goto bad;
2603 matchCh1 = *x->cp;
2604 k = matchCh1 >> 3;
2605 if ((charSet->length == 0 ||
2606 matchCh1 > charSet->length ||
2607 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2608 charSet->sense) {
2609 goto doAlt;
2612 result = NULL;
2613 break;
2615 case REOP_ALTPREREQ:
2616 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2617 pc += ARG_LEN;
2618 matchCh1 = GET_ARG(pc);
2619 pc += ARG_LEN;
2620 matchCh2 = GET_ARG(pc);
2621 pc += ARG_LEN;
2622 if (x->cp == gData->cpend ||
2623 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2624 result = NULL;
2625 break;
2627 /* else fall through... */
2629 case REOP_ALT:
2630 doAlt:
2631 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2632 pc += ARG_LEN; /* start of this alternate */
2633 curState->parenSoFar = parenSoFar;
2634 PUSH_STATE_STACK(gData);
2635 op = (REOp) *pc++;
2636 startcp = x->cp;
2637 if (REOP_IS_SIMPLE(op)) {
2638 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2639 op = (REOp) *nextpc++;
2640 pc = nextpc;
2641 continue;
2643 result = x;
2644 op = (REOp) *pc++;
2646 nextop = (REOp) *nextpc++;
2647 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2648 goto bad;
2649 continue;
2652 * Occurs at (successful) end of REOP_ALT,
2654 case REOP_JUMP:
2656 * If we have not gotten a result here, it is because of an
2657 * empty match. Do the same thing REOP_EMPTY would do.
2659 if (!result)
2660 result = x;
2662 --gData->stateStackTop;
2663 pc += GET_OFFSET(pc);
2664 op = (REOp) *pc++;
2665 continue;
2668 * Occurs at last (successful) end of REOP_ALT,
2670 case REOP_ENDALT:
2672 * If we have not gotten a result here, it is because of an
2673 * empty match. Do the same thing REOP_EMPTY would do.
2675 if (!result)
2676 result = x;
2678 --gData->stateStackTop;
2679 op = (REOp) *pc++;
2680 continue;
2682 case REOP_LPAREN:
2683 pc = ReadCompactIndex(pc, &parenIndex);
2684 TRACE("[ %Iu ]\n", (ULONG_PTR)parenIndex);
2685 assert(parenIndex < gData->regexp->parenCount);
2686 if (parenIndex + 1 > parenSoFar)
2687 parenSoFar = parenIndex + 1;
2688 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2689 x->parens[parenIndex].length = 0;
2690 op = (REOp) *pc++;
2691 continue;
2693 case REOP_RPAREN:
2695 ptrdiff_t delta;
2697 pc = ReadCompactIndex(pc, &parenIndex);
2698 assert(parenIndex < gData->regexp->parenCount);
2699 cap = &x->parens[parenIndex];
2700 delta = x->cp - (gData->cpbegin + cap->index);
2701 cap->length = (delta < 0) ? 0 : (size_t) delta;
2702 op = (REOp) *pc++;
2704 if (!result)
2705 result = x;
2706 continue;
2708 case REOP_ASSERT:
2709 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2710 pc += ARG_LEN; /* start of ASSERT child */
2711 op = (REOp) *pc++;
2712 testpc = pc;
2713 if (REOP_IS_SIMPLE(op) &&
2714 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2715 result = NULL;
2716 break;
2718 curState->u.assertion.top =
2719 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2720 curState->u.assertion.sz = gData->cursz;
2721 curState->index = x->cp - gData->cpbegin;
2722 curState->parenSoFar = parenSoFar;
2723 PUSH_STATE_STACK(gData);
2724 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2725 nextpc, x, x->cp, 0, 0)) {
2726 goto bad;
2728 continue;
2730 case REOP_ASSERT_NOT:
2731 nextpc = pc + GET_OFFSET(pc);
2732 pc += ARG_LEN;
2733 op = (REOp) *pc++;
2734 testpc = pc;
2735 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2736 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2737 *testpc == REOP_ASSERTNOTTEST) {
2738 result = NULL;
2739 break;
2741 curState->u.assertion.top
2742 = (char *)gData->backTrackSP -
2743 (char *)gData->backTrackStack;
2744 curState->u.assertion.sz = gData->cursz;
2745 curState->index = x->cp - gData->cpbegin;
2746 curState->parenSoFar = parenSoFar;
2747 PUSH_STATE_STACK(gData);
2748 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2749 nextpc, x, x->cp, 0, 0)) {
2750 goto bad;
2752 continue;
2754 case REOP_ASSERTTEST:
2755 --gData->stateStackTop;
2756 --curState;
2757 x->cp = gData->cpbegin + curState->index;
2758 gData->backTrackSP =
2759 (REBackTrackData *) ((char *)gData->backTrackStack +
2760 curState->u.assertion.top);
2761 gData->cursz = curState->u.assertion.sz;
2762 if (result)
2763 result = x;
2764 break;
2766 case REOP_ASSERTNOTTEST:
2767 --gData->stateStackTop;
2768 --curState;
2769 x->cp = gData->cpbegin + curState->index;
2770 gData->backTrackSP =
2771 (REBackTrackData *) ((char *)gData->backTrackStack +
2772 curState->u.assertion.top);
2773 gData->cursz = curState->u.assertion.sz;
2774 result = (!result) ? x : NULL;
2775 break;
2776 case REOP_STAR:
2777 curState->u.quantifier.min = 0;
2778 curState->u.quantifier.max = (UINT)-1;
2779 goto quantcommon;
2780 case REOP_PLUS:
2781 curState->u.quantifier.min = 1;
2782 curState->u.quantifier.max = (UINT)-1;
2783 goto quantcommon;
2784 case REOP_OPT:
2785 curState->u.quantifier.min = 0;
2786 curState->u.quantifier.max = 1;
2787 goto quantcommon;
2788 case REOP_QUANT:
2789 pc = ReadCompactIndex(pc, &k);
2790 curState->u.quantifier.min = k;
2791 pc = ReadCompactIndex(pc, &k);
2792 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2793 curState->u.quantifier.max = k - 1;
2794 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2795 quantcommon:
2796 if (curState->u.quantifier.max == 0) {
2797 pc = pc + GET_OFFSET(pc);
2798 op = (REOp) *pc++;
2799 result = x;
2800 continue;
2802 /* Step over <next> */
2803 nextpc = pc + ARG_LEN;
2804 op = (REOp) *nextpc++;
2805 startcp = x->cp;
2806 if (REOP_IS_SIMPLE(op)) {
2807 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2808 if (curState->u.quantifier.min == 0)
2809 result = x;
2810 else
2811 result = NULL;
2812 pc = pc + GET_OFFSET(pc);
2813 break;
2815 op = (REOp) *nextpc++;
2816 result = x;
2818 curState->index = startcp - gData->cpbegin;
2819 curState->continue_op = REOP_REPEAT;
2820 curState->continue_pc = pc;
2821 curState->parenSoFar = parenSoFar;
2822 PUSH_STATE_STACK(gData);
2823 if (curState->u.quantifier.min == 0 &&
2824 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2825 0, 0)) {
2826 goto bad;
2828 pc = nextpc;
2829 continue;
2831 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2832 pc = curState[-1].continue_pc;
2833 op = (REOp) curState[-1].continue_op;
2835 if (!result)
2836 result = x;
2837 continue;
2839 case REOP_REPEAT:
2840 --curState;
2841 do {
2842 --gData->stateStackTop;
2843 if (!result) {
2844 /* Failed, see if we have enough children. */
2845 if (curState->u.quantifier.min == 0)
2846 goto repeatDone;
2847 goto break_switch;
2849 if (curState->u.quantifier.min == 0 &&
2850 x->cp == gData->cpbegin + curState->index) {
2851 /* matched an empty string, that'll get us nowhere */
2852 result = NULL;
2853 goto break_switch;
2855 if (curState->u.quantifier.min != 0)
2856 curState->u.quantifier.min--;
2857 if (curState->u.quantifier.max != (UINT) -1)
2858 curState->u.quantifier.max--;
2859 if (curState->u.quantifier.max == 0)
2860 goto repeatDone;
2861 nextpc = pc + ARG_LEN;
2862 nextop = (REOp) *nextpc;
2863 startcp = x->cp;
2864 if (REOP_IS_SIMPLE(nextop)) {
2865 nextpc++;
2866 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2867 if (curState->u.quantifier.min == 0)
2868 goto repeatDone;
2869 result = NULL;
2870 goto break_switch;
2872 result = x;
2874 curState->index = startcp - gData->cpbegin;
2875 PUSH_STATE_STACK(gData);
2876 if (curState->u.quantifier.min == 0 &&
2877 !PushBackTrackState(gData, REOP_REPEAT,
2878 pc, x, startcp,
2879 curState->parenSoFar,
2880 parenSoFar -
2881 curState->parenSoFar)) {
2882 goto bad;
2884 } while (*nextpc == REOP_ENDCHILD);
2885 pc = nextpc;
2886 op = (REOp) *pc++;
2887 parenSoFar = curState->parenSoFar;
2888 continue;
2890 repeatDone:
2891 result = x;
2892 pc += GET_OFFSET(pc);
2893 goto break_switch;
2895 case REOP_MINIMALSTAR:
2896 curState->u.quantifier.min = 0;
2897 curState->u.quantifier.max = (UINT)-1;
2898 goto minimalquantcommon;
2899 case REOP_MINIMALPLUS:
2900 curState->u.quantifier.min = 1;
2901 curState->u.quantifier.max = (UINT)-1;
2902 goto minimalquantcommon;
2903 case REOP_MINIMALOPT:
2904 curState->u.quantifier.min = 0;
2905 curState->u.quantifier.max = 1;
2906 goto minimalquantcommon;
2907 case REOP_MINIMALQUANT:
2908 pc = ReadCompactIndex(pc, &k);
2909 curState->u.quantifier.min = k;
2910 pc = ReadCompactIndex(pc, &k);
2911 /* See REOP_QUANT comments about k - 1. */
2912 curState->u.quantifier.max = k - 1;
2913 assert(curState->u.quantifier.min
2914 <= curState->u.quantifier.max);
2915 minimalquantcommon:
2916 curState->index = x->cp - gData->cpbegin;
2917 curState->parenSoFar = parenSoFar;
2918 PUSH_STATE_STACK(gData);
2919 if (curState->u.quantifier.min != 0) {
2920 curState->continue_op = REOP_MINIMALREPEAT;
2921 curState->continue_pc = pc;
2922 /* step over <next> */
2923 pc += OFFSET_LEN;
2924 op = (REOp) *pc++;
2925 } else {
2926 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2927 pc, x, x->cp, 0, 0)) {
2928 goto bad;
2930 --gData->stateStackTop;
2931 pc = pc + GET_OFFSET(pc);
2932 op = (REOp) *pc++;
2934 continue;
2936 case REOP_MINIMALREPEAT:
2937 --gData->stateStackTop;
2938 --curState;
2940 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2941 #define PREPARE_REPEAT() \
2942 do { \
2943 curState->index = x->cp - gData->cpbegin; \
2944 curState->continue_op = REOP_MINIMALREPEAT; \
2945 curState->continue_pc = pc; \
2946 pc += ARG_LEN; \
2947 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2948 x->parens[k].index = -1; \
2949 PUSH_STATE_STACK(gData); \
2950 op = (REOp) *pc++; \
2951 assert(op < REOP_LIMIT); \
2952 }while(0)
2954 if (!result) {
2955 TRACE(" -\n");
2957 * Non-greedy failure - try to consume another child.
2959 if (curState->u.quantifier.max == (UINT) -1 ||
2960 curState->u.quantifier.max > 0) {
2961 PREPARE_REPEAT();
2962 continue;
2964 /* Don't need to adjust pc since we're going to pop. */
2965 break;
2967 if (curState->u.quantifier.min == 0 &&
2968 x->cp == gData->cpbegin + curState->index) {
2969 /* Matched an empty string, that'll get us nowhere. */
2970 result = NULL;
2971 break;
2973 if (curState->u.quantifier.min != 0)
2974 curState->u.quantifier.min--;
2975 if (curState->u.quantifier.max != (UINT) -1)
2976 curState->u.quantifier.max--;
2977 if (curState->u.quantifier.min != 0) {
2978 PREPARE_REPEAT();
2979 continue;
2981 curState->index = x->cp - gData->cpbegin;
2982 curState->parenSoFar = parenSoFar;
2983 PUSH_STATE_STACK(gData);
2984 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2985 pc, x, x->cp,
2986 curState->parenSoFar,
2987 parenSoFar - curState->parenSoFar)) {
2988 goto bad;
2990 --gData->stateStackTop;
2991 pc = pc + GET_OFFSET(pc);
2992 op = (REOp) *pc++;
2993 assert(op < REOP_LIMIT);
2994 continue;
2995 default:
2996 assert(FALSE);
2997 result = NULL;
2999 break_switch:;
3003 * If the match failed and there's a backtrack option, take it.
3004 * Otherwise this is a complete and utter failure.
3006 if (!result) {
3007 if (gData->cursz == 0)
3008 return NULL;
3010 /* Potentially detect explosive regex here. */
3011 gData->backTrackCount++;
3012 if (gData->backTrackLimit &&
3013 gData->backTrackCount >= gData->backTrackLimit) {
3014 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3015 JSMSG_REGEXP_TOO_COMPLEX);
3016 gData->ok = FALSE;
3017 return NULL;
3020 backTrackData = gData->backTrackSP;
3021 gData->cursz = backTrackData->sz;
3022 gData->backTrackSP =
3023 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3024 x->cp = backTrackData->cp;
3025 pc = backTrackData->backtrack_pc;
3026 op = (REOp) backTrackData->backtrack_op;
3027 assert(op < REOP_LIMIT);
3028 gData->stateStackTop = backTrackData->saveStateStackTop;
3029 assert(gData->stateStackTop);
3031 memcpy(gData->stateStack, backTrackData + 1,
3032 sizeof(REProgState) * backTrackData->saveStateStackTop);
3033 curState = &gData->stateStack[gData->stateStackTop - 1];
3035 if (backTrackData->parenCount) {
3036 memcpy(&x->parens[backTrackData->parenIndex],
3037 (char *)(backTrackData + 1) +
3038 sizeof(REProgState) * backTrackData->saveStateStackTop,
3039 sizeof(RECapture) * backTrackData->parenCount);
3040 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3041 } else {
3042 for (k = curState->parenSoFar; k < parenSoFar; k++)
3043 x->parens[k].index = -1;
3044 parenSoFar = curState->parenSoFar;
3047 TRACE("\tBT_Pop: %Id,%Id\n",
3048 (ULONG_PTR)backTrackData->parenIndex,
3049 (ULONG_PTR)backTrackData->parenCount);
3050 continue;
3052 x = result;
3055 * Continue with the expression.
3057 op = (REOp)*pc++;
3058 assert(op < REOP_LIMIT);
3061 bad:
3062 TRACE("\n");
3063 return NULL;
3065 good:
3066 TRACE("\n");
3067 return x;
3070 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3072 match_state_t *result;
3073 const WCHAR *cp = x->cp;
3074 const WCHAR *cp2;
3075 UINT j;
3078 * Have to include the position beyond the last character
3079 * in order to detect end-of-input/line condition.
3081 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3082 gData->skipped = cp2 - cp;
3083 x->cp = cp2;
3084 for (j = 0; j < gData->regexp->parenCount; j++)
3085 x->parens[j].index = -1;
3086 result = ExecuteREBytecode(gData, x);
3087 if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3088 return result;
3089 gData->backTrackSP = gData->backTrackStack;
3090 gData->cursz = 0;
3091 gData->stateStackTop = 0;
3092 cp2 = cp + gData->skipped;
3094 return NULL;
3097 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3099 UINT i;
3101 gData->backTrackStackSize = INITIAL_BACKTRACK;
3102 gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3103 if (!gData->backTrackStack)
3104 goto bad;
3106 gData->backTrackSP = gData->backTrackStack;
3107 gData->cursz = 0;
3108 gData->backTrackCount = 0;
3109 gData->backTrackLimit = 0;
3111 gData->stateStackLimit = INITIAL_STATESTACK;
3112 gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3113 if (!gData->stateStack)
3114 goto bad;
3116 gData->stateStackTop = 0;
3117 gData->cx = cx;
3118 gData->pool = pool;
3119 gData->regexp = re;
3120 gData->ok = TRUE;
3122 for (i = 0; i < re->classCount; i++) {
3123 if (!re->classList[i].converted &&
3124 !ProcessCharSet(gData, &re->classList[i])) {
3125 return E_FAIL;
3129 return S_OK;
3131 bad:
3132 gData->ok = FALSE;
3133 return E_OUTOFMEMORY;
3136 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3137 const WCHAR *str, DWORD str_len, match_state_t *result)
3139 match_state_t *res;
3140 REGlobalData gData;
3141 heap_pool_t *mark = heap_pool_mark(pool);
3142 const WCHAR *str_beg = result->cp;
3143 HRESULT hres;
3145 assert(result->cp != NULL);
3147 gData.cpbegin = str;
3148 gData.cpend = str+str_len;
3149 gData.start = result->cp-str;
3150 gData.skipped = 0;
3151 gData.pool = pool;
3153 hres = InitMatch(regexp, cx, pool, &gData);
3154 if(FAILED(hres)) {
3155 WARN("InitMatch failed\n");
3156 heap_pool_clear(mark);
3157 return hres;
3160 res = MatchRegExp(&gData, result);
3161 heap_pool_clear(mark);
3162 if(!gData.ok) {
3163 WARN("MatchRegExp failed\n");
3164 return E_FAIL;
3167 if(!res) {
3168 result->match_len = 0;
3169 return S_FALSE;
3172 result->match_len = (result->cp-str_beg) - gData.skipped;
3173 result->paren_count = regexp->parenCount;
3174 return S_OK;
3177 void regexp_destroy(regexp_t *re)
3179 if (re->classList) {
3180 UINT i;
3181 for (i = 0; i < re->classCount; i++) {
3182 if (re->classList[i].converted)
3183 free(re->classList[i].u.bits);
3184 re->classList[i].u.bits = NULL;
3186 free(re->classList);
3188 free(re);
3191 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3192 DWORD str_len, WORD flags, BOOL flat)
3194 regexp_t *re;
3195 heap_pool_t *mark;
3196 CompilerState state;
3197 size_t resize;
3198 jsbytecode *endPC;
3199 UINT i;
3200 size_t len;
3202 re = NULL;
3203 mark = heap_pool_mark(pool);
3204 len = str_len;
3206 state.context = cx;
3207 state.pool = pool;
3208 state.cp = str;
3209 if (!state.cp)
3210 goto out;
3211 state.cpbegin = state.cp;
3212 state.cpend = state.cp + len;
3213 state.flags = flags;
3214 state.parenCount = 0;
3215 state.classCount = 0;
3216 state.progLength = 0;
3217 state.treeDepth = 0;
3218 state.classBitmapsMem = 0;
3219 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3220 state.classCache[i].start = NULL;
3222 if (len != 0 && flat) {
3223 state.result = NewRENode(&state, REOP_FLAT);
3224 if (!state.result)
3225 goto out;
3226 state.result->u.flat.chr = *state.cpbegin;
3227 state.result->u.flat.length = len;
3228 state.result->kid = (void *) state.cpbegin;
3229 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3230 state.progLength += 1 + GetCompactIndexWidth(0)
3231 + GetCompactIndexWidth(len);
3232 } else {
3233 if (!ParseRegExp(&state))
3234 goto out;
3236 resize = offsetof(regexp_t, program) + state.progLength + 1;
3237 re = malloc(resize);
3238 if (!re)
3239 goto out;
3241 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3242 re->classCount = state.classCount;
3243 if (re->classCount) {
3244 re->classList = malloc(re->classCount * sizeof(RECharSet));
3245 if (!re->classList) {
3246 regexp_destroy(re);
3247 re = NULL;
3248 goto out;
3250 for (i = 0; i < re->classCount; i++)
3251 re->classList[i].converted = FALSE;
3252 } else {
3253 re->classList = NULL;
3255 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3256 if (!endPC) {
3257 regexp_destroy(re);
3258 re = NULL;
3259 goto out;
3261 *endPC++ = REOP_END;
3263 * Check whether size was overestimated and shrink using realloc.
3264 * This is safe since no pointers to newly parsed regexp or its parts
3265 * besides re exist here.
3267 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3268 regexp_t *tmp;
3269 assert((size_t)(endPC - re->program) < state.progLength + 1);
3270 resize = offsetof(regexp_t, program) + (endPC - re->program);
3271 tmp = realloc(re, resize);
3272 if (tmp)
3273 re = tmp;
3276 re->flags = flags;
3277 re->parenCount = state.parenCount;
3278 re->source = str;
3279 re->source_len = str_len;
3281 out:
3282 heap_pool_clear(mark);
3283 return re;