Bug 559408: Arena pool macros to methods. (r=gal)
[mozilla-central.git] / js / src / jsregexp.cpp
blobb6abe92c15c8a6a130ed57bebe9d6819f569e749
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set sw=4 ts=8 et tw=78:
4 * ***** BEGIN LICENSE BLOCK *****
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
7 * The contents of this file are subject to the Mozilla Public License Version
8 * 1.1 (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 * for the specific language governing rights and limitations under the
15 * License.
17 * The Original Code is Mozilla Communicator client code, released
18 * March 31, 1998.
20 * The Initial Developer of the Original Code is
21 * Netscape Communications Corporation.
22 * Portions created by the Initial Developer are Copyright (C) 1998
23 * the Initial Developer. All Rights Reserved.
25 * Contributor(s):
27 * Alternatively, the contents of this file may be used under the terms of
28 * either of the GNU General Public License Version 2 or later (the "GPL"),
29 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
39 * ***** END LICENSE BLOCK ***** */
42 * JS regular expressions, after Perl.
44 #include <stdlib.h>
45 #include <string.h>
46 #include <stdarg.h>
47 #include "jstypes.h"
48 #include "jsstdint.h"
49 #include "jsarena.h" /* Added by JSIFY */
50 #include "jsutil.h" /* Added by JSIFY */
51 #include "jsapi.h"
52 #include "jsarray.h"
53 #include "jsatom.h"
54 #include "jsbuiltins.h"
55 #include "jscntxt.h"
56 #include "jsversion.h"
57 #include "jsfun.h"
58 #include "jsgc.h"
59 #include "jsinterp.h"
60 #include "jslock.h"
61 #include "jsnum.h"
62 #include "jsobj.h"
63 #include "jsopcode.h"
64 #include "jsregexp.h"
65 #include "jsscan.h"
66 #include "jsscope.h"
67 #include "jsstaticcheck.h"
68 #include "jsstr.h"
69 #include "jsvector.h"
71 #ifdef JS_TRACER
72 #include "jstracer.h"
73 using namespace avmplus;
74 using namespace nanojit;
75 #endif
77 #include "jsobjinlines.h"
79 using namespace js;
81 typedef enum REOp {
82 #define REOP_DEF(opcode, name) opcode,
83 #include "jsreops.tbl"
84 #undef REOP_DEF
85 REOP_LIMIT /* META: no operator >= to this */
86 } REOp;
88 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
90 #ifdef REGEXP_DEBUG
91 const char *reop_names[] = {
92 #define REOP_DEF(opcode, name) name,
93 #include "jsreops.tbl"
94 #undef REOP_DEF
95 NULL
97 #endif
99 #ifdef __GNUC__
100 static int
101 re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2)));
102 #endif
104 #ifdef REGEXP_DEBUG
105 static int
106 re_debug(const char *fmt, ...)
108 va_list ap;
109 int retval;
111 va_start(ap, fmt);
112 retval = vprintf(fmt, ap);
113 va_end(ap);
114 return retval;
117 static void
118 re_debug_chars(const jschar *chrs, size_t length)
120 int i = 0;
122 printf(" \"");
123 while (*chrs && i++ < length) {
124 putchar((char)*chrs++);
126 printf("\"");
128 #else /* !REGEXP_DEBUG */
129 /* This should be optimized to a no-op by our tier-1 compilers. */
130 static int
131 re_debug(const char *fmt, ...)
133 return 0;
136 static void
137 re_debug_chars(const jschar *chrs, size_t length)
140 #endif /* !REGEXP_DEBUG */
142 struct RENode {
143 REOp op; /* r.e. op bytecode */
144 RENode *next; /* next in concatenation order */
145 void *kid; /* first operand */
146 union {
147 void *kid2; /* second operand */
148 jsint num; /* could be a number */
149 size_t parenIndex; /* or a parenthesis index */
150 struct { /* or a quantifier range */
151 uintN min;
152 uintN max;
153 JSPackedBool greedy;
154 } range;
155 struct { /* or a character class */
156 size_t startIndex;
157 size_t kidlen; /* length of string at kid, in jschars */
158 size_t index; /* index into class list */
159 uint16 bmsize; /* bitmap size, based on max char code */
160 JSPackedBool sense;
161 } ucclass;
162 struct { /* or a literal sequence */
163 jschar chr; /* of one character */
164 size_t length; /* or many (via the kid) */
165 } flat;
166 struct {
167 RENode *kid2; /* second operand from ALT */
168 jschar ch1; /* match char for ALTPREREQ */
169 jschar ch2; /* ditto, or class index for ALTPREREQ2 */
170 } altprereq;
171 } u;
174 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
175 ((c >= 'a') && (c <= 'z')) )
176 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
177 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
179 #define CLASS_CACHE_SIZE 4
181 typedef struct CompilerState {
182 JSContext *context;
183 TokenStream *tokenStream; /* For reporting errors */
184 const jschar *cpbegin;
185 const jschar *cpend;
186 const jschar *cp;
187 size_t parenCount;
188 size_t classCount; /* number of [] encountered */
189 size_t treeDepth; /* maximum depth of parse tree */
190 size_t progLength; /* estimated bytecode length */
191 RENode *result;
192 size_t classBitmapsMem; /* memory to hold all class bitmaps */
193 struct {
194 const jschar *start; /* small cache of class strings */
195 size_t length; /* since they're often the same */
196 size_t index;
197 } classCache[CLASS_CACHE_SIZE];
198 uint16 flags;
199 } CompilerState;
201 typedef struct EmitStateStackEntry {
202 jsbytecode *altHead; /* start of REOP_ALT* opcode */
203 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
204 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
205 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
206 RENode *continueNode; /* original REOP_ALT* node being stacked */
207 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
208 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
209 avoid 16-bit unsigned offset overflow */
210 } EmitStateStackEntry;
213 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
214 * the getters and setters take the pc of the offset, not of the opcode before
215 * the offset.
217 #define ARG_LEN 2
218 #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))
219 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
220 (pc)[1] = (jsbytecode) (arg))
222 #define OFFSET_LEN ARG_LEN
223 #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)
224 #define GET_OFFSET(pc) GET_ARG(pc)
227 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
228 * For sanity, we limit it to 2^24 bytes.
230 #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))
233 * The maximum memory that can be allocated for class bitmaps.
234 * For sanity, we limit it to 2^24 bytes.
236 #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)
239 * Functions to get size and write/read bytecode that represent small indexes
240 * compactly.
241 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
242 * indicates that the following byte brings more bits to the index. Otherwise
243 * this is the last byte in the index bytecode representing highest index bits.
245 static size_t
246 GetCompactIndexWidth(size_t index)
248 size_t width;
250 for (width = 1; (index >>= 7) != 0; ++width) { }
251 return width;
254 static JS_ALWAYS_INLINE jsbytecode *
255 WriteCompactIndex(jsbytecode *pc, size_t index)
257 size_t next;
259 while ((next = index >> 7) != 0) {
260 *pc++ = (jsbytecode)(index | 0x80);
261 index = next;
263 *pc++ = (jsbytecode)index;
264 return pc;
267 static JS_ALWAYS_INLINE jsbytecode *
268 ReadCompactIndex(jsbytecode *pc, size_t *result)
270 size_t nextByte;
272 nextByte = *pc++;
273 if ((nextByte & 0x80) == 0) {
275 * Short-circuit the most common case when compact index <= 127.
277 *result = nextByte;
278 } else {
279 size_t shift = 7;
280 *result = 0x7F & nextByte;
281 do {
282 nextByte = *pc++;
283 *result |= (nextByte & 0x7F) << shift;
284 shift += 7;
285 } while ((nextByte & 0x80) != 0);
287 return pc;
290 typedef struct RECapture {
291 ptrdiff_t index; /* start of contents, -1 for empty */
292 size_t length; /* length of capture */
293 } RECapture;
295 typedef struct REMatchState {
296 const jschar *cp;
297 RECapture parens[1]; /* first of 're->parenCount' captures,
298 allocated at end of this struct */
299 } REMatchState;
301 struct REBackTrackData;
303 typedef struct REProgState {
304 jsbytecode *continue_pc; /* current continuation data */
305 jsbytecode continue_op;
306 ptrdiff_t index; /* progress in text */
307 size_t parenSoFar; /* highest indexed paren started */
308 union {
309 struct {
310 uintN min; /* current quantifier limits */
311 uintN max;
312 } quantifier;
313 struct {
314 size_t top; /* backtrack stack state */
315 size_t sz;
316 } assertion;
317 } u;
318 } REProgState;
320 typedef struct REBackTrackData {
321 size_t sz; /* size of previous stack entry */
322 jsbytecode *backtrack_pc; /* where to backtrack to */
323 jsbytecode backtrack_op;
324 const jschar *cp; /* index in text of match at backtrack */
325 size_t parenIndex; /* start index of saved paren contents */
326 size_t parenCount; /* # of saved paren contents */
327 size_t saveStateStackTop; /* number of parent states */
328 /* saved parent states follow */
329 /* saved paren contents follow */
330 } REBackTrackData;
332 #define INITIAL_STATESTACK 100
333 #define INITIAL_BACKTRACK 8000
335 typedef struct REGlobalData {
336 JSContext *cx;
337 JSRegExp *regexp; /* the RE in execution */
338 JSBool ok; /* runtime error (out_of_memory only?) */
339 size_t start; /* offset to start at */
340 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
341 const jschar *cpbegin; /* text base address */
342 const jschar *cpend; /* text limit address */
344 REProgState *stateStack; /* stack of state of current parents */
345 size_t stateStackTop;
346 size_t stateStackLimit;
348 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
349 REBackTrackData *backTrackSP;
350 size_t backTrackStackSize;
351 size_t cursz; /* size of current stack entry */
352 size_t backTrackCount; /* how many times we've backtracked */
353 size_t backTrackLimit; /* upper limit on backtrack states */
354 } REGlobalData;
356 void
357 JSRegExpStatics::clearRoots()
359 input = NULL;
360 cx->runtime->gcPoke = JS_TRUE;
363 bool
364 JSRegExpStatics::copy(const JSRegExpStatics& other)
366 clearRoots();
367 input = other.input;
368 multiline = other.multiline;
369 lastMatch = other.lastMatch;
370 lastParen = other.lastParen;
371 leftContext = other.leftContext;
372 rightContext = other.rightContext;
373 if (!parens.resize(other.parens.length()))
374 return false;
375 memcpy(parens.begin(), other.parens.begin(), sizeof(JSSubString) * parens.length());
376 return true;
379 void
380 JSRegExpStatics::clear()
382 clearRoots();
383 multiline = false;
384 lastMatch = lastParen = leftContext = rightContext = js_EmptySubString;
385 parens.clear();
389 * 1. If IgnoreCase is false, return ch.
390 * 2. Let u be ch converted to upper case as if by calling
391 * String.prototype.toUpperCase on the one-character string ch.
392 * 3. If u does not consist of a single character, return ch.
393 * 4. Let cu be u's character.
394 * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
395 * code point value is less than decimal 128, then return ch.
396 * 6. Return cu.
398 static JS_ALWAYS_INLINE uintN
399 upcase(uintN ch)
401 uintN cu;
403 JS_ASSERT((uintN) (jschar) ch == ch);
404 if (ch < 128) {
405 if (ch - (uintN) 'a' <= (uintN) ('z' - 'a'))
406 ch -= (uintN) ('a' - 'A');
407 return ch;
410 cu = JS_TOUPPER(ch);
411 return (cu < 128) ? ch : cu;
415 * Return the 'canonical' inverse upcase of |ch|. That is the character
416 * |lch| such that |upcase(lch) == ch| and (|lch| is the lower-case form
417 * of |ch| or is |ch|).
419 static inline jschar inverse_upcase(jschar ch)
421 jschar lch = JS_TOLOWER(ch);
422 return (upcase(lch) == ch) ? lch : ch;
425 /* Construct and initialize an RENode, returning NULL for out-of-memory */
426 static RENode *
427 NewRENode(CompilerState *state, REOp op)
429 JSContext *cx;
430 RENode *ren;
432 cx = state->context;
433 cx->tempPool.allocateCast<RENode *>(ren, sizeof *ren);
434 if (!ren) {
435 js_ReportOutOfScriptQuota(cx);
436 return NULL;
438 ren->op = op;
439 ren->next = NULL;
440 ren->kid = NULL;
441 return ren;
445 * Validates and converts hex ascii value.
447 static JSBool
448 isASCIIHexDigit(jschar c, uintN *digit)
450 uintN cv = c;
452 if (cv < '0')
453 return JS_FALSE;
454 if (cv <= '9') {
455 *digit = cv - '0';
456 return JS_TRUE;
458 cv |= 0x20;
459 if (cv >= 'a' && cv <= 'f') {
460 *digit = cv - 'a' + 10;
461 return JS_TRUE;
463 return JS_FALSE;
467 typedef struct {
468 REOp op;
469 const jschar *errPos;
470 size_t parenIndex;
471 } REOpData;
473 static JSBool
474 ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber,
475 const jschar *arg)
477 if (state->tokenStream) {
478 return ReportCompileErrorNumber(state->context, state->tokenStream,
479 NULL, JSREPORT_UC | flags, errorNumber, arg);
481 return JS_ReportErrorFlagsAndNumberUC(state->context, flags,
482 js_GetErrorMessage, NULL,
483 errorNumber, arg);
486 static JSBool
487 ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber)
489 return ReportRegExpErrorHelper(state, flags, errorNumber, NULL);
493 * Process the op against the two top operands, reducing them to a single
494 * operand in the penultimate slot. Update progLength and treeDepth.
496 static JSBool
497 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
498 intN operandSP)
500 RENode *result;
502 switch (opData->op) {
503 case REOP_ALT:
504 result = NewRENode(state, REOP_ALT);
505 if (!result)
506 return JS_FALSE;
507 result->kid = operandStack[operandSP - 2];
508 result->u.kid2 = operandStack[operandSP - 1];
509 operandStack[operandSP - 2] = result;
511 if (state->treeDepth == TREE_DEPTH_MAX) {
512 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
513 return JS_FALSE;
515 ++state->treeDepth;
518 * Look at both alternates to see if there's a FLAT or a CLASS at
519 * the start of each. If so, use a prerequisite match.
521 if (((RENode *) result->kid)->op == REOP_FLAT &&
522 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
523 (state->flags & JSREG_FOLD) == 0) {
524 result->op = REOP_ALTPREREQ;
525 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
526 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
527 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
528 JUMP, <end> ... ENDALT */
529 state->progLength += 13;
531 else
532 if (((RENode *) result->kid)->op == REOP_CLASS &&
533 ((RENode *) result->kid)->u.ucclass.index < 256 &&
534 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
535 (state->flags & JSREG_FOLD) == 0) {
536 result->op = REOP_ALTPREREQ2;
537 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
538 result->u.altprereq.ch2 = jschar(((RENode *) result->kid)->u.ucclass.index);
539 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
540 JUMP, <end> ... ENDALT */
541 state->progLength += 13;
543 else
544 if (((RENode *) result->kid)->op == REOP_FLAT &&
545 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
546 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
547 (state->flags & JSREG_FOLD) == 0) {
548 result->op = REOP_ALTPREREQ2;
549 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
550 result->u.altprereq.ch2 =
551 jschar(((RENode *) result->u.kid2)->u.ucclass.index);
552 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
553 JUMP, <end> ... ENDALT */
554 state->progLength += 13;
556 else {
557 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
558 state->progLength += 7;
560 break;
562 case REOP_CONCAT:
563 result = operandStack[operandSP - 2];
564 while (result->next)
565 result = result->next;
566 result->next = operandStack[operandSP - 1];
567 break;
569 case REOP_ASSERT:
570 case REOP_ASSERT_NOT:
571 case REOP_LPARENNON:
572 case REOP_LPAREN:
573 /* These should have been processed by a close paren. */
574 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
575 opData->errPos);
576 return JS_FALSE;
578 default:;
580 return JS_TRUE;
584 * Parser forward declarations.
586 static JSBool ParseTerm(CompilerState *state);
587 static JSBool ParseQuantifier(CompilerState *state);
588 static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues);
591 * Top-down regular expression grammar, based closely on Perl4.
593 * regexp: altern A regular expression is one or more
594 * altern '|' regexp alternatives separated by vertical bar.
596 #define INITIAL_STACK_SIZE 128
598 static JSBool
599 ParseRegExp(CompilerState *state)
601 size_t parenIndex;
602 RENode *operand;
603 REOpData *operatorStack;
604 RENode **operandStack;
605 REOp op;
606 intN i;
607 JSBool result = JS_FALSE;
609 intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
610 intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
612 /* Watch out for empty regexp */
613 if (state->cp == state->cpend) {
614 state->result = NewRENode(state, REOP_EMPTY);
615 return (state->result != NULL);
618 operatorStack = (REOpData *)
619 state->context->malloc(sizeof(REOpData) * operatorStackSize);
620 if (!operatorStack)
621 return JS_FALSE;
623 operandStack = (RENode **)
624 state->context->malloc(sizeof(RENode *) * operandStackSize);
625 if (!operandStack)
626 goto out;
628 for (;;) {
629 parenIndex = state->parenCount;
630 if (state->cp == state->cpend) {
632 * If we are at the end of the regexp and we're short one or more
633 * operands, the regexp must have the form /x|/ or some such, with
634 * left parentheses making us short more than one operand.
636 if (operatorSP >= operandSP) {
637 operand = NewRENode(state, REOP_EMPTY);
638 if (!operand)
639 goto out;
640 goto pushOperand;
642 } else {
643 switch (*state->cp) {
644 case '(':
645 ++state->cp;
646 if (state->cp + 1 < state->cpend &&
647 *state->cp == '?' &&
648 (state->cp[1] == '=' ||
649 state->cp[1] == '!' ||
650 state->cp[1] == ':')) {
651 switch (state->cp[1]) {
652 case '=':
653 op = REOP_ASSERT;
654 /* ASSERT, <next>, ... ASSERTTEST */
655 state->progLength += 4;
656 break;
657 case '!':
658 op = REOP_ASSERT_NOT;
659 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
660 state->progLength += 4;
661 break;
662 default:
663 op = REOP_LPARENNON;
664 break;
666 state->cp += 2;
667 } else {
668 op = REOP_LPAREN;
669 /* LPAREN, <index>, ... RPAREN, <index> */
670 state->progLength
671 += 2 * (1 + GetCompactIndexWidth(parenIndex));
672 state->parenCount++;
673 if (state->parenCount == 65535) {
674 ReportRegExpError(state, JSREPORT_ERROR,
675 JSMSG_TOO_MANY_PARENS);
676 goto out;
679 goto pushOperator;
681 case ')':
683 * If there's no stacked open parenthesis, throw syntax error.
685 for (i = operatorSP - 1; ; i--) {
686 if (i < 0) {
687 ReportRegExpError(state, JSREPORT_ERROR,
688 JSMSG_UNMATCHED_RIGHT_PAREN);
689 goto out;
691 if (operatorStack[i].op == REOP_ASSERT ||
692 operatorStack[i].op == REOP_ASSERT_NOT ||
693 operatorStack[i].op == REOP_LPARENNON ||
694 operatorStack[i].op == REOP_LPAREN) {
695 break;
698 /* FALL THROUGH */
700 case '|':
701 /* Expected an operand before these, so make an empty one */
702 operand = NewRENode(state, REOP_EMPTY);
703 if (!operand)
704 goto out;
705 goto pushOperand;
707 default:
708 if (!ParseTerm(state))
709 goto out;
710 operand = state->result;
711 pushOperand:
712 if (operandSP == operandStackSize) {
713 RENode **tmp;
714 operandStackSize += operandStackSize;
715 tmp = (RENode **)
716 state->context->realloc(operandStack,
717 sizeof(RENode *) * operandStackSize);
718 if (!tmp)
719 goto out;
720 operandStack = tmp;
722 operandStack[operandSP++] = operand;
723 break;
727 /* At the end; process remaining operators. */
728 restartOperator:
729 if (state->cp == state->cpend) {
730 while (operatorSP) {
731 --operatorSP;
732 if (!ProcessOp(state, &operatorStack[operatorSP],
733 operandStack, operandSP))
734 goto out;
735 --operandSP;
737 JS_ASSERT(operandSP == 1);
738 state->result = operandStack[0];
739 result = JS_TRUE;
740 goto out;
743 switch (*state->cp) {
744 case '|':
745 /* Process any stacked 'concat' operators */
746 ++state->cp;
747 while (operatorSP &&
748 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
749 --operatorSP;
750 if (!ProcessOp(state, &operatorStack[operatorSP],
751 operandStack, operandSP)) {
752 goto out;
754 --operandSP;
756 op = REOP_ALT;
757 goto pushOperator;
759 case ')':
761 * If there's no stacked open parenthesis, throw syntax error.
763 for (i = operatorSP - 1; ; i--) {
764 if (i < 0) {
765 ReportRegExpError(state, JSREPORT_ERROR,
766 JSMSG_UNMATCHED_RIGHT_PAREN);
767 goto out;
769 if (operatorStack[i].op == REOP_ASSERT ||
770 operatorStack[i].op == REOP_ASSERT_NOT ||
771 operatorStack[i].op == REOP_LPARENNON ||
772 operatorStack[i].op == REOP_LPAREN) {
773 break;
776 ++state->cp;
778 /* Process everything on the stack until the open parenthesis. */
779 for (;;) {
780 JS_ASSERT(operatorSP);
781 --operatorSP;
782 switch (operatorStack[operatorSP].op) {
783 case REOP_ASSERT:
784 case REOP_ASSERT_NOT:
785 case REOP_LPAREN:
786 operand = NewRENode(state, operatorStack[operatorSP].op);
787 if (!operand)
788 goto out;
789 operand->u.parenIndex =
790 operatorStack[operatorSP].parenIndex;
791 JS_ASSERT(operandSP);
792 operand->kid = operandStack[operandSP - 1];
793 operandStack[operandSP - 1] = operand;
794 if (state->treeDepth == TREE_DEPTH_MAX) {
795 ReportRegExpError(state, JSREPORT_ERROR,
796 JSMSG_REGEXP_TOO_COMPLEX);
797 goto out;
799 ++state->treeDepth;
800 /* FALL THROUGH */
802 case REOP_LPARENNON:
803 state->result = operandStack[operandSP - 1];
804 if (!ParseQuantifier(state))
805 goto out;
806 operandStack[operandSP - 1] = state->result;
807 goto restartOperator;
808 default:
809 if (!ProcessOp(state, &operatorStack[operatorSP],
810 operandStack, operandSP))
811 goto out;
812 --operandSP;
813 break;
816 break;
818 case '{':
820 const jschar *errp = state->cp;
822 if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) {
824 * This didn't even scan correctly as a quantifier, so we should
825 * treat it as flat.
827 op = REOP_CONCAT;
828 goto pushOperator;
831 state->cp = errp;
832 /* FALL THROUGH */
835 case '+':
836 case '*':
837 case '?':
838 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
839 state->cp);
840 result = JS_FALSE;
841 goto out;
843 default:
844 /* Anything else is the start of the next term. */
845 op = REOP_CONCAT;
846 pushOperator:
847 if (operatorSP == operatorStackSize) {
848 REOpData *tmp;
849 operatorStackSize += operatorStackSize;
850 tmp = (REOpData *)
851 state->context->realloc(operatorStack,
852 sizeof(REOpData) * operatorStackSize);
853 if (!tmp)
854 goto out;
855 operatorStack = tmp;
857 operatorStack[operatorSP].op = op;
858 operatorStack[operatorSP].errPos = state->cp;
859 operatorStack[operatorSP++].parenIndex = parenIndex;
860 break;
863 out:
864 if (operatorStack)
865 state->context->free(operatorStack);
866 if (operandStack)
867 state->context->free(operandStack);
868 return result;
872 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
873 * its being on the stack, and to propagate errors to its callers.
875 #define JSREG_FIND_PAREN_COUNT 0x8000
876 #define JSREG_FIND_PAREN_ERROR 0x4000
879 * Magic return value from FindParenCount and GetDecimalValue, to indicate
880 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
881 * its findMax parameter is non-null.
883 #define OVERFLOW_VALUE ((uintN)-1)
885 static uintN
886 FindParenCount(CompilerState *state)
888 CompilerState temp;
889 int i;
891 if (state->flags & JSREG_FIND_PAREN_COUNT)
892 return OVERFLOW_VALUE;
895 * Copy state into temp, flag it so we never report an invalid backref,
896 * and reset its members to parse the entire regexp. This is obviously
897 * suboptimal, but GetDecimalValue calls us only if a backref appears to
898 * refer to a forward parenthetical, which is rare.
900 temp = *state;
901 temp.flags |= JSREG_FIND_PAREN_COUNT;
902 temp.cp = temp.cpbegin;
903 temp.parenCount = 0;
904 temp.classCount = 0;
905 temp.progLength = 0;
906 temp.treeDepth = 0;
907 temp.classBitmapsMem = 0;
908 for (i = 0; i < CLASS_CACHE_SIZE; i++)
909 temp.classCache[i].start = NULL;
911 if (!ParseRegExp(&temp)) {
912 state->flags |= JSREG_FIND_PAREN_ERROR;
913 return OVERFLOW_VALUE;
915 return temp.parenCount;
919 * Extract and return a decimal value at state->cp. The initial character c
920 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
921 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
922 * state->flags to discover whether an error occurred under findMax.
924 static uintN
925 GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state),
926 CompilerState *state)
928 uintN value = JS7_UNDEC(c);
929 JSBool overflow = (value > max && (!findMax || value > findMax(state)));
931 /* The following restriction allows simpler overflow checks. */
932 JS_ASSERT(max <= ((uintN)-1 - 9) / 10);
933 while (state->cp < state->cpend) {
934 c = *state->cp;
935 if (!JS7_ISDEC(c))
936 break;
937 value = 10 * value + JS7_UNDEC(c);
938 if (!overflow && value > max && (!findMax || value > findMax(state)))
939 overflow = JS_TRUE;
940 ++state->cp;
942 return overflow ? OVERFLOW_VALUE : value;
946 * Calculate the total size of the bitmap required for a class expression.
948 static JSBool
949 CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src,
950 const jschar *end)
952 uintN max = 0;
953 JSBool inRange = JS_FALSE;
954 jschar c, rangeStart = 0;
955 uintN n, digit, nDigits, i;
957 target->u.ucclass.bmsize = 0;
958 target->u.ucclass.sense = JS_TRUE;
960 if (src == end)
961 return JS_TRUE;
963 if (*src == '^') {
964 ++src;
965 target->u.ucclass.sense = JS_FALSE;
968 while (src != end) {
969 JSBool canStartRange = JS_TRUE;
970 jschar localMax = 0;
972 switch (*src) {
973 case '\\':
974 ++src;
975 c = *src++;
976 switch (c) {
977 case 'b':
978 localMax = 0x8;
979 break;
980 case 'f':
981 localMax = 0xC;
982 break;
983 case 'n':
984 localMax = 0xA;
985 break;
986 case 'r':
987 localMax = 0xD;
988 break;
989 case 't':
990 localMax = 0x9;
991 break;
992 case 'v':
993 localMax = 0xB;
994 break;
995 case 'c':
996 if (src < end && RE_IS_LETTER(*src)) {
997 localMax = (uintN) (*src++) & 0x1F;
998 } else {
999 --src;
1000 localMax = '\\';
1002 break;
1003 case 'x':
1004 nDigits = 2;
1005 goto lexHex;
1006 case 'u':
1007 nDigits = 4;
1008 lexHex:
1009 n = 0;
1010 for (i = 0; (i < nDigits) && (src < end); i++) {
1011 c = *src++;
1012 if (!isASCIIHexDigit(c, &digit)) {
1014 * Back off to accepting the original
1015 *'\' as a literal.
1017 src -= i + 1;
1018 n = '\\';
1019 break;
1021 n = (n << 4) | digit;
1023 localMax = jschar(n);
1024 break;
1025 case 'd':
1026 canStartRange = JS_FALSE;
1027 if (inRange) {
1028 JS_ReportErrorNumber(state->context,
1029 js_GetErrorMessage, NULL,
1030 JSMSG_BAD_CLASS_RANGE);
1031 return JS_FALSE;
1033 localMax = '9';
1034 break;
1035 case 'D':
1036 case 's':
1037 case 'S':
1038 case 'w':
1039 case 'W':
1040 canStartRange = JS_FALSE;
1041 if (inRange) {
1042 JS_ReportErrorNumber(state->context,
1043 js_GetErrorMessage, NULL,
1044 JSMSG_BAD_CLASS_RANGE);
1045 return JS_FALSE;
1047 max = 65535;
1050 * If this is the start of a range, ensure that it's less than
1051 * the end.
1053 localMax = 0;
1054 break;
1055 case '0':
1056 case '1':
1057 case '2':
1058 case '3':
1059 case '4':
1060 case '5':
1061 case '6':
1062 case '7':
1064 * This is a non-ECMA extension - decimal escapes (in this
1065 * case, octal!) are supposed to be an error inside class
1066 * ranges, but supported here for backwards compatibility.
1069 n = JS7_UNDEC(c);
1070 c = *src;
1071 if ('0' <= c && c <= '7') {
1072 src++;
1073 n = 8 * n + JS7_UNDEC(c);
1074 c = *src;
1075 if ('0' <= c && c <= '7') {
1076 src++;
1077 i = 8 * n + JS7_UNDEC(c);
1078 if (i <= 0377)
1079 n = i;
1080 else
1081 src--;
1084 localMax = jschar(n);
1085 break;
1087 default:
1088 localMax = c;
1089 break;
1091 break;
1092 default:
1093 localMax = *src++;
1094 break;
1097 if (inRange) {
1098 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1099 if (rangeStart > localMax) {
1100 JS_ReportErrorNumber(state->context,
1101 js_GetErrorMessage, NULL,
1102 JSMSG_BAD_CLASS_RANGE);
1103 return JS_FALSE;
1105 inRange = JS_FALSE;
1106 } else {
1107 if (canStartRange && src < end - 1) {
1108 if (*src == '-') {
1109 ++src;
1110 inRange = JS_TRUE;
1111 rangeStart = (jschar)localMax;
1112 continue;
1115 if (state->flags & JSREG_FOLD)
1116 rangeStart = localMax; /* one run of the uc/dc loop below */
1119 if (state->flags & JSREG_FOLD) {
1120 jschar maxch = localMax;
1122 for (i = rangeStart; i <= localMax; i++) {
1123 jschar uch, dch;
1125 uch = jschar(upcase(i));
1126 dch = inverse_upcase(jschar(i));
1127 maxch = JS_MAX(maxch, uch);
1128 maxch = JS_MAX(maxch, dch);
1130 localMax = maxch;
1133 if (localMax > max)
1134 max = uintN(localMax);
1136 target->u.ucclass.bmsize = uint16(max);
1137 return JS_TRUE;
1141 * item: assertion An item is either an assertion or
1142 * quantatom a quantified atom.
1144 * assertion: '^' Assertions match beginning of string
1145 * (or line if the class static property
1146 * RegExp.multiline is true).
1147 * '$' End of string (or line if the class
1148 * static property RegExp.multiline is
1149 * true).
1150 * '\b' Word boundary (between \w and \W).
1151 * '\B' Word non-boundary.
1153 * quantatom: atom An unquantified atom.
1154 * quantatom '{' n ',' m '}'
1155 * Atom must occur between n and m times.
1156 * quantatom '{' n ',' '}' Atom must occur at least n times.
1157 * quantatom '{' n '}' Atom must occur exactly n times.
1158 * quantatom '*' Zero or more times (same as {0,}).
1159 * quantatom '+' One or more times (same as {1,}).
1160 * quantatom '?' Zero or one time (same as {0,1}).
1162 * any of which can be optionally followed by '?' for ungreedy
1164 * atom: '(' regexp ')' A parenthesized regexp (what matched
1165 * can be addressed using a backreference,
1166 * see '\' n below).
1167 * '.' Matches any char except '\n'.
1168 * '[' classlist ']' A character class.
1169 * '[' '^' classlist ']' A negated character class.
1170 * '\f' Form Feed.
1171 * '\n' Newline (Line Feed).
1172 * '\r' Carriage Return.
1173 * '\t' Horizontal Tab.
1174 * '\v' Vertical Tab.
1175 * '\d' A digit (same as [0-9]).
1176 * '\D' A non-digit.
1177 * '\w' A word character, [0-9a-z_A-Z].
1178 * '\W' A non-word character.
1179 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1180 * '\S' A non-whitespace character.
1181 * '\' n A backreference to the nth (n decimal
1182 * and positive) parenthesized expression.
1183 * '\' octal An octal escape sequence (octal must be
1184 * two or three digits long, unless it is
1185 * 0 for the null character).
1186 * '\x' hex A hex escape (hex must be two digits).
1187 * '\u' unicode A unicode escape (must be four digits).
1188 * '\c' ctrl A control character, ctrl is a letter.
1189 * '\' literalatomchar Any character except one of the above
1190 * that follow '\' in an atom.
1191 * otheratomchar Any character not first among the other
1192 * atom right-hand sides.
1194 static JSBool
1195 ParseTerm(CompilerState *state)
1197 jschar c = *state->cp++;
1198 uintN nDigits;
1199 uintN num, tmp, n, i;
1200 const jschar *termStart;
1202 switch (c) {
1203 /* assertions and atoms */
1204 case '^':
1205 state->result = NewRENode(state, REOP_BOL);
1206 if (!state->result)
1207 return JS_FALSE;
1208 state->progLength++;
1209 return JS_TRUE;
1210 case '$':
1211 state->result = NewRENode(state, REOP_EOL);
1212 if (!state->result)
1213 return JS_FALSE;
1214 state->progLength++;
1215 return JS_TRUE;
1216 case '\\':
1217 if (state->cp >= state->cpend) {
1218 /* a trailing '\' is an error */
1219 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1220 return JS_FALSE;
1222 c = *state->cp++;
1223 switch (c) {
1224 /* assertion escapes */
1225 case 'b' :
1226 state->result = NewRENode(state, REOP_WBDRY);
1227 if (!state->result)
1228 return JS_FALSE;
1229 state->progLength++;
1230 return JS_TRUE;
1231 case 'B':
1232 state->result = NewRENode(state, REOP_WNONBDRY);
1233 if (!state->result)
1234 return JS_FALSE;
1235 state->progLength++;
1236 return JS_TRUE;
1237 /* Decimal escape */
1238 case '0':
1239 /* Give a strict warning. See also the note below. */
1240 if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT,
1241 JSMSG_INVALID_BACKREF)) {
1242 return JS_FALSE;
1244 doOctal:
1245 num = 0;
1246 while (state->cp < state->cpend) {
1247 c = *state->cp;
1248 if (c < '0' || '7' < c)
1249 break;
1250 state->cp++;
1251 tmp = 8 * num + (uintN)JS7_UNDEC(c);
1252 if (tmp > 0377)
1253 break;
1254 num = tmp;
1256 c = (jschar)num;
1257 doFlat:
1258 state->result = NewRENode(state, REOP_FLAT);
1259 if (!state->result)
1260 return JS_FALSE;
1261 state->result->u.flat.chr = c;
1262 state->result->u.flat.length = 1;
1263 state->progLength += 3;
1264 break;
1265 case '1':
1266 case '2':
1267 case '3':
1268 case '4':
1269 case '5':
1270 case '6':
1271 case '7':
1272 case '8':
1273 case '9':
1274 termStart = state->cp - 1;
1275 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1276 if (state->flags & JSREG_FIND_PAREN_ERROR)
1277 return JS_FALSE;
1278 if (num == OVERFLOW_VALUE) {
1279 /* Give a strict mode warning. */
1280 if (!ReportRegExpError(state,
1281 JSREPORT_WARNING | JSREPORT_STRICT,
1282 (c >= '8')
1283 ? JSMSG_INVALID_BACKREF
1284 : JSMSG_BAD_BACKREF)) {
1285 return JS_FALSE;
1289 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1290 * error here. However, for compatibility with IE, we treat the
1291 * whole backref as flat if the first character in it is not a
1292 * valid octal character, and as an octal escape otherwise.
1294 state->cp = termStart;
1295 if (c >= '8') {
1296 /* Treat this as flat. termStart - 1 is the \. */
1297 c = '\\';
1298 goto asFlat;
1301 /* Treat this as an octal escape. */
1302 goto doOctal;
1306 * When FindParenCount calls the regex parser recursively (to find
1307 * the number of backrefs) num can be arbitrary and the maximum
1308 * supported number of backrefs does not bound it.
1310 JS_ASSERT_IF(!(state->flags & JSREG_FIND_PAREN_COUNT),
1311 1 <= num && num <= 0x10000);
1312 state->result = NewRENode(state, REOP_BACKREF);
1313 if (!state->result)
1314 return JS_FALSE;
1315 state->result->u.parenIndex = num - 1;
1316 state->progLength
1317 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1318 break;
1319 /* Control escape */
1320 case 'f':
1321 c = 0xC;
1322 goto doFlat;
1323 case 'n':
1324 c = 0xA;
1325 goto doFlat;
1326 case 'r':
1327 c = 0xD;
1328 goto doFlat;
1329 case 't':
1330 c = 0x9;
1331 goto doFlat;
1332 case 'v':
1333 c = 0xB;
1334 goto doFlat;
1335 /* Control letter */
1336 case 'c':
1337 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1338 c = (jschar) (*state->cp++ & 0x1F);
1339 } else {
1340 /* back off to accepting the original '\' as a literal */
1341 --state->cp;
1342 c = '\\';
1344 goto doFlat;
1345 /* HexEscapeSequence */
1346 case 'x':
1347 nDigits = 2;
1348 goto lexHex;
1349 /* UnicodeEscapeSequence */
1350 case 'u':
1351 nDigits = 4;
1352 lexHex:
1353 n = 0;
1354 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1355 uintN digit;
1356 c = *state->cp++;
1357 if (!isASCIIHexDigit(c, &digit)) {
1359 * Back off to accepting the original 'u' or 'x' as a
1360 * literal.
1362 state->cp -= i + 2;
1363 n = *state->cp++;
1364 break;
1366 n = (n << 4) | digit;
1368 c = (jschar) n;
1369 goto doFlat;
1370 /* Character class escapes */
1371 case 'd':
1372 state->result = NewRENode(state, REOP_DIGIT);
1373 doSimple:
1374 if (!state->result)
1375 return JS_FALSE;
1376 state->progLength++;
1377 break;
1378 case 'D':
1379 state->result = NewRENode(state, REOP_NONDIGIT);
1380 goto doSimple;
1381 case 's':
1382 state->result = NewRENode(state, REOP_SPACE);
1383 goto doSimple;
1384 case 'S':
1385 state->result = NewRENode(state, REOP_NONSPACE);
1386 goto doSimple;
1387 case 'w':
1388 state->result = NewRENode(state, REOP_ALNUM);
1389 goto doSimple;
1390 case 'W':
1391 state->result = NewRENode(state, REOP_NONALNUM);
1392 goto doSimple;
1393 /* IdentityEscape */
1394 default:
1395 state->result = NewRENode(state, REOP_FLAT);
1396 if (!state->result)
1397 return JS_FALSE;
1398 state->result->u.flat.chr = c;
1399 state->result->u.flat.length = 1;
1400 state->result->kid = (void *) (state->cp - 1);
1401 state->progLength += 3;
1402 break;
1404 break;
1405 case '[':
1406 state->result = NewRENode(state, REOP_CLASS);
1407 if (!state->result)
1408 return JS_FALSE;
1409 termStart = state->cp;
1410 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1411 for (;;) {
1412 if (state->cp == state->cpend) {
1413 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1414 JSMSG_UNTERM_CLASS, termStart);
1416 return JS_FALSE;
1418 if (*state->cp == '\\') {
1419 state->cp++;
1420 if (state->cp != state->cpend)
1421 state->cp++;
1422 continue;
1424 if (*state->cp == ']') {
1425 state->result->u.ucclass.kidlen = state->cp - termStart;
1426 break;
1428 state->cp++;
1430 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1431 if (!state->classCache[i].start) {
1432 state->classCache[i].start = termStart;
1433 state->classCache[i].length = state->result->u.ucclass.kidlen;
1434 state->classCache[i].index = state->classCount;
1435 break;
1437 if (state->classCache[i].length ==
1438 state->result->u.ucclass.kidlen) {
1439 for (n = 0; ; n++) {
1440 if (n == state->classCache[i].length) {
1441 state->result->u.ucclass.index
1442 = state->classCache[i].index;
1443 goto claim;
1445 if (state->classCache[i].start[n] != termStart[n])
1446 break;
1450 state->result->u.ucclass.index = state->classCount++;
1452 claim:
1454 * Call CalculateBitmapSize now as we want any errors it finds
1455 * to be reported during the parse phase, not at execution.
1457 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1458 return JS_FALSE;
1460 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1461 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1462 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1464 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1465 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1466 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1467 return JS_FALSE;
1469 state->classBitmapsMem += n;
1470 /* CLASS, <index> */
1471 state->progLength
1472 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1473 break;
1475 case '.':
1476 state->result = NewRENode(state, REOP_DOT);
1477 goto doSimple;
1479 case '{':
1481 const jschar *errp = state->cp--;
1482 intN err;
1484 err = ParseMinMaxQuantifier(state, JS_TRUE);
1485 state->cp = errp;
1487 if (err < 0)
1488 goto asFlat;
1490 /* FALL THROUGH */
1492 case '*':
1493 case '+':
1494 case '?':
1495 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1496 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1497 return JS_FALSE;
1498 default:
1499 asFlat:
1500 state->result = NewRENode(state, REOP_FLAT);
1501 if (!state->result)
1502 return JS_FALSE;
1503 state->result->u.flat.chr = c;
1504 state->result->u.flat.length = 1;
1505 state->result->kid = (void *) (state->cp - 1);
1506 state->progLength += 3;
1507 break;
1509 return ParseQuantifier(state);
1512 static JSBool
1513 ParseQuantifier(CompilerState *state)
1515 RENode *term;
1516 term = state->result;
1517 if (state->cp < state->cpend) {
1518 switch (*state->cp) {
1519 case '+':
1520 state->result = NewRENode(state, REOP_QUANT);
1521 if (!state->result)
1522 return JS_FALSE;
1523 state->result->u.range.min = 1;
1524 state->result->u.range.max = (uintN)-1;
1525 /* <PLUS>, <next> ... <ENDCHILD> */
1526 state->progLength += 4;
1527 goto quantifier;
1528 case '*':
1529 state->result = NewRENode(state, REOP_QUANT);
1530 if (!state->result)
1531 return JS_FALSE;
1532 state->result->u.range.min = 0;
1533 state->result->u.range.max = (uintN)-1;
1534 /* <STAR>, <next> ... <ENDCHILD> */
1535 state->progLength += 4;
1536 goto quantifier;
1537 case '?':
1538 state->result = NewRENode(state, REOP_QUANT);
1539 if (!state->result)
1540 return JS_FALSE;
1541 state->result->u.range.min = 0;
1542 state->result->u.range.max = 1;
1543 /* <OPT>, <next> ... <ENDCHILD> */
1544 state->progLength += 4;
1545 goto quantifier;
1546 case '{': /* balance '}' */
1548 intN err;
1549 const jschar *errp = state->cp;
1551 err = ParseMinMaxQuantifier(state, JS_FALSE);
1552 if (err == 0)
1553 goto quantifier;
1554 if (err == -1)
1555 return JS_TRUE;
1557 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1558 return JS_FALSE;
1560 default:;
1563 return JS_TRUE;
1565 quantifier:
1566 if (state->treeDepth == TREE_DEPTH_MAX) {
1567 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1568 return JS_FALSE;
1571 ++state->treeDepth;
1572 ++state->cp;
1573 state->result->kid = term;
1574 if (state->cp < state->cpend && *state->cp == '?') {
1575 ++state->cp;
1576 state->result->u.range.greedy = JS_FALSE;
1577 } else {
1578 state->result->u.range.greedy = JS_TRUE;
1580 return JS_TRUE;
1583 static intN
1584 ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues)
1586 uintN min, max;
1587 jschar c;
1588 const jschar *errp = state->cp++;
1590 c = *state->cp;
1591 if (JS7_ISDEC(c)) {
1592 ++state->cp;
1593 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1594 c = *state->cp;
1596 if (!ignoreValues && min == OVERFLOW_VALUE)
1597 return JSMSG_MIN_TOO_BIG;
1599 if (c == ',') {
1600 c = *++state->cp;
1601 if (JS7_ISDEC(c)) {
1602 ++state->cp;
1603 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1604 c = *state->cp;
1605 if (!ignoreValues && max == OVERFLOW_VALUE)
1606 return JSMSG_MAX_TOO_BIG;
1607 if (!ignoreValues && min > max)
1608 return JSMSG_OUT_OF_ORDER;
1609 } else {
1610 max = (uintN)-1;
1612 } else {
1613 max = min;
1615 if (c == '}') {
1616 state->result = NewRENode(state, REOP_QUANT);
1617 if (!state->result)
1618 return JSMSG_OUT_OF_MEMORY;
1619 state->result->u.range.min = min;
1620 state->result->u.range.max = max;
1622 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1623 * where <max> is written as compact(max+1) to make
1624 * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1626 state->progLength += (1 + GetCompactIndexWidth(min)
1627 + GetCompactIndexWidth(max + 1)
1628 +3);
1629 return 0;
1633 state->cp = errp;
1634 return -1;
1637 static JSBool
1638 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
1640 ptrdiff_t offset = target - jump;
1642 /* Check that target really points forward. */
1643 JS_ASSERT(offset >= 2);
1644 if ((size_t)offset > OFFSET_MAX)
1645 return JS_FALSE;
1647 jump[0] = JUMP_OFFSET_HI(offset);
1648 jump[1] = JUMP_OFFSET_LO(offset);
1649 return JS_TRUE;
1652 /* Copy the charset data from a character class node to the charset list
1653 * in the regexp object. */
1654 static JS_ALWAYS_INLINE RECharSet *
1655 InitNodeCharSet(JSRegExp *re, RENode *node)
1657 RECharSet *charSet = &re->classList[node->u.ucclass.index];
1658 charSet->converted = JS_FALSE;
1659 charSet->length = node->u.ucclass.bmsize;
1660 charSet->u.src.startIndex = node->u.ucclass.startIndex;
1661 charSet->u.src.length = node->u.ucclass.kidlen;
1662 charSet->sense = node->u.ucclass.sense;
1663 return charSet;
1667 * Generate bytecode for the tree rooted at t using an explicit stack instead
1668 * of recursion.
1670 static jsbytecode *
1671 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
1672 jsbytecode *pc, RENode *t)
1674 EmitStateStackEntry *emitStateSP, *emitStateStack;
1675 REOp op;
1677 if (treeDepth == 0) {
1678 emitStateStack = NULL;
1679 } else {
1680 emitStateStack =
1681 (EmitStateStackEntry *)
1682 state->context->malloc(sizeof(EmitStateStackEntry) * treeDepth);
1683 if (!emitStateStack)
1684 return NULL;
1686 emitStateSP = emitStateStack;
1687 op = t->op;
1688 JS_ASSERT(op < REOP_LIMIT);
1690 for (;;) {
1691 *pc++ = op;
1692 switch (op) {
1693 case REOP_EMPTY:
1694 --pc;
1695 break;
1697 case REOP_ALTPREREQ2:
1698 case REOP_ALTPREREQ:
1699 JS_ASSERT(emitStateSP);
1700 emitStateSP->altHead = pc - 1;
1701 emitStateSP->endTermFixup = pc;
1702 pc += OFFSET_LEN;
1703 SET_ARG(pc, t->u.altprereq.ch1);
1704 pc += ARG_LEN;
1705 SET_ARG(pc, t->u.altprereq.ch2);
1706 pc += ARG_LEN;
1708 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1709 pc += OFFSET_LEN;
1711 emitStateSP->continueNode = t;
1712 emitStateSP->continueOp = REOP_JUMP;
1713 emitStateSP->jumpToJumpFlag = JS_FALSE;
1714 ++emitStateSP;
1715 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1716 t = (RENode *) t->kid;
1717 op = t->op;
1718 JS_ASSERT(op < REOP_LIMIT);
1719 continue;
1721 case REOP_JUMP:
1722 emitStateSP->nextTermFixup = pc; /* offset to following term */
1723 pc += OFFSET_LEN;
1724 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
1725 goto jump_too_big;
1726 emitStateSP->continueOp = REOP_ENDALT;
1727 ++emitStateSP;
1728 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1729 t = (RENode *) t->u.kid2;
1730 op = t->op;
1731 JS_ASSERT(op < REOP_LIMIT);
1732 continue;
1734 case REOP_ENDALT:
1736 * If we already patched emitStateSP->nextTermFixup to jump to
1737 * a nearer jump, to avoid 16-bit immediate offset overflow, we
1738 * are done here.
1740 if (emitStateSP->jumpToJumpFlag)
1741 break;
1744 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
1745 * REOP_ENDALT is executed only on successful match of the last
1746 * alternate in a group.
1748 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1749 goto jump_too_big;
1750 if (t->op != REOP_ALT) {
1751 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
1752 goto jump_too_big;
1756 * If the program is bigger than the REOP_JUMP offset range, then
1757 * we must check for alternates before this one that are part of
1758 * the same group, and fix up their jump offsets to target jumps
1759 * close enough to fit in a 16-bit unsigned offset immediate.
1761 if ((size_t)(pc - re->program) > OFFSET_MAX &&
1762 emitStateSP > emitStateStack) {
1763 EmitStateStackEntry *esp, *esp2;
1764 jsbytecode *alt, *jump;
1765 ptrdiff_t span, header;
1767 esp2 = emitStateSP;
1768 alt = esp2->altHead;
1769 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
1770 if (esp->continueOp == REOP_ENDALT &&
1771 !esp->jumpToJumpFlag &&
1772 esp->nextTermFixup + OFFSET_LEN == alt &&
1773 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
1774 ? esp->endTermFixup
1775 : esp->nextTermFixup)) > OFFSET_MAX) {
1776 alt = esp->altHead;
1777 jump = esp->nextTermFixup;
1780 * The span must be 1 less than the distance from
1781 * jump offset to jump offset, so we actually jump
1782 * to a REOP_JUMP bytecode, not to its offset!
1784 for (;;) {
1785 JS_ASSERT(jump < esp2->nextTermFixup);
1786 span = esp2->nextTermFixup - jump - 1;
1787 if ((size_t)span <= OFFSET_MAX)
1788 break;
1789 do {
1790 if (--esp2 == esp)
1791 goto jump_too_big;
1792 } while (esp2->continueOp != REOP_ENDALT);
1795 jump[0] = JUMP_OFFSET_HI(span);
1796 jump[1] = JUMP_OFFSET_LO(span);
1798 if (esp->continueNode->op != REOP_ALT) {
1800 * We must patch the offset at esp->endTermFixup
1801 * as well, for the REOP_ALTPREREQ{,2} opcodes.
1802 * If we're unlucky and endTermFixup is more than
1803 * OFFSET_MAX bytes from its target, we cheat by
1804 * jumping 6 bytes to the jump whose offset is at
1805 * esp->nextTermFixup, which has the same target.
1807 jump = esp->endTermFixup;
1808 header = esp->nextTermFixup - jump;
1809 span += header;
1810 if ((size_t)span > OFFSET_MAX)
1811 span = header;
1813 jump[0] = JUMP_OFFSET_HI(span);
1814 jump[1] = JUMP_OFFSET_LO(span);
1817 esp->jumpToJumpFlag = JS_TRUE;
1821 break;
1823 case REOP_ALT:
1824 JS_ASSERT(emitStateSP);
1825 emitStateSP->altHead = pc - 1;
1826 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
1827 pc += OFFSET_LEN;
1828 emitStateSP->continueNode = t;
1829 emitStateSP->continueOp = REOP_JUMP;
1830 emitStateSP->jumpToJumpFlag = JS_FALSE;
1831 ++emitStateSP;
1832 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1833 t = (RENode *) t->kid;
1834 op = t->op;
1835 JS_ASSERT(op < REOP_LIMIT);
1836 continue;
1838 case REOP_FLAT:
1840 * Coalesce FLATs if possible and if it would not increase bytecode
1841 * beyond preallocated limit. The latter happens only when bytecode
1842 * size for coalesced string with offset p and length 2 exceeds 6
1843 * bytes preallocated for 2 single char nodes, i.e. when
1844 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
1845 * GetCompactIndexWidth(p) > 4.
1846 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
1847 * nodes strictly decreases bytecode size, the check has to be
1848 * done only for the first coalescing.
1850 if (t->kid &&
1851 GetCompactIndexWidth((jschar *)t->kid - state->cpbegin) <= 4)
1853 while (t->next &&
1854 t->next->op == REOP_FLAT &&
1855 (jschar*)t->kid + t->u.flat.length ==
1856 (jschar*)t->next->kid) {
1857 t->u.flat.length += t->next->u.flat.length;
1858 t->next = t->next->next;
1861 if (t->kid && t->u.flat.length > 1) {
1862 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
1863 pc = WriteCompactIndex(pc, (jschar *)t->kid - state->cpbegin);
1864 pc = WriteCompactIndex(pc, t->u.flat.length);
1865 } else if (t->u.flat.chr < 256) {
1866 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
1867 *pc++ = (jsbytecode) t->u.flat.chr;
1868 } else {
1869 pc[-1] = (state->flags & JSREG_FOLD)
1870 ? REOP_UCFLAT1i
1871 : REOP_UCFLAT1;
1872 SET_ARG(pc, t->u.flat.chr);
1873 pc += ARG_LEN;
1875 break;
1877 case REOP_LPAREN:
1878 JS_ASSERT(emitStateSP);
1879 pc = WriteCompactIndex(pc, t->u.parenIndex);
1880 emitStateSP->continueNode = t;
1881 emitStateSP->continueOp = REOP_RPAREN;
1882 ++emitStateSP;
1883 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1884 t = (RENode *) t->kid;
1885 op = t->op;
1886 continue;
1888 case REOP_RPAREN:
1889 pc = WriteCompactIndex(pc, t->u.parenIndex);
1890 break;
1892 case REOP_BACKREF:
1893 pc = WriteCompactIndex(pc, t->u.parenIndex);
1894 break;
1896 case REOP_ASSERT:
1897 JS_ASSERT(emitStateSP);
1898 emitStateSP->nextTermFixup = pc;
1899 pc += OFFSET_LEN;
1900 emitStateSP->continueNode = t;
1901 emitStateSP->continueOp = REOP_ASSERTTEST;
1902 ++emitStateSP;
1903 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1904 t = (RENode *) t->kid;
1905 op = t->op;
1906 continue;
1908 case REOP_ASSERTTEST:
1909 case REOP_ASSERTNOTTEST:
1910 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1911 goto jump_too_big;
1912 break;
1914 case REOP_ASSERT_NOT:
1915 JS_ASSERT(emitStateSP);
1916 emitStateSP->nextTermFixup = pc;
1917 pc += OFFSET_LEN;
1918 emitStateSP->continueNode = t;
1919 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
1920 ++emitStateSP;
1921 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1922 t = (RENode *) t->kid;
1923 op = t->op;
1924 continue;
1926 case REOP_QUANT:
1927 JS_ASSERT(emitStateSP);
1928 if (t->u.range.min == 0 && t->u.range.max == (uintN)-1) {
1929 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
1930 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
1931 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
1932 } else if (t->u.range.min == 1 && t->u.range.max == (uintN) -1) {
1933 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
1934 } else {
1935 if (!t->u.range.greedy)
1936 pc[-1] = REOP_MINIMALQUANT;
1937 pc = WriteCompactIndex(pc, t->u.range.min);
1939 * Write max + 1 to avoid using size_t(max) + 1 bytes
1940 * for (uintN)-1 sentinel.
1942 pc = WriteCompactIndex(pc, t->u.range.max + 1);
1944 emitStateSP->nextTermFixup = pc;
1945 pc += OFFSET_LEN;
1946 emitStateSP->continueNode = t;
1947 emitStateSP->continueOp = REOP_ENDCHILD;
1948 ++emitStateSP;
1949 JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth);
1950 t = (RENode *) t->kid;
1951 op = t->op;
1952 continue;
1954 case REOP_ENDCHILD:
1955 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
1956 goto jump_too_big;
1957 break;
1959 case REOP_CLASS:
1960 if (!t->u.ucclass.sense)
1961 pc[-1] = REOP_NCLASS;
1962 pc = WriteCompactIndex(pc, t->u.ucclass.index);
1963 InitNodeCharSet(re, t);
1964 break;
1966 default:
1967 break;
1970 t = t->next;
1971 if (t) {
1972 op = t->op;
1973 } else {
1974 if (emitStateSP == emitStateStack)
1975 break;
1976 --emitStateSP;
1977 t = emitStateSP->continueNode;
1978 op = (REOp) emitStateSP->continueOp;
1982 cleanup:
1983 if (emitStateStack)
1984 state->context->free(emitStateStack);
1985 return pc;
1987 jump_too_big:
1988 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1989 pc = NULL;
1990 goto cleanup;
1993 static JSBool
1994 CompileRegExpToAST(JSContext* cx, TokenStream* ts,
1995 JSString* str, uintN flags, CompilerState& state)
1997 uintN i;
1998 size_t len;
2000 len = str->length();
2002 state.context = cx;
2003 state.tokenStream = ts;
2004 state.cp = js_UndependString(cx, str);
2005 if (!state.cp)
2006 return JS_FALSE;
2007 state.cpbegin = state.cp;
2008 state.cpend = state.cp + len;
2009 state.flags = uint16(flags);
2010 state.parenCount = 0;
2011 state.classCount = 0;
2012 state.progLength = 0;
2013 state.treeDepth = 0;
2014 state.classBitmapsMem = 0;
2015 for (i = 0; i < CLASS_CACHE_SIZE; i++)
2016 state.classCache[i].start = NULL;
2018 if (len != 0 && (flags & JSREG_FLAT)) {
2019 state.result = NewRENode(&state, REOP_FLAT);
2020 if (!state.result)
2021 return JS_FALSE;
2022 state.result->u.flat.chr = *state.cpbegin;
2023 state.result->u.flat.length = len;
2024 state.result->kid = (void *) state.cpbegin;
2025 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
2026 state.progLength += 1 + GetCompactIndexWidth(0)
2027 + GetCompactIndexWidth(len);
2028 return JS_TRUE;
2031 return ParseRegExp(&state);
2034 #ifdef JS_TRACER
2035 typedef js::Vector<LIns *, 4, js::ContextAllocPolicy> LInsList;
2037 namespace js {
2039 struct REFragment : public nanojit::Fragment
2041 REFragment(const void* _ip verbose_only(, uint32_t profFragID))
2042 : nanojit::Fragment(ip verbose_only(, profFragID))
2046 } /* namespace js */
2048 /* Return the cached fragment for the given regexp, or create one. */
2049 static Fragment*
2050 LookupNativeRegExp(JSContext* cx, uint16 re_flags,
2051 const jschar* re_chars, size_t re_length)
2053 TraceMonitor *tm = &JS_TRACE_MONITOR(cx);
2054 VMAllocator &alloc = *tm->dataAlloc;
2055 REHashMap &table = *tm->reFragments;
2057 REHashKey k(re_length, re_flags, re_chars);
2058 REFragment *frag = table.get(k);
2060 if (!frag) {
2061 verbose_only(
2062 uint32_t profFragID = (LogController.lcbits & LC_FragProfile)
2063 ? (++(tm->lastFragID)) : 0;
2065 frag = new (alloc) REFragment(0 verbose_only(, profFragID));
2067 * Copy the re_chars portion of the hash key into the Allocator, so
2068 * its lifecycle is disconnected from the lifecycle of the
2069 * underlying regexp.
2071 k.re_chars = (const jschar*) new (alloc) jschar[re_length];
2072 memcpy((void*) k.re_chars, re_chars, re_length * sizeof(jschar));
2073 table.put(k, frag);
2075 return frag;
2078 static JSBool
2079 ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet);
2081 /* Utilities for the RegExpNativeCompiler */
2083 namespace {
2085 * An efficient way to simultaneously statically guard that the sizeof(bool) is a
2086 * small power of 2 and take its log2.
2088 template <int> struct StaticLog2 {};
2089 template <> struct StaticLog2<1> { static const int result = 0; };
2090 template <> struct StaticLog2<2> { static const int result = 1; };
2091 template <> struct StaticLog2<4> { static const int result = 2; };
2092 template <> struct StaticLog2<8> { static const int result = 3; };
2096 * This table allows efficient testing for the ASCII portion of \s during a
2097 * trace. ECMA-262 15.10.2.12 defines the following characters below 128 to be
2098 * whitespace: 0x9 (0), 0xA (10), 0xB (11), 0xC (12), 0xD (13), 0x20 (32). The
2099 * index must be <= 32.
2101 static const bool js_ws[] = {
2102 /* 0 1 2 3 4 5 5 7 8 9 */
2103 /* 0 */ false, false, false, false, false, false, false, false, false, true,
2104 /* 1 */ true, true, true, true, false, false, false, false, false, false,
2105 /* 2 */ false, false, false, false, false, false, false, false, false, false,
2106 /* 3 */ false, false, true
2109 /* Sets of characters are described in terms of individuals and classes. */
2110 class CharSet {
2111 public:
2112 CharSet() : charEnd(charBuf), classes(0) {}
2114 static const uintN sBufSize = 8;
2116 bool full() { return charEnd == charBuf + sBufSize; }
2118 /* Add a single char to the set. */
2119 bool addChar(jschar c)
2121 if (full())
2122 return false;
2123 *charEnd++ = c;
2124 return true;
2127 enum Class {
2128 LineTerms = 1 << 0, /* Line Terminators (E262 7.3) */
2129 OtherSpace = 1 << 1, /* \s (E262 15.10.2.12) - LineTerms */
2130 Digit = 1 << 2, /* \d (E262 15.10.2.12) */
2131 OtherAlnum = 1 << 3, /* \w (E262 15,10.2.12) - Digit */
2132 Other = 1 << 4, /* all other characters */
2133 All = LineTerms | OtherSpace | Digit | OtherAlnum | Other,
2135 Space = LineTerms | OtherSpace,
2136 AlNum = Digit | OtherAlnum,
2137 Dot = All & ~LineTerms
2140 /* Add a set of chars to the set. */
2141 void addClass(Class c) { classes |= c; }
2143 /* Return whether two sets of chars are disjoint. */
2144 bool disjoint(const CharSet &) const;
2146 private:
2147 static bool disjoint(const jschar *beg, const jschar *end, uintN classes);
2149 mutable jschar charBuf[sBufSize];
2150 jschar *charEnd;
2151 uintN classes;
2154 /* Appease the type checker. */
2155 static inline CharSet::Class
2156 operator|(CharSet::Class c1, CharSet::Class c2) {
2157 return (CharSet::Class)(((int)c1) | ((int)c2));
2159 static inline CharSet::Class
2160 operator~(CharSet::Class c) {
2161 return (CharSet::Class)(~(int)c);
2165 * Return whether the characters in the range [beg, end) fall within any of the
2166 * classes with a bit set in 'classes'.
2168 bool
2169 CharSet::disjoint(const jschar *beg, const jschar *end, uintN classes)
2171 for (const jschar *p = beg; p != end; ++p) {
2172 if (JS7_ISDEC(*p)) {
2173 if (classes & Digit)
2174 return false;
2175 } else if (JS_ISWORD(*p)) {
2176 if (classes & OtherAlnum)
2177 return false;
2178 } else if (RE_IS_LINE_TERM(*p)) {
2179 if (classes & LineTerms)
2180 return false;
2181 } else if (JS_ISSPACE(*p)) {
2182 if (classes & OtherSpace)
2183 return false;
2184 } else {
2185 if (classes & Other)
2186 return false;
2189 return true;
2193 * Predicate version of the STL's set_intersection. Assumes both ranges are
2194 * sorted and thus runs in linear time.
2196 * FIXME: This is a reusable algorithm, perhaps it should be put somewhere.
2198 template <class InputIterator1, class InputIterator2>
2199 bool
2200 set_disjoint(InputIterator1 p1, InputIterator1 end1,
2201 InputIterator2 p2, InputIterator2 end2)
2203 if (p1 == end1 || p2 == end2)
2204 return true;
2205 while (*p1 != *p2) {
2206 if (*p1 < *p2) {
2207 ++p1;
2208 if (p1 == end1)
2209 return true;
2210 } else if (*p2 < *p1) {
2211 ++p2;
2212 if (p2 == end2)
2213 return true;
2216 return false;
2219 static JSBool
2220 CharCmp(void *arg, const void *a, const void *b, int *result)
2222 jschar ca = *(jschar *)a, cb = *(jschar *)b;
2223 *result = ca - cb;
2224 return JS_TRUE;
2227 bool
2228 CharSet::disjoint(const CharSet &other) const
2230 /* Check overlap between classes. */
2231 if (classes & other.classes)
2232 return false;
2235 * Check char-class overlap. Compare this->charBuf with other.classes and
2236 * vice versa with a loop.
2238 if (!disjoint(this->charBuf, this->charEnd, other.classes) ||
2239 !disjoint(other.charBuf, other.charEnd, this->classes))
2240 return false;
2242 /* Check char-char overlap. */
2243 jschar tmp[CharSet::sBufSize];
2244 js_MergeSort(charBuf, charEnd - charBuf, sizeof(jschar),
2245 CharCmp, 0, tmp);
2246 js_MergeSort(other.charBuf, other.charEnd - other.charBuf, sizeof(jschar),
2247 CharCmp, 0, tmp);
2248 return set_disjoint(charBuf, charEnd, other.charBuf, other.charEnd);
2252 * Return true if the given subexpression may match the empty string. The
2253 * conservative answer is |true|. If |next| is true, then the subexpression is
2254 * considered to be |node| followed by the rest of |node->next|. Otherwise, the
2255 * subexpression is considered to be |node| by itself.
2257 static bool
2258 mayMatchEmpty(RENode *node, bool next = true)
2260 if (!node)
2261 return true;
2262 switch (node->op) {
2263 case REOP_EMPTY: return true;
2264 case REOP_FLAT: return false;
2265 case REOP_CLASS: return false;
2266 case REOP_ALNUM: return false;
2267 case REOP_ALT: return (mayMatchEmpty((RENode *)node->kid) ||
2268 mayMatchEmpty((RENode *)node->u.kid2)) &&
2269 (!next || mayMatchEmpty(node->next));
2270 case REOP_QUANT: return (node->u.range.min == 0 ||
2271 mayMatchEmpty((RENode *)node->kid)) &&
2272 (!next || mayMatchEmpty(node->next));
2273 default: return true;
2278 * Enumerate the set of characters that may be consumed next by the given
2279 * subexpression in isolation. Return whether the enumeration was successful.
2281 static bool
2282 enumerateNextChars(JSContext *cx, RENode *node, CharSet &set)
2284 JS_CHECK_RECURSION(cx, return JS_FALSE);
2286 if (!node)
2287 return true;
2289 switch (node->op) {
2290 /* Record as bitflags. */
2291 case REOP_DOT: set.addClass(CharSet::Dot); return true;
2292 case REOP_DIGIT: set.addClass(CharSet::Digit); return true;
2293 case REOP_NONDIGIT: set.addClass(~CharSet::Digit); return true;
2294 case REOP_ALNUM: set.addClass(CharSet::AlNum); return true;
2295 case REOP_NONALNUM: set.addClass(~CharSet::AlNum); return true;
2296 case REOP_SPACE: set.addClass(CharSet::Space); return true;
2297 case REOP_NONSPACE: set.addClass(~CharSet::Space); return true;
2299 /* Record as individual characters. */
2300 case REOP_FLAT:
2301 return set.addChar(node->u.flat.chr);
2303 /* Control structures. */
2304 case REOP_EMPTY:
2305 return true;
2306 case REOP_ALT:
2307 case REOP_ALTPREREQ:
2308 return enumerateNextChars(cx, (RENode *)node->kid, set) &&
2309 enumerateNextChars(cx, (RENode *)node->u.kid2, set) &&
2310 (!mayMatchEmpty(node, false) ||
2311 enumerateNextChars(cx, (RENode *)node->next, set));
2312 case REOP_QUANT:
2313 return enumerateNextChars(cx, (RENode *)node->kid, set) &&
2314 (!mayMatchEmpty(node, false) ||
2315 enumerateNextChars(cx, (RENode *)node->next, set));
2317 /* Arbitrary character classes and oddities. */
2318 default:
2319 return false;
2323 class RegExpNativeCompiler {
2324 private:
2325 VMAllocator& tempAlloc;
2326 JSContext* cx;
2327 JSRegExp* re;
2328 CompilerState* cs; /* RegExp to compile */
2329 Fragment* fragment;
2330 LirWriter* lir;
2331 #ifdef DEBUG
2332 LirWriter* validate_writer;
2333 #endif
2334 #ifdef NJ_VERBOSE
2335 LirWriter* verbose_filter;
2336 #endif
2337 LirBufWriter* lirBufWriter; /* for skip */
2339 LIns* state;
2340 LIns* start;
2341 LIns* cpend;
2343 LirBuffer* const lirbuf;
2345 bool outOfMemory() {
2346 return tempAlloc.outOfMemory() || JS_TRACE_MONITOR(cx).dataAlloc->outOfMemory();
2349 JSBool isCaseInsensitive() const { return (cs->flags & JSREG_FOLD) != 0; }
2351 void targetCurrentPoint(LIns *ins)
2353 ins->setTarget(lir->ins0(LIR_label));
2356 void targetCurrentPoint(LInsList &fails)
2358 LIns *fail = lir->ins0(LIR_label);
2359 for (size_t i = 0; i < fails.length(); ++i) {
2360 fails[i]->setTarget(fail);
2362 fails.clear();
2366 * These functions return the new position after their match operation,
2367 * or NULL if there was an error.
2369 LIns* compileEmpty(RENode* node, LIns* pos, LInsList& fails)
2371 return pos;
2374 #if defined(AVMPLUS_ARM) || defined(AVMPLUS_SPARC)
2375 /* We can't do this on ARM or SPARC, since it relies on doing a 32-bit load from
2376 * a pointer which is only 2-byte aligned.
2378 #undef USE_DOUBLE_CHAR_MATCH
2379 #else
2380 #define USE_DOUBLE_CHAR_MATCH
2381 #endif
2383 LIns* compileFlatSingleChar(jschar ch, LIns* pos, LInsList& fails)
2385 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, cpend), 0);
2386 if (!fails.append(to_fail))
2387 return NULL;
2388 LIns* text_ch = lir->insLoad(LIR_ldus2ui, pos, 0, ACC_READONLY);
2390 // Extra characters that need to be compared against when doing folding.
2391 struct extra {
2392 jschar ch;
2393 LIns *match;
2395 extra extras[5];
2396 int nextras = 0;
2398 if (cs->flags & JSREG_FOLD) {
2399 ch = JS_TOUPPER(ch);
2400 jschar lch = inverse_upcase(ch);
2402 if (ch != lch) {
2403 if (L'A' <= ch && ch <= L'Z') {
2404 // Fast conversion of text character to lower case by OR-ing with 32.
2405 text_ch = lir->ins2(LIR_ori, text_ch, lir->insImmI(32));
2406 // These ASCII letters have 2 lower-case forms. We put the ASCII one in
2407 // |extras| so it is tested first, because we expect that to be the common
2408 // case. Note that the code points of the non-ASCII forms both have the
2409 // 32 bit set, so it is OK to compare against the OR-32-converted text char.
2410 ch = lch;
2411 if (ch == L'i') {
2412 extras[nextras++].ch = ch;
2413 ch = 0x131;
2414 } else if (ch == L's') {
2415 extras[nextras++].ch = ch;
2416 ch = 0x17f;
2418 goto gen;
2419 } else if (0x01c4 <= ch && ch <= 0x1e60) {
2420 // The following group of conditionals handles characters that have 1 or 2
2421 // lower-case forms in addition to JS_TOLOWER(ch).
2422 if (ch <= 0x1f1) { // DZ,LJ,NJ
2423 if (ch == 0x01c4) {
2424 extras[nextras++].ch = 0x01c5;
2425 } else if (ch == 0x01c7) {
2426 extras[nextras++].ch = 0x01c8;
2427 } else if (ch == 0x01ca) {
2428 extras[nextras++].ch = 0x01cb;
2429 } else if (ch == 0x01f1) {
2430 extras[nextras++].ch = 0x01f2;
2432 } else if (ch < 0x0392) { // no extra lower-case forms in this range
2433 } else if (ch <= 0x03a6) { // Greek
2434 if (ch == 0x0392) {
2435 extras[nextras++].ch = 0x03d0;
2436 } else if (ch == 0x0395) {
2437 extras[nextras++].ch = 0x03f5;
2438 } else if (ch == 0x0398) {
2439 extras[nextras++].ch = 0x03d1;
2440 } else if (ch == 0x0399) {
2441 extras[nextras++].ch = 0x0345;
2442 extras[nextras++].ch = 0x1fbe;
2443 } else if (ch == 0x039a) {
2444 extras[nextras++].ch = 0x03f0;
2445 } else if (ch == 0x039c) {
2446 extras[nextras++].ch = 0xb5;
2447 } else if (ch == 0x03a0) {
2448 extras[nextras++].ch = 0x03d6;
2449 } else if (ch == 0x03a1) {
2450 extras[nextras++].ch = 0x03f1;
2451 } else if (ch == 0x03a3) {
2452 extras[nextras++].ch = 0x03c2;
2453 } else if (ch == 0x03a6) {
2454 extras[nextras++].ch = 0x03d5;
2456 } else if (ch == 0x1e60) { // S with dot above
2457 extras[nextras++].ch = 0x1e9b;
2461 extras[nextras++].ch = lch;
2465 gen:
2466 for (int i = 0; i < nextras; ++i) {
2467 LIns *test = lir->ins2(LIR_eqi, text_ch, lir->insImmI(extras[i].ch));
2468 LIns *branch = lir->insBranch(LIR_jt, test, 0);
2469 extras[i].match = branch;
2472 if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_eqi, text_ch, lir->insImmI(ch)), 0)))
2473 return NULL;
2475 for (int i = 0; i < nextras; ++i)
2476 targetCurrentPoint(extras[i].match);
2477 return lir->ins2(LIR_addp, pos, lir->insImmWord(2));
2480 JS_INLINE bool hasCases(jschar ch)
2482 return JS_TOLOWER(ch) != JS_TOUPPER(ch);
2485 LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos, LInsList& fails)
2487 #ifdef IS_BIG_ENDIAN
2488 uint32 word = (ch1 << 16) | ch2;
2489 #else
2490 uint32 word = (ch2 << 16) | ch1;
2491 #endif
2493 * Fast case-insensitive test for ASCII letters: convert text
2494 * char to lower case by bit-or-ing in 32 and compare.
2496 JSBool useFastCI = JS_FALSE;
2497 union { jschar c[2]; uint32 i; } mask;
2498 if (cs->flags & JSREG_FOLD) {
2499 jschar uch1 = JS_TOUPPER(ch1);
2500 jschar uch2 = JS_TOUPPER(ch2);
2501 JSBool mask1 = (L'A' <= uch1 && uch1 <= L'Z' && uch1 != L'I' && uch1 != L'S');
2502 JSBool mask2 = (L'A' <= uch2 && uch2 <= L'Z' && uch2 != L'I' && uch2 != L'S');
2503 if ((!mask1 && hasCases(ch1)) || (!mask2 && hasCases(ch2))) {
2504 pos = compileFlatSingleChar(ch1, pos, fails);
2505 if (!pos) return NULL;
2506 return compileFlatSingleChar(ch2, pos, fails);
2509 mask.c[0] = mask1 ? 0x0020 : 0x0;
2510 mask.c[1] = mask2 ? 0x0020 : 0x0;
2512 if (mask.i) {
2513 word |= mask.i;
2514 useFastCI = JS_TRUE;
2518 LIns* to_fail = lir->insBranch(LIR_jf,
2519 lir->ins2(LIR_ltp,
2520 pos,
2521 lir->ins2(LIR_addp,
2522 cpend,
2523 lir->insImmWord(-2))),
2525 if (!fails.append(to_fail))
2526 return NULL;
2527 LIns* text_word = lir->insLoad(LIR_ldi, pos, 0, ACC_OTHER);
2528 LIns* comp_word = useFastCI ?
2529 lir->ins2(LIR_ori, text_word, lir->insImmI(mask.i)) :
2530 text_word;
2531 if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_eqi, comp_word, lir->insImmI(word)), 0)))
2532 return NULL;
2534 return lir->ins2(LIR_addp, pos, lir->insImmWord(4));
2537 LIns* compileFlat(RENode *&node, LIns* pos, LInsList& fails)
2539 #ifdef USE_DOUBLE_CHAR_MATCH
2540 if (node->u.flat.length == 1) {
2541 if (node->next && node->next->op == REOP_FLAT &&
2542 node->next->u.flat.length == 1) {
2543 pos = compileFlatDoubleChar(node->u.flat.chr,
2544 node->next->u.flat.chr,
2545 pos, fails);
2546 node = node->next;
2547 } else {
2548 pos = compileFlatSingleChar(node->u.flat.chr, pos, fails);
2550 return pos;
2551 } else {
2552 size_t i;
2553 for (i = 0; i < node->u.flat.length - 1; i += 2) {
2554 if (outOfMemory())
2555 return 0;
2556 pos = compileFlatDoubleChar(((jschar*) node->kid)[i],
2557 ((jschar*) node->kid)[i+1],
2558 pos, fails);
2559 if (!pos)
2560 return 0;
2562 JS_ASSERT(pos != 0);
2563 if (i == node->u.flat.length - 1)
2564 pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails);
2565 return pos;
2567 #else
2568 if (node->u.flat.length == 1) {
2569 return compileFlatSingleChar(node->u.flat.chr, pos, fails);
2570 } else {
2571 for (size_t i = 0; i < node->u.flat.length; i++) {
2572 if (outOfMemory())
2573 return 0;
2574 pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails);
2575 if (!pos)
2576 return 0;
2578 return pos;
2580 #endif
2583 LIns* compileClass(RENode* node, LIns* pos, LInsList& fails)
2585 if (!node->u.ucclass.sense)
2586 return JS_FALSE;
2588 * If we share generated native code, we need to make a copy
2589 * of the bitmap because the original regexp's copy is destroyed when
2590 * that regexp is.
2592 RECharSet *charSet = &re->classList[node->u.ucclass.index];
2593 size_t bitmapLen = (charSet->length >> 3) + 1;
2594 /* Arbitrary size limit on bitmap. */
2595 if (bitmapLen > 1024)
2596 return NULL;
2597 Allocator &alloc = *JS_TRACE_MONITOR(cx).dataAlloc;
2598 /* The following line allocates charSet.u.bits if successful. */
2599 if (!charSet->converted && !ProcessCharSet(cx, re, charSet))
2600 return NULL;
2601 void* bitmapData = alloc.alloc(bitmapLen);
2602 if (outOfMemory())
2603 return NULL;
2604 memcpy(bitmapData, charSet->u.bits, bitmapLen);
2606 LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, cpend), 0);
2607 if (!fails.append(to_fail))
2608 return NULL;
2609 LIns* text_ch = lir->insLoad(LIR_ldus2ui, pos, 0, ACC_READONLY);
2610 if (!fails.append(lir->insBranch(LIR_jf,
2611 lir->ins2(LIR_lei, text_ch, lir->insImmI(charSet->length)),
2612 0))) {
2613 return NULL;
2615 LIns* byteIndex = lir->insI2P(lir->ins2(LIR_rshi, text_ch, lir->insImmI(3)));
2616 LIns* bitmap = lir->insImmP(bitmapData);
2617 LIns* byte = lir->insLoad(LIR_lduc2ui, lir->ins2(LIR_addp, bitmap, byteIndex), (int) 0,
2618 ACC_READONLY);
2619 LIns* bitMask = lir->ins2(LIR_lshi, lir->insImmI(1),
2620 lir->ins2(LIR_andi, text_ch, lir->insImmI(0x7)));
2621 LIns* test = lir->ins2(LIR_eqi, lir->ins2(LIR_andi, byte, bitMask), lir->insImmI(0));
2623 LIns* to_next = lir->insBranch(LIR_jt, test, 0);
2624 if (!fails.append(to_next))
2625 return NULL;
2626 return lir->ins2(LIR_addp, pos, lir->insImmWord(2));
2629 /* Factor out common code to index js_alnum. */
2630 LIns *compileTableRead(LIns *chr, const bool *tbl)
2632 if (sizeof(bool) != 1) {
2633 LIns *sizeLog2 = lir->insImmI(StaticLog2<sizeof(bool)>::result);
2634 chr = lir->ins2(LIR_lshi, chr, sizeLog2);
2636 LIns *addr = lir->ins2(LIR_addp, lir->insImmP(tbl), lir->insUI2P(chr));
2637 return lir->insLoad(LIR_lduc2ui, addr, 0, ACC_READONLY);
2640 /* Compile a builtin character class. */
2641 LIns *compileBuiltinClass(RENode *node, LIns *pos, LInsList &fails)
2643 /* All the builtins checked below consume one character. */
2644 if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, cpend), 0)))
2645 return NULL;
2646 LIns *chr = lir->insLoad(LIR_ldus2ui, pos, 0, ACC_READONLY);
2648 switch (node->op) {
2649 case REOP_DOT:
2651 /* Accept any character except those in ECMA-262 15.10.2.8. */
2652 LIns *eq1 = lir->ins2(LIR_eqi, chr, lir->insImmI('\n'));
2653 if (!fails.append(lir->insBranch(LIR_jt, eq1, NULL)))
2654 return NULL;
2655 LIns *eq2 = lir->ins2(LIR_eqi, chr, lir->insImmI('\r'));
2656 if (!fails.append(lir->insBranch(LIR_jt, eq2, NULL)))
2657 return NULL;
2658 LIns *eq3 = lir->ins2(LIR_eqi, chr, lir->insImmI(LINE_SEPARATOR));
2659 if (!fails.append(lir->insBranch(LIR_jt, eq3, NULL)))
2660 return NULL;
2661 LIns *eq4 = lir->ins2(LIR_eqi, chr, lir->insImmI(PARA_SEPARATOR));
2662 if (!fails.append(lir->insBranch(LIR_jt, eq4, NULL)))
2663 return NULL;
2664 break;
2666 case REOP_DIGIT:
2668 LIns *ge = lir->ins2(LIR_gei, chr, lir->insImmI('0'));
2669 if (!fails.append(lir->insBranch(LIR_jf, ge, NULL)))
2670 return NULL;
2671 LIns *le = lir->ins2(LIR_lei, chr, lir->insImmI('9'));
2672 if (!fails.append(lir->insBranch(LIR_jf, le, NULL)))
2673 return NULL;
2674 break;
2676 case REOP_NONDIGIT:
2678 /* Use 'and' to give a predictable branch for success path. */
2679 LIns *ge = lir->ins2(LIR_gei, chr, lir->insImmI('0'));
2680 LIns *le = lir->ins2(LIR_lei, chr, lir->insImmI('9'));
2681 LIns *both = lir->ins2(LIR_andi, ge, le);
2682 if (!fails.append(lir->insBranch(LIR_jf, lir->insEqI_0(both), NULL)))
2683 return NULL;
2684 break;
2686 case REOP_ALNUM:
2689 * Compile the condition:
2690 * ((uint)*cp) < 128 && js_alnum[(uint)*cp]
2692 LIns *rangeCnd = lir->ins2(LIR_ltui, chr, lir->insImmI(128));
2693 if (!fails.append(lir->insBranch(LIR_jf, rangeCnd, NULL)))
2694 return NULL;
2695 LIns *tableVal = compileTableRead(chr, js_alnum);
2696 if (!fails.append(lir->insBranch(LIR_jt, lir->insEqI_0(tableVal), NULL)))
2697 return NULL;
2698 break;
2700 case REOP_NONALNUM:
2703 * Compile the condition:
2704 * ((uint)*cp) >= 128 || !js_alnum[(uint)*cp]
2706 LIns *rangeCnd = lir->ins2(LIR_geui, chr, lir->insImmI(128));
2707 LIns *rangeBr = lir->insBranch(LIR_jt, rangeCnd, NULL);
2708 LIns *tableVal = compileTableRead(chr, js_alnum);
2709 if (!fails.append(lir->insBranch(LIR_jf, lir->insEqI_0(tableVal), NULL)))
2710 return NULL;
2711 LIns *success = lir->ins0(LIR_label);
2712 rangeBr->setTarget(success);
2713 break;
2715 case REOP_SPACE:
2716 case REOP_NONSPACE:
2719 * ECMA-262 7.2, 7.3, and 15.10.2.12 define a bunch of Unicode code
2720 * points for whitespace. We optimize here for the common case of
2721 * ASCII characters using a table lookup for the lower block that
2722 * can actually contain spaces. For the rest, use a (more or less)
2723 * binary search to minimize tests.
2725 * [0000,0020]: 9, A, B, C, D, 20
2726 * (0020,00A0): none
2727 * [00A0,2000): A0, 1680, 180E
2728 * [2000,200A]: all
2729 * (200A, max): 2028, 2029, 202F, 205F, 3000
2731 /* Below 0x20? */
2732 LIns *tableRangeCnd = lir->ins2(LIR_leui, chr, lir->insImmI(0x20));
2733 LIns *tableRangeBr = lir->insBranch(LIR_jt, tableRangeCnd, NULL);
2734 /* Fall through means *chr > 0x20. */
2736 /* Handle (0x20,0xA0). */
2737 LIns *asciiCnd = lir->ins2(LIR_ltui, chr, lir->insImmI(0xA0));
2738 LIns *asciiMissBr = lir->insBranch(LIR_jt, asciiCnd, NULL);
2739 /* Fall through means *chr >= 0xA0. */
2741 /* Partition around [0x2000,0x200A]. */
2742 LIns *belowCnd = lir->ins2(LIR_ltui, chr, lir->insImmI(0x2000));
2743 LIns *belowBr = lir->insBranch(LIR_jt, belowCnd, NULL);
2744 LIns *aboveCnd = lir->ins2(LIR_gtui, chr, lir->insImmI(0x200A));
2745 LIns *aboveBr = lir->insBranch(LIR_jt, aboveCnd, NULL);
2746 LIns *intervalMatchBr = lir->insBranch(LIR_j, NULL, NULL);
2748 /* Handle [0xA0,0x2000). */
2749 LIns *belowLbl = lir->ins0(LIR_label);
2750 belowBr->setTarget(belowLbl);
2751 LIns *eq1Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0xA0));
2752 LIns *eq1Br = lir->insBranch(LIR_jt, eq1Cnd, NULL);
2753 LIns *eq2Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x1680));
2754 LIns *eq2Br = lir->insBranch(LIR_jt, eq2Cnd, NULL);
2755 LIns *eq3Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x180E));
2756 LIns *eq3Br = lir->insBranch(LIR_jt, eq3Cnd, NULL);
2757 LIns *belowMissBr = lir->insBranch(LIR_j, NULL, NULL);
2759 /* Handle (0x200A, max). */
2760 LIns *aboveLbl = lir->ins0(LIR_label);
2761 aboveBr->setTarget(aboveLbl);
2762 LIns *eq4Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x2028));
2763 LIns *eq4Br = lir->insBranch(LIR_jt, eq4Cnd, NULL);
2764 LIns *eq5Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x2029));
2765 LIns *eq5Br = lir->insBranch(LIR_jt, eq5Cnd, NULL);
2766 LIns *eq6Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x202F));
2767 LIns *eq6Br = lir->insBranch(LIR_jt, eq6Cnd, NULL);
2768 LIns *eq7Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x205F));
2769 LIns *eq7Br = lir->insBranch(LIR_jt, eq7Cnd, NULL);
2770 LIns *eq8Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x3000));
2771 LIns *eq8Br = lir->insBranch(LIR_jt, eq8Cnd, NULL);
2772 LIns *aboveMissBr = lir->insBranch(LIR_j, NULL, NULL);
2774 /* Handle [0,0x20]. */
2775 LIns *tableLbl = lir->ins0(LIR_label);
2776 tableRangeBr->setTarget(tableLbl);
2777 LIns *tableVal = compileTableRead(chr, js_ws);
2778 LIns *tableCnd = lir->insEqI_0(tableVal);
2779 LIns *tableMatchBr = lir->insBranch(LIR_jf, tableCnd, NULL);
2781 /* Collect misses. */
2782 LIns *missLbl = lir->ins0(LIR_label);
2783 asciiMissBr->setTarget(missLbl);
2784 belowMissBr->setTarget(missLbl);
2785 aboveMissBr->setTarget(missLbl);
2786 LIns *missBr = lir->insBranch(LIR_j, NULL, NULL);
2787 if (node->op == REOP_SPACE) {
2788 if (!fails.append(missBr))
2789 return NULL;
2792 /* Collect matches. */
2793 LIns *matchLbl = lir->ins0(LIR_label);
2794 intervalMatchBr->setTarget(matchLbl);
2795 tableMatchBr->setTarget(matchLbl);
2796 eq1Br->setTarget(matchLbl); eq2Br->setTarget(matchLbl);
2797 eq3Br->setTarget(matchLbl); eq4Br->setTarget(matchLbl);
2798 eq5Br->setTarget(matchLbl); eq6Br->setTarget(matchLbl);
2799 eq7Br->setTarget(matchLbl); eq8Br->setTarget(matchLbl);
2800 if (node->op == REOP_NONSPACE) {
2801 LIns *matchBr = lir->insBranch(LIR_j, NULL, NULL);
2802 if (!fails.append(matchBr))
2803 return NULL;
2805 /* Fall through means match == success. */
2807 /* Collect successes to fall through. */
2808 LIns *success = lir->ins0(LIR_label);
2809 if (node->op == REOP_NONSPACE)
2810 missBr->setTarget(success);
2811 break;
2813 default:
2814 return NULL;
2817 return lir->ins2(LIR_addp, pos, lir->insImmWord(2));
2820 LIns *compileAlt(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2822 RENode *leftRe = (RENode *)node->kid, *rightRe = (RENode *)node->u.kid2;
2825 * If the RE continues after the alternative, we need to ensure that no
2826 * backtracking is required. Recursive calls to compileNode will fail
2827 * on capturing parens, so the only thing we have to check here is that,
2828 * if the left subexpression matches, we can keep going without later
2829 * deciding we need to try the right subexpression.
2831 if (!atEnd) {
2833 * If there is no character overlap between left and right, then
2834 * there is only one possible path through the alternative.
2836 CharSet leftSet, rightSet;
2837 if (!enumerateNextChars(cx, leftRe, leftSet) ||
2838 !enumerateNextChars(cx, rightRe, rightSet) ||
2839 !leftSet.disjoint(rightSet))
2840 return NULL;
2843 * If there is an empty path through either subexpression, the above
2844 * check is incomplete; we need to include |node->next| as well.
2846 bool epsLeft = mayMatchEmpty(leftRe),
2847 epsRight = mayMatchEmpty(rightRe);
2848 if (epsRight && epsLeft) {
2849 return NULL;
2850 } else if (epsLeft || epsRight) {
2851 CharSet nextSet;
2852 if (!enumerateNextChars(cx, node->next, nextSet) ||
2853 (epsLeft && !nextSet.disjoint(rightSet)) ||
2854 (epsRight && !nextSet.disjoint(leftSet))) {
2855 return NULL;
2860 /* Try left branch. */
2861 LInsList kidFails(cx);
2862 LIns *branchEnd = compileNode(leftRe, pos, atEnd, kidFails);
2863 if (!branchEnd)
2864 return NULL;
2867 * Since there are no phis, simulate by writing to and reading from
2868 * memory (REGlobalData::stateStack, since it is unused).
2870 lir->insStore(branchEnd, state,
2871 offsetof(REGlobalData, stateStack), ACC_OTHER);
2872 LIns *leftSuccess = lir->insBranch(LIR_j, NULL, NULL);
2874 /* Try right branch. */
2875 targetCurrentPoint(kidFails);
2876 if (!(branchEnd = compileNode(rightRe, pos, atEnd, fails)))
2877 return NULL;
2878 lir->insStore(branchEnd, state,
2879 offsetof(REGlobalData, stateStack), ACC_OTHER);
2881 /* Land success on the left branch. */
2882 targetCurrentPoint(leftSuccess);
2883 return addName(fragment->lirbuf,
2884 lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, stateStack), ACC_OTHER),
2885 "pos");
2888 LIns *compileOpt(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2891 * Since there are no phis, simulate by writing to and reading from
2892 * memory (REGlobalData::stateStack, since it is unused).
2894 lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER);
2896 /* Try ? body. */
2897 LInsList kidFails(cx);
2898 if (!(pos = compileNode(node, pos, atEnd, kidFails)))
2899 return NULL;
2900 lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER);
2902 /* Join success and failure and get new position. */
2903 targetCurrentPoint(kidFails);
2904 pos = addName(fragment->lirbuf,
2905 lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, stateStack), ACC_OTHER),
2906 "pos");
2908 return pos;
2911 LIns *compileQuant(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
2913 /* Only support greedy *, +, ?. */
2914 if (!node->u.range.greedy ||
2915 node->u.range.min > 1 ||
2916 (node->u.range.max > 1 && node->u.range.max < (uintN)-1)) {
2917 return NULL;
2920 RENode *bodyRe = (RENode *)node->kid;
2923 * If the RE continues after the alternative, we need to ensure that no
2924 * backtracking is required. Recursive calls to compileNode will fail
2925 * on capturing parens, so the only thing we have to check here is that,
2926 * if the quantifier body matches, we can continue matching the body
2927 * without later deciding we need to undo the body matches.
2929 if (!atEnd) {
2931 * If there is no character overlap between the body and
2932 * |node->next|, then all possible body matches are used.
2934 CharSet bodySet, nextSet;
2935 if (!enumerateNextChars(cx, bodyRe, bodySet) ||
2936 !enumerateNextChars(cx, node->next, nextSet) ||
2937 !bodySet.disjoint(nextSet)) {
2938 return NULL;
2942 /* Fork off ? and {1,1}. */
2943 if (node->u.range.max == 1) {
2944 if (node->u.range.min == 1)
2945 return compileNode(bodyRe, pos, atEnd, fails);
2946 else
2947 return compileOpt(bodyRe, pos, atEnd, fails);
2950 /* For +, compile a copy of the body where failure is real failure. */
2951 if (node->u.range.min == 1) {
2952 if (!(pos = compileNode(bodyRe, pos, atEnd, fails)))
2953 return NULL;
2957 * Since there are no phis, simulate by writing to and reading from
2958 * memory (REGlobalData::stateStack, since it is unused).
2960 lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER);
2962 /* Begin iteration: load loop variables. */
2963 LIns *loopTop = lir->ins0(LIR_label);
2964 LIns *iterBegin = addName(fragment->lirbuf,
2965 lir->insLoad(LIR_ldp, state,
2966 offsetof(REGlobalData, stateStack), ACC_OTHER),
2967 "pos");
2969 /* Match quantifier body. */
2970 LInsList kidFails(cx);
2971 LIns *iterEnd = compileNode(bodyRe, iterBegin, atEnd, kidFails);
2972 if (!iterEnd)
2973 return NULL;
2976 * If there is an epsilon path through the body then, when it is taken,
2977 * we need to abort the loop or else we will loop forever.
2979 if (mayMatchEmpty(bodyRe)) {
2980 LIns *eqCnd = lir->ins2(LIR_eqp, iterBegin, iterEnd);
2981 if (!kidFails.append(lir->insBranch(LIR_jt, eqCnd, NULL)))
2982 return NULL;
2985 /* End iteration: store loop variables, increment, jump */
2986 lir->insStore(iterEnd, state, offsetof(REGlobalData, stateStack), ACC_OTHER);
2987 lir->insBranch(LIR_j, NULL, loopTop);
2990 * Using '+' as branch, the intended control flow is:
2992 * ...
2993 * A -> |
2994 * |<---.
2995 * B -> | |
2996 * +--. |
2997 * C -> | | |
2998 * +--. |
2999 * D -> | | |
3000 * +--|-'
3001 * X -> | |
3002 * |<-'
3003 * E -> |
3004 * ...
3006 * We are currently at point X. Since the regalloc makes a single,
3007 * linear, backwards sweep over the IR (going from E to A), point X
3008 * must tell the regalloc what LIR insns are live at the end of D.
3009 * Thus, we need to report *all* insns defined *before* the end of D
3010 * that may be used *after* D. This means insns defined in A, B, C, or
3011 * D and used in B, C, D, or E. Since insns in B, C, and D are
3012 * conditionally executed, and we (currently) don't have real phi
3013 * nodes, we need only consider insns defined in A and used in E.
3015 lir->ins1(LIR_livep, state);
3016 lir->ins1(LIR_livep, cpend);
3017 lir->ins1(LIR_livep, start);
3019 /* After the loop: reload 'pos' from memory and continue. */
3020 targetCurrentPoint(kidFails);
3021 return iterBegin;
3025 * Compile the regular expression rooted at 'node'. Return 0 on failed
3026 * compilation. Otherwise, generate code that falls through on success (the
3027 * returned LIns* is the current 'pos') and jumps to the end on failure (by
3028 * adding the guard LIns to 'fails').
3030 LIns *compileNode(RENode *node, LIns *pos, bool atEnd, LInsList &fails)
3032 for (; pos && node; node = node->next) {
3033 if (outOfMemory())
3034 return NULL;
3036 bool childNextIsEnd = atEnd && !node->next;
3038 switch (node->op) {
3039 case REOP_EMPTY:
3040 pos = compileEmpty(node, pos, fails);
3041 break;
3042 case REOP_FLAT:
3043 pos = compileFlat(node, pos, fails);
3044 break;
3045 case REOP_ALT:
3046 case REOP_ALTPREREQ:
3047 pos = compileAlt(node, pos, childNextIsEnd, fails);
3048 break;
3049 case REOP_QUANT:
3050 pos = compileQuant(node, pos, childNextIsEnd, fails);
3051 break;
3052 case REOP_CLASS:
3053 pos = compileClass(node, pos, fails);
3054 break;
3055 case REOP_DOT:
3056 case REOP_DIGIT:
3057 case REOP_NONDIGIT:
3058 case REOP_ALNUM:
3059 case REOP_NONALNUM:
3060 case REOP_SPACE:
3061 case REOP_NONSPACE:
3062 pos = compileBuiltinClass(node, pos, fails);
3063 break;
3064 default:
3065 return NULL;
3068 return pos;
3072 * This function kicks off recursive compileNode compilation, finishes the
3073 * success path, and lets the failed-match path fall through.
3075 bool compileRootNode(RENode *root, LIns *pos, LIns *anchorFail)
3077 /* Compile the regular expression body. */
3078 LInsList fails(cx);
3079 pos = compileNode(root, pos, true, fails);
3080 if (!pos)
3081 return false;
3083 /* Fall-through from compileNode means success. */
3084 lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER);
3085 lir->ins0(LIR_regfence);
3086 lir->ins1(LIR_reti, lir->insImmI(1));
3088 /* Stick return here so we don't have to jump over it every time. */
3089 if (anchorFail) {
3090 targetCurrentPoint(anchorFail);
3091 lir->ins0(LIR_regfence);
3092 lir->ins1(LIR_reti, lir->insImmI(0));
3095 /* Target failed matches. */
3096 targetCurrentPoint(fails);
3097 return true;
3100 /* Compile a regular expressions that can only match on the first char. */
3101 bool compileSticky(RENode *root, LIns *start)
3103 if (!compileRootNode(root, start, NULL))
3104 return false;
3106 /* Failed to match on first character, so fail whole match. */
3107 lir->ins0(LIR_regfence);
3108 lir->ins1(LIR_reti, lir->insImmI(0));
3109 return !outOfMemory();
3112 /* Compile normal regular expressions that can match starting at any char. */
3113 bool compileAnchoring(RENode *root, LIns *start)
3115 /* Guard outer anchoring loop. Use <= to allow empty regexp match. */
3116 LIns *anchorFail = lir->insBranch(LIR_jf, lir->ins2(LIR_lep, start, cpend), 0);
3118 if (!compileRootNode(root, start, anchorFail))
3119 return false;
3121 /* Outer loop increment. */
3122 lir->insStore(lir->ins2(LIR_addp, start, lir->insImmWord(2)), state,
3123 offsetof(REGlobalData, skipped), ACC_OTHER);
3125 return !outOfMemory();
3128 inline LIns*
3129 addName(LirBuffer* lirbuf, LIns* ins, const char* name)
3131 #ifdef NJ_VERBOSE
3132 debug_only_stmt(lirbuf->printer->lirNameMap->addName(ins, name);)
3133 #endif
3134 return ins;
3138 * Insert the side exit and guard record for a compiled regexp. Most
3139 * of the fields are not used. The important part is the regexp source
3140 * and flags, which we use as the fragment lookup key.
3142 GuardRecord* insertGuard(LIns* loopLabel, const jschar* re_chars, size_t re_length)
3144 if (loopLabel) {
3145 lir->insBranch(LIR_j, NULL, loopLabel);
3146 LirBuffer* lirbuf = fragment->lirbuf;
3147 lir->ins1(LIR_livep, lirbuf->state);
3148 lir->ins1(LIR_livep, lirbuf->param1);
3151 Allocator &alloc = *JS_TRACE_MONITOR(cx).dataAlloc;
3153 /* Must only create a VMSideExit; see StackFilter::getTops. */
3154 size_t len = (sizeof(GuardRecord) +
3155 sizeof(VMSideExit) +
3156 (re_length-1) * sizeof(jschar));
3157 GuardRecord* guard = (GuardRecord *) alloc.alloc(len);
3158 VMSideExit* exit = (VMSideExit*)(guard+1);
3159 guard->exit = exit;
3160 guard->exit->target = fragment;
3161 fragment->lastIns = lir->insGuard(LIR_x, NULL, guard);
3162 // guard->profCount is calloc'd to zero
3163 verbose_only(
3164 guard->profGuardID = fragment->guardNumberer++;
3165 guard->nextInFrag = fragment->guardsForFrag;
3166 fragment->guardsForFrag = guard;
3168 return guard;
3171 public:
3172 RegExpNativeCompiler(JSContext* cx, JSRegExp* re, CompilerState* cs, Fragment* fragment)
3173 : tempAlloc(*JS_TRACE_MONITOR(cx).reTempAlloc), cx(cx),
3174 re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL),
3175 lirbuf(new (tempAlloc) LirBuffer(tempAlloc))
3177 fragment->lirbuf = lirbuf;
3178 #ifdef DEBUG
3179 lirbuf->printer = new (tempAlloc) LInsPrinter(tempAlloc);
3180 #endif
3183 ~RegExpNativeCompiler() {
3184 /* Purge the tempAlloc used during recording. */
3185 tempAlloc.reset();
3188 JSBool compile()
3190 GuardRecord* guard = NULL;
3191 const jschar* re_chars;
3192 size_t re_length;
3193 TraceMonitor* tm = &JS_TRACE_MONITOR(cx);
3194 Assembler *assm = tm->assembler;
3195 LIns* loopLabel = NULL;
3197 if (outOfMemory() || OverfullJITCache(tm))
3198 return JS_FALSE;
3200 re->source->getCharsAndLength(re_chars, re_length);
3202 * If the regexp is too long nanojit will assert when we
3203 * try to insert the guard record.
3205 if (re_length > 1024) {
3206 re->flags |= JSREG_NOCOMPILE;
3207 return JS_FALSE;
3210 /* At this point we have an empty fragment. */
3211 LirBuffer* lirbuf = fragment->lirbuf;
3212 if (outOfMemory())
3213 goto fail;
3214 /* FIXME Use bug 463260 smart pointer when available. */
3215 lir = lirBufWriter = new LirBufWriter(lirbuf, nanojit::AvmCore::config);
3217 /* FIXME Use bug 463260 smart pointer when available. */
3218 #ifdef NJ_VERBOSE
3219 debug_only_stmt(
3220 if (LogController.lcbits & LC_TMRegexp) {
3221 lir = verbose_filter = new VerboseWriter(tempAlloc, lir, lirbuf->printer,
3222 &LogController);
3225 #endif
3226 #ifdef DEBUG
3227 lir = validate_writer = new ValidateWriter(lir, lirbuf->printer, "regexp writer pipeline");
3228 #endif
3231 * Although we could just load REGlobalData::cpend from 'state', by
3232 * passing it as a parameter, we avoid loading it every iteration.
3234 lir->ins0(LIR_start);
3236 for (int i = 0; i < NumSavedRegs; ++i)
3237 lir->insParam(i, 1);
3238 #ifdef DEBUG
3239 for (int i = 0; i < NumSavedRegs; ++i)
3240 addName(lirbuf, lirbuf->savedRegs[i], regNames[Assembler::savedRegs[i]]);
3241 #endif
3243 lirbuf->state = state = addName(lirbuf, lir->insParam(0, 0), "state");
3244 lirbuf->param1 = cpend = addName(lirbuf, lir->insParam(1, 0), "cpend");
3246 loopLabel = lir->ins0(LIR_label);
3247 // If profiling, record where the loop label is, so that the
3248 // assembler can insert a frag-entry-counter increment at that
3249 // point
3250 verbose_only( if (LogController.lcbits & LC_FragProfile) {
3251 NanoAssert(!fragment->loopLabel);
3252 fragment->loopLabel = loopLabel;
3255 start = addName(lirbuf,
3256 lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, skipped), ACC_OTHER),
3257 "start");
3259 if (cs->flags & JSREG_STICKY) {
3260 if (!compileSticky(cs->result, start))
3261 goto fail;
3262 } else {
3263 if (!compileAnchoring(cs->result, start))
3264 goto fail;
3267 guard = insertGuard(loopLabel, re_chars, re_length);
3269 if (outOfMemory())
3270 goto fail;
3273 * Deep in the nanojit compiler, the StackFilter is trying to throw
3274 * away stores above the VM interpreter/native stacks. We have no such
3275 * stacks, so rely on the fact that lirbuf->sp and lirbuf->rp are null
3276 * to ensure our stores are ignored.
3278 JS_ASSERT(!lirbuf->sp && !lirbuf->rp);
3280 assm->compile(fragment, tempAlloc, /*optimize*/true verbose_only(, lirbuf->printer));
3281 if (assm->error() != nanojit::None)
3282 goto fail;
3284 delete lirBufWriter;
3285 #ifdef DEBUG
3286 delete validate_writer;
3287 #endif
3288 #ifdef NJ_VERBOSE
3289 debug_only_stmt( if (LogController.lcbits & LC_TMRegexp)
3290 delete verbose_filter; )
3291 #endif
3292 return JS_TRUE;
3293 fail:
3294 if (outOfMemory() || OverfullJITCache(tm)) {
3295 delete lirBufWriter;
3296 // recover profiling data from expiring Fragments
3297 verbose_only(
3298 REHashMap::Iter iter(*(tm->reFragments));
3299 while (iter.next()) {
3300 nanojit::Fragment* frag = iter.value();
3301 FragProfiling_FragFinalizer(frag, tm);
3304 FlushJITCache(cx);
3305 } else {
3306 if (!guard) insertGuard(loopLabel, re_chars, re_length);
3307 re->flags |= JSREG_NOCOMPILE;
3308 delete lirBufWriter;
3310 #ifdef DEBUG
3311 delete validate_writer;
3312 #endif
3313 #ifdef NJ_VERBOSE
3314 debug_only_stmt( if (LogController.lcbits & LC_TMRegexp)
3315 delete verbose_filter; )
3316 #endif
3317 return JS_FALSE;
3322 * Compile a regexp to native code in the given fragment.
3324 static inline JSBool
3325 CompileRegExpToNative(JSContext* cx, JSRegExp* re, Fragment* fragment)
3327 JSBool rv = JS_FALSE;
3328 void* mark;
3329 CompilerState state;
3330 RegExpNativeCompiler rc(cx, re, &state, fragment);
3332 JS_ASSERT(!fragment->code());
3333 mark = cx->tempPool.getMark();
3334 if (!CompileRegExpToAST(cx, NULL, re->source, re->flags, state)) {
3335 goto out;
3337 rv = rc.compile();
3338 out:
3339 cx->tempPool.release(mark);
3340 return rv;
3343 /* Function type for a compiled native regexp. */
3344 typedef void *(FASTCALL *NativeRegExp)(REGlobalData*, const jschar *);
3347 * Return a compiled native regexp if one already exists or can be created
3348 * now, or NULL otherwise.
3350 static NativeRegExp
3351 GetNativeRegExp(JSContext* cx, JSRegExp* re)
3353 const jschar *re_chars;
3354 size_t re_length;
3355 re->source->getCharsAndLength(re_chars, re_length);
3356 Fragment *fragment = LookupNativeRegExp(cx, re->flags, re_chars, re_length);
3357 JS_ASSERT(fragment);
3358 if (!fragment->code() && fragment->recordAttempts == 0) {
3359 fragment->recordAttempts++;
3360 if (!CompileRegExpToNative(cx, re, fragment))
3361 return NULL;
3363 union { NIns *code; NativeRegExp func; } u;
3364 u.code = fragment->code();
3365 return u.func;
3367 #endif
3369 JSRegExp *
3370 js_NewRegExp(JSContext *cx, TokenStream *ts,
3371 JSString *str, uintN flags, JSBool flat)
3373 JSRegExp *re;
3374 void *mark;
3375 CompilerState state;
3376 size_t resize;
3377 jsbytecode *endPC;
3378 uintN i;
3380 re = NULL;
3381 mark = cx->tempPool.getMark();
3384 * Parsing the string as flat is now expressed internally using
3385 * a flag, so that we keep this information in the JSRegExp, but
3386 * we keep the 'flat' parameter for now for compatibility.
3388 if (flat) flags |= JSREG_FLAT;
3389 if (!CompileRegExpToAST(cx, ts, str, flags, state))
3390 goto out;
3392 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3393 re = (JSRegExp *) cx->malloc(resize);
3394 if (!re)
3395 goto out;
3397 re->nrefs = 1;
3398 JS_ASSERT(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3399 re->classCount = state.classCount;
3400 if (re->classCount) {
3401 re->classList = (RECharSet *)
3402 cx->malloc(re->classCount * sizeof(RECharSet));
3403 if (!re->classList) {
3404 js_DestroyRegExp(cx, re);
3405 re = NULL;
3406 goto out;
3408 for (i = 0; i < re->classCount; i++)
3409 re->classList[i].converted = JS_FALSE;
3410 } else {
3411 re->classList = NULL;
3414 /* Compile the bytecode version. */
3415 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3416 if (!endPC) {
3417 js_DestroyRegExp(cx, re);
3418 re = NULL;
3419 goto out;
3421 *endPC++ = REOP_END;
3423 * Check whether size was overestimated and shrink using realloc.
3424 * This is safe since no pointers to newly parsed regexp or its parts
3425 * besides re exist here.
3427 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3428 JSRegExp *tmp;
3429 JS_ASSERT((size_t)(endPC - re->program) < state.progLength + 1);
3430 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3431 tmp = (JSRegExp *) cx->realloc(re, resize);
3432 if (tmp)
3433 re = tmp;
3436 re->flags = uint16(flags);
3437 re->parenCount = state.parenCount;
3438 re->source = str;
3440 out:
3441 cx->tempPool.release(mark);
3442 return re;
3445 JSRegExp *
3446 js_NewRegExpOpt(JSContext *cx, JSString *str, JSString *opt, JSBool flat)
3448 uintN flags;
3449 const jschar *s;
3450 size_t i, n;
3451 char charBuf[2];
3453 flags = 0;
3454 if (opt) {
3455 opt->getCharsAndLength(s, n);
3456 for (i = 0; i < n; i++) {
3457 #define HANDLE_FLAG(name) \
3458 JS_BEGIN_MACRO \
3459 if (flags & (name)) \
3460 goto bad_flag; \
3461 flags |= (name); \
3462 JS_END_MACRO
3463 switch (s[i]) {
3464 case 'g':
3465 HANDLE_FLAG(JSREG_GLOB);
3466 break;
3467 case 'i':
3468 HANDLE_FLAG(JSREG_FOLD);
3469 break;
3470 case 'm':
3471 HANDLE_FLAG(JSREG_MULTILINE);
3472 break;
3473 case 'y':
3474 HANDLE_FLAG(JSREG_STICKY);
3475 break;
3476 default:
3477 bad_flag:
3478 charBuf[0] = (char)s[i];
3479 charBuf[1] = '\0';
3480 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR,
3481 js_GetErrorMessage, NULL,
3482 JSMSG_BAD_REGEXP_FLAG, charBuf);
3483 return NULL;
3485 #undef HANDLE_FLAG
3488 return js_NewRegExp(cx, NULL, str, flags, flat);
3492 * Save the current state of the match - the position in the input
3493 * text as well as the position in the bytecode. The state of any
3494 * parent expressions is also saved (preceding state).
3495 * Contents of parenCount parentheses from parenIndex are also saved.
3497 static REBackTrackData *
3498 PushBackTrackState(REGlobalData *gData, REOp op,
3499 jsbytecode *target, REMatchState *x, const jschar *cp,
3500 size_t parenIndex, size_t parenCount)
3502 size_t i;
3503 REBackTrackData *result =
3504 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
3506 size_t sz = sizeof(REBackTrackData) +
3507 gData->stateStackTop * sizeof(REProgState) +
3508 parenCount * sizeof(RECapture);
3510 ptrdiff_t btsize = gData->backTrackStackSize;
3511 ptrdiff_t btincr = ((char *)result + sz) -
3512 ((char *)gData->backTrackStack + btsize);
3514 re_debug("\tBT_Push: %lu,%lu",
3515 (unsigned long) parenIndex, (unsigned long) parenCount);
3517 if (btincr > 0) {
3518 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
3520 btincr = JS_ROUNDUP(btincr, btsize);
3521 gData->cx->regexpPool.growCast<REBackTrackData *>(gData->backTrackStack, btsize, btincr);
3522 if (!gData->backTrackStack) {
3523 js_ReportOutOfScriptQuota(gData->cx);
3524 gData->ok = JS_FALSE;
3525 return NULL;
3527 gData->backTrackStackSize = btsize + btincr;
3528 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
3530 gData->backTrackSP = result;
3531 result->sz = gData->cursz;
3532 gData->cursz = sz;
3534 result->backtrack_op = op;
3535 result->backtrack_pc = target;
3536 result->cp = cp;
3537 result->parenCount = parenCount;
3538 result->parenIndex = parenIndex;
3540 result->saveStateStackTop = gData->stateStackTop;
3541 JS_ASSERT(gData->stateStackTop);
3542 memcpy(result + 1, gData->stateStack,
3543 sizeof(REProgState) * result->saveStateStackTop);
3545 if (parenCount != 0) {
3546 memcpy((char *)(result + 1) +
3547 sizeof(REProgState) * result->saveStateStackTop,
3548 &x->parens[parenIndex],
3549 sizeof(RECapture) * parenCount);
3550 for (i = 0; i != parenCount; i++)
3551 x->parens[parenIndex + i].index = -1;
3554 return result;
3559 * Consecutive literal characters.
3561 #if 0
3562 static REMatchState *
3563 FlatNMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
3564 size_t length)
3566 size_t i;
3567 if (length > gData->cpend - x->cp)
3568 return NULL;
3569 for (i = 0; i != length; i++) {
3570 if (matchChars[i] != x->cp[i])
3571 return NULL;
3573 x->cp += length;
3574 return x;
3576 #endif
3578 static JS_ALWAYS_INLINE REMatchState *
3579 FlatNIMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
3580 size_t length)
3582 size_t i;
3583 JS_ASSERT(gData->cpend >= x->cp);
3584 if (length > (size_t)(gData->cpend - x->cp))
3585 return NULL;
3586 for (i = 0; i != length; i++) {
3587 if (upcase(matchChars[i]) != upcase(x->cp[i]))
3588 return NULL;
3590 x->cp += length;
3591 return x;
3595 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
3596 * 2. If E is not a character then go to step 6.
3597 * 3. Let ch be E's character.
3598 * 4. Let A be a one-element RECharSet containing the character ch.
3599 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
3600 * 6. E must be an integer. Let n be that integer.
3601 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
3602 * 8. Return an internal Matcher closure that takes two arguments, a State x
3603 * and a Continuation c, and performs the following:
3604 * 1. Let cap be x's captures internal array.
3605 * 2. Let s be cap[n].
3606 * 3. If s is undefined, then call c(x) and return its result.
3607 * 4. Let e be x's endIndex.
3608 * 5. Let len be s's length.
3609 * 6. Let f be e+len.
3610 * 7. If f>InputLength, return failure.
3611 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
3612 * such that Canonicalize(s[i]) is not the same character as
3613 * Canonicalize(Input [e+i]), then return failure.
3614 * 9. Let y be the State (f, cap).
3615 * 10. Call c(y) and return its result.
3617 static REMatchState *
3618 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
3620 size_t len, i;
3621 const jschar *parenContent;
3622 RECapture *cap = &x->parens[parenIndex];
3624 if (cap->index == -1)
3625 return x;
3627 len = cap->length;
3628 if (x->cp + len > gData->cpend)
3629 return NULL;
3631 parenContent = &gData->cpbegin[cap->index];
3632 if (gData->regexp->flags & JSREG_FOLD) {
3633 for (i = 0; i < len; i++) {
3634 if (upcase(parenContent[i]) != upcase(x->cp[i]))
3635 return NULL;
3637 } else {
3638 for (i = 0; i < len; i++) {
3639 if (parenContent[i] != x->cp[i])
3640 return NULL;
3643 x->cp += len;
3644 return x;
3648 /* Add a single character to the RECharSet */
3649 static void
3650 AddCharacterToCharSet(RECharSet *cs, jschar c)
3652 uintN byteIndex = (uintN)(c >> 3);
3653 JS_ASSERT(c <= cs->length);
3654 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
3658 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
3659 static void
3660 AddCharacterRangeToCharSet(RECharSet *cs, uintN c1, uintN c2)
3662 uintN i;
3664 uintN byteIndex1 = c1 >> 3;
3665 uintN byteIndex2 = c2 >> 3;
3667 JS_ASSERT(c2 <= cs->length && c1 <= c2);
3669 c1 &= 0x7;
3670 c2 &= 0x7;
3672 if (byteIndex1 == byteIndex2) {
3673 cs->u.bits[byteIndex1] |= ((uint8)0xFF >> (7 - (c2 - c1))) << c1;
3674 } else {
3675 cs->u.bits[byteIndex1] |= 0xFF << c1;
3676 for (i = byteIndex1 + 1; i < byteIndex2; i++)
3677 cs->u.bits[i] = 0xFF;
3678 cs->u.bits[byteIndex2] |= (uint8)0xFF >> (7 - c2);
3682 struct CharacterRange {
3683 jschar start;
3684 jschar end;
3688 * The following characters are taken from the ECMA-262 standard, section 7.2
3689 * and 7.3, and the Unicode 3 standard, Table 6-1.
3691 static const CharacterRange WhiteSpaceRanges[] = {
3692 /* TAB, LF, VT, FF, CR */
3693 { 0x0009, 0x000D },
3694 /* SPACE */
3695 { 0x0020, 0x0020 },
3696 /* NO-BREAK SPACE */
3697 { 0x00A0, 0x00A0 },
3699 * EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, FOUR-PER-EM
3700 * SPACE, SIX-PER-EM SPACE, FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE,
3701 * HAIR SPACE, ZERO WIDTH SPACE
3703 { 0x2000, 0x200B },
3704 /* LS, PS */
3705 { 0x2028, 0x2029 },
3706 /* NARROW NO-BREAK SPACE */
3707 { 0x202F, 0x202F },
3708 /* IDEOGRAPHIC SPACE */
3709 { 0x3000, 0x3000 }
3712 /* ECMA-262 standard, section 15.10.2.6. */
3713 static const CharacterRange WordRanges[] = {
3714 { jschar('0'), jschar('9') },
3715 { jschar('A'), jschar('Z') },
3716 { jschar('_'), jschar('_') },
3717 { jschar('a'), jschar('z') }
3720 static void
3721 AddCharacterRanges(RECharSet *charSet,
3722 const CharacterRange *range,
3723 const CharacterRange *end)
3725 for (; range < end; ++range)
3726 AddCharacterRangeToCharSet(charSet, range->start, range->end);
3729 static void
3730 AddInvertedCharacterRanges(RECharSet *charSet,
3731 const CharacterRange *range,
3732 const CharacterRange *end)
3734 uint16 previous = 0;
3735 for (; range < end; ++range) {
3736 AddCharacterRangeToCharSet(charSet, previous, range->start - 1);
3737 previous = range->end + 1;
3739 AddCharacterRangeToCharSet(charSet, previous, charSet->length);
3742 /* Compile the source of the class into a RECharSet */
3743 static JSBool
3744 ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet)
3746 const jschar *src, *end;
3747 JSBool inRange = JS_FALSE;
3748 jschar rangeStart = 0;
3749 uintN byteLength, n;
3750 jschar c, thisCh;
3751 intN nDigits, i;
3753 JS_ASSERT(!charSet->converted);
3755 * Assert that startIndex and length points to chars inside [] inside
3756 * source string.
3758 JS_ASSERT(1 <= charSet->u.src.startIndex);
3759 JS_ASSERT(charSet->u.src.startIndex < re->source->length());
3760 JS_ASSERT(charSet->u.src.length <= re->source->length()
3761 - 1 - charSet->u.src.startIndex);
3763 charSet->converted = JS_TRUE;
3764 src = re->source->chars() + charSet->u.src.startIndex;
3765 end = src + charSet->u.src.length;
3766 JS_ASSERT(src[-1] == '[');
3767 JS_ASSERT(end[0] == ']');
3769 byteLength = (charSet->length >> 3) + 1;
3770 charSet->u.bits = (uint8 *)cx->malloc(byteLength);
3771 if (!charSet->u.bits) {
3772 JS_ReportOutOfMemory(cx);
3773 return JS_FALSE;
3775 memset(charSet->u.bits, 0, byteLength);
3777 if (src == end)
3778 return JS_TRUE;
3780 if (*src == '^') {
3781 JS_ASSERT(charSet->sense == JS_FALSE);
3782 ++src;
3783 } else {
3784 JS_ASSERT(charSet->sense == JS_TRUE);
3787 while (src != end) {
3788 switch (*src) {
3789 case '\\':
3790 ++src;
3791 c = *src++;
3792 switch (c) {
3793 case 'b':
3794 thisCh = 0x8;
3795 break;
3796 case 'f':
3797 thisCh = 0xC;
3798 break;
3799 case 'n':
3800 thisCh = 0xA;
3801 break;
3802 case 'r':
3803 thisCh = 0xD;
3804 break;
3805 case 't':
3806 thisCh = 0x9;
3807 break;
3808 case 'v':
3809 thisCh = 0xB;
3810 break;
3811 case 'c':
3812 if (src < end && JS_ISWORD(*src)) {
3813 thisCh = (jschar)(*src++ & 0x1F);
3814 } else {
3815 --src;
3816 thisCh = '\\';
3818 break;
3819 case 'x':
3820 nDigits = 2;
3821 goto lexHex;
3822 case 'u':
3823 nDigits = 4;
3824 lexHex:
3825 n = 0;
3826 for (i = 0; (i < nDigits) && (src < end); i++) {
3827 uintN digit;
3828 c = *src++;
3829 if (!isASCIIHexDigit(c, &digit)) {
3831 * Back off to accepting the original '\'
3832 * as a literal
3834 src -= i + 1;
3835 n = '\\';
3836 break;
3838 n = (n << 4) | digit;
3840 thisCh = (jschar)n;
3841 break;
3842 case '0':
3843 case '1':
3844 case '2':
3845 case '3':
3846 case '4':
3847 case '5':
3848 case '6':
3849 case '7':
3851 * This is a non-ECMA extension - decimal escapes (in this
3852 * case, octal!) are supposed to be an error inside class
3853 * ranges, but supported here for backwards compatibility.
3855 n = JS7_UNDEC(c);
3856 c = *src;
3857 if ('0' <= c && c <= '7') {
3858 src++;
3859 n = 8 * n + JS7_UNDEC(c);
3860 c = *src;
3861 if ('0' <= c && c <= '7') {
3862 src++;
3863 i = 8 * n + JS7_UNDEC(c);
3864 if (i <= 0377)
3865 n = i;
3866 else
3867 src--;
3870 thisCh = (jschar)n;
3871 break;
3873 case 'd':
3874 AddCharacterRangeToCharSet(charSet, '0', '9');
3875 continue; /* don't need range processing */
3876 case 'D':
3877 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
3878 AddCharacterRangeToCharSet(charSet,
3879 (jschar)('9' + 1),
3880 (jschar)charSet->length);
3881 continue;
3882 case 's':
3883 AddCharacterRanges(charSet, WhiteSpaceRanges,
3884 WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges));
3885 continue;
3886 case 'S':
3887 AddInvertedCharacterRanges(charSet, WhiteSpaceRanges,
3888 WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges));
3889 continue;
3890 case 'w':
3891 AddCharacterRanges(charSet, WordRanges,
3892 WordRanges + JS_ARRAY_LENGTH(WordRanges));
3893 continue;
3894 case 'W':
3895 AddInvertedCharacterRanges(charSet, WordRanges,
3896 WordRanges + JS_ARRAY_LENGTH(WordRanges));
3897 continue;
3898 default:
3899 thisCh = c;
3900 break;
3903 break;
3905 default:
3906 thisCh = *src++;
3907 break;
3910 if (inRange) {
3911 if (re->flags & JSREG_FOLD) {
3912 int i;
3914 JS_ASSERT(rangeStart <= thisCh);
3915 for (i = rangeStart; i <= thisCh; i++) {
3916 jschar uch, dch;
3918 AddCharacterToCharSet(charSet, jschar(i));
3919 uch = jschar(upcase(i));
3920 dch = inverse_upcase(jschar(i));
3921 if (i != uch)
3922 AddCharacterToCharSet(charSet, uch);
3923 if (i != dch)
3924 AddCharacterToCharSet(charSet, dch);
3926 } else {
3927 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
3929 inRange = JS_FALSE;
3930 } else {
3931 if (re->flags & JSREG_FOLD) {
3932 AddCharacterToCharSet(charSet, jschar(upcase(thisCh)));
3933 AddCharacterToCharSet(charSet, inverse_upcase(thisCh));
3934 } else {
3935 AddCharacterToCharSet(charSet, thisCh);
3937 if (src < end - 1) {
3938 if (*src == '-') {
3939 ++src;
3940 inRange = JS_TRUE;
3941 rangeStart = thisCh;
3946 return JS_TRUE;
3949 static inline JSBool
3950 MatcherProcessCharSet(REGlobalData *gData, RECharSet *charSet) {
3951 JSBool rv = ProcessCharSet(gData->cx, gData->regexp, charSet);
3952 if (!rv) gData->ok = JS_FALSE;
3953 return rv;
3956 void
3957 js_DestroyRegExp(JSContext *cx, JSRegExp *re)
3959 if (JS_ATOMIC_DECREMENT(&re->nrefs) == 0) {
3960 if (re->classList) {
3961 uintN i;
3962 for (i = 0; i < re->classCount; i++) {
3963 if (re->classList[i].converted)
3964 cx->free(re->classList[i].u.bits);
3965 re->classList[i].u.bits = NULL;
3967 cx->free(re->classList);
3969 cx->free(re);
3973 static JSBool
3974 ReallocStateStack(REGlobalData *gData)
3976 size_t limit = gData->stateStackLimit;
3977 size_t sz = sizeof(REProgState) * limit;
3979 gData->cx->regexpPool.growCast<REProgState *>(gData->stateStack, sz, sz);
3980 if (!gData->stateStack) {
3981 js_ReportOutOfScriptQuota(gData->cx);
3982 gData->ok = JS_FALSE;
3983 return JS_FALSE;
3985 gData->stateStackLimit = limit + limit;
3986 return JS_TRUE;
3989 #define PUSH_STATE_STACK(data) \
3990 JS_BEGIN_MACRO \
3991 ++(data)->stateStackTop; \
3992 if ((data)->stateStackTop == (data)->stateStackLimit && \
3993 !ReallocStateStack((data))) { \
3994 return NULL; \
3996 JS_END_MACRO
3999 * Apply the current op against the given input to see if it's going to match
4000 * or fail. Return false if we don't get a match, true if we do. If updatecp is
4001 * true, then update the current state's cp. Always update startpc to the next
4002 * op.
4004 static JS_ALWAYS_INLINE REMatchState *
4005 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
4006 jsbytecode **startpc, JSBool updatecp)
4008 REMatchState *result = NULL;
4009 jschar matchCh;
4010 size_t parenIndex;
4011 size_t offset, length, index;
4012 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
4013 jschar *source;
4014 const jschar *startcp = x->cp;
4015 jschar ch;
4016 RECharSet *charSet;
4018 #ifdef REGEXP_DEBUG
4019 const char *opname = reop_names[op];
4020 re_debug("\n%06d: %*s%s", pc - gData->regexp->program,
4021 gData->stateStackTop * 2, "", opname);
4022 #endif
4023 switch (op) {
4024 case REOP_EMPTY:
4025 result = x;
4026 break;
4027 case REOP_BOL:
4028 if (x->cp != gData->cpbegin) {
4029 if (!gData->cx->regExpStatics.multiline &&
4030 !(gData->regexp->flags & JSREG_MULTILINE)) {
4031 break;
4033 if (!RE_IS_LINE_TERM(x->cp[-1]))
4034 break;
4036 result = x;
4037 break;
4038 case REOP_EOL:
4039 if (x->cp != gData->cpend) {
4040 if (!gData->cx->regExpStatics.multiline &&
4041 !(gData->regexp->flags & JSREG_MULTILINE)) {
4042 break;
4044 if (!RE_IS_LINE_TERM(*x->cp))
4045 break;
4047 result = x;
4048 break;
4049 case REOP_WBDRY:
4050 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
4051 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
4052 result = x;
4054 break;
4055 case REOP_WNONBDRY:
4056 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
4057 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
4058 result = x;
4060 break;
4061 case REOP_DOT:
4062 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
4063 result = x;
4064 result->cp++;
4066 break;
4067 case REOP_DIGIT:
4068 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
4069 result = x;
4070 result->cp++;
4072 break;
4073 case REOP_NONDIGIT:
4074 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
4075 result = x;
4076 result->cp++;
4078 break;
4079 case REOP_ALNUM:
4080 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
4081 result = x;
4082 result->cp++;
4084 break;
4085 case REOP_NONALNUM:
4086 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
4087 result = x;
4088 result->cp++;
4090 break;
4091 case REOP_SPACE:
4092 if (x->cp != gData->cpend && JS_ISSPACE(*x->cp)) {
4093 result = x;
4094 result->cp++;
4096 break;
4097 case REOP_NONSPACE:
4098 if (x->cp != gData->cpend && !JS_ISSPACE(*x->cp)) {
4099 result = x;
4100 result->cp++;
4102 break;
4103 case REOP_BACKREF:
4104 pc = ReadCompactIndex(pc, &parenIndex);
4105 JS_ASSERT(parenIndex < gData->regexp->parenCount);
4106 result = BackrefMatcher(gData, x, parenIndex);
4107 break;
4108 case REOP_FLAT:
4109 pc = ReadCompactIndex(pc, &offset);
4110 JS_ASSERT(offset < gData->regexp->source->length());
4111 pc = ReadCompactIndex(pc, &length);
4112 JS_ASSERT(1 <= length);
4113 JS_ASSERT(length <= gData->regexp->source->length() - offset);
4114 if (length <= (size_t)(gData->cpend - x->cp)) {
4115 source = gData->regexp->source->chars() + offset;
4116 re_debug_chars(source, length);
4117 for (index = 0; index != length; index++) {
4118 if (source[index] != x->cp[index])
4119 return NULL;
4121 x->cp += length;
4122 result = x;
4124 break;
4125 case REOP_FLAT1:
4126 matchCh = *pc++;
4127 re_debug(" '%c' == '%c'", (char)matchCh, (char)*x->cp);
4128 if (x->cp != gData->cpend && *x->cp == matchCh) {
4129 result = x;
4130 result->cp++;
4132 break;
4133 case REOP_FLATi:
4134 pc = ReadCompactIndex(pc, &offset);
4135 JS_ASSERT(offset < gData->regexp->source->length());
4136 pc = ReadCompactIndex(pc, &length);
4137 JS_ASSERT(1 <= length);
4138 JS_ASSERT(length <= gData->regexp->source->length() - offset);
4139 source = gData->regexp->source->chars();
4140 result = FlatNIMatcher(gData, x, source + offset, length);
4141 break;
4142 case REOP_FLAT1i:
4143 matchCh = *pc++;
4144 if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) {
4145 result = x;
4146 result->cp++;
4148 break;
4149 case REOP_UCFLAT1:
4150 matchCh = GET_ARG(pc);
4151 re_debug(" '%c' == '%c'", (char)matchCh, (char)*x->cp);
4152 pc += ARG_LEN;
4153 if (x->cp != gData->cpend && *x->cp == matchCh) {
4154 result = x;
4155 result->cp++;
4157 break;
4158 case REOP_UCFLAT1i:
4159 matchCh = GET_ARG(pc);
4160 pc += ARG_LEN;
4161 if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) {
4162 result = x;
4163 result->cp++;
4165 break;
4166 case REOP_CLASS:
4167 pc = ReadCompactIndex(pc, &index);
4168 JS_ASSERT(index < gData->regexp->classCount);
4169 if (x->cp != gData->cpend) {
4170 charSet = &gData->regexp->classList[index];
4171 JS_ASSERT(charSet->converted);
4172 ch = *x->cp;
4173 index = ch >> 3;
4174 if (ch <= charSet->length &&
4175 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
4176 result = x;
4177 result->cp++;
4180 break;
4181 case REOP_NCLASS:
4182 pc = ReadCompactIndex(pc, &index);
4183 JS_ASSERT(index < gData->regexp->classCount);
4184 if (x->cp != gData->cpend) {
4185 charSet = &gData->regexp->classList[index];
4186 JS_ASSERT(charSet->converted);
4187 ch = *x->cp;
4188 index = ch >> 3;
4189 if (ch > charSet->length ||
4190 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
4191 result = x;
4192 result->cp++;
4195 break;
4197 default:
4198 JS_ASSERT(JS_FALSE);
4200 if (result) {
4201 if (!updatecp)
4202 x->cp = startcp;
4203 *startpc = pc;
4204 re_debug(" * ");
4205 return result;
4207 x->cp = startcp;
4208 return NULL;
4211 static JS_ALWAYS_INLINE REMatchState *
4212 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
4214 REMatchState *result = NULL;
4215 REBackTrackData *backTrackData;
4216 jsbytecode *nextpc, *testpc;
4217 REOp nextop;
4218 RECapture *cap;
4219 REProgState *curState;
4220 const jschar *startcp;
4221 size_t parenIndex, k;
4222 size_t parenSoFar = 0;
4224 jschar matchCh1, matchCh2;
4225 RECharSet *charSet;
4227 JSBool anchor;
4228 jsbytecode *pc = gData->regexp->program;
4229 REOp op = (REOp) *pc++;
4232 * If the first node is a simple match, step the index into the string
4233 * until that match is made, or fail if it can't be found at all.
4235 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
4236 anchor = JS_FALSE;
4237 while (x->cp <= gData->cpend) {
4238 nextpc = pc; /* reset back to start each time */
4239 result = SimpleMatch(gData, x, op, &nextpc, JS_TRUE);
4240 if (result) {
4241 anchor = JS_TRUE;
4242 x = result;
4243 pc = nextpc; /* accept skip to next opcode */
4244 op = (REOp) *pc++;
4245 JS_ASSERT(op < REOP_LIMIT);
4246 break;
4248 gData->skipped++;
4249 x->cp++;
4251 if (!anchor)
4252 goto bad;
4255 for (;;) {
4256 #ifdef REGEXP_DEBUG
4257 const char *opname = reop_names[op];
4258 re_debug("\n%06d: %*s%s", pc - gData->regexp->program,
4259 gData->stateStackTop * 2, "", opname);
4260 #endif
4261 if (REOP_IS_SIMPLE(op)) {
4262 result = SimpleMatch(gData, x, op, &pc, JS_TRUE);
4263 } else {
4264 curState = &gData->stateStack[gData->stateStackTop];
4265 switch (op) {
4266 case REOP_END:
4267 goto good;
4268 case REOP_ALTPREREQ2:
4269 nextpc = pc + GET_OFFSET(pc); /* start of next op */
4270 pc += ARG_LEN;
4271 matchCh2 = GET_ARG(pc);
4272 pc += ARG_LEN;
4273 k = GET_ARG(pc);
4274 pc += ARG_LEN;
4276 if (x->cp != gData->cpend) {
4277 if (*x->cp == matchCh2)
4278 goto doAlt;
4280 charSet = &gData->regexp->classList[k];
4281 if (!charSet->converted && !MatcherProcessCharSet(gData, charSet))
4282 goto bad;
4283 matchCh1 = *x->cp;
4284 k = matchCh1 >> 3;
4285 if ((matchCh1 > charSet->length ||
4286 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
4287 charSet->sense) {
4288 goto doAlt;
4291 result = NULL;
4292 break;
4294 case REOP_ALTPREREQ:
4295 nextpc = pc + GET_OFFSET(pc); /* start of next op */
4296 pc += ARG_LEN;
4297 matchCh1 = GET_ARG(pc);
4298 pc += ARG_LEN;
4299 matchCh2 = GET_ARG(pc);
4300 pc += ARG_LEN;
4301 if (x->cp == gData->cpend ||
4302 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
4303 result = NULL;
4304 break;
4306 /* else false thru... */
4308 case REOP_ALT:
4309 doAlt:
4310 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
4311 pc += ARG_LEN; /* start of this alternate */
4312 curState->parenSoFar = parenSoFar;
4313 PUSH_STATE_STACK(gData);
4314 op = (REOp) *pc++;
4315 startcp = x->cp;
4316 if (REOP_IS_SIMPLE(op)) {
4317 if (!SimpleMatch(gData, x, op, &pc, JS_TRUE)) {
4318 op = (REOp) *nextpc++;
4319 pc = nextpc;
4320 continue;
4322 result = x;
4323 op = (REOp) *pc++;
4325 nextop = (REOp) *nextpc++;
4326 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
4327 goto bad;
4328 continue;
4331 * Occurs at (successful) end of REOP_ALT,
4333 case REOP_JUMP:
4335 * If we have not gotten a result here, it is because of an
4336 * empty match. Do the same thing REOP_EMPTY would do.
4338 if (!result)
4339 result = x;
4341 --gData->stateStackTop;
4342 pc += GET_OFFSET(pc);
4343 op = (REOp) *pc++;
4344 continue;
4347 * Occurs at last (successful) end of REOP_ALT,
4349 case REOP_ENDALT:
4351 * If we have not gotten a result here, it is because of an
4352 * empty match. Do the same thing REOP_EMPTY would do.
4354 if (!result)
4355 result = x;
4357 --gData->stateStackTop;
4358 op = (REOp) *pc++;
4359 continue;
4361 case REOP_LPAREN:
4362 pc = ReadCompactIndex(pc, &parenIndex);
4363 re_debug("[ %lu ]", (unsigned long) parenIndex);
4364 JS_ASSERT(parenIndex < gData->regexp->parenCount);
4365 if (parenIndex + 1 > parenSoFar)
4366 parenSoFar = parenIndex + 1;
4367 x->parens[parenIndex].index = x->cp - gData->cpbegin;
4368 x->parens[parenIndex].length = 0;
4369 op = (REOp) *pc++;
4370 continue;
4372 case REOP_RPAREN:
4374 ptrdiff_t delta;
4376 pc = ReadCompactIndex(pc, &parenIndex);
4377 JS_ASSERT(parenIndex < gData->regexp->parenCount);
4378 cap = &x->parens[parenIndex];
4379 delta = x->cp - (gData->cpbegin + cap->index);
4380 cap->length = (delta < 0) ? 0 : (size_t) delta;
4381 op = (REOp) *pc++;
4383 if (!result)
4384 result = x;
4385 continue;
4387 case REOP_ASSERT:
4388 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
4389 pc += ARG_LEN; /* start of ASSERT child */
4390 op = (REOp) *pc++;
4391 testpc = pc;
4392 if (REOP_IS_SIMPLE(op) &&
4393 !SimpleMatch(gData, x, op, &testpc, JS_FALSE)) {
4394 result = NULL;
4395 break;
4397 curState->u.assertion.top =
4398 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
4399 curState->u.assertion.sz = gData->cursz;
4400 curState->index = x->cp - gData->cpbegin;
4401 curState->parenSoFar = parenSoFar;
4402 PUSH_STATE_STACK(gData);
4403 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
4404 nextpc, x, x->cp, 0, 0)) {
4405 goto bad;
4407 continue;
4409 case REOP_ASSERT_NOT:
4410 nextpc = pc + GET_OFFSET(pc);
4411 pc += ARG_LEN;
4412 op = (REOp) *pc++;
4413 testpc = pc;
4414 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
4415 SimpleMatch(gData, x, op, &testpc, JS_FALSE) &&
4416 *testpc == REOP_ASSERTNOTTEST) {
4417 result = NULL;
4418 break;
4420 curState->u.assertion.top
4421 = (char *)gData->backTrackSP -
4422 (char *)gData->backTrackStack;
4423 curState->u.assertion.sz = gData->cursz;
4424 curState->index = x->cp - gData->cpbegin;
4425 curState->parenSoFar = parenSoFar;
4426 PUSH_STATE_STACK(gData);
4427 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
4428 nextpc, x, x->cp, 0, 0)) {
4429 goto bad;
4431 continue;
4433 case REOP_ASSERTTEST:
4434 --gData->stateStackTop;
4435 --curState;
4436 x->cp = gData->cpbegin + curState->index;
4437 gData->backTrackSP =
4438 (REBackTrackData *) ((char *)gData->backTrackStack +
4439 curState->u.assertion.top);
4440 gData->cursz = curState->u.assertion.sz;
4441 if (result)
4442 result = x;
4443 break;
4445 case REOP_ASSERTNOTTEST:
4446 --gData->stateStackTop;
4447 --curState;
4448 x->cp = gData->cpbegin + curState->index;
4449 gData->backTrackSP =
4450 (REBackTrackData *) ((char *)gData->backTrackStack +
4451 curState->u.assertion.top);
4452 gData->cursz = curState->u.assertion.sz;
4453 result = (!result) ? x : NULL;
4454 break;
4455 case REOP_STAR:
4456 curState->u.quantifier.min = 0;
4457 curState->u.quantifier.max = (uintN)-1;
4458 goto quantcommon;
4459 case REOP_PLUS:
4460 curState->u.quantifier.min = 1;
4461 curState->u.quantifier.max = (uintN)-1;
4462 goto quantcommon;
4463 case REOP_OPT:
4464 curState->u.quantifier.min = 0;
4465 curState->u.quantifier.max = 1;
4466 goto quantcommon;
4467 case REOP_QUANT:
4468 pc = ReadCompactIndex(pc, &k);
4469 curState->u.quantifier.min = k;
4470 pc = ReadCompactIndex(pc, &k);
4471 /* max is k - 1 to use one byte for (uintN)-1 sentinel. */
4472 curState->u.quantifier.max = k - 1;
4473 JS_ASSERT(curState->u.quantifier.min
4474 <= curState->u.quantifier.max);
4475 quantcommon:
4476 if (curState->u.quantifier.max == 0) {
4477 pc = pc + GET_OFFSET(pc);
4478 op = (REOp) *pc++;
4479 result = x;
4480 continue;
4482 /* Step over <next> */
4483 nextpc = pc + ARG_LEN;
4484 op = (REOp) *nextpc++;
4485 startcp = x->cp;
4486 if (REOP_IS_SIMPLE(op)) {
4487 if (!SimpleMatch(gData, x, op, &nextpc, JS_TRUE)) {
4488 if (curState->u.quantifier.min == 0)
4489 result = x;
4490 else
4491 result = NULL;
4492 pc = pc + GET_OFFSET(pc);
4493 break;
4495 op = (REOp) *nextpc++;
4496 result = x;
4498 curState->index = startcp - gData->cpbegin;
4499 curState->continue_op = REOP_REPEAT;
4500 curState->continue_pc = pc;
4501 curState->parenSoFar = parenSoFar;
4502 PUSH_STATE_STACK(gData);
4503 if (curState->u.quantifier.min == 0 &&
4504 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
4505 0, 0)) {
4506 goto bad;
4508 pc = nextpc;
4509 continue;
4511 case REOP_ENDCHILD: /* marks the end of a quantifier child */
4512 pc = curState[-1].continue_pc;
4513 op = (REOp) curState[-1].continue_op;
4515 if (!result)
4516 result = x;
4517 continue;
4519 case REOP_REPEAT:
4520 --curState;
4521 do {
4522 --gData->stateStackTop;
4523 if (!result) {
4524 /* Failed, see if we have enough children. */
4525 if (curState->u.quantifier.min == 0)
4526 goto repeatDone;
4527 goto break_switch;
4529 if (curState->u.quantifier.min == 0 &&
4530 x->cp == gData->cpbegin + curState->index) {
4531 /* matched an empty string, that'll get us nowhere */
4532 result = NULL;
4533 goto break_switch;
4535 if (curState->u.quantifier.min != 0)
4536 curState->u.quantifier.min--;
4537 if (curState->u.quantifier.max != (uintN) -1)
4538 curState->u.quantifier.max--;
4539 if (curState->u.quantifier.max == 0)
4540 goto repeatDone;
4541 nextpc = pc + ARG_LEN;
4542 nextop = (REOp) *nextpc;
4543 startcp = x->cp;
4544 if (REOP_IS_SIMPLE(nextop)) {
4545 nextpc++;
4546 if (!SimpleMatch(gData, x, nextop, &nextpc, JS_TRUE)) {
4547 if (curState->u.quantifier.min == 0)
4548 goto repeatDone;
4549 result = NULL;
4550 goto break_switch;
4552 result = x;
4554 curState->index = startcp - gData->cpbegin;
4555 PUSH_STATE_STACK(gData);
4556 if (curState->u.quantifier.min == 0 &&
4557 !PushBackTrackState(gData, REOP_REPEAT,
4558 pc, x, startcp,
4559 curState->parenSoFar,
4560 parenSoFar -
4561 curState->parenSoFar)) {
4562 goto bad;
4564 } while (*nextpc == REOP_ENDCHILD);
4565 pc = nextpc;
4566 op = (REOp) *pc++;
4567 parenSoFar = curState->parenSoFar;
4568 continue;
4570 repeatDone:
4571 result = x;
4572 pc += GET_OFFSET(pc);
4573 goto break_switch;
4575 case REOP_MINIMALSTAR:
4576 curState->u.quantifier.min = 0;
4577 curState->u.quantifier.max = (uintN)-1;
4578 goto minimalquantcommon;
4579 case REOP_MINIMALPLUS:
4580 curState->u.quantifier.min = 1;
4581 curState->u.quantifier.max = (uintN)-1;
4582 goto minimalquantcommon;
4583 case REOP_MINIMALOPT:
4584 curState->u.quantifier.min = 0;
4585 curState->u.quantifier.max = 1;
4586 goto minimalquantcommon;
4587 case REOP_MINIMALQUANT:
4588 pc = ReadCompactIndex(pc, &k);
4589 curState->u.quantifier.min = k;
4590 pc = ReadCompactIndex(pc, &k);
4591 /* See REOP_QUANT comments about k - 1. */
4592 curState->u.quantifier.max = k - 1;
4593 JS_ASSERT(curState->u.quantifier.min
4594 <= curState->u.quantifier.max);
4595 minimalquantcommon:
4596 curState->index = x->cp - gData->cpbegin;
4597 curState->parenSoFar = parenSoFar;
4598 PUSH_STATE_STACK(gData);
4599 if (curState->u.quantifier.min != 0) {
4600 curState->continue_op = REOP_MINIMALREPEAT;
4601 curState->continue_pc = pc;
4602 /* step over <next> */
4603 pc += OFFSET_LEN;
4604 op = (REOp) *pc++;
4605 } else {
4606 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
4607 pc, x, x->cp, 0, 0)) {
4608 goto bad;
4610 --gData->stateStackTop;
4611 pc = pc + GET_OFFSET(pc);
4612 op = (REOp) *pc++;
4614 continue;
4616 case REOP_MINIMALREPEAT:
4617 --gData->stateStackTop;
4618 --curState;
4620 re_debug("{%d,%d}", curState->u.quantifier.min,
4621 curState->u.quantifier.max);
4622 #define PREPARE_REPEAT() \
4623 JS_BEGIN_MACRO \
4624 curState->index = x->cp - gData->cpbegin; \
4625 curState->continue_op = REOP_MINIMALREPEAT; \
4626 curState->continue_pc = pc; \
4627 pc += ARG_LEN; \
4628 for (k = curState->parenSoFar; k < parenSoFar; k++) \
4629 x->parens[k].index = -1; \
4630 PUSH_STATE_STACK(gData); \
4631 op = (REOp) *pc++; \
4632 JS_ASSERT(op < REOP_LIMIT); \
4633 JS_END_MACRO
4635 if (!result) {
4636 re_debug(" - ");
4638 * Non-greedy failure - try to consume another child.
4640 if (curState->u.quantifier.max == (uintN) -1 ||
4641 curState->u.quantifier.max > 0) {
4642 PREPARE_REPEAT();
4643 continue;
4645 /* Don't need to adjust pc since we're going to pop. */
4646 break;
4648 if (curState->u.quantifier.min == 0 &&
4649 x->cp == gData->cpbegin + curState->index) {
4650 /* Matched an empty string, that'll get us nowhere. */
4651 result = NULL;
4652 break;
4654 if (curState->u.quantifier.min != 0)
4655 curState->u.quantifier.min--;
4656 if (curState->u.quantifier.max != (uintN) -1)
4657 curState->u.quantifier.max--;
4658 if (curState->u.quantifier.min != 0) {
4659 PREPARE_REPEAT();
4660 continue;
4662 curState->index = x->cp - gData->cpbegin;
4663 curState->parenSoFar = parenSoFar;
4664 PUSH_STATE_STACK(gData);
4665 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
4666 pc, x, x->cp,
4667 curState->parenSoFar,
4668 parenSoFar - curState->parenSoFar)) {
4669 goto bad;
4671 --gData->stateStackTop;
4672 pc = pc + GET_OFFSET(pc);
4673 op = (REOp) *pc++;
4674 JS_ASSERT(op < REOP_LIMIT);
4675 continue;
4676 default:
4677 JS_ASSERT(JS_FALSE);
4678 result = NULL;
4680 break_switch:;
4684 * If the match failed and there's a backtrack option, take it.
4685 * Otherwise this is a complete and utter failure.
4687 if (!result) {
4688 if (gData->cursz == 0)
4689 return NULL;
4690 if (!JS_CHECK_OPERATION_LIMIT(gData->cx)) {
4691 gData->ok = JS_FALSE;
4692 return NULL;
4695 /* Potentially detect explosive regex here. */
4696 gData->backTrackCount++;
4697 if (gData->backTrackLimit &&
4698 gData->backTrackCount >= gData->backTrackLimit) {
4699 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
4700 JSMSG_REGEXP_TOO_COMPLEX);
4701 gData->ok = JS_FALSE;
4702 return NULL;
4705 backTrackData = gData->backTrackSP;
4706 gData->cursz = backTrackData->sz;
4707 gData->backTrackSP =
4708 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
4709 x->cp = backTrackData->cp;
4710 pc = backTrackData->backtrack_pc;
4711 op = (REOp) backTrackData->backtrack_op;
4712 JS_ASSERT(op < REOP_LIMIT);
4713 gData->stateStackTop = backTrackData->saveStateStackTop;
4714 JS_ASSERT(gData->stateStackTop);
4716 memcpy(gData->stateStack, backTrackData + 1,
4717 sizeof(REProgState) * backTrackData->saveStateStackTop);
4718 curState = &gData->stateStack[gData->stateStackTop - 1];
4720 if (backTrackData->parenCount) {
4721 memcpy(&x->parens[backTrackData->parenIndex],
4722 (char *)(backTrackData + 1) +
4723 sizeof(REProgState) * backTrackData->saveStateStackTop,
4724 sizeof(RECapture) * backTrackData->parenCount);
4725 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
4726 } else {
4727 for (k = curState->parenSoFar; k < parenSoFar; k++)
4728 x->parens[k].index = -1;
4729 parenSoFar = curState->parenSoFar;
4732 re_debug("\tBT_Pop: %ld,%ld",
4733 (unsigned long) backTrackData->parenIndex,
4734 (unsigned long) backTrackData->parenCount);
4735 continue;
4737 x = result;
4740 * Continue with the expression.
4742 op = (REOp)*pc++;
4743 JS_ASSERT(op < REOP_LIMIT);
4746 bad:
4747 re_debug("\n");
4748 return NULL;
4750 good:
4751 re_debug("\n");
4752 return x;
4755 static REMatchState *
4756 MatchRegExp(REGlobalData *gData, REMatchState *x)
4758 const jschar *cpOrig = x->cp;
4760 #ifdef JS_TRACER
4761 NativeRegExp native;
4763 /* Run with native regexp if possible. */
4764 if (TRACING_ENABLED(gData->cx) &&
4765 !(gData->regexp->flags & JSREG_NOCOMPILE) &&
4766 (native = GetNativeRegExp(gData->cx, gData->regexp))) {
4769 * For efficient native execution, store offset as a direct pointer into
4770 * the buffer and convert back after execution finishes.
4772 gData->skipped = (ptrdiff_t)cpOrig;
4774 #ifdef JS_JIT_SPEW
4775 debug_only_stmt({
4776 VOUCH_DOES_NOT_REQUIRE_STACK();
4777 JSStackFrame *caller = (JS_ON_TRACE(gData->cx))
4778 ? NULL
4779 : js_GetScriptedCaller(gData->cx, NULL);
4780 debug_only_printf(LC_TMRegexp,
4781 "entering REGEXP trace at %s:%u@%u, code: %p\n",
4782 caller ? caller->script->filename : "<unknown>",
4783 caller ? js_FramePCToLineNumber(gData->cx, caller) : 0,
4784 caller ? FramePCOffset(caller) : 0,
4785 JS_FUNC_TO_DATA_PTR(void *, native));
4787 #endif
4789 void *result;
4790 #if defined(JS_NO_FASTCALL) && defined(NANOJIT_IA32)
4792 * Although a NativeRegExp takes one argument and SIMULATE_FASTCALL is
4793 * passing two, the second goes into 'edx' and can safely be ignored.
4795 SIMULATE_FASTCALL(result, gData, gData->cpend, native);
4796 #else
4797 result = native(gData, gData->cpend);
4798 #endif
4799 debug_only_print0(LC_TMRegexp, "leaving REGEXP trace\n");
4800 if (!result)
4801 return NULL;
4803 /* Restore REGlobalData::skipped and fill REMatchState. */
4804 x->cp = (const jschar *)gData->stateStack;
4805 gData->skipped = (const jschar *)gData->skipped - cpOrig;
4806 return x;
4808 #endif
4811 * Have to include the position beyond the last character
4812 * in order to detect end-of-input/line condition.
4814 for (const jschar *p = cpOrig; p <= gData->cpend; p++) {
4815 gData->skipped = p - cpOrig;
4816 x->cp = p;
4817 for (uintN j = 0; j < gData->regexp->parenCount; j++)
4818 x->parens[j].index = -1;
4819 REMatchState *result = ExecuteREBytecode(gData, x);
4820 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
4821 return result;
4822 gData->backTrackSP = gData->backTrackStack;
4823 gData->cursz = 0;
4824 gData->stateStackTop = 0;
4825 p = cpOrig + gData->skipped;
4827 return NULL;
4830 #define MIN_BACKTRACK_LIMIT 400000
4832 static REMatchState *
4833 InitMatch(JSContext *cx, REGlobalData *gData, JSRegExp *re, size_t length)
4835 REMatchState *result;
4836 uintN i;
4838 gData->backTrackStackSize = INITIAL_BACKTRACK;
4839 JSArenaPool &regexpPool = cx->regexpPool;
4840 regexpPool.allocateCast<REBackTrackData *>(gData->backTrackStack, INITIAL_BACKTRACK);
4841 if (!gData->backTrackStack)
4842 goto bad;
4844 gData->backTrackSP = gData->backTrackStack;
4845 gData->cursz = 0;
4846 gData->backTrackCount = 0;
4847 gData->backTrackLimit = 0;
4848 if (JS_GetOptions(cx) & JSOPTION_RELIMIT) {
4849 gData->backTrackLimit = length * length * length; /* O(n^3) */
4850 if (gData->backTrackLimit < MIN_BACKTRACK_LIMIT)
4851 gData->backTrackLimit = MIN_BACKTRACK_LIMIT;
4854 gData->stateStackLimit = INITIAL_STATESTACK;
4855 regexpPool.allocateCast<REProgState *>(gData->stateStack,
4856 sizeof(REProgState) * INITIAL_STATESTACK);
4857 if (!gData->stateStack)
4858 goto bad;
4860 gData->stateStackTop = 0;
4861 gData->cx = cx;
4862 gData->regexp = re;
4863 gData->ok = JS_TRUE;
4865 regexpPool.allocateCast<REMatchState *>(result, offsetof(REMatchState, parens)
4866 + re->parenCount * sizeof(RECapture));
4867 if (!result)
4868 goto bad;
4870 for (i = 0; i < re->classCount; i++) {
4871 if (!re->classList[i].converted &&
4872 !MatcherProcessCharSet(gData, &re->classList[i])) {
4873 return NULL;
4877 return result;
4879 bad:
4880 js_ReportOutOfScriptQuota(cx);
4881 gData->ok = JS_FALSE;
4882 return NULL;
4885 JSBool
4886 js_ExecuteRegExp(JSContext *cx, JSRegExp *re, JSString *str, size_t *indexp,
4887 JSBool test, jsval *rval)
4889 REGlobalData gData;
4890 REMatchState *x, *result;
4892 const jschar *cp, *ep;
4893 size_t i, length, start;
4894 JSBool ok;
4895 JSRegExpStatics *res;
4896 ptrdiff_t matchlen;
4897 uintN num;
4898 JSString *parstr, *matchstr;
4899 JSObject *obj;
4901 RECapture *parsub = NULL;
4902 void *mark;
4903 int64 *timestamp;
4906 * It's safe to load from cp because JSStrings have a zero at the end,
4907 * and we never let cp get beyond cpend.
4909 start = *indexp;
4910 str->getCharsAndLength(cp, length);
4911 if (start > length)
4912 start = length;
4913 gData.cpbegin = cp;
4914 gData.cpend = cp + length;
4915 cp += start;
4916 gData.start = start;
4917 gData.skipped = 0;
4919 if (!cx->regexpPool.getSecond()) {
4921 * The first arena in the regexpPool must have a timestamp at its base.
4923 cx->regexpPool.allocateCast<int64 *>(timestamp, sizeof *timestamp);
4924 if (!timestamp)
4925 return JS_FALSE;
4926 *timestamp = JS_Now();
4928 mark = cx->regexpPool.getMark();
4930 x = InitMatch(cx, &gData, re, length);
4932 if (!x) {
4933 ok = JS_FALSE;
4934 goto out;
4936 x->cp = cp;
4939 * Call the recursive matcher to do the real work. Return null on mismatch
4940 * whether testing or not. On match, return an extended Array object.
4942 result = MatchRegExp(&gData, x);
4943 ok = gData.ok;
4944 if (!ok)
4945 goto out;
4946 if (!result) {
4947 *rval = JSVAL_NULL;
4948 goto out;
4950 cp = result->cp;
4951 i = cp - gData.cpbegin;
4952 *indexp = i;
4953 matchlen = i - (start + gData.skipped);
4954 JS_ASSERT(matchlen >= 0);
4955 ep = cp;
4956 cp -= matchlen;
4958 if (test) {
4960 * Testing for a match and updating cx->regExpStatics: don't allocate
4961 * an array object, do return true.
4963 *rval = JSVAL_TRUE;
4965 /* Avoid warning. (gcc doesn't detect that obj is needed iff !test); */
4966 obj = NULL;
4967 } else {
4969 * The array returned on match has element 0 bound to the matched
4970 * string, elements 1 through state.parenCount bound to the paren
4971 * matches, an index property telling the length of the left context,
4972 * and an input property referring to the input string.
4974 obj = js_NewSlowArrayObject(cx);
4975 if (!obj) {
4976 ok = JS_FALSE;
4977 goto out;
4979 *rval = OBJECT_TO_JSVAL(obj);
4981 #define DEFVAL(val, id) { \
4982 ok = js_DefineProperty(cx, obj, id, val, \
4983 JS_PropertyStub, JS_PropertyStub, \
4984 JSPROP_ENUMERATE); \
4985 if (!ok) \
4986 goto out; \
4989 matchstr = js_NewDependentString(cx, str, cp - str->chars(),
4990 matchlen);
4991 if (!matchstr) {
4992 ok = JS_FALSE;
4993 goto out;
4995 DEFVAL(STRING_TO_JSVAL(matchstr), INT_TO_JSID(0));
4998 res = &cx->regExpStatics;
4999 res->input = str;
5000 if (!res->parens.resize(re->parenCount)) {
5001 ok = JS_FALSE;
5002 goto out;
5004 if (re->parenCount == 0) {
5005 res->lastParen = js_EmptySubString;
5006 } else {
5007 for (num = 0; num < re->parenCount; num++) {
5008 JSSubString *sub = &res->parens[num];
5009 parsub = &result->parens[num];
5010 if (parsub->index == -1) {
5011 sub->chars = NULL;
5012 sub->length = 0;
5013 } else {
5014 sub->chars = gData.cpbegin + parsub->index;
5015 sub->length = parsub->length;
5017 if (test)
5018 continue;
5019 if (parsub->index == -1) {
5020 ok = js_DefineProperty(cx, obj, INT_TO_JSID(num + 1), JSVAL_VOID, NULL, NULL,
5021 JSPROP_ENUMERATE);
5022 } else {
5023 parstr = js_NewDependentString(cx, str,
5024 gData.cpbegin + parsub->index -
5025 str->chars(),
5026 parsub->length);
5027 if (!parstr) {
5028 ok = JS_FALSE;
5029 goto out;
5031 ok = js_DefineProperty(cx, obj, INT_TO_JSID(num + 1), STRING_TO_JSVAL(parstr),
5032 NULL, NULL, JSPROP_ENUMERATE);
5034 if (!ok)
5035 goto out;
5037 if (parsub->index == -1) {
5038 res->lastParen = js_EmptySubString;
5039 } else {
5040 res->lastParen.chars = gData.cpbegin + parsub->index;
5041 res->lastParen.length = parsub->length;
5045 if (!test) {
5047 * Define the index and input properties last for better for/in loop
5048 * order (so they come after the elements).
5050 DEFVAL(INT_TO_JSVAL(start + gData.skipped),
5051 ATOM_TO_JSID(cx->runtime->atomState.indexAtom));
5052 DEFVAL(STRING_TO_JSVAL(str),
5053 ATOM_TO_JSID(cx->runtime->atomState.inputAtom));
5056 #undef DEFVAL
5058 res->lastMatch.chars = cp;
5059 res->lastMatch.length = matchlen;
5062 * For JS1.3 and ECMAv2, emulate Perl5 exactly:
5064 * js1.3 "hi", "hi there" "hihitherehi therebye"
5066 res->leftContext.chars = str->chars();
5067 res->leftContext.length = start + gData.skipped;
5068 res->rightContext.chars = ep;
5069 res->rightContext.length = gData.cpend - ep;
5071 out:
5072 cx->regexpPool.release(mark);
5073 return ok;
5076 /************************************************************************/
5078 static JSBool
5079 SetRegExpLastIndex(JSContext *cx, JSObject *obj, jsdouble lastIndex)
5081 JS_ASSERT(obj->isRegExp());
5082 return JS_NewNumberValue(cx, lastIndex, obj->addressOfRegExpLastIndex());
5085 static JSBool
5086 regexp_getProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
5088 jsint slot;
5089 JSRegExp *re;
5091 if (!JSVAL_IS_INT(id))
5092 return JS_TRUE;
5093 while (obj->getClass() != &js_RegExpClass) {
5094 obj = obj->getProto();
5095 if (!obj)
5096 return JS_TRUE;
5098 slot = JSVAL_TO_INT(id);
5099 if (slot == REGEXP_LAST_INDEX) {
5100 *vp = obj->getRegExpLastIndex();
5101 return JS_TRUE;
5104 JS_LOCK_OBJ(cx, obj);
5105 re = (JSRegExp *) obj->getPrivate();
5106 if (re) {
5107 switch (slot) {
5108 case REGEXP_SOURCE:
5109 *vp = STRING_TO_JSVAL(re->source);
5110 break;
5111 case REGEXP_GLOBAL:
5112 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_GLOB) != 0);
5113 break;
5114 case REGEXP_IGNORE_CASE:
5115 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_FOLD) != 0);
5116 break;
5117 case REGEXP_MULTILINE:
5118 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_MULTILINE) != 0);
5119 break;
5120 case REGEXP_STICKY:
5121 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_STICKY) != 0);
5122 break;
5125 JS_UNLOCK_OBJ(cx, obj);
5126 return JS_TRUE;
5129 static JSBool
5130 regexp_setProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
5132 JSBool ok;
5133 jsint slot;
5134 jsdouble lastIndex;
5136 ok = JS_TRUE;
5137 if (!JSVAL_IS_INT(id))
5138 return ok;
5139 while (obj->getClass() != &js_RegExpClass) {
5140 obj = obj->getProto();
5141 if (!obj)
5142 return JS_TRUE;
5144 slot = JSVAL_TO_INT(id);
5145 if (slot == REGEXP_LAST_INDEX) {
5146 if (!JS_ValueToNumber(cx, *vp, &lastIndex))
5147 return JS_FALSE;
5148 lastIndex = js_DoubleToInteger(lastIndex);
5149 ok = SetRegExpLastIndex(cx, obj, lastIndex);
5151 return ok;
5154 #define REGEXP_PROP_ATTRS (JSPROP_PERMANENT | JSPROP_SHARED)
5155 #define RO_REGEXP_PROP_ATTRS (REGEXP_PROP_ATTRS | JSPROP_READONLY)
5157 #define G regexp_getProperty
5158 #define S regexp_setProperty
5160 static JSPropertySpec regexp_props[] = {
5161 {"source", REGEXP_SOURCE, RO_REGEXP_PROP_ATTRS,G,S},
5162 {"global", REGEXP_GLOBAL, RO_REGEXP_PROP_ATTRS,G,S},
5163 {"ignoreCase", REGEXP_IGNORE_CASE, RO_REGEXP_PROP_ATTRS,G,S},
5164 {"lastIndex", REGEXP_LAST_INDEX, REGEXP_PROP_ATTRS,G,S},
5165 {"multiline", REGEXP_MULTILINE, RO_REGEXP_PROP_ATTRS,G,S},
5166 {"sticky", REGEXP_STICKY, RO_REGEXP_PROP_ATTRS,G,S},
5167 {0,0,0,0,0}
5170 #undef G
5171 #undef S
5174 * RegExp class static properties and their Perl counterparts:
5176 * RegExp.input $_
5177 * RegExp.multiline $*
5178 * RegExp.lastMatch $&
5179 * RegExp.lastParen $+
5180 * RegExp.leftContext $`
5181 * RegExp.rightContext $'
5183 enum regexp_static_tinyid {
5184 REGEXP_STATIC_INPUT = -1,
5185 REGEXP_STATIC_MULTILINE = -2,
5186 REGEXP_STATIC_LAST_MATCH = -3,
5187 REGEXP_STATIC_LAST_PAREN = -4,
5188 REGEXP_STATIC_LEFT_CONTEXT = -5,
5189 REGEXP_STATIC_RIGHT_CONTEXT = -6
5192 void
5193 js_InitRegExpStatics(JSContext *cx)
5196 * To avoid multiple allocations in InitMatch(), the arena size parameter
5197 * should be at least as big as:
5198 * INITIAL_BACKTRACK
5199 * + (sizeof(REProgState) * INITIAL_STATESTACK)
5200 * + (offsetof(REMatchState, parens) + avgParanSize * sizeof(RECapture))
5202 cx->regexpPool.init("regexp", 12 * 1024 - 40, /* FIXME: bug 421435 */
5203 sizeof(void *), &cx->scriptStackQuota);
5205 JS_ClearRegExpStatics(cx);
5208 JS_FRIEND_API(void)
5209 js_SaveAndClearRegExpStatics(JSContext *cx, JSRegExpStatics *statics,
5210 AutoValueRooter *tvr)
5212 statics->copy(cx->regExpStatics);
5213 if (statics->input)
5214 tvr->setString(statics->input);
5215 JS_ClearRegExpStatics(cx);
5218 JS_FRIEND_API(void)
5219 js_RestoreRegExpStatics(JSContext *cx, JSRegExpStatics *statics,
5220 AutoValueRooter *tvr)
5222 /* Clear/free any new JSRegExpStatics data before clobbering. */
5223 cx->regExpStatics.copy(*statics);
5226 void
5227 js_TraceRegExpStatics(JSTracer *trc, JSContext *acx)
5229 JSRegExpStatics *res = &acx->regExpStatics;
5231 if (res->input)
5232 JS_CALL_STRING_TRACER(trc, res->input, "res->input");
5235 void
5236 js_FreeRegExpStatics(JSContext *cx)
5238 JS_ClearRegExpStatics(cx);
5239 cx->regexpPool.finish();
5242 static JSBool
5243 regexp_static_getProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
5245 jsint slot;
5246 JSRegExpStatics *res;
5247 JSString *str;
5248 JSSubString *sub;
5250 res = &cx->regExpStatics;
5251 if (!JSVAL_IS_INT(id))
5252 return JS_TRUE;
5253 slot = JSVAL_TO_INT(id);
5254 switch (slot) {
5255 case REGEXP_STATIC_INPUT:
5256 *vp = res->input ? STRING_TO_JSVAL(res->input)
5257 : JS_GetEmptyStringValue(cx);
5258 return JS_TRUE;
5259 case REGEXP_STATIC_MULTILINE:
5260 *vp = BOOLEAN_TO_JSVAL(res->multiline);
5261 return JS_TRUE;
5262 case REGEXP_STATIC_LAST_MATCH:
5263 sub = &res->lastMatch;
5264 break;
5265 case REGEXP_STATIC_LAST_PAREN:
5266 sub = &res->lastParen;
5267 break;
5268 case REGEXP_STATIC_LEFT_CONTEXT:
5269 sub = &res->leftContext;
5270 break;
5271 case REGEXP_STATIC_RIGHT_CONTEXT:
5272 sub = &res->rightContext;
5273 break;
5274 default:
5275 sub = (size_t(slot) < res->parens.length()) ? &res->parens[slot] : &js_EmptySubString;
5276 break;
5278 str = js_NewStringCopyN(cx, sub->chars, sub->length);
5279 if (!str)
5280 return JS_FALSE;
5281 *vp = STRING_TO_JSVAL(str);
5282 return JS_TRUE;
5285 static JSBool
5286 regexp_static_setProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
5288 JSRegExpStatics *res;
5290 if (!JSVAL_IS_INT(id))
5291 return JS_TRUE;
5292 res = &cx->regExpStatics;
5293 /* XXX use if-else rather than switch to keep MSVC1.52 from crashing */
5294 if (JSVAL_TO_INT(id) == REGEXP_STATIC_INPUT) {
5295 if (!JSVAL_IS_STRING(*vp) &&
5296 !JS_ConvertValue(cx, *vp, JSTYPE_STRING, vp)) {
5297 return JS_FALSE;
5299 res->input = JSVAL_TO_STRING(*vp);
5300 } else if (JSVAL_TO_INT(id) == REGEXP_STATIC_MULTILINE) {
5301 if (!JSVAL_IS_BOOLEAN(*vp) &&
5302 !JS_ConvertValue(cx, *vp, JSTYPE_BOOLEAN, vp)) {
5303 return JS_FALSE;
5305 res->multiline = JSVAL_TO_BOOLEAN(*vp);
5307 return JS_TRUE;
5309 #define REGEXP_STATIC_PROP_ATTRS (REGEXP_PROP_ATTRS | JSPROP_ENUMERATE)
5310 #define RO_REGEXP_STATIC_PROP_ATTRS (REGEXP_STATIC_PROP_ATTRS | JSPROP_READONLY)
5312 static JSPropertySpec regexp_static_props[] = {
5313 {"input",
5314 REGEXP_STATIC_INPUT,
5315 REGEXP_STATIC_PROP_ATTRS,
5316 regexp_static_getProperty, regexp_static_setProperty},
5317 {"multiline",
5318 REGEXP_STATIC_MULTILINE,
5319 REGEXP_STATIC_PROP_ATTRS,
5320 regexp_static_getProperty, regexp_static_setProperty},
5321 {"lastMatch",
5322 REGEXP_STATIC_LAST_MATCH,
5323 RO_REGEXP_STATIC_PROP_ATTRS,
5324 regexp_static_getProperty, regexp_static_getProperty},
5325 {"lastParen",
5326 REGEXP_STATIC_LAST_PAREN,
5327 RO_REGEXP_STATIC_PROP_ATTRS,
5328 regexp_static_getProperty, regexp_static_getProperty},
5329 {"leftContext",
5330 REGEXP_STATIC_LEFT_CONTEXT,
5331 RO_REGEXP_STATIC_PROP_ATTRS,
5332 regexp_static_getProperty, regexp_static_getProperty},
5333 {"rightContext",
5334 REGEXP_STATIC_RIGHT_CONTEXT,
5335 RO_REGEXP_STATIC_PROP_ATTRS,
5336 regexp_static_getProperty, regexp_static_getProperty},
5338 /* XXX should have block scope and local $1, etc. */
5339 {"$1", 0, RO_REGEXP_STATIC_PROP_ATTRS,
5340 regexp_static_getProperty, regexp_static_getProperty},
5341 {"$2", 1, RO_REGEXP_STATIC_PROP_ATTRS,
5342 regexp_static_getProperty, regexp_static_getProperty},
5343 {"$3", 2, RO_REGEXP_STATIC_PROP_ATTRS,
5344 regexp_static_getProperty, regexp_static_getProperty},
5345 {"$4", 3, RO_REGEXP_STATIC_PROP_ATTRS,
5346 regexp_static_getProperty, regexp_static_getProperty},
5347 {"$5", 4, RO_REGEXP_STATIC_PROP_ATTRS,
5348 regexp_static_getProperty, regexp_static_getProperty},
5349 {"$6", 5, RO_REGEXP_STATIC_PROP_ATTRS,
5350 regexp_static_getProperty, regexp_static_getProperty},
5351 {"$7", 6, RO_REGEXP_STATIC_PROP_ATTRS,
5352 regexp_static_getProperty, regexp_static_getProperty},
5353 {"$8", 7, RO_REGEXP_STATIC_PROP_ATTRS,
5354 regexp_static_getProperty, regexp_static_getProperty},
5355 {"$9", 8, RO_REGEXP_STATIC_PROP_ATTRS,
5356 regexp_static_getProperty, regexp_static_getProperty},
5358 {0,0,0,0,0}
5361 static void
5362 regexp_finalize(JSContext *cx, JSObject *obj)
5364 JSRegExp *re = (JSRegExp *) obj->getPrivate();
5365 if (!re)
5366 return;
5367 js_DestroyRegExp(cx, re);
5370 /* Forward static prototype. */
5371 static JSBool
5372 regexp_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
5373 JSBool test, jsval *rval);
5375 static JSBool
5376 regexp_call(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval)
5378 return regexp_exec_sub(cx, JSVAL_TO_OBJECT(argv[-2]), argc, argv,
5379 JS_FALSE, rval);
5382 #if JS_HAS_XDR
5384 #include "jsxdrapi.h"
5386 JSBool
5387 js_XDRRegExpObject(JSXDRState *xdr, JSObject **objp)
5389 JSRegExp *re;
5390 JSString *source;
5391 uint32 flagsword;
5392 JSObject *obj;
5394 if (xdr->mode == JSXDR_ENCODE) {
5395 re = (JSRegExp *) (*objp)->getPrivate();
5396 if (!re)
5397 return JS_FALSE;
5398 source = re->source;
5399 flagsword = (uint32)re->flags;
5401 if (!JS_XDRString(xdr, &source) ||
5402 !JS_XDRUint32(xdr, &flagsword)) {
5403 return JS_FALSE;
5405 if (xdr->mode == JSXDR_DECODE) {
5406 obj = NewObject(xdr->cx, &js_RegExpClass, NULL, NULL);
5407 if (!obj)
5408 return JS_FALSE;
5409 obj->clearParent();
5410 obj->clearProto();
5411 re = js_NewRegExp(xdr->cx, NULL, source, (uint8)flagsword, JS_FALSE);
5412 if (!re)
5413 return JS_FALSE;
5414 obj->setPrivate(re);
5415 obj->zeroRegExpLastIndex();
5416 *objp = obj;
5418 return JS_TRUE;
5421 #else /* !JS_HAS_XDR */
5423 #define js_XDRRegExpObject NULL
5425 #endif /* !JS_HAS_XDR */
5427 static void
5428 regexp_trace(JSTracer *trc, JSObject *obj)
5430 JSRegExp *re = (JSRegExp *) obj->getPrivate();
5431 if (re && re->source)
5432 JS_CALL_STRING_TRACER(trc, re->source, "source");
5435 JSClass js_RegExpClass = {
5436 js_RegExp_str,
5437 JSCLASS_HAS_PRIVATE |
5438 JSCLASS_HAS_RESERVED_SLOTS(JSObject::REGEXP_FIXED_RESERVED_SLOTS) |
5439 JSCLASS_MARK_IS_TRACE | JSCLASS_HAS_CACHED_PROTO(JSProto_RegExp),
5440 JS_PropertyStub, JS_PropertyStub,
5441 JS_PropertyStub, JS_PropertyStub,
5442 JS_EnumerateStub, JS_ResolveStub,
5443 JS_ConvertStub, regexp_finalize,
5444 NULL, NULL,
5445 regexp_call, NULL,
5446 js_XDRRegExpObject, NULL,
5447 JS_CLASS_TRACE(regexp_trace), 0
5450 static const jschar empty_regexp_ucstr[] = {'(', '?', ':', ')', 0};
5452 JSBool
5453 js_regexp_toString(JSContext *cx, JSObject *obj, jsval *vp)
5455 JSRegExp *re;
5456 const jschar *source;
5457 jschar *chars;
5458 size_t length, nflags;
5459 uintN flags;
5460 JSString *str;
5462 if (!JS_InstanceOf(cx, obj, &js_RegExpClass, vp + 2))
5463 return JS_FALSE;
5464 JS_LOCK_OBJ(cx, obj);
5465 re = (JSRegExp *) obj->getPrivate();
5466 if (!re) {
5467 JS_UNLOCK_OBJ(cx, obj);
5468 *vp = STRING_TO_JSVAL(cx->runtime->emptyString);
5469 return JS_TRUE;
5472 re->source->getCharsAndLength(source, length);
5473 if (length == 0) {
5474 source = empty_regexp_ucstr;
5475 length = JS_ARRAY_LENGTH(empty_regexp_ucstr) - 1;
5477 length += 2;
5478 nflags = 0;
5479 for (flags = re->flags; flags != 0; flags &= flags - 1)
5480 nflags++;
5481 chars = (jschar*) cx->malloc((length + nflags + 1) * sizeof(jschar));
5482 if (!chars) {
5483 JS_UNLOCK_OBJ(cx, obj);
5484 return JS_FALSE;
5487 chars[0] = '/';
5488 js_strncpy(&chars[1], source, length - 2);
5489 chars[length-1] = '/';
5490 if (nflags) {
5491 if (re->flags & JSREG_GLOB)
5492 chars[length++] = 'g';
5493 if (re->flags & JSREG_FOLD)
5494 chars[length++] = 'i';
5495 if (re->flags & JSREG_MULTILINE)
5496 chars[length++] = 'm';
5497 if (re->flags & JSREG_STICKY)
5498 chars[length++] = 'y';
5500 JS_UNLOCK_OBJ(cx, obj);
5501 chars[length] = 0;
5503 str = js_NewString(cx, chars, length);
5504 if (!str) {
5505 cx->free(chars);
5506 return JS_FALSE;
5508 *vp = STRING_TO_JSVAL(str);
5509 return JS_TRUE;
5512 static JSBool
5513 regexp_toString(JSContext *cx, uintN argc, jsval *vp)
5515 JSObject *obj;
5517 obj = JS_THIS_OBJECT(cx, vp);
5518 return obj && js_regexp_toString(cx, obj, vp);
5521 static JSBool
5522 regexp_compile_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
5523 jsval *rval)
5525 JSString *opt, *str;
5526 JSRegExp *oldre, *re;
5527 JSObject *obj2;
5528 size_t length, nbytes;
5529 const jschar *cp, *start, *end;
5530 jschar *nstart, *ncp, *tmp;
5532 if (!JS_InstanceOf(cx, obj, &js_RegExpClass, argv))
5533 return JS_FALSE;
5534 opt = NULL;
5535 if (argc == 0) {
5536 str = cx->runtime->emptyString;
5537 } else {
5538 if (JSVAL_IS_OBJECT(argv[0])) {
5540 * If we get passed in a RegExp object we construct a new
5541 * RegExp that is a duplicate of it by re-compiling the
5542 * original source code. ECMA requires that it be an error
5543 * here if the flags are specified. (We must use the flags
5544 * from the original RegExp also).
5546 obj2 = JSVAL_TO_OBJECT(argv[0]);
5547 if (obj2 && obj2->getClass() == &js_RegExpClass) {
5548 if (argc >= 2 && !JSVAL_IS_VOID(argv[1])) { /* 'flags' passed */
5549 JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL,
5550 JSMSG_NEWREGEXP_FLAGGED);
5551 return JS_FALSE;
5553 JS_LOCK_OBJ(cx, obj2);
5554 re = (JSRegExp *) obj2->getPrivate();
5555 if (!re) {
5556 JS_UNLOCK_OBJ(cx, obj2);
5557 return JS_FALSE;
5559 re = js_NewRegExp(cx, NULL, re->source, re->flags, JS_FALSE);
5560 JS_UNLOCK_OBJ(cx, obj2);
5561 goto created;
5564 str = js_ValueToString(cx, argv[0]);
5565 if (!str)
5566 return JS_FALSE;
5567 argv[0] = STRING_TO_JSVAL(str);
5568 if (argc > 1) {
5569 if (JSVAL_IS_VOID(argv[1])) {
5570 opt = NULL;
5571 } else {
5572 opt = js_ValueToString(cx, argv[1]);
5573 if (!opt)
5574 return JS_FALSE;
5575 argv[1] = STRING_TO_JSVAL(opt);
5579 /* Escape any naked slashes in the regexp source. */
5580 str->getCharsAndLength(start, length);
5581 end = start + length;
5582 nstart = ncp = NULL;
5583 for (cp = start; cp < end; cp++) {
5584 if (*cp == '/' && (cp == start || cp[-1] != '\\')) {
5585 nbytes = (++length + 1) * sizeof(jschar);
5586 if (!nstart) {
5587 nstart = (jschar *) cx->malloc(nbytes);
5588 if (!nstart)
5589 return JS_FALSE;
5590 ncp = nstart + (cp - start);
5591 js_strncpy(nstart, start, cp - start);
5592 } else {
5593 tmp = (jschar *) cx->realloc(nstart, nbytes);
5594 if (!tmp) {
5595 cx->free(nstart);
5596 return JS_FALSE;
5598 ncp = tmp + (ncp - nstart);
5599 nstart = tmp;
5601 *ncp++ = '\\';
5603 if (nstart)
5604 *ncp++ = *cp;
5607 if (nstart) {
5608 /* Don't forget to store the backstop after the new string. */
5609 JS_ASSERT((size_t)(ncp - nstart) == length);
5610 *ncp = 0;
5611 str = js_NewString(cx, nstart, length);
5612 if (!str) {
5613 cx->free(nstart);
5614 return JS_FALSE;
5616 argv[0] = STRING_TO_JSVAL(str);
5620 re = js_NewRegExpOpt(cx, str, opt, JS_FALSE);
5621 created:
5622 if (!re)
5623 return JS_FALSE;
5624 JS_LOCK_OBJ(cx, obj);
5625 oldre = (JSRegExp *) obj->getPrivate();
5626 obj->setPrivate(re);
5627 obj->zeroRegExpLastIndex();
5628 JS_UNLOCK_OBJ(cx, obj);
5629 if (oldre)
5630 js_DestroyRegExp(cx, oldre);
5631 *rval = OBJECT_TO_JSVAL(obj);
5632 return JS_TRUE;
5635 static JSBool
5636 regexp_compile(JSContext *cx, uintN argc, jsval *vp)
5638 JSObject *obj;
5640 obj = JS_THIS_OBJECT(cx, vp);
5641 return obj && regexp_compile_sub(cx, obj, argc, vp + 2, vp);
5644 static JSBool
5645 regexp_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
5646 JSBool test, jsval *rval)
5648 JSBool ok, sticky;
5649 JSRegExp *re;
5650 jsdouble lastIndex;
5651 JSString *str;
5652 size_t i;
5654 ok = JS_InstanceOf(cx, obj, &js_RegExpClass, argv);
5655 if (!ok)
5656 return JS_FALSE;
5657 JS_LOCK_OBJ(cx, obj);
5658 re = (JSRegExp *) obj->getPrivate();
5659 if (!re) {
5660 JS_UNLOCK_OBJ(cx, obj);
5661 return JS_TRUE;
5664 /* NB: we must reach out: after this paragraph, in order to drop re. */
5665 HOLD_REGEXP(cx, re);
5666 sticky = (re->flags & JSREG_STICKY) != 0;
5667 if (re->flags & (JSREG_GLOB | JSREG_STICKY)) {
5668 jsval v = obj->getRegExpLastIndex();
5669 if (JSVAL_IS_INT(v)) {
5670 lastIndex = JSVAL_TO_INT(v);
5671 } else {
5672 JS_ASSERT(JSVAL_IS_DOUBLE(v));
5673 lastIndex = *JSVAL_TO_DOUBLE(v);
5675 } else {
5676 lastIndex = 0;
5678 JS_UNLOCK_OBJ(cx, obj);
5680 /* Now that obj is unlocked, it's safe to (potentially) grab the GC lock. */
5681 if (argc == 0) {
5682 str = cx->regExpStatics.input;
5683 if (!str) {
5684 const char *bytes = js_GetStringBytes(cx, re->source);
5686 if (bytes) {
5687 JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL,
5688 JSMSG_NO_INPUT,
5689 bytes,
5690 (re->flags & JSREG_GLOB) ? "g" : "",
5691 (re->flags & JSREG_FOLD) ? "i" : "",
5692 (re->flags & JSREG_MULTILINE) ? "m" : "",
5693 (re->flags & JSREG_STICKY) ? "y" : "");
5695 ok = JS_FALSE;
5696 goto out;
5698 } else {
5699 str = js_ValueToString(cx, argv[0]);
5700 if (!str) {
5701 ok = JS_FALSE;
5702 goto out;
5704 argv[0] = STRING_TO_JSVAL(str);
5707 if (lastIndex < 0 || str->length() < lastIndex) {
5708 obj->zeroRegExpLastIndex();
5709 *rval = JSVAL_NULL;
5710 } else {
5711 i = (size_t) lastIndex;
5712 ok = js_ExecuteRegExp(cx, re, str, &i, test, rval);
5713 if (ok &&
5714 ((re->flags & JSREG_GLOB) || (*rval != JSVAL_NULL && sticky))) {
5715 if (*rval == JSVAL_NULL)
5716 obj->zeroRegExpLastIndex();
5717 else
5718 ok = SetRegExpLastIndex(cx, obj, i);
5722 out:
5723 DROP_REGEXP(cx, re);
5724 return ok;
5727 static JSBool
5728 regexp_exec(JSContext *cx, uintN argc, jsval *vp)
5730 return regexp_exec_sub(cx, JS_THIS_OBJECT(cx, vp), argc, vp + 2, JS_FALSE,
5731 vp);
5734 static JSBool
5735 regexp_test(JSContext *cx, uintN argc, jsval *vp)
5737 if (!regexp_exec_sub(cx, JS_THIS_OBJECT(cx, vp), argc, vp + 2, JS_TRUE, vp))
5738 return JS_FALSE;
5739 if (*vp != JSVAL_TRUE)
5740 *vp = JSVAL_FALSE;
5741 return JS_TRUE;
5744 static JSFunctionSpec regexp_methods[] = {
5745 #if JS_HAS_TOSOURCE
5746 JS_FN(js_toSource_str, regexp_toString, 0,0),
5747 #endif
5748 JS_FN(js_toString_str, regexp_toString, 0,0),
5749 JS_FN("compile", regexp_compile, 2,0),
5750 JS_FN("exec", regexp_exec, 1,0),
5751 JS_FN("test", regexp_test, 1,0),
5752 JS_FS_END
5755 static JSBool
5756 RegExp(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval)
5758 if (!JS_IsConstructing(cx)) {
5760 * If first arg is regexp and no flags are given, just return the arg.
5761 * (regexp_compile_sub detects the regexp + flags case and throws a
5762 * TypeError.) See 10.15.3.1.
5764 if ((argc < 2 || JSVAL_IS_VOID(argv[1])) &&
5765 !JSVAL_IS_PRIMITIVE(argv[0]) &&
5766 JSVAL_TO_OBJECT(argv[0])->getClass() == &js_RegExpClass) {
5767 *rval = argv[0];
5768 return JS_TRUE;
5771 /* Otherwise, replace obj with a new RegExp object. */
5772 obj = NewObject(cx, &js_RegExpClass, NULL, NULL);
5773 if (!obj)
5774 return JS_FALSE;
5777 * regexp_compile_sub does not use rval to root its temporaries so we
5778 * can use it to root obj.
5780 *rval = OBJECT_TO_JSVAL(obj);
5782 return regexp_compile_sub(cx, obj, argc, argv, rval);
5785 JSObject *
5786 js_InitRegExpClass(JSContext *cx, JSObject *obj)
5788 JSObject *proto = js_InitClass(cx, obj, NULL, &js_RegExpClass, RegExp, 1,
5789 regexp_props, regexp_methods,
5790 regexp_static_props, NULL);
5791 if (!proto)
5792 return NULL;
5794 JSObject *ctor = JS_GetConstructor(cx, proto);
5795 if (!ctor)
5796 return NULL;
5798 /* Give RegExp.prototype private data so it matches the empty string. */
5799 jsval rval;
5800 if (!JS_AliasProperty(cx, ctor, "input", "$_") ||
5801 !JS_AliasProperty(cx, ctor, "multiline", "$*") ||
5802 !JS_AliasProperty(cx, ctor, "lastMatch", "$&") ||
5803 !JS_AliasProperty(cx, ctor, "lastParen", "$+") ||
5804 !JS_AliasProperty(cx, ctor, "leftContext", "$`") ||
5805 !JS_AliasProperty(cx, ctor, "rightContext", "$'") ||
5806 !regexp_compile_sub(cx, proto, 0, NULL, &rval)) {
5807 return NULL;
5810 return proto;
5813 JSObject *
5814 js_NewRegExpObject(JSContext *cx, TokenStream *ts,
5815 const jschar *chars, size_t length, uintN flags)
5817 JSString *str;
5818 JSObject *obj;
5819 JSRegExp *re;
5821 str = js_NewStringCopyN(cx, chars, length);
5822 if (!str)
5823 return NULL;
5824 AutoValueRooter tvr(cx, str);
5825 re = js_NewRegExp(cx, ts, str, flags, JS_FALSE);
5826 if (!re)
5827 return NULL;
5828 obj = NewObject(cx, &js_RegExpClass, NULL, NULL);
5829 if (!obj) {
5830 js_DestroyRegExp(cx, re);
5831 return NULL;
5833 obj->setPrivate(re);
5834 obj->zeroRegExpLastIndex();
5835 return obj;
5838 JSObject * JS_FASTCALL
5839 js_CloneRegExpObject(JSContext *cx, JSObject *obj, JSObject *proto)
5841 JS_ASSERT(obj->getClass() == &js_RegExpClass);
5842 JS_ASSERT(proto);
5843 JS_ASSERT(proto->getClass() == &js_RegExpClass);
5844 JSObject *clone = NewObjectWithGivenProto(cx, &js_RegExpClass, proto, NULL);
5845 if (!clone)
5846 return NULL;
5847 JSRegExp *re = static_cast<JSRegExp *>(obj->getPrivate());
5848 clone->setPrivate(re);
5849 clone->zeroRegExpLastIndex();
5850 HOLD_REGEXP(cx, re);
5851 return clone;
5854 #ifdef JS_TRACER
5855 JS_DEFINE_CALLINFO_3(extern, OBJECT, js_CloneRegExpObject, CONTEXT, OBJECT, OBJECT, 0,
5856 ACC_STORE_ANY)
5857 #endif
5859 bool
5860 js_ContainsRegExpMetaChars(const jschar *chars, size_t length)
5862 for (size_t i = 0; i < length; ++i) {
5863 jschar c = chars[i];
5864 switch (c) {
5865 /* Taken from the PatternCharacter production in 15.10.1. */
5866 case '^': case '$': case '\\': case '.': case '*': case '+':
5867 case '?': case '(': case ')': case '[': case ']': case '{':
5868 case '}': case '|':
5869 return true;
5870 default:;
5873 return false;
5876 JSBool
5877 js_ObjectIsRegExp(JSObject *obj)
5879 return obj->isRegExp();